{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1282.75, "completions/mean_length": 1749.984375, "completions/mean_terminated_length": 972.3125, "completions/min_length": 1233.0, "completions/min_terminated_length": 721.0, "epoch": 0.0005, "grad_norm": 0.2678317427635193, "kl": 0.0005869865417480469, "learning_rate": 0.0, "loss": 0.121, "num_tokens": 120639.0, "reward": 0.18359375, "reward_std": 0.044233137741684914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.08846627920866013, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1427.75, "completions/mean_length": 1939.875, "completions/mean_terminated_length": 1257.1795043945312, "completions/min_length": 1429.25, "completions/min_terminated_length": 917.25, "epoch": 0.001, "grad_norm": 0.20804205536842346, "kl": 0.0005259513854980469, "learning_rate": 5e-09, "loss": 0.0152, "num_tokens": 252423.0, "reward": 0.19877984374761581, "reward_std": 0.12046680599451065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012475860305130482, "rewards/penalized_accuracy_reward/std": 0.04990344122052193, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.07558366656303406, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 1852.25, "completions/mean_terminated_length": 1075.2083435058594, "completions/min_length": 1393.5, "completions/min_terminated_length": 881.5, "epoch": 0.0015, "grad_norm": 0.2779929041862488, "kl": 0.0005402565002441406, "learning_rate": 1e-08, "loss": 0.0562, "num_tokens": 381111.0, "reward": 0.171875, "reward_std": 0.05849890783429146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.11699781939387321, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 1825.5, "completions/max_terminated_length": 1270.0, "completions/mean_length": 1620.5625, "completions/mean_terminated_length": 1053.3401489257812, "completions/min_length": 1391.75, "completions/min_terminated_length": 879.75, "epoch": 0.002, "grad_norm": 0.24853765964508057, "kl": 0.000507354736328125, "learning_rate": 1.5e-08, "loss": 0.0122, "num_tokens": 494059.0, "reward": 0.24528497457504272, "reward_std": 0.14659234508872032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024986235424876213, "rewards/penalized_accuracy_reward/std": 0.06827539205551147, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.05259781517088413, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.25, "completions/mean_length": 1686.328125, "completions/mean_terminated_length": 1338.6002807617188, "completions/min_length": 821.25, "completions/min_terminated_length": 821.25, "epoch": 0.0025, "grad_norm": 0.2926381230354309, "kl": 0.0006008148193359375, "learning_rate": 2e-08, "loss": 0.1818, "num_tokens": 610576.0, "reward": 0.2700623571872711, "reward_std": 0.21374760195612907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03737492859363556, "rewards/penalized_accuracy_reward/std": 0.0803537368774414, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.12444132193922997, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1526.5, "completions/mean_length": 1830.90625, "completions/mean_terminated_length": 1295.8541870117188, "completions/min_length": 1022.5, "completions/min_terminated_length": 1022.5, "epoch": 0.003, "grad_norm": 0.2165934443473816, "kl": 0.0004878044128417969, "learning_rate": 2.5e-08, "loss": 0.1206, "num_tokens": 736426.0, "reward": 0.2065669298171997, "reward_std": 0.1528809405863285, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012463153339922428, "rewards/penalized_accuracy_reward/std": 0.04985261708498001, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.1287429817020893, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 1737.25, "completions/max_terminated_length": 1307.75, "completions/mean_length": 1412.640625, "completions/mean_terminated_length": 990.1302032470703, "completions/min_length": 697.25, "completions/min_terminated_length": 697.25, "epoch": 0.0035, "grad_norm": 0.264034628868103, "kl": 0.0006909370422363281, "learning_rate": 3e-08, "loss": 0.1614, "num_tokens": 835347.0, "reward": 0.36813443899154663, "reward_std": 0.23992962948977947, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.10666721127927303, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2027.5, "completions/max_terminated_length": 1345.0, "completions/mean_length": 1656.203125, "completions/mean_terminated_length": 951.5178833007812, "completions/min_length": 1213.75, "completions/min_terminated_length": 701.75, "epoch": 0.004, "grad_norm": 0.15212257206439972, "kl": 0.0004897117614746094, "learning_rate": 3.5e-08, "loss": -0.0014, "num_tokens": 949536.0, "reward": 0.203125, "reward_std": 0.03324815817177296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.06649631634354591, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1609.25, "completions/mean_length": 1821.21875, "completions/mean_terminated_length": 1152.9722290039062, "completions/min_length": 744.5, "completions/min_terminated_length": 744.5, "epoch": 0.0045, "grad_norm": 0.22132602334022522, "kl": 0.0004668235778808594, "learning_rate": 4e-08, "loss": 0.1732, "num_tokens": 1077214.0, "reward": 0.162109375, "reward_std": 0.0502830371260643, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.32421875, "rewards/tag_count_reward/std": 0.10056607984006405, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 1954.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 1714.5625, "completions/mean_terminated_length": 1460.4531555175781, "completions/min_length": 1321.75, "completions/min_terminated_length": 1321.75, "epoch": 0.005, "grad_norm": 0.2768813967704773, "kl": 0.0005021095275878906, "learning_rate": 4.5e-08, "loss": 0.1112, "num_tokens": 1197634.0, "reward": 0.6040506139397621, "reward_std": 0.338875412940979, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21218155696988106, "rewards/penalized_accuracy_reward/std": 0.1486714631319046, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.08306500501930714, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1764.75, "completions/max_terminated_length": 1235.75, "completions/mean_length": 1637.125, "completions/mean_terminated_length": 987.6000061035156, "completions/min_length": 1284.5, "completions/min_terminated_length": 772.5, "epoch": 0.0055, "grad_norm": 0.3173900544643402, "kl": 0.0005846023559570312, "learning_rate": 5e-08, "loss": 0.0204, "num_tokens": 1310586.0, "reward": 0.23200411349534988, "reward_std": 0.1589034628123045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012486432678997517, "rewards/penalized_accuracy_reward/std": 0.049945730715990067, "rewards/tag_count_reward/mean": 0.4140625, "rewards/tag_count_reward/std": 0.15557121112942696, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 1957.0, "completions/max_terminated_length": 1875.25, "completions/mean_length": 1646.375, "completions/mean_terminated_length": 1490.9500122070312, "completions/min_length": 1241.75, "completions/min_terminated_length": 1241.75, "epoch": 0.006, "grad_norm": 0.26154857873916626, "kl": 0.00037860870361328125, "learning_rate": 5.4999999999999996e-08, "loss": 0.0482, "num_tokens": 1423314.0, "reward": 0.212890625, "reward_std": 0.0683181881904602, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.42578125, "rewards/tag_count_reward/std": 0.1366363763809204, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1819.75, "completions/mean_length": 1739.453125, "completions/mean_terminated_length": 1304.6648254394531, "completions/min_length": 782.75, "completions/min_terminated_length": 782.75, "epoch": 0.0065, "grad_norm": 0.2854880094528198, "kl": 0.0005950927734375, "learning_rate": 6e-08, "loss": 0.1349, "num_tokens": 1542959.0, "reward": 0.29121362417936325, "reward_std": 0.3285956550389528, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049903687089681625, "rewards/penalized_accuracy_reward/std": 0.13636285066604614, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.1327147539705038, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1745.75, "completions/mean_length": 1572.109375, "completions/mean_terminated_length": 1357.8984680175781, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.007, "grad_norm": 0.2734803855419159, "kl": 0.0004181861877441406, "learning_rate": 6.5e-08, "loss": 0.1399, "num_tokens": 1650614.0, "reward": 0.220703125, "reward_std": 0.06361151300370693, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.44140625, "rewards/tag_count_reward/std": 0.12722302973270416, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1937.25, "completions/mean_length": 1680.4375, "completions/mean_terminated_length": 1434.8178100585938, "completions/min_length": 966.25, "completions/min_terminated_length": 966.25, "epoch": 0.0075, "grad_norm": 0.26038095355033875, "kl": 0.0005350112915039062, "learning_rate": 7e-08, "loss": 0.1027, "num_tokens": 1773954.0, "reward": 0.25885436683893204, "reward_std": 0.23788997158408165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024934994988143444, "rewards/penalized_accuracy_reward/std": 0.09973998367786407, "rewards/tag_count_reward/mean": 0.41796875, "rewards/tag_count_reward/std": 0.11241647228598595, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1465.75, "completions/mean_length": 1762.53125, "completions/mean_terminated_length": 1032.7095642089844, "completions/min_length": 1183.5, "completions/min_terminated_length": 671.5, "epoch": 0.008, "grad_norm": 0.21983151137828827, "kl": 0.00047969818115234375, "learning_rate": 7.5e-08, "loss": 0.0856, "num_tokens": 1894740.0, "reward": 0.2602503150701523, "reward_std": 0.262044258415699, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03735172376036644, "rewards/penalized_accuracy_reward/std": 0.1178455762565136, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.09440502151846886, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 991.5, "completions/mean_length": 1934.03125, "completions/mean_terminated_length": 784.1666870117188, "completions/min_length": 1563.25, "completions/min_terminated_length": 539.25, "epoch": 0.0085, "grad_norm": 0.22865381836891174, "kl": 0.0004444122314453125, "learning_rate": 8e-08, "loss": 0.0264, "num_tokens": 2026710.0, "reward": 0.19287124276161194, "reward_std": 0.14259367994964123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01245124638080597, "rewards/penalized_accuracy_reward/std": 0.04980498552322388, "rewards/tag_count_reward/mean": 0.3359375, "rewards/tag_count_reward/std": 0.1068628653883934, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1296.5, "completions/mean_length": 1683.421875, "completions/mean_terminated_length": 974.5112609863281, "completions/min_length": 1184.75, "completions/min_terminated_length": 672.75, "epoch": 0.009, "grad_norm": 0.2942623496055603, "kl": 0.0005745887756347656, "learning_rate": 8.500000000000001e-08, "loss": 0.1302, "num_tokens": 2141505.0, "reward": 0.38894355297088623, "reward_std": 0.24314025975763798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09974521398544312, "rewards/penalized_accuracy_reward/std": 0.10301648825407028, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.0854202676564455, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 501.75, "completions/mean_length": 2047.359375, "completions/mean_terminated_length": 501.75, "completions/min_length": 2037.75, "completions/min_terminated_length": 501.75, "epoch": 0.0095, "grad_norm": 0.0956871509552002, "kl": 0.0005245208740234375, "learning_rate": 9e-08, "loss": 0.0012, "num_tokens": 2280744.0, "reward": 0.126953125, "reward_std": 0.0078125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25390625, "rewards/tag_count_reward/std": 0.015625, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1733.0, "completions/max_terminated_length": 1514.25, "completions/mean_length": 1402.171875, "completions/mean_terminated_length": 1223.0649108886719, "completions/min_length": 734.25, "completions/min_terminated_length": 734.25, "epoch": 0.01, "grad_norm": 0.2281389981508255, "kl": 0.0004749298095703125, "learning_rate": 9.499999999999999e-08, "loss": 0.0775, "num_tokens": 2378963.0, "reward": 0.38963261246681213, "reward_std": 0.2470595482736826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08739441633224487, "rewards/penalized_accuracy_reward/std": 0.10234588384628296, "rewards/tag_count_reward/mean": 0.4296875, "rewards/tag_count_reward/std": 0.10577632114291191, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1477.75, "completions/mean_length": 1837.78125, "completions/mean_terminated_length": 1104.3492431640625, "completions/min_length": 1231.75, "completions/min_terminated_length": 719.75, "epoch": 0.0105, "grad_norm": 0.25088387727737427, "kl": 0.0005035400390625, "learning_rate": 1e-07, "loss": 0.1206, "num_tokens": 2505637.0, "reward": 0.32934245467185974, "reward_std": 0.2541283257305622, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07482747733592987, "rewards/penalized_accuracy_reward/std": 0.09977006912231445, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.1256135143339634, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1477.75, "completions/mean_length": 1917.765625, "completions/mean_terminated_length": 1186.0472412109375, "completions/min_length": 1450.0, "completions/min_terminated_length": 938.0, "epoch": 0.011, "grad_norm": 0.24813935160636902, "kl": 0.0005106925964355469, "learning_rate": 1.0499999999999999e-07, "loss": 0.0631, "num_tokens": 2636422.0, "reward": 0.169921875, "reward_std": 0.06318480707705021, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.12636961787939072, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 1938.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 1662.171875, "completions/mean_terminated_length": 918.6354370117188, "completions/min_length": 1131.75, "completions/min_terminated_length": 619.75, "epoch": 0.0115, "grad_norm": 0.21569512784481049, "kl": 0.0003943443298339844, "learning_rate": 1.0999999999999999e-07, "loss": 0.0836, "num_tokens": 2752017.0, "reward": 0.2294965386390686, "reward_std": 0.22110073268413544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024904518388211727, "rewards/penalized_accuracy_reward/std": 0.09961806982755661, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.0625, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.75, "completions/mean_length": 1795.0, "completions/mean_terminated_length": 1450.3611145019531, "completions/min_length": 1031.75, "completions/min_terminated_length": 1031.75, "epoch": 0.012, "grad_norm": 0.2750110924243927, "kl": 0.0005893707275390625, "learning_rate": 1.15e-07, "loss": 0.1157, "num_tokens": 2876881.0, "reward": 0.22019581496715546, "reward_std": 0.16302404552698135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012441657483577728, "rewards/penalized_accuracy_reward/std": 0.04976662993431091, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.15576278418302536, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1316.5, "completions/mean_length": 1682.8125, "completions/mean_terminated_length": 1013.3879089355469, "completions/min_length": 1278.25, "completions/min_terminated_length": 766.25, "epoch": 0.0125, "grad_norm": 0.316141277551651, "kl": 0.0005574226379394531, "learning_rate": 1.2e-07, "loss": -0.0092, "num_tokens": 2995221.0, "reward": 0.19140625, "reward_std": 0.03697281330823898, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.07394563034176826, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1704.25, "completions/mean_length": 1667.390625, "completions/mean_terminated_length": 1160.6090087890625, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.013, "grad_norm": 0.29166877269744873, "kl": 0.000637054443359375, "learning_rate": 1.25e-07, "loss": 0.1606, "num_tokens": 3110238.0, "reward": 0.197265625, "reward_std": 0.08549676090478897, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.17099352926015854, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 1857.5, "completions/max_terminated_length": 1774.75, "completions/mean_length": 1736.4375, "completions/mean_terminated_length": 1645.96875, "completions/min_length": 1552.75, "completions/min_terminated_length": 1552.75, "epoch": 0.0135, "grad_norm": 0.2818675637245178, "kl": 0.0005917549133300781, "learning_rate": 1.3e-07, "loss": 0.0129, "num_tokens": 3230682.0, "reward": 0.25450141727924347, "reward_std": 0.20308996364474297, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03740696236491203, "rewards/penalized_accuracy_reward/std": 0.08042258024215698, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.10221691615879536, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 1743.75, "completions/max_terminated_length": 596.25, "completions/mean_length": 1483.609375, "completions/mean_terminated_length": 445.95001220703125, "completions/min_length": 1223.5, "completions/min_terminated_length": 199.5, "epoch": 0.014, "grad_norm": 0.2801688015460968, "kl": 0.0005373954772949219, "learning_rate": 1.35e-07, "loss": 0.0097, "num_tokens": 3335409.0, "reward": 0.203125, "reward_std": 0.04175759106874466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.08351518586277962, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 1715.9375, "completions/mean_terminated_length": 523.6495971679688, "completions/min_length": 1264.25, "completions/min_terminated_length": 240.25, "epoch": 0.0145, "grad_norm": 0.2773332893848419, "kl": 0.0006761550903320312, "learning_rate": 1.4e-07, "loss": 0.1383, "num_tokens": 3456621.0, "reward": 0.18359375, "reward_std": 0.05425935424864292, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.10851870849728584, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1750.75, "completions/mean_length": 1704.1875, "completions/mean_terminated_length": 1398.55419921875, "completions/min_length": 978.5, "completions/min_terminated_length": 978.5, "epoch": 0.015, "grad_norm": 0.2570420503616333, "kl": 0.0004572868347167969, "learning_rate": 1.45e-07, "loss": 0.1202, "num_tokens": 3574969.0, "reward": 0.203125, "reward_std": 0.060551310889422894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.12110262177884579, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 1957.0, "completions/max_terminated_length": 1310.75, "completions/mean_length": 1654.15625, "completions/mean_terminated_length": 1000.2522583007812, "completions/min_length": 1239.25, "completions/min_terminated_length": 727.25, "epoch": 0.0155, "grad_norm": 0.282610148191452, "kl": 0.000667572021484375, "learning_rate": 1.5e-07, "loss": 0.0661, "num_tokens": 3692963.0, "reward": 0.1953125, "reward_std": 0.058096304535865784, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.11619261465966702, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 848.5, "completions/mean_length": 1866.796875, "completions/mean_terminated_length": 730.0096130371094, "completions/min_length": 1640.5, "completions/min_terminated_length": 616.5, "epoch": 0.016, "grad_norm": 0.16582348942756653, "kl": 0.0005125999450683594, "learning_rate": 1.55e-07, "loss": 0.0297, "num_tokens": 3820230.0, "reward": 0.18898441642522812, "reward_std": 0.1136630903929472, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012460959143936634, "rewards/penalized_accuracy_reward/std": 0.049843836575746536, "rewards/tag_count_reward/mean": 0.328125, "rewards/tag_count_reward/std": 0.027950851246714592, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1596.5, "completions/mean_length": 1522.015625, "completions/mean_terminated_length": 1169.3999938964844, "completions/min_length": 592.25, "completions/min_terminated_length": 592.25, "epoch": 0.0165, "grad_norm": 0.3382783830165863, "kl": 0.00052642822265625, "learning_rate": 1.6e-07, "loss": 0.1605, "num_tokens": 3926759.0, "reward": 0.22808826714754105, "reward_std": 0.16330134309828281, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012481633573770523, "rewards/penalized_accuracy_reward/std": 0.04992653802037239, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.1456743534654379, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1214.25, "completions/mean_length": 1958.328125, "completions/mean_terminated_length": 1103.6499938964844, "completions/min_length": 1525.0, "completions/min_terminated_length": 1013.0, "epoch": 0.017, "grad_norm": 0.219075545668602, "kl": 0.0004405975341796875, "learning_rate": 1.65e-07, "loss": 0.0505, "num_tokens": 4060412.0, "reward": 0.15234375, "reward_std": 0.039282044395804405, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3046875, "rewards/tag_count_reward/std": 0.07856409437954426, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1849.5, "completions/max_terminated_length": 1336.25, "completions/mean_length": 1462.03125, "completions/mean_terminated_length": 927.836669921875, "completions/min_length": 1194.75, "completions/min_terminated_length": 682.75, "epoch": 0.0175, "grad_norm": 0.2220667153596878, "kl": 0.0004000663757324219, "learning_rate": 1.7000000000000001e-07, "loss": 0.0504, "num_tokens": 4165646.0, "reward": 0.25, "reward_std": 0.0611990075558424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.1223980188369751, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1877.75, "completions/max_terminated_length": 1622.0, "completions/mean_length": 1390.296875, "completions/mean_terminated_length": 1102.187515258789, "completions/min_length": 733.5, "completions/min_terminated_length": 733.5, "epoch": 0.018, "grad_norm": 0.3474196493625641, "kl": 0.0005612373352050781, "learning_rate": 1.75e-07, "loss": 0.0176, "num_tokens": 4264561.0, "reward": 0.2109375, "reward_std": 0.05627823993563652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.11255648173391819, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1825.5, "completions/max_terminated_length": 1801.75, "completions/mean_length": 1329.859375, "completions/mean_terminated_length": 1260.9531555175781, "completions/min_length": 756.25, "completions/min_terminated_length": 756.25, "epoch": 0.0185, "grad_norm": 0.25084978342056274, "kl": 0.0004429817199707031, "learning_rate": 1.8e-07, "loss": 0.0901, "num_tokens": 4357064.0, "reward": 0.3266914188861847, "reward_std": 0.21751300431787968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037369146943092346, "rewards/penalized_accuracy_reward/std": 0.08034129440784454, "rewards/tag_count_reward/mean": 0.50390625, "rewards/tag_count_reward/std": 0.11366083472967148, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1820.75, "completions/max_terminated_length": 1564.75, "completions/mean_length": 1480.4375, "completions/mean_terminated_length": 1221.9933166503906, "completions/min_length": 925.25, "completions/min_terminated_length": 925.25, "epoch": 0.019, "grad_norm": 0.24586114287376404, "kl": 0.0004696846008300781, "learning_rate": 1.85e-07, "loss": 0.0948, "num_tokens": 4460644.0, "reward": 0.2725646048784256, "reward_std": 0.17986449040472507, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0249541774392128, "rewards/penalized_accuracy_reward/std": 0.06818779557943344, "rewards/tag_count_reward/mean": 0.4453125, "rewards/tag_count_reward/std": 0.11949022859334946, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1935.5, "completions/max_terminated_length": 1352.5, "completions/mean_length": 1421.53125, "completions/mean_terminated_length": 785.1718902587891, "completions/min_length": 876.5, "completions/min_terminated_length": 364.5, "epoch": 0.0195, "grad_norm": 0.28956305980682373, "kl": 0.0006070137023925781, "learning_rate": 1.8999999999999998e-07, "loss": 0.129, "num_tokens": 4562422.0, "reward": 0.43180055916309357, "reward_std": 0.40428076684474945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09968933835625648, "rewards/penalized_accuracy_reward/std": 0.17832984775304794, "rewards/tag_count_reward/mean": 0.46484375, "rewards/tag_count_reward/std": 0.15116634219884872, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 1949.75, "completions/max_terminated_length": 1336.5, "completions/mean_length": 1676.71875, "completions/mean_terminated_length": 982.1904907226562, "completions/min_length": 1199.0, "completions/min_terminated_length": 687.0, "epoch": 0.02, "grad_norm": 0.2751065194606781, "kl": 0.0006046295166015625, "learning_rate": 1.9499999999999999e-07, "loss": 0.1012, "num_tokens": 4678788.0, "reward": 0.19921875, "reward_std": 0.07069835998117924, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.14139672368764877, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2018.75, "completions/max_terminated_length": 1394.5, "completions/mean_length": 1529.390625, "completions/mean_terminated_length": 909.4973449707031, "completions/min_length": 1111.0, "completions/min_terminated_length": 599.0, "epoch": 0.0205, "grad_norm": 0.24613972008228302, "kl": 0.000499725341796875, "learning_rate": 2e-07, "loss": 0.1099, "num_tokens": 4784573.0, "reward": 0.3852745294570923, "reward_std": 0.24712130427360535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08716852217912674, "rewards/penalized_accuracy_reward/std": 0.10208141058683395, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.12433474138379097, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1040.25, "completions/mean_length": 1687.359375, "completions/mean_terminated_length": 944.9729309082031, "completions/min_length": 1296.75, "completions/min_terminated_length": 784.75, "epoch": 0.021, "grad_norm": 0.2400922328233719, "kl": 0.0006103515625, "learning_rate": 2.0499999999999997e-07, "loss": 0.0424, "num_tokens": 4904820.0, "reward": 0.23339340090751648, "reward_std": 0.22976340353488922, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024899822659790516, "rewards/penalized_accuracy_reward/std": 0.09959929063916206, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.11361231282353401, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1853.75, "completions/max_terminated_length": 1632.75, "completions/mean_length": 1452.109375, "completions/mean_terminated_length": 1161.586669921875, "completions/min_length": 762.5, "completions/min_terminated_length": 762.5, "epoch": 0.0215, "grad_norm": 0.22884468734264374, "kl": 0.000469207763671875, "learning_rate": 2.0999999999999997e-07, "loss": 0.1383, "num_tokens": 5005707.0, "reward": 0.201171875, "reward_std": 0.04396170936524868, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40234375, "rewards/tag_count_reward/std": 0.08792342618107796, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1488.5, "completions/mean_length": 1844.3125, "completions/mean_terminated_length": 1050.175048828125, "completions/min_length": 1150.5, "completions/min_terminated_length": 638.5, "epoch": 0.022, "grad_norm": 0.21332745254039764, "kl": 0.0005483627319335938, "learning_rate": 2.1499999999999998e-07, "loss": 0.1145, "num_tokens": 5135999.0, "reward": 0.25446535646915436, "reward_std": 0.21538694202899933, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03738892823457718, "rewards/penalized_accuracy_reward/std": 0.08038385957479477, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.14370574057102203, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 953.5, "completions/mean_length": 1991.328125, "completions/mean_terminated_length": 814.5249938964844, "completions/min_length": 1752.5, "completions/min_terminated_length": 728.5, "epoch": 0.0225, "grad_norm": 0.19154949486255646, "kl": 0.0006303787231445312, "learning_rate": 2.1999999999999998e-07, "loss": 0.0402, "num_tokens": 5273892.0, "reward": 0.142578125, "reward_std": 0.028222277760505676, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28515625, "rewards/tag_count_reward/std": 0.05644455552101135, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2016.25, "completions/max_terminated_length": 1690.75, "completions/mean_length": 1692.140625, "completions/mean_terminated_length": 1255.5656433105469, "completions/min_length": 946.5, "completions/min_terminated_length": 946.5, "epoch": 0.023, "grad_norm": 0.276924192905426, "kl": 0.0005559921264648438, "learning_rate": 2.25e-07, "loss": 0.1048, "num_tokens": 5390989.0, "reward": 0.181640625, "reward_std": 0.05335709825158119, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.10671419650316238, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 1975.25, "completions/max_terminated_length": 1924.75, "completions/mean_length": 1657.578125, "completions/mean_terminated_length": 1535.8635559082031, "completions/min_length": 1238.25, "completions/min_terminated_length": 1238.25, "epoch": 0.0235, "grad_norm": 0.3119513690471649, "kl": 0.0004444122314453125, "learning_rate": 2.3e-07, "loss": -0.0444, "num_tokens": 5511234.0, "reward": 0.208984375, "reward_std": 0.0643832329660654, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.41796875, "rewards/tag_count_reward/std": 0.12876647524535656, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 1349.21875, "completions/mean_terminated_length": 1014.7500152587891, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.024, "grad_norm": 0.38019534945487976, "kl": 0.00057220458984375, "learning_rate": 2.3499999999999997e-07, "loss": 0.1032, "num_tokens": 5607328.0, "reward": 0.2109375, "reward_std": 0.06491155736148357, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.1298231165856123, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 1876.96875, "completions/mean_terminated_length": 1182.0273132324219, "completions/min_length": 1438.5, "completions/min_terminated_length": 926.5, "epoch": 0.0245, "grad_norm": 0.24021805822849274, "kl": 0.0005464553833007812, "learning_rate": 2.4e-07, "loss": 0.0727, "num_tokens": 5736958.0, "reward": 0.19678297638893127, "reward_std": 0.15489555150270462, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012453990057110786, "rewards/penalized_accuracy_reward/std": 0.049815960228443146, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.1271837092936039, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 1875.5, "completions/max_terminated_length": 735.25, "completions/mean_length": 1646.75, "completions/mean_terminated_length": 494.3937683105469, "completions/min_length": 1341.25, "completions/min_terminated_length": 317.25, "epoch": 0.025, "grad_norm": 0.2609166204929352, "kl": 0.0005779266357421875, "learning_rate": 2.45e-07, "loss": 0.0361, "num_tokens": 5851230.0, "reward": 0.2065376564860344, "reward_std": 0.11430336721241474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012448515743017197, "rewards/penalized_accuracy_reward/std": 0.04979405924677849, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.08582130074501038, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1474.5, "completions/mean_length": 1902.359375, "completions/mean_terminated_length": 1243.4035949707031, "completions/min_length": 1572.75, "completions/min_terminated_length": 1060.75, "epoch": 0.0255, "grad_norm": 0.2003663033246994, "kl": 0.0005621910095214844, "learning_rate": 2.5e-07, "loss": 0.0567, "num_tokens": 5985077.0, "reward": 0.2198178619146347, "reward_std": 0.16222075559198856, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024947993457317352, "rewards/penalized_accuracy_reward/std": 0.06817090511322021, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.06924767605960369, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1689.5, "completions/mean_length": 1899.96875, "completions/mean_terminated_length": 1463.71875, "completions/min_length": 1240.5, "completions/min_terminated_length": 1240.5, "epoch": 0.026, "grad_norm": 0.24062907695770264, "kl": 0.000514984130859375, "learning_rate": 2.55e-07, "loss": 0.1057, "num_tokens": 6115779.0, "reward": 0.16015625, "reward_std": 0.0500102024525404, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3203125, "rewards/tag_count_reward/std": 0.10002040676772594, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 2029.625, "completions/mean_terminated_length": 414.0, "completions/min_length": 1881.25, "completions/min_terminated_length": 345.25, "epoch": 0.0265, "grad_norm": 0.16029268503189087, "kl": 0.0005631446838378906, "learning_rate": 2.6e-07, "loss": 0.0102, "num_tokens": 6257419.0, "reward": 0.142578125, "reward_std": 0.02668476663529873, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28515625, "rewards/tag_count_reward/std": 0.053369538858532906, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 1605.484375, "completions/mean_terminated_length": 1266.5028228759766, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 0.027, "grad_norm": 0.2430792599916458, "kl": 0.0004329681396484375, "learning_rate": 2.65e-07, "loss": 0.1059, "num_tokens": 6372522.0, "reward": 0.23584596812725067, "reward_std": 0.15031002275645733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01245423499494791, "rewards/penalized_accuracy_reward/std": 0.04981693997979164, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.11740683205425739, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.5, "completions/mean_length": 1714.46875, "completions/mean_terminated_length": 1178.0470275878906, "completions/min_length": 773.5, "completions/min_terminated_length": 773.5, "epoch": 0.0275, "grad_norm": 0.26039227843284607, "kl": 0.0005517005920410156, "learning_rate": 2.7e-07, "loss": 0.155, "num_tokens": 6490792.0, "reward": 0.46805012226104736, "reward_std": 0.26175385899841785, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13734537363052368, "rewards/penalized_accuracy_reward/std": 0.09563512355089188, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.14995136857032776, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.25, "completions/mean_length": 1846.875, "completions/mean_terminated_length": 1546.5795593261719, "completions/min_length": 1121.25, "completions/min_terminated_length": 1121.25, "epoch": 0.028, "grad_norm": 0.20607280731201172, "kl": 0.0004391670227050781, "learning_rate": 2.75e-07, "loss": 0.1198, "num_tokens": 6616816.0, "reward": 0.2564138099551201, "reward_std": 0.29669356159865856, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03738659247756004, "rewards/penalized_accuracy_reward/std": 0.11796759814023972, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.14907719939947128, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1819.5, "completions/max_terminated_length": 1697.0, "completions/mean_length": 1487.71875, "completions/mean_terminated_length": 1279.5854797363281, "completions/min_length": 877.25, "completions/min_terminated_length": 877.25, "epoch": 0.0285, "grad_norm": 0.22957228124141693, "kl": 0.0005941390991210938, "learning_rate": 2.8e-07, "loss": 0.0939, "num_tokens": 6719502.0, "reward": 0.212890625, "reward_std": 0.04889973625540733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.42578125, "rewards/tag_count_reward/std": 0.09779947623610497, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1908.25, "completions/max_terminated_length": 884.25, "completions/mean_length": 1644.75, "completions/mean_terminated_length": 620.75, "completions/min_length": 1372.5, "completions/min_terminated_length": 348.5, "epoch": 0.029, "grad_norm": 0.169194757938385, "kl": 0.0005240440368652344, "learning_rate": 2.8499999999999997e-07, "loss": 0.0095, "num_tokens": 6832894.0, "reward": 0.21635404974222183, "reward_std": 0.11046509817242622, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012473898939788342, "rewards/penalized_accuracy_reward/std": 0.049895595759153366, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1909.0, "completions/mean_length": 1630.625, "completions/mean_terminated_length": 1423.3807983398438, "completions/min_length": 1116.0, "completions/min_terminated_length": 1116.0, "epoch": 0.0295, "grad_norm": 0.2953999638557434, "kl": 0.0005402565002441406, "learning_rate": 2.9e-07, "loss": 0.0939, "num_tokens": 6944550.0, "reward": 0.26477157324552536, "reward_std": 0.183581187389791, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496391162276268, "rewards/penalized_accuracy_reward/std": 0.06821439415216446, "rewards/tag_count_reward/mean": 0.4296875, "rewards/tag_count_reward/std": 0.13251501135528088, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1463.75, "completions/mean_length": 1707.453125, "completions/mean_terminated_length": 1112.1328125, "completions/min_length": 805.5, "completions/min_terminated_length": 805.5, "epoch": 0.03, "grad_norm": 0.2785547971725464, "kl": 0.0006418228149414062, "learning_rate": 2.95e-07, "loss": 0.1606, "num_tokens": 7064515.0, "reward": 0.1953125, "reward_std": 0.08365727588534355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.1673145592212677, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1936.5, "completions/max_terminated_length": 1423.0, "completions/mean_length": 1372.65625, "completions/mean_terminated_length": 804.1927185058594, "completions/min_length": 919.0, "completions/min_terminated_length": 407.0, "epoch": 0.0305, "grad_norm": 0.2923567593097687, "kl": 0.0005192756652832031, "learning_rate": 3e-07, "loss": 0.1518, "num_tokens": 7161869.0, "reward": 0.5377467200160027, "reward_std": 0.3277735151350498, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14973273500800133, "rewards/penalized_accuracy_reward/std": 0.14540744572877884, "rewards/tag_count_reward/mean": 0.4765625, "rewards/tag_count_reward/std": 0.1255886685103178, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1576.75, "completions/max_terminated_length": 1522.5, "completions/mean_length": 1307.609375, "completions/mean_terminated_length": 1120.6666870117188, "completions/min_length": 679.25, "completions/min_terminated_length": 679.25, "epoch": 0.031, "grad_norm": 0.3055476248264313, "kl": 0.000461578369140625, "learning_rate": 3.05e-07, "loss": 0.0901, "num_tokens": 7253556.0, "reward": 0.23828125, "reward_std": 0.05849713087081909, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4765625, "rewards/tag_count_reward/std": 0.11699426174163818, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1538.75, "completions/mean_length": 1750.84375, "completions/mean_terminated_length": 1154.6180725097656, "completions/min_length": 958.5, "completions/min_terminated_length": 958.5, "epoch": 0.0315, "grad_norm": 0.26141586899757385, "kl": 0.0006251335144042969, "learning_rate": 3.1e-07, "loss": 0.1882, "num_tokens": 7376970.0, "reward": 0.19873236119747162, "reward_std": 0.1472194381058216, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012452119030058384, "rewards/penalized_accuracy_reward/std": 0.049808476120233536, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.13079290091991425, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1425.25, "completions/mean_length": 2015.671875, "completions/mean_terminated_length": 1273.375, "completions/min_length": 1633.5, "completions/min_terminated_length": 1121.5, "epoch": 0.032, "grad_norm": 0.21029052138328552, "kl": 0.0005297660827636719, "learning_rate": 3.15e-07, "loss": 0.042, "num_tokens": 7518037.0, "reward": 0.19238737225532532, "reward_std": 0.1811387501657009, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02490462362766266, "rewards/penalized_accuracy_reward/std": 0.06805238872766495, "rewards/tag_count_reward/mean": 0.28515625, "rewards/tag_count_reward/std": 0.09495466388761997, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1315.25, "completions/mean_length": 1643.84375, "completions/mean_terminated_length": 950.5502319335938, "completions/min_length": 1120.25, "completions/min_terminated_length": 608.25, "epoch": 0.0325, "grad_norm": 0.30236080288887024, "kl": 0.0005960464477539062, "learning_rate": 3.2e-07, "loss": 0.0917, "num_tokens": 7633451.0, "reward": 0.27987734228372574, "reward_std": 0.2707459591329098, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037399609573185444, "rewards/penalized_accuracy_reward/std": 0.11798049136996269, "rewards/tag_count_reward/mean": 0.41015625, "rewards/tag_count_reward/std": 0.11480986513197422, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.25, "completions/mean_length": 1633.34375, "completions/mean_terminated_length": 1257.21875, "completions/min_length": 824.25, "completions/min_terminated_length": 824.25, "epoch": 0.033, "grad_norm": 0.24960391223430634, "kl": 0.000499725341796875, "learning_rate": 3.25e-07, "loss": 0.1465, "num_tokens": 7745105.0, "reward": 0.32976511120796204, "reward_std": 0.3264557532966137, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06234349496662617, "rewards/penalized_accuracy_reward/std": 0.13906018808484077, "rewards/tag_count_reward/mean": 0.41015625, "rewards/tag_count_reward/std": 0.16608504205942154, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1875.75, "completions/mean_length": 1700.390625, "completions/mean_terminated_length": 1447.3140563964844, "completions/min_length": 1065.0, "completions/min_terminated_length": 1065.0, "epoch": 0.0335, "grad_norm": 0.26834070682525635, "kl": 0.0004520416259765625, "learning_rate": 3.3e-07, "loss": 0.0948, "num_tokens": 7863130.0, "reward": 0.32795046269893646, "reward_std": 0.22959410771727562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06241273134946823, "rewards/penalized_accuracy_reward/std": 0.09560903906822205, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.11365052498877048, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1796.25, "completions/max_terminated_length": 1257.75, "completions/mean_length": 1478.609375, "completions/mean_terminated_length": 870.8827819824219, "completions/min_length": 1152.5, "completions/min_terminated_length": 640.5, "epoch": 0.034, "grad_norm": 0.2746933400630951, "kl": 0.0005650520324707031, "learning_rate": 3.35e-07, "loss": 0.0362, "num_tokens": 7966769.0, "reward": 0.220703125, "reward_std": 0.049259018152952194, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.44140625, "rewards/tag_count_reward/std": 0.09851804003119469, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1304.5, "completions/mean_length": 1990.28125, "completions/mean_terminated_length": 1173.25, "completions/min_length": 1546.25, "completions/min_terminated_length": 1034.25, "epoch": 0.0345, "grad_norm": 0.2442578375339508, "kl": 0.0005135536193847656, "learning_rate": 3.4000000000000003e-07, "loss": 0.0429, "num_tokens": 8103603.0, "reward": 0.1753501333296299, "reward_std": 0.14317160099744797, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012479754164814949, "rewards/penalized_accuracy_reward/std": 0.049919016659259796, "rewards/tag_count_reward/mean": 0.30078125, "rewards/tag_count_reward/std": 0.0972641110420227, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 1907.25, "completions/max_terminated_length": 1711.75, "completions/mean_length": 1570.796875, "completions/mean_terminated_length": 1249.0915222167969, "completions/min_length": 937.25, "completions/min_terminated_length": 937.25, "epoch": 0.035, "grad_norm": 0.35273581743240356, "kl": 0.0006546974182128906, "learning_rate": 3.45e-07, "loss": 0.0949, "num_tokens": 8214534.0, "reward": 0.201171875, "reward_std": 0.07806682772934437, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40234375, "rewards/tag_count_reward/std": 0.15613365732133389, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 1495.328125, "completions/mean_terminated_length": 895.318603515625, "completions/min_length": 1203.0, "completions/min_terminated_length": 691.0, "epoch": 0.0355, "grad_norm": 0.3790898621082306, "kl": 0.0006451606750488281, "learning_rate": 3.5e-07, "loss": 0.102, "num_tokens": 8320059.0, "reward": 0.205078125, "reward_std": 0.06656993553042412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.41015625, "rewards/tag_count_reward/std": 0.13313987478613853, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 936.5, "completions/mean_length": 1867.484375, "completions/mean_terminated_length": 720.3096618652344, "completions/min_length": 1494.25, "completions/min_terminated_length": 470.25, "epoch": 0.036, "grad_norm": 0.18929243087768555, "kl": 0.00047206878662109375, "learning_rate": 3.55e-07, "loss": 0.0656, "num_tokens": 8447370.0, "reward": 0.24071663618087769, "reward_std": 0.18897553719580173, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03735050559043884, "rewards/penalized_accuracy_reward/std": 0.0803011804819107, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.07585071213543415, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1311.75, "completions/mean_length": 1614.21875, "completions/mean_terminated_length": 975.0303192138672, "completions/min_length": 1255.25, "completions/min_terminated_length": 743.25, "epoch": 0.0365, "grad_norm": 0.28594115376472473, "kl": 0.0005755424499511719, "learning_rate": 3.6e-07, "loss": 0.0807, "num_tokens": 8561848.0, "reward": 0.21245772391557693, "reward_std": 0.12418534234166145, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012478861026465893, "rewards/penalized_accuracy_reward/std": 0.04991544410586357, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.08240052498877048, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1847.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 1445.21875, "completions/mean_terminated_length": 1321.6871643066406, "completions/min_length": 831.75, "completions/min_terminated_length": 831.75, "epoch": 0.037, "grad_norm": 0.2602826654911041, "kl": 0.000492095947265625, "learning_rate": 3.65e-07, "loss": 0.1125, "num_tokens": 8664550.0, "reward": 0.2783743739128113, "reward_std": 0.23680835962295532, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024929371662437916, "rewards/penalized_accuracy_reward/std": 0.09971749410033226, "rewards/tag_count_reward/mean": 0.45703125, "rewards/tag_count_reward/std": 0.12774410098791122, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 1710.921875, "completions/mean_terminated_length": 1372.7927856445312, "completions/min_length": 869.25, "completions/min_terminated_length": 869.25, "epoch": 0.0375, "grad_norm": 0.3260379135608673, "kl": 0.0008144378662109375, "learning_rate": 3.7e-07, "loss": 0.1452, "num_tokens": 8784673.0, "reward": 0.19921875, "reward_std": 0.07470299489796162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.14940599165856838, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1803.75, "completions/max_terminated_length": 1747.75, "completions/mean_length": 1469.71875, "completions/mean_terminated_length": 1200.0335083007812, "completions/min_length": 729.75, "completions/min_terminated_length": 729.75, "epoch": 0.038, "grad_norm": 0.37297365069389343, "kl": 0.0007028579711914062, "learning_rate": 3.75e-07, "loss": 0.0959, "num_tokens": 8886191.0, "reward": 0.2109375, "reward_std": 0.08358689583837986, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.16717379540205002, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 1930.90625, "completions/mean_terminated_length": 1162.8125, "completions/min_length": 1489.25, "completions/min_terminated_length": 977.25, "epoch": 0.0385, "grad_norm": 0.2381117343902588, "kl": 0.0005688667297363281, "learning_rate": 3.7999999999999996e-07, "loss": 0.058, "num_tokens": 9018265.0, "reward": 0.21391794830560684, "reward_std": 0.1725856065750122, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02492772415280342, "rewards/penalized_accuracy_reward/std": 0.06811551749706268, "rewards/tag_count_reward/mean": 0.328125, "rewards/tag_count_reward/std": 0.11091229319572449, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1882.75, "completions/max_terminated_length": 1012.0, "completions/mean_length": 1646.109375, "completions/mean_terminated_length": 685.1041870117188, "completions/min_length": 987.0, "completions/min_terminated_length": 475.0, "epoch": 0.039, "grad_norm": 0.19739216566085815, "kl": 0.0005846023559570312, "learning_rate": 3.8499999999999997e-07, "loss": 0.1069, "num_tokens": 9132864.0, "reward": 0.16796875, "reward_std": 0.036034777760505676, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3359375, "rewards/tag_count_reward/std": 0.07206955552101135, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1794.5, "completions/mean_length": 1422.8125, "completions/mean_terminated_length": 1236.9500274658203, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.0395, "grad_norm": 0.2987287640571594, "kl": 0.0004291534423828125, "learning_rate": 3.8999999999999997e-07, "loss": 0.1134, "num_tokens": 9232596.0, "reward": 0.2593488544225693, "reward_std": 0.15653171483427286, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012486925348639488, "rewards/penalized_accuracy_reward/std": 0.04994770511984825, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.11327262595295906, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1775.5, "completions/mean_length": 1428.15625, "completions/mean_terminated_length": 1255.7805480957031, "completions/min_length": 736.75, "completions/min_terminated_length": 736.75, "epoch": 0.04, "grad_norm": 0.2934829592704773, "kl": 0.0005168914794921875, "learning_rate": 3.95e-07, "loss": 0.1178, "num_tokens": 9332638.0, "reward": 0.228515625, "reward_std": 0.04422798380255699, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.45703125, "rewards/tag_count_reward/std": 0.08845596946775913, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1766.5, "completions/mean_length": 1838.625, "completions/mean_terminated_length": 1329.5833435058594, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.0405, "grad_norm": 0.25493645668029785, "kl": 0.00040435791015625, "learning_rate": 4e-07, "loss": 0.0941, "num_tokens": 9459718.0, "reward": 0.177734375, "reward_std": 0.0680323950946331, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.1360647901892662, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 1649.75, "completions/max_terminated_length": 945.5, "completions/mean_length": 1601.1875, "completions/mean_terminated_length": 906.4375, "completions/min_length": 1388.25, "completions/min_terminated_length": 876.25, "epoch": 0.041, "grad_norm": 0.3894990384578705, "kl": 0.0004935264587402344, "learning_rate": 4.05e-07, "loss": -0.0051, "num_tokens": 9575442.0, "reward": 0.16796875, "reward_std": 0.036034777760505676, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3359375, "rewards/tag_count_reward/std": 0.07206955552101135, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1391.5, "completions/mean_length": 1841.0625, "completions/mean_terminated_length": 1212.6923217773438, "completions/min_length": 1133.5, "completions/min_terminated_length": 1133.5, "epoch": 0.0415, "grad_norm": 0.21569542586803436, "kl": 0.0005879402160644531, "learning_rate": 4.0999999999999994e-07, "loss": 0.1046, "num_tokens": 9702198.0, "reward": 0.2637956291437149, "reward_std": 0.22446785867214203, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04986656457185745, "rewards/penalized_accuracy_reward/std": 0.08920406550168991, "rewards/tag_count_reward/mean": 0.328125, "rewards/tag_count_reward/std": 0.10814354941248894, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1400.5, "completions/mean_length": 1624.09375, "completions/mean_terminated_length": 892.5424194335938, "completions/min_length": 1099.25, "completions/min_terminated_length": 587.25, "epoch": 0.042, "grad_norm": 0.3125496804714203, "kl": 0.0006093978881835938, "learning_rate": 4.1499999999999994e-07, "loss": 0.12, "num_tokens": 9817388.0, "reward": 0.24328495562076569, "reward_std": 0.17240451090037823, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496279403567314, "rewards/penalized_accuracy_reward/std": 0.06821134686470032, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.0894539300352335, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1655.5, "completions/max_terminated_length": 1504.25, "completions/mean_length": 1222.984375, "completions/mean_terminated_length": 1094.3846893310547, "completions/min_length": 697.5, "completions/min_terminated_length": 697.5, "epoch": 0.0425, "grad_norm": 0.27532830834388733, "kl": 0.000438690185546875, "learning_rate": 4.1999999999999995e-07, "loss": 0.0795, "num_tokens": 9903387.0, "reward": 0.40740836411714554, "reward_std": 0.42155035585165024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07479792926460505, "rewards/penalized_accuracy_reward/std": 0.19846704229712486, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.16173567809164524, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1562.75, "completions/max_terminated_length": 1517.5, "completions/mean_length": 1246.390625, "completions/mean_terminated_length": 1179.7875061035156, "completions/min_length": 793.5, "completions/min_terminated_length": 793.5, "epoch": 0.043, "grad_norm": 0.33682435750961304, "kl": 0.00046825408935546875, "learning_rate": 4.2499999999999995e-07, "loss": 0.08, "num_tokens": 9989940.0, "reward": 0.4957877993583679, "reward_std": 0.2623061016201973, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12484701722860336, "rewards/penalized_accuracy_reward/std": 0.09987764060497284, "rewards/tag_count_reward/mean": 0.4921875, "rewards/tag_count_reward/std": 0.14896522462368011, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 1892.0, "completions/max_terminated_length": 1309.25, "completions/mean_length": 1718.8125, "completions/mean_terminated_length": 1022.125, "completions/min_length": 1323.25, "completions/min_terminated_length": 811.25, "epoch": 0.0435, "grad_norm": 0.16355371475219727, "kl": 0.0006418228149414062, "learning_rate": 4.2999999999999996e-07, "loss": 0.0334, "num_tokens": 10107528.0, "reward": 0.22376590967178345, "reward_std": 0.15250487625598907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024968892335891724, "rewards/penalized_accuracy_reward/std": 0.06822801381349564, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.06665603816509247, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1953.5, "completions/max_terminated_length": 1397.75, "completions/mean_length": 1685.40625, "completions/mean_terminated_length": 1099.2514343261719, "completions/min_length": 1280.5, "completions/min_terminated_length": 768.5, "epoch": 0.044, "grad_norm": 0.2657161056995392, "kl": 0.0005893707275390625, "learning_rate": 4.3499999999999996e-07, "loss": 0.0208, "num_tokens": 10225282.0, "reward": 0.36606886982917786, "reward_std": 0.24278223142027855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08733130991458893, "rewards/penalized_accuracy_reward/std": 0.10227198898792267, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.09752252139151096, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 827.5, "completions/mean_length": 1967.359375, "completions/mean_terminated_length": 709.3333435058594, "completions/min_length": 1633.25, "completions/min_terminated_length": 609.25, "epoch": 0.0445, "grad_norm": 0.15643256902694702, "kl": 0.0005807876586914062, "learning_rate": 4.3999999999999997e-07, "loss": 0.0595, "num_tokens": 10359945.0, "reward": 0.32307395339012146, "reward_std": 0.22616350278258324, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08731822669506073, "rewards/penalized_accuracy_reward/std": 0.10225667804479599, "rewards/tag_count_reward/mean": 0.296875, "rewards/tag_count_reward/std": 0.05259781517088413, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1840.25, "completions/max_terminated_length": 1260.75, "completions/mean_length": 1420.28125, "completions/mean_terminated_length": 707.7916717529297, "completions/min_length": 775.5, "completions/min_terminated_length": 263.5, "epoch": 0.045, "grad_norm": 0.31921878457069397, "kl": 0.000690460205078125, "learning_rate": 4.45e-07, "loss": 0.1671, "num_tokens": 10458795.0, "reward": 0.2476242333650589, "reward_std": 0.13571038842201233, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01248399168252945, "rewards/penalized_accuracy_reward/std": 0.0499359667301178, "rewards/tag_count_reward/mean": 0.4453125, "rewards/tag_count_reward/std": 0.1371676418930292, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1081.25, "completions/mean_length": 1678.875, "completions/mean_terminated_length": 741.8750152587891, "completions/min_length": 1059.75, "completions/min_terminated_length": 547.75, "epoch": 0.0455, "grad_norm": 0.36719128489494324, "kl": 0.0007715225219726562, "learning_rate": 4.5e-07, "loss": 0.1546, "num_tokens": 10577667.0, "reward": 0.166015625, "reward_std": 0.02668476663529873, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.053369538858532906, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1161.25, "completions/mean_length": 1705.265625, "completions/mean_terminated_length": 866.3500061035156, "completions/min_length": 1062.25, "completions/min_terminated_length": 550.25, "epoch": 0.046, "grad_norm": 0.2737264931201935, "kl": 0.0006999969482421875, "learning_rate": 4.55e-07, "loss": 0.0793, "num_tokens": 10697588.0, "reward": 0.18359375, "reward_std": 0.041011312045156956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.08202262595295906, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1740.75, "completions/max_terminated_length": 1095.0, "completions/mean_length": 1447.296875, "completions/mean_terminated_length": 786.6160888671875, "completions/min_length": 1074.5, "completions/min_terminated_length": 562.5, "epoch": 0.0465, "grad_norm": 0.30761247873306274, "kl": 0.0005564689636230469, "learning_rate": 4.6e-07, "loss": 0.0804, "num_tokens": 10797767.0, "reward": 0.3336184471845627, "reward_std": 0.2346997670829296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06231703609228134, "rewards/penalized_accuracy_reward/std": 0.09546253830194473, "rewards/tag_count_reward/mean": 0.41796875, "rewards/tag_count_reward/std": 0.10346913151443005, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 1819.078125, "completions/mean_terminated_length": 1102.0166931152344, "completions/min_length": 1341.75, "completions/min_terminated_length": 829.75, "epoch": 0.047, "grad_norm": 0.28269627690315247, "kl": 0.0006771087646484375, "learning_rate": 4.65e-07, "loss": 0.0504, "num_tokens": 10927932.0, "reward": 0.181640625, "reward_std": 0.057104695588350296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.11420939117670059, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 1693.890625, "completions/mean_terminated_length": 992.9840087890625, "completions/min_length": 1122.75, "completions/min_terminated_length": 610.75, "epoch": 0.0475, "grad_norm": 0.25420916080474854, "kl": 0.0007114410400390625, "learning_rate": 4.6999999999999995e-07, "loss": 0.1053, "num_tokens": 11046293.0, "reward": 0.3467457890510559, "reward_std": 0.23846420645713806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07474008202552795, "rewards/penalized_accuracy_reward/std": 0.09965348988771439, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.09289801307022572, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 1567.46875, "completions/mean_terminated_length": 797.1205596923828, "completions/min_length": 971.75, "completions/min_terminated_length": 459.75, "epoch": 0.048, "grad_norm": 0.30752503871917725, "kl": 0.00081634521484375, "learning_rate": 4.7499999999999995e-07, "loss": 0.1082, "num_tokens": 11157539.0, "reward": 0.23194404691457748, "reward_std": 0.1510626170784235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012456399388611317, "rewards/penalized_accuracy_reward/std": 0.049825601279735565, "rewards/tag_count_reward/mean": 0.4140625, "rewards/tag_count_reward/std": 0.15843890607357025, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1497.5, "completions/mean_length": 1953.15625, "completions/mean_terminated_length": 1220.7291870117188, "completions/min_length": 1470.5, "completions/min_terminated_length": 958.5, "epoch": 0.0485, "grad_norm": 0.24095728993415833, "kl": 0.0006098747253417969, "learning_rate": 4.8e-07, "loss": 0.0634, "num_tokens": 11290397.0, "reward": 0.2139211669564247, "reward_std": 0.1888051386922598, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024929331615567207, "rewards/penalized_accuracy_reward/std": 0.06811991333961487, "rewards/tag_count_reward/mean": 0.328125, "rewards/tag_count_reward/std": 0.13419455848634243, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 1671.25, "completions/mean_terminated_length": 897.0265197753906, "completions/min_length": 1129.25, "completions/min_terminated_length": 617.25, "epoch": 0.049, "grad_norm": 0.3216855227947235, "kl": 0.0007114410400390625, "learning_rate": 4.85e-07, "loss": 0.178, "num_tokens": 11408909.0, "reward": 0.19140625, "reward_std": 0.06298827938735485, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.12597656436264515, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1845.5, "completions/mean_length": 1814.0, "completions/mean_terminated_length": 1418.0319519042969, "completions/min_length": 904.5, "completions/min_terminated_length": 904.5, "epoch": 0.0495, "grad_norm": 0.23584382236003876, "kl": 0.0007009506225585938, "learning_rate": 4.9e-07, "loss": 0.1404, "num_tokens": 11535661.0, "reward": 0.28714369237422943, "reward_std": 0.3034210968762636, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04982184711843729, "rewards/penalized_accuracy_reward/std": 0.13017740473151207, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13885539025068283, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1355.25, "completions/mean_length": 1930.8125, "completions/mean_terminated_length": 1086.5000305175781, "completions/min_length": 1323.75, "completions/min_terminated_length": 811.75, "epoch": 0.05, "grad_norm": 0.2153274565935135, "kl": 0.0007123947143554688, "learning_rate": 4.95e-07, "loss": 0.0697, "num_tokens": 11669153.0, "reward": 0.154296875, "reward_std": 0.05114922486245632, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.30859375, "rewards/tag_count_reward/std": 0.10229845158755779, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 2046.75, "completions/max_terminated_length": 1844.5, "completions/mean_length": 1606.09375, "completions/mean_terminated_length": 1256.6268005371094, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 0.0505, "grad_norm": 0.2879039943218231, "kl": 0.0007524490356445312, "learning_rate": 5e-07, "loss": 0.1846, "num_tokens": 11781671.0, "reward": 0.2857328951358795, "reward_std": 0.21633771061897278, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03739769756793976, "rewards/penalized_accuracy_reward/std": 0.08040270954370499, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.14686176739633083, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1669.25, "completions/mean_length": 1681.578125, "completions/mean_terminated_length": 1268.8541717529297, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.051, "grad_norm": 0.28296342492103577, "kl": 0.0006866455078125, "learning_rate": 5.049999999999999e-07, "loss": 0.1416, "num_tokens": 11897372.0, "reward": 0.177734375, "reward_std": 0.05102001782506704, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.10204003937542439, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1992.0, "completions/max_terminated_length": 1836.5, "completions/mean_length": 1642.828125, "completions/mean_terminated_length": 1339.8521118164062, "completions/min_length": 1000.75, "completions/min_terminated_length": 1000.75, "epoch": 0.0515, "grad_norm": 0.3031728267669678, "kl": 0.0007085800170898438, "learning_rate": 5.1e-07, "loss": 0.0125, "num_tokens": 12010529.0, "reward": 0.23780875653028488, "reward_std": 0.16998455859720707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01245906576514244, "rewards/penalized_accuracy_reward/std": 0.04983626306056976, "rewards/tag_count_reward/mean": 0.42578125, "rewards/tag_count_reward/std": 0.1512194164097309, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1496.5, "completions/mean_length": 1902.84375, "completions/mean_terminated_length": 1302.5568237304688, "completions/min_length": 1125.5, "completions/min_terminated_length": 1125.5, "epoch": 0.052, "grad_norm": 0.23680856823921204, "kl": 0.0007162094116210938, "learning_rate": 5.149999999999999e-07, "loss": 0.0769, "num_tokens": 12141543.0, "reward": 0.166015625, "reward_std": 0.056407444179058075, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.11281489208340645, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1717.5, "completions/mean_length": 1936.234375, "completions/mean_terminated_length": 1546.6805725097656, "completions/min_length": 1453.75, "completions/min_terminated_length": 1453.75, "epoch": 0.0525, "grad_norm": 0.2541963756084442, "kl": 0.0007419586181640625, "learning_rate": 5.2e-07, "loss": 0.0773, "num_tokens": 12278838.0, "reward": 0.158203125, "reward_std": 0.05336645990610123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.31640625, "rewards/tag_count_reward/std": 0.10673292353749275, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1729.5, "completions/max_terminated_length": 1549.25, "completions/mean_length": 1336.84375, "completions/mean_terminated_length": 1108.9673461914062, "completions/min_length": 782.75, "completions/min_terminated_length": 782.75, "epoch": 0.053, "grad_norm": 0.36853623390197754, "kl": 0.0010061264038085938, "learning_rate": 5.25e-07, "loss": 0.1636, "num_tokens": 12375852.0, "reward": 0.2726123034954071, "reward_std": 0.17165477201342583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497803047299385, "rewards/penalized_accuracy_reward/std": 0.06825298070907593, "rewards/tag_count_reward/mean": 0.4453125, "rewards/tag_count_reward/std": 0.13420547172427177, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1247.75, "completions/mean_length": 1904.734375, "completions/mean_terminated_length": 982.1041870117188, "completions/min_length": 1222.25, "completions/min_terminated_length": 710.25, "epoch": 0.0535, "grad_norm": 0.24676094949245453, "kl": 0.000934600830078125, "learning_rate": 5.3e-07, "loss": 0.1144, "num_tokens": 12507435.0, "reward": 0.15625, "reward_std": 0.048801276832818985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.09760255925357342, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1856.25, "completions/mean_length": 1753.703125, "completions/mean_terminated_length": 1431.2976379394531, "completions/min_length": 1038.5, "completions/min_terminated_length": 1038.5, "epoch": 0.054, "grad_norm": 0.2826627492904663, "kl": 0.0009002685546875, "learning_rate": 5.35e-07, "loss": 0.1181, "num_tokens": 12631208.0, "reward": 0.2761185020208359, "reward_std": 0.28201801143586636, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037473312579095364, "rewards/penalized_accuracy_reward/std": 0.11822321638464928, "rewards/tag_count_reward/mean": 0.40234375, "rewards/tag_count_reward/std": 0.14982198737561703, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1874.75, "completions/max_terminated_length": 1592.75, "completions/mean_length": 1309.328125, "completions/mean_terminated_length": 1197.2437744140625, "completions/min_length": 966.25, "completions/min_terminated_length": 966.25, "epoch": 0.0545, "grad_norm": 0.33942097425460815, "kl": 0.0008554458618164062, "learning_rate": 5.4e-07, "loss": 0.0589, "num_tokens": 12722445.0, "reward": 0.3227277398109436, "reward_std": 0.22777720354497433, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0373404286801815, "rewards/penalized_accuracy_reward/std": 0.08027950674295425, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.17241756431758404, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1163.25, "completions/mean_length": 1800.984375, "completions/mean_terminated_length": 921.3937683105469, "completions/min_length": 1179.75, "completions/min_terminated_length": 667.75, "epoch": 0.055, "grad_norm": 0.27312326431274414, "kl": 0.000896453857421875, "learning_rate": 5.45e-07, "loss": 0.1029, "num_tokens": 12847420.0, "reward": 0.17578125, "reward_std": 0.04885547794401646, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.09771095961332321, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.25, "completions/mean_length": 1504.328125, "completions/mean_terminated_length": 1197.0250091552734, "completions/min_length": 678.5, "completions/min_terminated_length": 678.5, "epoch": 0.0555, "grad_norm": 0.3251037001609802, "kl": 0.0009441375732421875, "learning_rate": 5.5e-07, "loss": 0.1559, "num_tokens": 12953393.0, "reward": 0.2109375, "reward_std": 0.07614279352128506, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.15228559263050556, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1860.75, "completions/max_terminated_length": 1697.0, "completions/mean_length": 1542.6875, "completions/mean_terminated_length": 1255.1317138671875, "completions/min_length": 781.5, "completions/min_terminated_length": 781.5, "epoch": 0.056, "grad_norm": 0.2510627508163452, "kl": 0.0008754730224609375, "learning_rate": 5.55e-07, "loss": 0.07, "num_tokens": 13059629.0, "reward": 0.23590326309204102, "reward_std": 0.1760062724351883, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482883408665657, "rewards/penalized_accuracy_reward/std": 0.04993153735995293, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.17468148469924927, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1703.75, "completions/max_terminated_length": 1625.0, "completions/mean_length": 1467.671875, "completions/mean_terminated_length": 1236.0072326660156, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.0565, "grad_norm": 0.36335065960884094, "kl": 0.001041412353515625, "learning_rate": 5.6e-07, "loss": 0.0883, "num_tokens": 13162408.0, "reward": 0.4142707586288452, "reward_std": 0.2631514351814985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09971349686384201, "rewards/penalized_accuracy_reward/std": 0.10298368334770203, "rewards/tag_count_reward/mean": 0.4296875, "rewards/tag_count_reward/std": 0.1256849728524685, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2026.75, "completions/max_terminated_length": 1828.0, "completions/mean_length": 1642.203125, "completions/mean_terminated_length": 1217.2292175292969, "completions/min_length": 709.0, "completions/min_terminated_length": 709.0, "epoch": 0.057, "grad_norm": 0.3021674156188965, "kl": 0.00118255615234375, "learning_rate": 5.649999999999999e-07, "loss": 0.145, "num_tokens": 13276757.0, "reward": 0.47391974925994873, "reward_std": 0.33079166151583195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13735049776732922, "rewards/penalized_accuracy_reward/std": 0.1498541459441185, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.13800719380378723, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1936.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 1333.578125, "completions/mean_terminated_length": 999.4624481201172, "completions/min_length": 594.5, "completions/min_terminated_length": 594.5, "epoch": 0.0575, "grad_norm": 0.2522277534008026, "kl": 0.0009059906005859375, "learning_rate": 5.699999999999999e-07, "loss": 0.1999, "num_tokens": 13371690.0, "reward": 0.236328125, "reward_std": 0.07281853072345257, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.47265625, "rewards/tag_count_reward/std": 0.14563707076013088, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.25, "completions/mean_length": 1675.921875, "completions/mean_terminated_length": 1395.0375366210938, "completions/min_length": 855.5, "completions/min_terminated_length": 855.5, "epoch": 0.058, "grad_norm": 0.2691246271133423, "kl": 0.0008363723754882812, "learning_rate": 5.749999999999999e-07, "loss": 0.2099, "num_tokens": 13487621.0, "reward": 0.26859086006879807, "reward_std": 0.2283919844776392, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024920430034399033, "rewards/penalized_accuracy_reward/std": 0.0680956020951271, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.20291327126324177, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 1941.5, "completions/max_terminated_length": 973.25, "completions/mean_length": 1744.8125, "completions/mean_terminated_length": 752.7395935058594, "completions/min_length": 1135.25, "completions/min_terminated_length": 623.25, "epoch": 0.0585, "grad_norm": 0.27911967039108276, "kl": 0.0012674331665039062, "learning_rate": 5.8e-07, "loss": 0.0722, "num_tokens": 13607561.0, "reward": 0.5519682765007019, "reward_std": 0.13612132146954536, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18711695075035095, "rewards/penalized_accuracy_reward/std": 0.04989815503358841, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.09947281517088413, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1919.0, "completions/max_terminated_length": 1584.75, "completions/mean_length": 1428.34375, "completions/mean_terminated_length": 1037.321044921875, "completions/min_length": 740.25, "completions/min_terminated_length": 740.25, "epoch": 0.059, "grad_norm": 0.29136401414871216, "kl": 0.0011548995971679688, "learning_rate": 5.849999999999999e-07, "loss": 0.1123, "num_tokens": 13705439.0, "reward": 0.21875, "reward_std": 0.054907046258449554, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.10981409251689911, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1735.75, "completions/max_terminated_length": 1497.75, "completions/mean_length": 1112.234375, "completions/mean_terminated_length": 917.7038879394531, "completions/min_length": 380.75, "completions/min_terminated_length": 380.75, "epoch": 0.0595, "grad_norm": 0.3757966160774231, "kl": 0.001041412353515625, "learning_rate": 5.9e-07, "loss": 0.1234, "num_tokens": 13784878.0, "reward": 0.38591888546943665, "reward_std": 0.2516702562570572, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07479538023471832, "rewards/penalized_accuracy_reward/std": 0.09972727298736572, "rewards/tag_count_reward/mean": 0.47265625, "rewards/tag_count_reward/std": 0.13121474906802177, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1705.5, "completions/mean_length": 1789.28125, "completions/mean_terminated_length": 1337.0625, "completions/min_length": 800.75, "completions/min_terminated_length": 800.75, "epoch": 0.06, "grad_norm": 0.2664056420326233, "kl": 0.0010509490966796875, "learning_rate": 5.949999999999999e-07, "loss": 0.1295, "num_tokens": 13907312.0, "reward": 0.2373553365468979, "reward_std": 0.20769884809851646, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024927668273448944, "rewards/penalized_accuracy_reward/std": 0.06811536103487015, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.15502336621284485, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1888.25, "completions/max_terminated_length": 1723.5, "completions/mean_length": 1359.9375, "completions/mean_terminated_length": 1102.824691772461, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.0605, "grad_norm": 0.3865290880203247, "kl": 0.0012578964233398438, "learning_rate": 6e-07, "loss": 0.1137, "num_tokens": 14001708.0, "reward": 0.45919710397720337, "reward_std": 0.35656336322426796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09971573762595654, "rewards/penalized_accuracy_reward/std": 0.15202975273132324, "rewards/tag_count_reward/mean": 0.51953125, "rewards/tag_count_reward/std": 0.1812620796263218, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2016.25, "completions/max_terminated_length": 1698.25, "completions/mean_length": 1667.28125, "completions/mean_terminated_length": 1315.6291809082031, "completions/min_length": 925.0, "completions/min_terminated_length": 925.0, "epoch": 0.061, "grad_norm": 0.2925189435482025, "kl": 0.001384735107421875, "learning_rate": 6.049999999999999e-07, "loss": 0.1145, "num_tokens": 14116638.0, "reward": 0.3818674832582474, "reward_std": 0.26422244496643543, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08741812407970428, "rewards/penalized_accuracy_reward/std": 0.1023736372590065, "rewards/tag_count_reward/mean": 0.4140625, "rewards/tag_count_reward/std": 0.1189503725618124, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 1484.40625, "completions/mean_terminated_length": 856.8036041259766, "completions/min_length": 970.75, "completions/min_terminated_length": 458.75, "epoch": 0.0615, "grad_norm": 0.28961825370788574, "kl": 0.00112152099609375, "learning_rate": 6.1e-07, "loss": 0.0994, "num_tokens": 14221976.0, "reward": 0.23046875, "reward_std": 0.06385000795125961, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4609375, "rewards/tag_count_reward/std": 0.12770001962780952, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1946.25, "completions/max_terminated_length": 1748.75, "completions/mean_length": 1155.9375, "completions/mean_terminated_length": 1011.1549530029297, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.062, "grad_norm": 0.38905683159828186, "kl": 0.0024623870849609375, "learning_rate": 6.149999999999999e-07, "loss": 0.2792, "num_tokens": 14305076.0, "reward": 0.4171203672885895, "reward_std": 0.2723007798194885, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07477112114429474, "rewards/penalized_accuracy_reward/std": 0.0996948629617691, "rewards/tag_count_reward/mean": 0.53515625, "rewards/tag_count_reward/std": 0.16602574847638607, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1865.25, "completions/max_terminated_length": 1248.75, "completions/mean_length": 1526.46875, "completions/mean_terminated_length": 866.6156311035156, "completions/min_length": 975.75, "completions/min_terminated_length": 463.75, "epoch": 0.0625, "grad_norm": 0.36599811911582947, "kl": 0.0012531280517578125, "learning_rate": 6.2e-07, "loss": 0.0989, "num_tokens": 14413394.0, "reward": 0.2265625, "reward_std": 0.08170771412551403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.453125, "rewards/tag_count_reward/std": 0.16341543197631836, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1652.75, "completions/mean_length": 1371.1875, "completions/mean_terminated_length": 1168.040267944336, "completions/min_length": 667.25, "completions/min_terminated_length": 667.25, "epoch": 0.063, "grad_norm": 0.24372759461402893, "kl": 0.001979827880859375, "learning_rate": 6.249999999999999e-07, "loss": 0.1035, "num_tokens": 14512254.0, "reward": 0.2553902715444565, "reward_std": 0.14846415258944035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01246076263487339, "rewards/penalized_accuracy_reward/std": 0.04984305053949356, "rewards/tag_count_reward/mean": 0.4609375, "rewards/tag_count_reward/std": 0.1180882640182972, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1655.25, "completions/mean_length": 1671.765625, "completions/mean_terminated_length": 1193.7115631103516, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 0.0635, "grad_norm": 0.27513402700424194, "kl": 0.001079559326171875, "learning_rate": 6.3e-07, "loss": 0.2246, "num_tokens": 14628239.0, "reward": 0.19921875, "reward_std": 0.09336646273732185, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.1867329254746437, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2021.25, "completions/max_terminated_length": 1567.0, "completions/mean_length": 1312.046875, "completions/mean_terminated_length": 844.3979339599609, "completions/min_length": 513.5, "completions/min_terminated_length": 513.5, "epoch": 0.064, "grad_norm": 0.3371029794216156, "kl": 0.0018587112426757812, "learning_rate": 6.35e-07, "loss": 0.1525, "num_tokens": 14721282.0, "reward": 0.25150007754564285, "reward_std": 0.17879353649914265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012468787841498852, "rewards/penalized_accuracy_reward/std": 0.049875155091285706, "rewards/tag_count_reward/mean": 0.453125, "rewards/tag_count_reward/std": 0.17744236253201962, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1496.75, "completions/mean_length": 1630.65625, "completions/mean_terminated_length": 1149.375015258789, "completions/min_length": 702.75, "completions/min_terminated_length": 702.75, "epoch": 0.0645, "grad_norm": 0.2788827121257782, "kl": 0.00177001953125, "learning_rate": 6.4e-07, "loss": 0.1265, "num_tokens": 14833980.0, "reward": 0.236328125, "reward_std": 0.09686242416501045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.47265625, "rewards/tag_count_reward/std": 0.1937248520553112, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1416.5, "completions/mean_length": 1945.90625, "completions/mean_terminated_length": 1174.2124938964844, "completions/min_length": 1402.0, "completions/min_terminated_length": 890.0, "epoch": 0.065, "grad_norm": 0.18588171899318695, "kl": 0.0008668899536132812, "learning_rate": 6.45e-07, "loss": 0.0783, "num_tokens": 14967750.0, "reward": 0.20997636020183563, "reward_std": 0.17668692208826542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024910055100917816, "rewards/penalized_accuracy_reward/std": 0.06806723028421402, "rewards/tag_count_reward/mean": 0.3203125, "rewards/tag_count_reward/std": 0.09319132193922997, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 1581.71875, "completions/mean_terminated_length": 983.36669921875, "completions/min_length": 1004.5, "completions/min_terminated_length": 492.5, "epoch": 0.0655, "grad_norm": 0.21308617293834686, "kl": 0.0009641647338867188, "learning_rate": 6.5e-07, "loss": 0.1033, "num_tokens": 15078948.0, "reward": 0.46626120805740356, "reward_std": 0.25258435495197773, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12473215907812119, "rewards/penalized_accuracy_reward/std": 0.09978580474853516, "rewards/tag_count_reward/mean": 0.43359375, "rewards/tag_count_reward/std": 0.1161738969385624, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1915.75, "completions/max_terminated_length": 1798.5, "completions/mean_length": 1498.96875, "completions/mean_terminated_length": 1296.0446472167969, "completions/min_length": 783.5, "completions/min_terminated_length": 783.5, "epoch": 0.066, "grad_norm": 0.2749808132648468, "kl": 0.001255035400390625, "learning_rate": 6.55e-07, "loss": 0.1157, "num_tokens": 15182786.0, "reward": 0.3341115340590477, "reward_std": 0.3238433711230755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049868266098201275, "rewards/penalized_accuracy_reward/std": 0.13030629977583885, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.15987918712198734, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1849.75, "completions/max_terminated_length": 1293.25, "completions/mean_length": 1415.875, "completions/mean_terminated_length": 802.9453735351562, "completions/min_length": 1018.75, "completions/min_terminated_length": 506.75, "epoch": 0.0665, "grad_norm": 0.3283509910106659, "kl": 0.0020961761474609375, "learning_rate": 6.6e-07, "loss": 0.0946, "num_tokens": 15283258.0, "reward": 0.23046875, "reward_std": 0.06775708682835102, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4609375, "rewards/tag_count_reward/std": 0.1355141755193472, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.5, "completions/mean_length": 1591.515625, "completions/mean_terminated_length": 1235.6466064453125, "completions/min_length": 644.25, "completions/min_terminated_length": 644.25, "epoch": 0.067, "grad_norm": 0.2735710144042969, "kl": 0.002254486083984375, "learning_rate": 6.65e-07, "loss": 0.1083, "num_tokens": 15393243.0, "reward": 0.43547965586185455, "reward_std": 0.42237590439617634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11227107793092728, "rewards/penalized_accuracy_reward/std": 0.18481221050024033, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.13269630074501038, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 1933.609375, "completions/mean_terminated_length": 1126.7708435058594, "completions/min_length": 1201.75, "completions/min_terminated_length": 689.75, "epoch": 0.0675, "grad_norm": 0.23296479880809784, "kl": 0.001667022705078125, "learning_rate": 6.7e-07, "loss": 0.0812, "num_tokens": 15528162.0, "reward": 0.169921875, "reward_std": 0.07610558345913887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.15221116691827774, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.75, "completions/mean_length": 1369.984375, "completions/mean_terminated_length": 1119.4087677001953, "completions/min_length": 618.75, "completions/min_terminated_length": 618.75, "epoch": 0.068, "grad_norm": 0.30840855836868286, "kl": 0.001628875732421875, "learning_rate": 6.75e-07, "loss": 0.1525, "num_tokens": 15624273.0, "reward": 0.248046875, "reward_std": 0.10399307496845722, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.20798614993691444, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1378.75, "completions/mean_length": 1784.21875, "completions/mean_terminated_length": 862.4757080078125, "completions/min_length": 1036.25, "completions/min_terminated_length": 524.25, "epoch": 0.0685, "grad_norm": 0.2548421621322632, "kl": 0.001556396484375, "learning_rate": 6.800000000000001e-07, "loss": 0.1476, "num_tokens": 15748303.0, "reward": 0.181640625, "reward_std": 0.06215548701584339, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.12431098148226738, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1869.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 1403.15625, "completions/mean_terminated_length": 1227.5001525878906, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 0.069, "grad_norm": 0.38195690512657166, "kl": 0.001800537109375, "learning_rate": 6.85e-07, "loss": 0.0838, "num_tokens": 15849209.0, "reward": 0.44181180000305176, "reward_std": 0.47902366891503334, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09981214161962271, "rewards/penalized_accuracy_reward/std": 0.2136572152376175, "rewards/tag_count_reward/mean": 0.484375, "rewards/tag_count_reward/std": 0.19208712875843048, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.5, "completions/mean_length": 1904.875, "completions/mean_terminated_length": 1643.71875, "completions/min_length": 1273.0, "completions/min_terminated_length": 1273.0, "epoch": 0.0695, "grad_norm": 0.26629582047462463, "kl": 0.0015468597412109375, "learning_rate": 6.9e-07, "loss": 0.0714, "num_tokens": 15981201.0, "reward": 0.173828125, "reward_std": 0.05513381212949753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.11026762798428535, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.5, "completions/mean_length": 1920.75, "completions/mean_terminated_length": 1359.3750305175781, "completions/min_length": 1013.75, "completions/min_terminated_length": 1013.75, "epoch": 0.07, "grad_norm": 0.2456037700176239, "kl": 0.0015077590942382812, "learning_rate": 6.949999999999999e-07, "loss": 0.1005, "num_tokens": 16114417.0, "reward": 0.1640625, "reward_std": 0.06704802066087723, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.328125, "rewards/tag_count_reward/std": 0.1340960431843996, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1897.75, "completions/max_terminated_length": 1851.25, "completions/mean_length": 1149.25, "completions/mean_terminated_length": 1072.0823059082031, "completions/min_length": 519.75, "completions/min_terminated_length": 519.75, "epoch": 0.0705, "grad_norm": 0.32963475584983826, "kl": 0.002094268798828125, "learning_rate": 7e-07, "loss": 0.1786, "num_tokens": 16195153.0, "reward": 0.35072392225265503, "reward_std": 0.20901594124734402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024971334263682365, "rewards/penalized_accuracy_reward/std": 0.06823467463254929, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.20470929145812988, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.75, "completions/mean_length": 1395.796875, "completions/mean_terminated_length": 1247.226318359375, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.071, "grad_norm": 0.2894320487976074, "kl": 0.0016689300537109375, "learning_rate": 7.049999999999999e-07, "loss": 0.1214, "num_tokens": 16293140.0, "reward": 0.28819409757852554, "reward_std": 0.2382927080616355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024956420995295048, "rewards/penalized_accuracy_reward/std": 0.09982568770647049, "rewards/tag_count_reward/mean": 0.4765625, "rewards/tag_count_reward/std": 0.12119786068797112, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 1584.109375, "completions/mean_terminated_length": 1275.9847412109375, "completions/min_length": 661.25, "completions/min_terminated_length": 661.25, "epoch": 0.0715, "grad_norm": 0.2785470485687256, "kl": 0.002246856689453125, "learning_rate": 7.1e-07, "loss": 0.1735, "num_tokens": 16402955.0, "reward": 0.2534629926085472, "reward_std": 0.17556234635412693, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01247368287295103, "rewards/penalized_accuracy_reward/std": 0.04989473149180412, "rewards/tag_count_reward/mean": 0.45703125, "rewards/tag_count_reward/std": 0.17207948118448257, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1946.25, "completions/max_terminated_length": 1784.25, "completions/mean_length": 1480.84375, "completions/mean_terminated_length": 1253.6375122070312, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 0.072, "grad_norm": 0.3683241009712219, "kl": 0.003154754638671875, "learning_rate": 7.149999999999999e-07, "loss": 0.1427, "num_tokens": 16506017.0, "reward": 0.3054276555776596, "reward_std": 0.23148823902010918, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037479449063539505, "rewards/penalized_accuracy_reward/std": 0.08057840168476105, "rewards/tag_count_reward/mean": 0.4609375, "rewards/tag_count_reward/std": 0.19882777333259583, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1954.5, "completions/max_terminated_length": 1635.25, "completions/mean_length": 1220.78125, "completions/mean_terminated_length": 1013.6852874755859, "completions/min_length": 569.5, "completions/min_terminated_length": 569.5, "epoch": 0.0725, "grad_norm": 0.32292768359184265, "kl": 0.0015716552734375, "learning_rate": 7.2e-07, "loss": 0.1933, "num_tokens": 16593219.0, "reward": 0.3599045127630234, "reward_std": 0.2351843435317278, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03737413138151169, "rewards/penalized_accuracy_reward/std": 0.08035197108983994, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.17927220836281776, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1921.0, "completions/max_terminated_length": 1311.25, "completions/mean_length": 1549.578125, "completions/mean_terminated_length": 898.5312805175781, "completions/min_length": 1130.0, "completions/min_terminated_length": 618.0, "epoch": 0.073, "grad_norm": 0.24699978530406952, "kl": 0.0021305084228515625, "learning_rate": 7.249999999999999e-07, "loss": 0.1173, "num_tokens": 16700456.0, "reward": 0.205078125, "reward_std": 0.052845509722828865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.41015625, "rewards/tag_count_reward/std": 0.10569102317094803, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 1735.25, "completions/max_terminated_length": 1602.25, "completions/mean_length": 1485.828125, "completions/mean_terminated_length": 1252.8028869628906, "completions/min_length": 845.75, "completions/min_terminated_length": 845.75, "epoch": 0.0735, "grad_norm": 0.3038204610347748, "kl": 0.002033233642578125, "learning_rate": 7.3e-07, "loss": 0.0369, "num_tokens": 16804317.0, "reward": 0.24609375, "reward_std": 0.07592839002609253, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4921875, "rewards/tag_count_reward/std": 0.1518567819148302, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1312.25, "completions/mean_length": 1430.84375, "completions/mean_terminated_length": 808.4471845626831, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.074, "grad_norm": 0.2971806228160858, "kl": 0.002056121826171875, "learning_rate": 7.35e-07, "loss": 0.1785, "num_tokens": 16906755.0, "reward": 0.232421875, "reward_std": 0.09662540815770626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46484375, "rewards/tag_count_reward/std": 0.19325081631541252, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1963.25, "completions/max_terminated_length": 1717.0, "completions/mean_length": 1579.265625, "completions/mean_terminated_length": 1284.1846618652344, "completions/min_length": 844.5, "completions/min_terminated_length": 844.5, "epoch": 0.0745, "grad_norm": 0.2700496017932892, "kl": 0.0020580291748046875, "learning_rate": 7.4e-07, "loss": 0.1122, "num_tokens": 17016164.0, "reward": 0.43358784914016724, "reward_std": 0.26486230455338955, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11230173707008362, "rewards/penalized_accuracy_reward/std": 0.10228895395994186, "rewards/tag_count_reward/mean": 0.41796875, "rewards/tag_count_reward/std": 0.1271854229271412, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1459.0, "completions/max_terminated_length": 1116.75, "completions/mean_length": 944.515625, "completions/mean_terminated_length": 827.7919769287109, "completions/min_length": 546.75, "completions/min_terminated_length": 546.75, "epoch": 0.075, "grad_norm": 0.3653740882873535, "kl": 0.0030193328857421875, "learning_rate": 7.45e-07, "loss": 0.0648, "num_tokens": 17084853.0, "reward": 0.3600284531712532, "reward_std": 0.3503040336072445, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374361015856266, "rewards/penalized_accuracy_reward/std": 0.149744413793087, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.20414938032627106, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1980.75, "completions/max_terminated_length": 1728.75, "completions/mean_length": 1380.4375, "completions/mean_terminated_length": 1106.1722869873047, "completions/min_length": 537.25, "completions/min_terminated_length": 537.25, "epoch": 0.0755, "grad_norm": 0.3014363646507263, "kl": 0.0023040771484375, "learning_rate": 7.5e-07, "loss": 0.1557, "num_tokens": 17180145.0, "reward": 0.2671721652150154, "reward_std": 0.192220414057374, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249233353883028, "rewards/penalized_accuracy_reward/std": 0.04996933415532112, "rewards/tag_count_reward/mean": 0.484375, "rewards/tag_count_reward/std": 0.19334780424833298, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1884.5, "completions/max_terminated_length": 1812.0, "completions/mean_length": 1517.5, "completions/mean_terminated_length": 1391.0812683105469, "completions/min_length": 980.75, "completions/min_terminated_length": 980.75, "epoch": 0.076, "grad_norm": 0.3830300271511078, "kl": 0.001800537109375, "learning_rate": 7.55e-07, "loss": 0.0148, "num_tokens": 17287041.0, "reward": 0.2109375, "reward_std": 0.06231795810163021, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.12463591620326042, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1566.3125, "completions/mean_terminated_length": 1148.4346160888672, "completions/min_length": 633.75, "completions/min_terminated_length": 633.75, "epoch": 0.0765, "grad_norm": 0.27950647473335266, "kl": 0.003780364990234375, "learning_rate": 7.599999999999999e-07, "loss": 0.1765, "num_tokens": 17396789.0, "reward": 0.2632182613015175, "reward_std": 0.18906253017485142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012468506582081318, "rewards/penalized_accuracy_reward/std": 0.04987403005361557, "rewards/tag_count_reward/mean": 0.4765625, "rewards/tag_count_reward/std": 0.21531551703810692, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1107.25, "completions/max_terminated_length": 1041.75, "completions/mean_length": 911.890625, "completions/mean_terminated_length": 831.265625, "completions/min_length": 597.25, "completions/min_terminated_length": 597.25, "epoch": 0.077, "grad_norm": 1.0197837352752686, "kl": 0.0072307586669921875, "learning_rate": 7.65e-07, "loss": 0.0367, "num_tokens": 17462654.0, "reward": 0.30859375, "reward_std": 0.08285845257341862, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6171875, "rewards/tag_count_reward/std": 0.16571690514683723, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2034.25, "completions/max_terminated_length": 1964.75, "completions/mean_length": 1548.875, "completions/mean_terminated_length": 1271.1875, "completions/min_length": 808.5, "completions/min_terminated_length": 808.5, "epoch": 0.0775, "grad_norm": 0.3156202733516693, "kl": 0.0028247833251953125, "learning_rate": 7.699999999999999e-07, "loss": 0.1616, "num_tokens": 17572150.0, "reward": 0.2265625, "reward_std": 0.09165986254811287, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.453125, "rewards/tag_count_reward/std": 0.18331972509622574, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1790.5, "completions/mean_length": 1621.125, "completions/mean_terminated_length": 1211.6666870117188, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.078, "grad_norm": 0.2818986773490906, "kl": 0.0025787353515625, "learning_rate": 7.75e-07, "loss": 0.1764, "num_tokens": 17685406.0, "reward": 0.23785623162984848, "reward_std": 0.17626497149467468, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482804246246815, "rewards/penalized_accuracy_reward/std": 0.04993121698498726, "rewards/tag_count_reward/mean": 0.42578125, "rewards/tag_count_reward/std": 0.18840648978948593, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.5, "completions/mean_length": 1587.390625, "completions/mean_terminated_length": 1099.3950500488281, "completions/min_length": 685.25, "completions/min_terminated_length": 685.25, "epoch": 0.0785, "grad_norm": 0.27965351939201355, "kl": 0.003704071044921875, "learning_rate": 7.799999999999999e-07, "loss": 0.2333, "num_tokens": 17797495.0, "reward": 0.24369525909423828, "reward_std": 0.18748999200761318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012472628615796566, "rewards/penalized_accuracy_reward/std": 0.049890514463186264, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1860143579542637, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1872.25, "completions/max_terminated_length": 1692.25, "completions/mean_length": 1563.34375, "completions/mean_terminated_length": 1313.7448120117188, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 0.079, "grad_norm": 0.2732720971107483, "kl": 0.0025787353515625, "learning_rate": 7.85e-07, "loss": 0.1006, "num_tokens": 17915037.0, "reward": 0.3764696419239044, "reward_std": 0.2655080817639828, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07495356351137161, "rewards/penalized_accuracy_reward/std": 0.09993808716535568, "rewards/tag_count_reward/mean": 0.453125, "rewards/tag_count_reward/std": 0.17698714137077332, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1813.5, "completions/max_terminated_length": 1558.0, "completions/mean_length": 1026.203125, "completions/mean_terminated_length": 897.2958679199219, "completions/min_length": 539.25, "completions/min_terminated_length": 539.25, "epoch": 0.0795, "grad_norm": 0.35080721974372864, "kl": 0.0057544708251953125, "learning_rate": 7.9e-07, "loss": 0.0954, "num_tokens": 17990122.0, "reward": 0.5310700312256813, "reward_std": 0.5000581722706556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11221470218151808, "rewards/penalized_accuracy_reward/std": 0.22580703347921371, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.21189307793974876, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 1701.890625, "completions/mean_terminated_length": 1166.0833740234375, "completions/min_length": 612.75, "completions/min_terminated_length": 612.75, "epoch": 0.08, "grad_norm": 0.23547233641147614, "kl": 0.0020904541015625, "learning_rate": 7.95e-07, "loss": 0.0771, "num_tokens": 18108227.0, "reward": 0.3471536338329315, "reward_std": 0.2671518735587597, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06224869564175606, "rewards/penalized_accuracy_reward/std": 0.095357745885849, "rewards/tag_count_reward/mean": 0.4453125, "rewards/tag_count_reward/std": 0.20506444945931435, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1987.75, "completions/max_terminated_length": 1820.75, "completions/mean_length": 1464.640625, "completions/mean_terminated_length": 1161.8042907714844, "completions/min_length": 468.75, "completions/min_terminated_length": 468.75, "epoch": 0.0805, "grad_norm": 0.2914004325866699, "kl": 0.00415802001953125, "learning_rate": 8e-07, "loss": 0.1699, "num_tokens": 18211420.0, "reward": 0.3140275478363037, "reward_std": 0.19475942105054855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482525780797005, "rewards/penalized_accuracy_reward/std": 0.04993010312318802, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.24033771082758904, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.25, "completions/mean_length": 1619.6875, "completions/mean_terminated_length": 1398.8984680175781, "completions/min_length": 949.75, "completions/min_terminated_length": 949.75, "epoch": 0.081, "grad_norm": 0.24877753853797913, "kl": 0.0027866363525390625, "learning_rate": 8.05e-07, "loss": 0.1761, "num_tokens": 18323000.0, "reward": 0.232421875, "reward_std": 0.09987804852426052, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46484375, "rewards/tag_count_reward/std": 0.19975610822439194, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1892.5, "completions/mean_length": 1480.46875, "completions/mean_terminated_length": 1169.1864624023438, "completions/min_length": 548.75, "completions/min_terminated_length": 548.75, "epoch": 0.0815, "grad_norm": 0.24062517285346985, "kl": 0.002788543701171875, "learning_rate": 8.1e-07, "loss": 0.1943, "num_tokens": 18426838.0, "reward": 0.4056006819009781, "reward_std": 0.36471516638994217, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07487065251916647, "rewards/penalized_accuracy_reward/std": 0.14548716321587563, "rewards/tag_count_reward/mean": 0.51171875, "rewards/tag_count_reward/std": 0.23399199172854424, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1856.75, "completions/max_terminated_length": 1784.75, "completions/mean_length": 1441.53125, "completions/mean_terminated_length": 1230.734375, "completions/min_length": 688.25, "completions/min_terminated_length": 688.25, "epoch": 0.082, "grad_norm": 0.27475443482398987, "kl": 0.0041866302490234375, "learning_rate": 8.149999999999999e-07, "loss": 0.11, "num_tokens": 18530616.0, "reward": 0.2769148200750351, "reward_std": 0.18498349748551846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012480847537517548, "rewards/penalized_accuracy_reward/std": 0.04992339015007019, "rewards/tag_count_reward/mean": 0.50390625, "rewards/tag_count_reward/std": 0.19902971014380455, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.75, "completions/mean_length": 1647.796875, "completions/mean_terminated_length": 1215.2386779785156, "completions/min_length": 456.5, "completions/min_terminated_length": 456.5, "epoch": 0.0825, "grad_norm": 0.21334032714366913, "kl": 0.00247955322265625, "learning_rate": 8.199999999999999e-07, "loss": 0.1998, "num_tokens": 18644587.0, "reward": 0.33616212010383606, "reward_std": 0.27161361649632454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04991700500249863, "rewards/penalized_accuracy_reward/std": 0.08929425477981567, "rewards/tag_count_reward/mean": 0.47265625, "rewards/tag_count_reward/std": 0.2178027704358101, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1703.75, "completions/max_terminated_length": 1667.75, "completions/mean_length": 1471.390625, "completions/mean_terminated_length": 1227.40625, "completions/min_length": 776.5, "completions/min_terminated_length": 776.5, "epoch": 0.083, "grad_norm": 0.38718321919441223, "kl": 0.00545501708984375, "learning_rate": 8.249999999999999e-07, "loss": 0.1208, "num_tokens": 18746660.0, "reward": 0.41879934072494507, "reward_std": 0.28887110762298107, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08732935786247253, "rewards/penalized_accuracy_reward/std": 0.10226975381374359, "rewards/tag_count_reward/mean": 0.48828125, "rewards/tag_count_reward/std": 0.1779608093202114, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1807.25, "completions/max_terminated_length": 1419.5, "completions/mean_length": 1123.453125, "completions/mean_terminated_length": 972.1384429931641, "completions/min_length": 582.5, "completions/min_terminated_length": 582.5, "epoch": 0.0835, "grad_norm": 0.411729097366333, "kl": 0.005779266357421875, "learning_rate": 8.299999999999999e-07, "loss": 0.179, "num_tokens": 18828033.0, "reward": 0.3218102902173996, "reward_std": 0.19224550388753414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012467646040022373, "rewards/penalized_accuracy_reward/std": 0.04987058416008949, "rewards/tag_count_reward/mean": 0.59375, "rewards/tag_count_reward/std": 0.2370493747293949, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.75, "completions/mean_length": 1704.34375, "completions/mean_terminated_length": 1399.2291870117188, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.084, "grad_norm": 0.2760254740715027, "kl": 0.00384521484375, "learning_rate": 8.349999999999999e-07, "loss": 0.1377, "num_tokens": 18945479.0, "reward": 0.28430207818746567, "reward_std": 0.2327622827142477, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024963537231087685, "rewards/penalized_accuracy_reward/std": 0.06821337342262268, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.23482176288962364, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1699.75, "completions/mean_length": 1687.359375, "completions/mean_terminated_length": 1206.9878540039062, "completions/min_length": 759.5, "completions/min_terminated_length": 759.5, "epoch": 0.0845, "grad_norm": 0.22638848423957825, "kl": 0.0034332275390625, "learning_rate": 8.399999999999999e-07, "loss": 0.1367, "num_tokens": 19064190.0, "reward": 0.23046875, "reward_std": 0.12507586739957333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4609375, "rewards/tag_count_reward/std": 0.25015174224972725, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1732.25, "completions/max_terminated_length": 1309.25, "completions/mean_length": 922.578125, "completions/mean_terminated_length": 828.6453704833984, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.085, "grad_norm": 0.3714368939399719, "kl": 0.00711822509765625, "learning_rate": 8.45e-07, "loss": 0.0994, "num_tokens": 19131747.0, "reward": 0.4069758653640747, "reward_std": 0.2454173006117344, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037472307682037354, "rewards/penalized_accuracy_reward/std": 0.08056304603815079, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.20624671503901482, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1741.25, "completions/max_terminated_length": 1197.0, "completions/mean_length": 1187.203125, "completions/mean_terminated_length": 544.1161956787109, "completions/min_length": 716.25, "completions/min_terminated_length": 204.25, "epoch": 0.0855, "grad_norm": 0.4808145761489868, "kl": 0.00655364990234375, "learning_rate": 8.499999999999999e-07, "loss": 0.1826, "num_tokens": 19216592.0, "reward": 0.28515625, "reward_std": 0.10761208459734917, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.21522418037056923, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1682.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 1062.078125, "completions/mean_terminated_length": 958.4031524658203, "completions/min_length": 613.25, "completions/min_terminated_length": 613.25, "epoch": 0.086, "grad_norm": 0.3941885530948639, "kl": 0.00701141357421875, "learning_rate": 8.55e-07, "loss": 0.0262, "num_tokens": 19292725.0, "reward": 0.451317198574543, "reward_std": 0.3328010216355324, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04987734742462635, "rewards/penalized_accuracy_reward/std": 0.13629087060689926, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.22543026134371758, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1413.25, "completions/mean_length": 1905.40625, "completions/mean_terminated_length": 1161.5555725097656, "completions/min_length": 1319.0, "completions/min_terminated_length": 807.0, "epoch": 0.0865, "grad_norm": 0.21743524074554443, "kl": 0.00298309326171875, "learning_rate": 8.599999999999999e-07, "loss": 0.0754, "num_tokens": 19430351.0, "reward": 0.189453125, "reward_std": 0.0776875913143158, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.1553751826286316, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 1575.671875, "completions/mean_terminated_length": 1209.8083801269531, "completions/min_length": 804.5, "completions/min_terminated_length": 804.5, "epoch": 0.087, "grad_norm": 0.2772214412689209, "kl": 0.00408172607421875, "learning_rate": 8.65e-07, "loss": 0.1379, "num_tokens": 19541338.0, "reward": 0.35011133551597595, "reward_std": 0.26436256617307663, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037360358983278275, "rewards/penalized_accuracy_reward/std": 0.08032236993312836, "rewards/tag_count_reward/mean": 0.55078125, "rewards/tag_count_reward/std": 0.2791518606245518, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.25, "completions/mean_length": 1307.671875, "completions/mean_terminated_length": 1047.3551788330078, "completions/min_length": 450.5, "completions/min_terminated_length": 450.5, "epoch": 0.0875, "grad_norm": 0.2925030291080475, "kl": 0.004718780517578125, "learning_rate": 8.699999999999999e-07, "loss": 0.1638, "num_tokens": 19633141.0, "reward": 0.6544082015752792, "reward_std": 0.6043625138700008, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16216504201292992, "rewards/penalized_accuracy_reward/std": 0.26312603056430817, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.30698979645967484, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2018.5, "completions/max_terminated_length": 1807.25, "completions/mean_length": 1296.203125, "completions/mean_terminated_length": 1167.3385467529297, "completions/min_length": 585.75, "completions/min_terminated_length": 585.75, "epoch": 0.088, "grad_norm": 0.31008630990982056, "kl": 0.00592041015625, "learning_rate": 8.75e-07, "loss": 0.1253, "num_tokens": 19725714.0, "reward": 0.7451330125331879, "reward_std": 0.3549316469579935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.23877744749188423, "rewards/penalized_accuracy_reward/std": 0.14223450049757957, "rewards/tag_count_reward/mean": 0.53515625, "rewards/tag_count_reward/std": 0.1955747827887535, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2035.5, "completions/max_terminated_length": 1790.25, "completions/mean_length": 1449.9375, "completions/mean_terminated_length": 1059.4107360839844, "completions/min_length": 570.25, "completions/min_terminated_length": 570.25, "epoch": 0.0885, "grad_norm": 0.27184951305389404, "kl": 0.004734039306640625, "learning_rate": 8.799999999999999e-07, "loss": 0.1647, "num_tokens": 19825214.0, "reward": 0.3755781501531601, "reward_std": 0.2558440584689379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03739845007658005, "rewards/penalized_accuracy_reward/std": 0.08040428906679153, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.24985554441809654, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.25, "completions/mean_length": 1136.203125, "completions/mean_terminated_length": 962.530403137207, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.089, "grad_norm": 0.3713781237602234, "kl": 0.00867462158203125, "learning_rate": 8.85e-07, "loss": 0.2498, "num_tokens": 19908091.0, "reward": 0.53424072265625, "reward_std": 0.3175358548760414, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.087432861328125, "rewards/penalized_accuracy_reward/std": 0.10239089280366898, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.24137605354189873, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1964.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1363.53125, "completions/mean_terminated_length": 1148.5391998291016, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.0895, "grad_norm": 0.2737806737422943, "kl": 0.004940032958984375, "learning_rate": 8.9e-07, "loss": 0.1523, "num_tokens": 20003613.0, "reward": 0.33312833309173584, "reward_std": 0.22119545377790928, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496260590851307, "rewards/penalized_accuracy_reward/std": 0.06821084022521973, "rewards/tag_count_reward/mean": 0.56640625, "rewards/tag_count_reward/std": 0.21909063681960106, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1563.75, "completions/mean_length": 1200.0, "completions/mean_terminated_length": 980.5269470214844, "completions/min_length": 423.5, "completions/min_terminated_length": 423.5, "epoch": 0.09, "grad_norm": 0.4590481221675873, "kl": 0.009883880615234375, "learning_rate": 8.95e-07, "loss": 0.2111, "num_tokens": 20088461.0, "reward": 0.49707064032554626, "reward_std": 0.399056077003479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08740250580012798, "rewards/penalized_accuracy_reward/std": 0.14982453361153603, "rewards/tag_count_reward/mean": 0.64453125, "rewards/tag_count_reward/std": 0.2766445428133011, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1776.5, "completions/mean_length": 1373.34375, "completions/mean_terminated_length": 1069.9113464355469, "completions/min_length": 485.5, "completions/min_terminated_length": 485.5, "epoch": 0.0905, "grad_norm": 0.2885509431362152, "kl": 0.0055999755859375, "learning_rate": 9e-07, "loss": 0.2371, "num_tokens": 20184115.0, "reward": 0.3932991027832031, "reward_std": 0.2802636418491602, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746986389160156, "rewards/penalized_accuracy_reward/std": 0.08055779337882996, "rewards/tag_count_reward/mean": 0.63671875, "rewards/tag_count_reward/std": 0.2847995422780514, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.75, "completions/mean_length": 1579.03125, "completions/mean_terminated_length": 1220.3045349121094, "completions/min_length": 825.75, "completions/min_terminated_length": 825.75, "epoch": 0.091, "grad_norm": 0.5139794945716858, "kl": 0.0074615478515625, "learning_rate": 9.05e-07, "loss": 0.1901, "num_tokens": 20296181.0, "reward": 0.40749672055244446, "reward_std": 0.31171809881925583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07484211027622223, "rewards/penalized_accuracy_reward/std": 0.09978951513767242, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.25379306077957153, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1656.5, "completions/mean_length": 1523.390625, "completions/mean_terminated_length": 1054.175033569336, "completions/min_length": 580.75, "completions/min_terminated_length": 580.75, "epoch": 0.0915, "grad_norm": 0.292393296957016, "kl": 0.004425048828125, "learning_rate": 9.1e-07, "loss": 0.1171, "num_tokens": 20406014.0, "reward": 0.5628294050693512, "reward_std": 0.3595346547663212, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14957876410335302, "rewards/penalized_accuracy_reward/std": 0.1453448310494423, "rewards/tag_count_reward/mean": 0.52734375, "rewards/tag_count_reward/std": 0.24343108385801315, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1534.25, "completions/mean_length": 1418.53125, "completions/mean_terminated_length": 997.6583633422852, "completions/min_length": 624.75, "completions/min_terminated_length": 624.75, "epoch": 0.092, "grad_norm": 0.3611016869544983, "kl": 0.00853729248046875, "learning_rate": 9.15e-07, "loss": 0.3404, "num_tokens": 20506192.0, "reward": 0.3199012652039528, "reward_std": 0.1938423588871956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012489695101976395, "rewards/penalized_accuracy_reward/std": 0.04995878413319588, "rewards/tag_count_reward/mean": 0.58984375, "rewards/tag_count_reward/std": 0.22971272096037865, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1379.0, "completions/max_terminated_length": 1334.5, "completions/mean_length": 964.4375, "completions/mean_terminated_length": 749.8638610839844, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.0925, "grad_norm": 0.5784435868263245, "kl": 0.0134735107421875, "learning_rate": 9.2e-07, "loss": 0.0462, "num_tokens": 20576492.0, "reward": 0.6778072118759155, "reward_std": 0.20784076675772667, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17484110593795776, "rewards/penalized_accuracy_reward/std": 0.06825102120637894, "rewards/tag_count_reward/mean": 0.65625, "rewards/tag_count_reward/std": 0.21652427315711975, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1966.0, "completions/max_terminated_length": 1688.25, "completions/mean_length": 1278.828125, "completions/mean_terminated_length": 1066.9510803222656, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.093, "grad_norm": 0.45320242643356323, "kl": 0.00629425048828125, "learning_rate": 9.25e-07, "loss": 0.1515, "num_tokens": 20667585.0, "reward": 0.34471653401851654, "reward_std": 0.2285569440573454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02489732950925827, "rewards/penalized_accuracy_reward/std": 0.06803245842456818, "rewards/tag_count_reward/mean": 0.58984375, "rewards/tag_count_reward/std": 0.23127005994319916, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1833.75, "completions/max_terminated_length": 1279.25, "completions/mean_length": 1287.6875, "completions/mean_terminated_length": 747.8616333007812, "completions/min_length": 835.5, "completions/min_terminated_length": 323.5, "epoch": 0.0935, "grad_norm": 0.26031494140625, "kl": 0.005245208740234375, "learning_rate": 9.3e-07, "loss": 0.0364, "num_tokens": 20756989.0, "reward": 0.379628986120224, "reward_std": 0.22798337787389755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374707356095314, "rewards/penalized_accuracy_reward/std": 0.08055967092514038, "rewards/tag_count_reward/mean": 0.609375, "rewards/tag_count_reward/std": 0.18777992948889732, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1989.25, "completions/max_terminated_length": 1522.5, "completions/mean_length": 1254.5, "completions/mean_terminated_length": 1047.3046417236328, "completions/min_length": 571.5, "completions/min_terminated_length": 571.5, "epoch": 0.094, "grad_norm": 0.26525816321372986, "kl": 0.006336212158203125, "learning_rate": 9.35e-07, "loss": 0.1652, "num_tokens": 20845021.0, "reward": 0.5773632228374481, "reward_std": 0.6327668204903603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1373144192621112, "rewards/penalized_accuracy_reward/std": 0.2942996621131897, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.21332840621471405, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1792.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 1350.1875, "completions/mean_terminated_length": 1041.9094848632812, "completions/min_length": 452.5, "completions/min_terminated_length": 452.5, "epoch": 0.0945, "grad_norm": 0.29812803864479065, "kl": 0.00811004638671875, "learning_rate": 9.399999999999999e-07, "loss": 0.1798, "num_tokens": 20944153.0, "reward": 0.3751583993434906, "reward_std": 0.2516368478536606, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0498838871717453, "rewards/penalized_accuracy_reward/std": 0.08923505246639252, "rewards/tag_count_reward/mean": 0.55078125, "rewards/tag_count_reward/std": 0.20309358090162277, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 1524.28125, "completions/mean_terminated_length": 1307.8786010742188, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.095, "grad_norm": 0.22715386748313904, "kl": 0.004901885986328125, "learning_rate": 9.45e-07, "loss": 0.1562, "num_tokens": 21052059.0, "reward": 0.30859375, "reward_std": 0.14489952102303505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6171875, "rewards/tag_count_reward/std": 0.2897990569472313, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2025.75, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1178.828125, "completions/mean_terminated_length": 951.0472106933594, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.0955, "grad_norm": 0.3521711230278015, "kl": 0.00656890869140625, "learning_rate": 9.499999999999999e-07, "loss": 0.1955, "num_tokens": 21136064.0, "reward": 0.4069746881723404, "reward_std": 0.32097572833299637, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747171629220247, "rewards/penalized_accuracy_reward/std": 0.11821791902184486, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.26693839952349663, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1478.0, "completions/max_terminated_length": 1309.5, "completions/mean_length": 826.84375, "completions/mean_terminated_length": 798.3948211669922, "completions/min_length": 402.75, "completions/min_terminated_length": 402.75, "epoch": 0.096, "grad_norm": 0.6751384139060974, "kl": 0.01691436767578125, "learning_rate": 9.55e-07, "loss": -0.0176, "num_tokens": 21196886.0, "reward": 0.49912434816360474, "reward_std": 0.34702117554843426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07475748844444752, "rewards/penalized_accuracy_reward/std": 0.1452994979918003, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.22779318317770958, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1958.5, "completions/max_terminated_length": 1450.5, "completions/mean_length": 1248.15625, "completions/mean_terminated_length": 967.2127685546875, "completions/min_length": 531.25, "completions/min_terminated_length": 531.25, "epoch": 0.0965, "grad_norm": 0.40492069721221924, "kl": 0.00968170166015625, "learning_rate": 9.6e-07, "loss": 0.1545, "num_tokens": 21287520.0, "reward": 0.401055246591568, "reward_std": 0.2648246381431818, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374416820704937, "rewards/penalized_accuracy_reward/std": 0.08049721270799637, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.23647373914718628, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1715.0, "completions/max_terminated_length": 1510.75, "completions/mean_length": 1269.921875, "completions/mean_terminated_length": 1035.171875, "completions/min_length": 714.5, "completions/min_terminated_length": 714.5, "epoch": 0.097, "grad_norm": 0.341296523809433, "kl": 0.00730133056640625, "learning_rate": 9.649999999999999e-07, "loss": 0.0719, "num_tokens": 21375627.0, "reward": 0.3565404489636421, "reward_std": 0.23007201962172985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02494991198182106, "rewards/penalized_accuracy_reward/std": 0.06817616522312164, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.18743937648832798, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1882.25, "completions/max_terminated_length": 1764.5, "completions/mean_length": 1428.796875, "completions/mean_terminated_length": 1312.3303833007812, "completions/min_length": 828.25, "completions/min_terminated_length": 828.25, "epoch": 0.0975, "grad_norm": 0.3552434742450714, "kl": 0.006587982177734375, "learning_rate": 9.7e-07, "loss": 0.0565, "num_tokens": 21477598.0, "reward": 0.38692332804203033, "reward_std": 0.26839711144566536, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04990697652101517, "rewards/penalized_accuracy_reward/std": 0.08927632123231888, "rewards/tag_count_reward/mean": 0.57421875, "rewards/tag_count_reward/std": 0.22524796426296234, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1676.25, "completions/mean_length": 1371.390625, "completions/mean_terminated_length": 1080.870849609375, "completions/min_length": 357.25, "completions/min_terminated_length": 357.25, "epoch": 0.098, "grad_norm": 0.3601851463317871, "kl": 0.01062774658203125, "learning_rate": 9.75e-07, "loss": 0.1532, "num_tokens": 21574439.0, "reward": 0.37918415665626526, "reward_std": 0.28313941694796085, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994363710284233, "rewards/penalized_accuracy_reward/std": 0.0893418937921524, "rewards/tag_count_reward/mean": 0.55859375, "rewards/tag_count_reward/std": 0.24444390088319778, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1810.5, "completions/max_terminated_length": 1486.25, "completions/mean_length": 1273.390625, "completions/mean_terminated_length": 1097.2159881591797, "completions/min_length": 677.25, "completions/min_terminated_length": 677.25, "epoch": 0.0985, "grad_norm": 0.379241406917572, "kl": 0.008068084716796875, "learning_rate": 9.8e-07, "loss": 0.1458, "num_tokens": 21668512.0, "reward": 0.328125, "reward_std": 0.111423060297966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.65625, "rewards/tag_count_reward/std": 0.2228461243212223, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.5, "completions/mean_length": 1496.453125, "completions/mean_terminated_length": 1037.5375366210938, "completions/min_length": 474.25, "completions/min_terminated_length": 474.25, "epoch": 0.099, "grad_norm": 0.27730974555015564, "kl": 0.006710052490234375, "learning_rate": 9.849999999999999e-07, "loss": 0.2784, "num_tokens": 21772013.0, "reward": 0.265625, "reward_std": 0.13597221858799458, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.27194443717598915, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1819.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 1255.0625, "completions/mean_terminated_length": 891.1489715576172, "completions/min_length": 327.5, "completions/min_terminated_length": 327.5, "epoch": 0.0995, "grad_norm": 0.36719828844070435, "kl": 0.00637054443359375, "learning_rate": 9.9e-07, "loss": 0.2092, "num_tokens": 21861265.0, "reward": 0.3203125, "reward_std": 0.11343466117978096, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.22686932235956192, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 1267.5, "completions/mean_terminated_length": 809.1520156860352, "completions/min_length": 365.75, "completions/min_terminated_length": 365.75, "epoch": 0.1, "grad_norm": 0.3793827295303345, "kl": 0.005157470703125, "learning_rate": 9.95e-07, "loss": 0.2571, "num_tokens": 21956033.0, "reward": 0.30078125, "reward_std": 0.12788406386971474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.2557681314647198, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1451.25, "completions/max_terminated_length": 1379.75, "completions/mean_length": 1030.234375, "completions/mean_terminated_length": 929.0291748046875, "completions/min_length": 474.75, "completions/min_terminated_length": 474.75, "epoch": 0.1005, "grad_norm": 0.34132617712020874, "kl": 0.00867462158203125, "learning_rate": 1e-06, "loss": 0.1349, "num_tokens": 22030112.0, "reward": 0.4323435425758362, "reward_std": 0.27051245607435703, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746083378791809, "rewards/penalized_accuracy_reward/std": 0.08053838461637497, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.2565223462879658, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1684.5, "completions/mean_length": 1269.296875, "completions/mean_terminated_length": 1010.5666961669922, "completions/min_length": 548.25, "completions/min_terminated_length": 548.25, "epoch": 0.101, "grad_norm": 0.33228105306625366, "kl": 0.0115966796875, "learning_rate": 9.999993146109795e-07, "loss": 0.1926, "num_tokens": 22118515.0, "reward": 0.35268695652484894, "reward_std": 0.24598660320043564, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024976294487714767, "rewards/penalized_accuracy_reward/std": 0.06824823468923569, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.2641993835568428, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1404.75, "completions/max_terminated_length": 1376.75, "completions/mean_length": 653.09375, "completions/mean_terminated_length": 635.3104248046875, "completions/min_length": 236.25, "completions/min_terminated_length": 236.25, "epoch": 0.1015, "grad_norm": 0.5000954270362854, "kl": 0.0145111083984375, "learning_rate": 9.999972584460056e-07, "loss": 0.024, "num_tokens": 22169321.0, "reward": 0.41950538754463196, "reward_std": 0.18606950528919697, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012487064115703106, "rewards/penalized_accuracy_reward/std": 0.049948256462812424, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.22263088449835777, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1652.5, "completions/max_terminated_length": 1617.5, "completions/mean_length": 1166.03125, "completions/mean_terminated_length": 1027.1509094238281, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 0.102, "grad_norm": 0.2610733211040497, "kl": 0.0077056884765625, "learning_rate": 9.99993831511342e-07, "loss": 0.1612, "num_tokens": 22251899.0, "reward": 0.36092042922973633, "reward_std": 0.1976094339042902, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491466477513313, "rewards/penalized_accuracy_reward/std": 0.04996586591005325, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.23463811352849007, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1971.0, "completions/max_terminated_length": 1510.5, "completions/mean_length": 858.328125, "completions/mean_terminated_length": 799.7073211669922, "completions/min_length": 375.5, "completions/min_terminated_length": 375.5, "epoch": 0.1025, "grad_norm": 0.36990928649902344, "kl": 0.0081024169921875, "learning_rate": 9.999890338174275e-07, "loss": 0.1228, "num_tokens": 22316416.0, "reward": 1.012734591960907, "reward_std": 0.6390398778021336, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3120313584804535, "rewards/penalized_accuracy_reward/std": 0.29152287542819977, "rewards/tag_count_reward/mean": 0.77734375, "rewards/tag_count_reward/std": 0.24691515415906906, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1638.5, "completions/max_terminated_length": 1282.0, "completions/mean_length": 1098.4375, "completions/mean_terminated_length": 811.5498657226562, "completions/min_length": 524.75, "completions/min_terminated_length": 524.75, "epoch": 0.103, "grad_norm": 0.5252552032470703, "kl": 0.011077880859375, "learning_rate": 9.99982865378877e-07, "loss": 0.1688, "num_tokens": 22399244.0, "reward": 0.275390625, "reward_std": 0.10217466577887535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.55078125, "rewards/tag_count_reward/std": 0.20434933342039585, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1691.75, "completions/max_terminated_length": 1213.25, "completions/mean_length": 1148.390625, "completions/mean_terminated_length": 813.3055725097656, "completions/min_length": 539.75, "completions/min_terminated_length": 539.75, "epoch": 0.1035, "grad_norm": 0.3766385018825531, "kl": 0.0098724365234375, "learning_rate": 9.999753262144804e-07, "loss": 0.2045, "num_tokens": 22482325.0, "reward": 0.29296875, "reward_std": 0.1172355618327856, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.2344711311161518, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1826.5, "completions/mean_length": 1628.125, "completions/mean_terminated_length": 1234.1641693115234, "completions/min_length": 644.25, "completions/min_terminated_length": 644.25, "epoch": 0.104, "grad_norm": 0.2645423412322998, "kl": 0.00494384765625, "learning_rate": 9.999664163472034e-07, "loss": 0.1829, "num_tokens": 22593645.0, "reward": 0.3575945794582367, "reward_std": 0.31268296018242836, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04989103972911835, "rewards/penalized_accuracy_reward/std": 0.08924780786037445, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.3083356097340584, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.25, "completions/mean_length": 1301.46875, "completions/mean_terminated_length": 793.1201400756836, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.1045, "grad_norm": 0.33448272943496704, "kl": 0.00582122802734375, "learning_rate": 9.999561358041868e-07, "loss": 0.3334, "num_tokens": 22690139.0, "reward": 0.3814338147640228, "reward_std": 0.2817476373165846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03739658743143082, "rewards/penalized_accuracy_reward/std": 0.080400250852108, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.2827734872698784, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1824.75, "completions/max_terminated_length": 1481.75, "completions/mean_length": 1069.15625, "completions/mean_terminated_length": 845.8541717529297, "completions/min_length": 340.75, "completions/min_terminated_length": 340.75, "epoch": 0.105, "grad_norm": 0.3587760031223297, "kl": 0.00800323486328125, "learning_rate": 9.99944484616747e-07, "loss": 0.1663, "num_tokens": 22765557.0, "reward": 0.620009183883667, "reward_std": 0.5126272048801184, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14984834380447865, "rewards/penalized_accuracy_reward/std": 0.2336450219154358, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.2114214226603508, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1862.75, "completions/max_terminated_length": 1624.25, "completions/mean_length": 1178.796875, "completions/mean_terminated_length": 1011.9986724853516, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.1055, "grad_norm": 0.2883901000022888, "kl": 0.0064239501953125, "learning_rate": 9.99931462820376e-07, "loss": 0.1076, "num_tokens": 22848552.0, "reward": 0.4551694095134735, "reward_std": 0.3211580589413643, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049850333482027054, "rewards/penalized_accuracy_reward/std": 0.13024071604013443, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.2186996005475521, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1360.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 915.5, "completions/mean_terminated_length": 841.7784118652344, "completions/min_length": 336.5, "completions/min_terminated_length": 336.5, "epoch": 0.106, "grad_norm": 0.431331068277359, "kl": 0.01154327392578125, "learning_rate": 9.999170704547398e-07, "loss": -0.0142, "num_tokens": 22916584.0, "reward": 0.42102065682411194, "reward_std": 0.23310044966638088, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496344968676567, "rewards/penalized_accuracy_reward/std": 0.06821312755346298, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.2482161857187748, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1697.5, "completions/mean_length": 1570.75, "completions/mean_terminated_length": 970.9187622070312, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.1065, "grad_norm": 0.2520676255226135, "kl": 0.00534820556640625, "learning_rate": 9.999013075636804e-07, "loss": 0.2869, "num_tokens": 23024824.0, "reward": 0.30232710391283035, "reward_std": 0.24239635467529297, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491676025092602, "rewards/penalized_accuracy_reward/std": 0.049966707825660706, "rewards/tag_count_reward/mean": 0.5546875, "rewards/tag_count_reward/std": 0.3186086267232895, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1764.25, "completions/max_terminated_length": 1420.25, "completions/mean_length": 1195.515625, "completions/mean_terminated_length": 1036.6619567871094, "completions/min_length": 622.25, "completions/min_terminated_length": 622.25, "epoch": 0.107, "grad_norm": 0.3345262110233307, "kl": 0.01030731201171875, "learning_rate": 9.998841741952141e-07, "loss": 0.1155, "num_tokens": 23108601.0, "reward": 0.4240236207842827, "reward_std": 0.3567482568323612, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049902429804205894, "rewards/penalized_accuracy_reward/std": 0.13635940849781036, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.24564043059945107, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 1520.484375, "completions/mean_terminated_length": 951.1166687011719, "completions/min_length": 523.25, "completions/min_terminated_length": 523.25, "epoch": 0.1075, "grad_norm": 0.27261942625045776, "kl": 0.006443023681640625, "learning_rate": 9.998656704015323e-07, "loss": 0.2158, "num_tokens": 23215352.0, "reward": 0.2421875, "reward_std": 0.10811270773410797, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.484375, "rewards/tag_count_reward/std": 0.2162254210561514, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1933.25, "completions/max_terminated_length": 1691.5, "completions/mean_length": 1296.40625, "completions/mean_terminated_length": 998.3584442138672, "completions/min_length": 416.5, "completions/min_terminated_length": 416.5, "epoch": 0.108, "grad_norm": 0.36814162135124207, "kl": 0.0085906982421875, "learning_rate": 9.998457962390008e-07, "loss": 0.1963, "num_tokens": 23307378.0, "reward": 0.34331846237182617, "reward_std": 0.22249481081962585, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012479545548558235, "rewards/penalized_accuracy_reward/std": 0.04991818591952324, "rewards/tag_count_reward/mean": 0.63671875, "rewards/tag_count_reward/std": 0.28611917793750763, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 1332.5625, "completions/mean_terminated_length": 901.7310943603516, "completions/min_length": 446.5, "completions/min_terminated_length": 446.5, "epoch": 0.1085, "grad_norm": 0.3440226912498474, "kl": 0.01078033447265625, "learning_rate": 9.998245517681593e-07, "loss": 0.3113, "num_tokens": 23400422.0, "reward": 0.302734375, "reward_std": 0.14336878806352615, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.2867375798523426, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 1472.859375, "completions/mean_terminated_length": 1185.4214630126953, "completions/min_length": 563.5, "completions/min_terminated_length": 563.5, "epoch": 0.109, "grad_norm": 0.28158628940582275, "kl": 0.00545501708984375, "learning_rate": 9.998019370537227e-07, "loss": 0.1519, "num_tokens": 23502429.0, "reward": 0.28125, "reward_std": 0.1410377211868763, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.2820754423737526, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1865.25, "completions/max_terminated_length": 1561.0, "completions/mean_length": 1034.46875, "completions/mean_terminated_length": 900.5000457763672, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.1095, "grad_norm": 0.35891857743263245, "kl": 0.0078582763671875, "learning_rate": 9.997779521645791e-07, "loss": 0.1006, "num_tokens": 23579211.0, "reward": 0.341796875, "reward_std": 0.12201843038201332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.68359375, "rewards/tag_count_reward/std": 0.24403686448931694, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1564.5, "completions/mean_length": 1499.0625, "completions/mean_terminated_length": 1190.2625274658203, "completions/min_length": 901.25, "completions/min_terminated_length": 901.25, "epoch": 0.11, "grad_norm": 0.32783108949661255, "kl": 0.00742340087890625, "learning_rate": 9.997525971737909e-07, "loss": 0.1834, "num_tokens": 23684063.0, "reward": 0.29644108563661575, "reward_std": 0.19655951112508774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012478355318307877, "rewards/penalized_accuracy_reward/std": 0.049913424998521805, "rewards/tag_count_reward/mean": 0.54296875, "rewards/tag_count_reward/std": 0.23530085384845734, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1952.75, "completions/max_terminated_length": 1715.25, "completions/mean_length": 1272.546875, "completions/mean_terminated_length": 1112.9547729492188, "completions/min_length": 384.5, "completions/min_terminated_length": 384.5, "epoch": 0.1105, "grad_norm": 0.3169563114643097, "kl": 0.0059814453125, "learning_rate": 9.997258721585931e-07, "loss": 0.2299, "num_tokens": 23777346.0, "reward": 0.376953125, "reward_std": 0.1366262473165989, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.2732525020837784, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1931.75, "completions/max_terminated_length": 1848.5, "completions/mean_length": 1490.9375, "completions/mean_terminated_length": 1161.8819580078125, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 0.111, "grad_norm": 0.42406514286994934, "kl": 0.014904022216796875, "learning_rate": 9.99697777200395e-07, "loss": 0.1604, "num_tokens": 23888686.0, "reward": 0.47137194871902466, "reward_std": 0.3750978112220764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08724846504628658, "rewards/penalized_accuracy_reward/std": 0.1496107392013073, "rewards/tag_count_reward/mean": 0.59375, "rewards/tag_count_reward/std": 0.25351135432720184, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1481.5, "completions/max_terminated_length": 1362.25, "completions/mean_length": 1110.109375, "completions/mean_terminated_length": 978.4000244140625, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.1115, "grad_norm": 0.40029430389404297, "kl": 0.00960540771484375, "learning_rate": 9.996683123847795e-07, "loss": 0.0765, "num_tokens": 23968357.0, "reward": 0.44336578249931335, "reward_std": 0.30848387256264687, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049807894974946976, "rewards/penalized_accuracy_reward/std": 0.13011159747838974, "rewards/tag_count_reward/mean": 0.6875, "rewards/tag_count_reward/std": 0.18281427025794983, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 1463.828125, "completions/mean_terminated_length": 1007.4988250732422, "completions/min_length": 499.25, "completions/min_terminated_length": 499.25, "epoch": 0.112, "grad_norm": 0.31347936391830444, "kl": 0.00572967529296875, "learning_rate": 9.996374778015007e-07, "loss": 0.2664, "num_tokens": 24072378.0, "reward": 0.39638160169124603, "reward_std": 0.301539184525609, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062448613345623016, "rewards/penalized_accuracy_reward/std": 0.09566399455070496, "rewards/tag_count_reward/mean": 0.54296875, "rewards/tag_count_reward/std": 0.2502913065254688, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2036.75, "completions/max_terminated_length": 1955.25, "completions/mean_length": 1393.671875, "completions/mean_terminated_length": 1163.1099243164062, "completions/min_length": 618.5, "completions/min_terminated_length": 618.5, "epoch": 0.1125, "grad_norm": 0.3007999658584595, "kl": 0.00720977783203125, "learning_rate": 9.996052735444862e-07, "loss": 0.1231, "num_tokens": 24169637.0, "reward": 0.3510977476835251, "reward_std": 0.22193579375743866, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012462938204407692, "rewards/penalized_accuracy_reward/std": 0.049851756542921066, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.2857937589287758, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2007.5, "completions/max_terminated_length": 1670.0, "completions/mean_length": 1109.484375, "completions/mean_terminated_length": 999.2719573974609, "completions/min_length": 565.5, "completions/min_terminated_length": 565.5, "epoch": 0.113, "grad_norm": 0.34143391251564026, "kl": 0.00732421875, "learning_rate": 9.99571699711836e-07, "loss": 0.098, "num_tokens": 24249220.0, "reward": 0.6800155341625214, "reward_std": 0.3956412263214588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16227338835597038, "rewards/penalized_accuracy_reward/std": 0.1637037917971611, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.23152604699134827, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1902.25, "completions/max_terminated_length": 1616.75, "completions/mean_length": 1462.0, "completions/mean_terminated_length": 1036.1135559082031, "completions/min_length": 454.75, "completions/min_terminated_length": 454.75, "epoch": 0.1135, "grad_norm": 0.29742729663848877, "kl": 0.008319854736328125, "learning_rate": 9.995367564058216e-07, "loss": 0.1795, "num_tokens": 24350836.0, "reward": 0.271484375, "reward_std": 0.11780629679560661, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.54296875, "rewards/tag_count_reward/std": 0.23561260476708412, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1935.5, "completions/max_terminated_length": 1592.75, "completions/mean_length": 1388.984375, "completions/mean_terminated_length": 1014.6486663818359, "completions/min_length": 614.75, "completions/min_terminated_length": 614.75, "epoch": 0.114, "grad_norm": 0.28713229298591614, "kl": 0.006793975830078125, "learning_rate": 9.995004437328865e-07, "loss": 0.2081, "num_tokens": 24450691.0, "reward": 0.4583692103624344, "reward_std": 0.43363262712955475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0748877264559269, "rewards/penalized_accuracy_reward/std": 0.16100379079580307, "rewards/tag_count_reward/mean": 0.6171875, "rewards/tag_count_reward/std": 0.27328263223171234, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1885.0, "completions/max_terminated_length": 1230.75, "completions/mean_length": 1278.828125, "completions/mean_terminated_length": 710.9498596191406, "completions/min_length": 801.5, "completions/min_terminated_length": 289.5, "epoch": 0.1145, "grad_norm": 0.2621670365333557, "kl": 0.00696563720703125, "learning_rate": 9.994627618036452e-07, "loss": 0.1072, "num_tokens": 24539784.0, "reward": 0.38193752616643906, "reward_std": 0.2344514187425375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024953141808509827, "rewards/penalized_accuracy_reward/std": 0.0998125709593296, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.18074768409132957, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1539.0, "completions/max_terminated_length": 1340.5, "completions/mean_length": 943.921875, "completions/mean_terminated_length": 767.0468902587891, "completions/min_length": 379.5, "completions/min_terminated_length": 379.5, "epoch": 0.115, "grad_norm": 0.4587479829788208, "kl": 0.010223388671875, "learning_rate": 9.994237107328838e-07, "loss": 0.2114, "num_tokens": 24610003.0, "reward": 0.5169970691204071, "reward_std": 0.3103814907371998, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490478456020355, "rewards/penalized_accuracy_reward/std": 0.09987307339906693, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.259022518992424, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1556.25, "completions/mean_length": 1303.609375, "completions/mean_terminated_length": 963.5005035400391, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.1155, "grad_norm": 0.30326250195503235, "kl": 0.00676727294921875, "learning_rate": 9.993832906395582e-07, "loss": 0.2347, "num_tokens": 24702746.0, "reward": 0.4176335260272026, "reward_std": 0.3579166308045387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06233238708227873, "rewards/penalized_accuracy_reward/std": 0.13910287246108055, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.22546176984906197, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1928.25, "completions/max_terminated_length": 1616.25, "completions/mean_length": 1191.390625, "completions/mean_terminated_length": 966.3244171142578, "completions/min_length": 478.5, "completions/min_terminated_length": 478.5, "epoch": 0.116, "grad_norm": 0.3553161323070526, "kl": 0.0091705322265625, "learning_rate": 9.993415016467952e-07, "loss": 0.2119, "num_tokens": 24788963.0, "reward": 0.4783135801553726, "reward_std": 0.3536135330796242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062398976646363735, "rewards/penalized_accuracy_reward/std": 0.139177106320858, "rewards/tag_count_reward/mean": 0.70703125, "rewards/tag_count_reward/std": 0.2602023631334305, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 1343.515625, "completions/mean_terminated_length": 992.9696655273438, "completions/min_length": 450.25, "completions/min_terminated_length": 450.25, "epoch": 0.1165, "grad_norm": 0.3408110439777374, "kl": 0.00791168212890625, "learning_rate": 9.992983438818915e-07, "loss": 0.2517, "num_tokens": 24883620.0, "reward": 0.3624609112739563, "reward_std": 0.2606295868754387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498045563697815, "rewards/penalized_accuracy_reward/std": 0.06825960427522659, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.28069160878658295, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1783.0, "completions/max_terminated_length": 1404.25, "completions/mean_length": 1402.859375, "completions/mean_terminated_length": 1059.2692260742188, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 0.117, "grad_norm": 0.34663674235343933, "kl": 0.00830841064453125, "learning_rate": 9.992538174763127e-07, "loss": 0.0502, "num_tokens": 24987707.0, "reward": 0.32532578706741333, "reward_std": 0.27893694676458836, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024967581033706665, "rewards/penalized_accuracy_reward/std": 0.09987032040953636, "rewards/tag_count_reward/mean": 0.55078125, "rewards/tag_count_reward/std": 0.206074096262455, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1774.75, "completions/mean_length": 1335.21875, "completions/mean_terminated_length": 1127.0631408691406, "completions/min_length": 594.25, "completions/min_terminated_length": 594.25, "epoch": 0.1175, "grad_norm": 0.2652917504310608, "kl": 0.00603485107421875, "learning_rate": 9.992079225656944e-07, "loss": 0.1665, "num_tokens": 25081369.0, "reward": 0.4778081402182579, "reward_std": 0.366770276799798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07484157104045153, "rewards/penalized_accuracy_reward/std": 0.14538579434156418, "rewards/tag_count_reward/mean": 0.65625, "rewards/tag_count_reward/std": 0.2622602842748165, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1384.546875, "completions/mean_terminated_length": 1161.9368896484375, "completions/min_length": 562.75, "completions/min_terminated_length": 562.75, "epoch": 0.118, "grad_norm": 0.3013145625591278, "kl": 0.00720977783203125, "learning_rate": 9.9916065928984e-07, "loss": 0.2085, "num_tokens": 25177612.0, "reward": 0.3839438557624817, "reward_std": 0.2544761821627617, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024979744106531143, "rewards/penalized_accuracy_reward/std": 0.0682576596736908, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.2886027321219444, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1545.75, "completions/max_terminated_length": 1321.75, "completions/mean_length": 1121.34375, "completions/mean_terminated_length": 872.25, "completions/min_length": 400.25, "completions/min_terminated_length": 400.25, "epoch": 0.1185, "grad_norm": 0.4426823854446411, "kl": 0.00821685791015625, "learning_rate": 9.991120277927223e-07, "loss": 0.148, "num_tokens": 25257826.0, "reward": 0.36328125, "reward_std": 0.12154676765203476, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.24309353530406952, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1760.25, "completions/mean_length": 1244.3125, "completions/mean_terminated_length": 972.1125183105469, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.119, "grad_norm": 0.3399312496185303, "kl": 0.0085906982421875, "learning_rate": 9.990620282224806e-07, "loss": 0.1684, "num_tokens": 25348198.0, "reward": 0.6316221505403519, "reward_std": 0.48579951748251915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14979545027017593, "rewards/penalized_accuracy_reward/std": 0.1924196183681488, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.28309283778071404, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1931.75, "completions/max_terminated_length": 1510.0, "completions/mean_length": 1232.796875, "completions/mean_terminated_length": 1012.7812805175781, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.1195, "grad_norm": 0.33279597759246826, "kl": 0.00870513916015625, "learning_rate": 9.990106607314225e-07, "loss": 0.2658, "num_tokens": 25438121.0, "reward": 0.4668958783149719, "reward_std": 0.29174844175577164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04985418915748596, "rewards/penalized_accuracy_reward/std": 0.08918201923370361, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.3009119816124439, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 1163.484375, "completions/mean_terminated_length": 896.4447326660156, "completions/min_length": 344.5, "completions/min_terminated_length": 344.5, "epoch": 0.12, "grad_norm": 0.3400592505931854, "kl": 0.0095062255859375, "learning_rate": 9.989579254760224e-07, "loss": 0.2664, "num_tokens": 25521976.0, "reward": 0.439786359667778, "reward_std": 0.3660985790193081, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049971312284469604, "rewards/penalized_accuracy_reward/std": 0.13654762506484985, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.2816978357732296, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1991.25, "completions/max_terminated_length": 1479.25, "completions/mean_length": 1121.671875, "completions/mean_terminated_length": 874.4010467529297, "completions/min_length": 383.75, "completions/min_terminated_length": 383.75, "epoch": 0.1205, "grad_norm": 0.3213726282119751, "kl": 0.01314544677734375, "learning_rate": 9.989038226169207e-07, "loss": 0.2352, "num_tokens": 25603779.0, "reward": 0.37258509546518326, "reward_std": 0.23022194392979145, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012464423663914204, "rewards/penalized_accuracy_reward/std": 0.04985769838094711, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.3058083839714527, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1757.0, "completions/max_terminated_length": 1351.5, "completions/mean_length": 994.1875, "completions/mean_terminated_length": 747.93408203125, "completions/min_length": 394.5, "completions/min_terminated_length": 394.5, "epoch": 0.121, "grad_norm": 0.3732205033302307, "kl": 0.0094146728515625, "learning_rate": 9.988483523189248e-07, "loss": 0.2513, "num_tokens": 25677983.0, "reward": 0.38631023466587067, "reward_std": 0.21719501167535782, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491055764257908, "rewards/penalized_accuracy_reward/std": 0.04996422305703163, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.27208245918154716, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.75, "completions/mean_length": 1206.75, "completions/mean_terminated_length": 922.8461608886719, "completions/min_length": 368.5, "completions/min_terminated_length": 368.5, "epoch": 0.1215, "grad_norm": 0.35542330145835876, "kl": 0.009490966796875, "learning_rate": 9.98791514751006e-07, "loss": 0.2974, "num_tokens": 25763167.0, "reward": 0.4705250859260559, "reward_std": 0.3745540976524353, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06241097953170538, "rewards/penalized_accuracy_reward/std": 0.13922728598117828, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.2980343624949455, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1761.5, "completions/mean_length": 1487.125, "completions/mean_terminated_length": 1125.422119140625, "completions/min_length": 476.5, "completions/min_terminated_length": 476.5, "epoch": 0.122, "grad_norm": 0.25591400265693665, "kl": 0.00614166259765625, "learning_rate": 9.98733310086302e-07, "loss": 0.1468, "num_tokens": 25867671.0, "reward": 0.41763947904109955, "reward_std": 0.3687479067593813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06233536824584007, "rewards/penalized_accuracy_reward/std": 0.1485375165939331, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.2803327105939388, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1840.25, "completions/max_terminated_length": 1161.5, "completions/mean_length": 1405.71875, "completions/mean_terminated_length": 697.7320098876953, "completions/min_length": 833.75, "completions/min_terminated_length": 321.75, "epoch": 0.1225, "grad_norm": 0.23165962100028992, "kl": 0.00818634033203125, "learning_rate": 9.98673738502114e-07, "loss": 0.158, "num_tokens": 25968837.0, "reward": 0.3507068455219269, "reward_std": 0.2225376833230257, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496280148625374, "rewards/penalized_accuracy_reward/std": 0.06821136921644211, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.2179103083908558, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1715.25, "completions/max_terminated_length": 1124.75, "completions/mean_length": 967.546875, "completions/mean_terminated_length": 716.439826965332, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.123, "grad_norm": 0.5526963472366333, "kl": 0.012969970703125, "learning_rate": 9.986128001799076e-07, "loss": 0.2675, "num_tokens": 26041112.0, "reward": 0.3515625, "reward_std": 0.14131099358201027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.28262199461460114, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1762.0, "completions/max_terminated_length": 1322.5, "completions/mean_length": 1308.0625, "completions/mean_terminated_length": 914.4851379394531, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.1235, "grad_norm": 0.3229832053184509, "kl": 0.00679779052734375, "learning_rate": 9.985504953053113e-07, "loss": 0.2299, "num_tokens": 26133052.0, "reward": 0.314453125, "reward_std": 0.14737671241164207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62890625, "rewards/tag_count_reward/std": 0.29475343972444534, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1494.5, "completions/mean_length": 1365.03125, "completions/mean_terminated_length": 981.5359802246094, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "epoch": 0.124, "grad_norm": 0.31268391013145447, "kl": 0.00714111328125, "learning_rate": 9.984868240681164e-07, "loss": 0.2413, "num_tokens": 26230158.0, "reward": 0.32575952261686325, "reward_std": 0.21573570370674133, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0124891372397542, "rewards/penalized_accuracy_reward/std": 0.0499565489590168, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.3038676492869854, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1841.5, "completions/max_terminated_length": 1539.5, "completions/mean_length": 1278.125, "completions/mean_terminated_length": 1057.3497619628906, "completions/min_length": 566.75, "completions/min_terminated_length": 566.75, "epoch": 0.1245, "grad_norm": 0.26246288418769836, "kl": 0.00762176513671875, "learning_rate": 9.98421786662277e-07, "loss": 0.0863, "num_tokens": 26319334.0, "reward": 0.454897865653038, "reward_std": 0.4478120878338814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06240987218916416, "rewards/penalized_accuracy_reward/std": 0.2179848812520504, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.26621733605861664, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1720.75, "completions/max_terminated_length": 1649.75, "completions/mean_length": 1303.8125, "completions/mean_terminated_length": 906.5843505859375, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.125, "grad_norm": 0.37845170497894287, "kl": 0.01207733154296875, "learning_rate": 9.983553832859078e-07, "loss": 0.1756, "num_tokens": 26414682.0, "reward": 0.310546875, "reward_std": 0.12242008186876774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.24484017491340637, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1779.75, "completions/max_terminated_length": 1517.75, "completions/mean_length": 1158.171875, "completions/mean_terminated_length": 868.3980255126953, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.1255, "grad_norm": 0.3477992117404938, "kl": 0.0126190185546875, "learning_rate": 9.982876141412855e-07, "loss": 0.2856, "num_tokens": 26498693.0, "reward": 0.3203125, "reward_std": 0.1482603382319212, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.2965206876397133, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1797.5, "completions/max_terminated_length": 1391.0, "completions/mean_length": 1121.546875, "completions/mean_terminated_length": 927.3555450439453, "completions/min_length": 556.25, "completions/min_terminated_length": 556.25, "epoch": 0.126, "grad_norm": 0.37826013565063477, "kl": 0.009613037109375, "learning_rate": 9.982184794348462e-07, "loss": 0.2224, "num_tokens": 26580216.0, "reward": 0.41322118043899536, "reward_std": 0.22169899940490723, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496996708214283, "rewards/penalized_accuracy_reward/std": 0.06823094189167023, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.24095388501882553, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1919.5, "completions/max_terminated_length": 1682.25, "completions/mean_length": 1365.15625, "completions/mean_terminated_length": 1079.6754913330078, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.1265, "grad_norm": 0.2822503447532654, "kl": 0.006988525390625, "learning_rate": 9.981479793771866e-07, "loss": 0.1871, "num_tokens": 26677954.0, "reward": 0.6312463730573654, "reward_std": 0.5598792620003223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16230286471545696, "rewards/penalized_accuracy_reward/std": 0.2388230636715889, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.27723121643066406, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1314.5, "completions/mean_length": 1060.140625, "completions/mean_terminated_length": 771.7617797851562, "completions/min_length": 361.25, "completions/min_terminated_length": 361.25, "epoch": 0.127, "grad_norm": 0.3351059854030609, "kl": 0.0099334716796875, "learning_rate": 9.98076114183062e-07, "loss": 0.3445, "num_tokens": 26757179.0, "reward": 0.4308185577392578, "reward_std": 0.2507655657827854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497958578169346, "rewards/penalized_accuracy_reward/std": 0.06825722754001617, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.273227084428072, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1605.5, "completions/max_terminated_length": 1522.25, "completions/mean_length": 997.71875, "completions/mean_terminated_length": 890.6067199707031, "completions/min_length": 330.5, "completions/min_terminated_length": 330.5, "epoch": 0.1275, "grad_norm": 0.3680637776851654, "kl": 0.0140838623046875, "learning_rate": 9.98002884071386e-07, "loss": 0.0476, "num_tokens": 26835433.0, "reward": 0.7500898241996765, "reward_std": 0.23188265040516853, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17484959959983826, "rewards/penalized_accuracy_reward/std": 0.06825432181358337, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.24756275862455368, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1912.25, "completions/max_terminated_length": 1517.25, "completions/mean_length": 1007.5625, "completions/mean_terminated_length": 854.9792022705078, "completions/min_length": 420.25, "completions/min_terminated_length": 420.25, "epoch": 0.128, "grad_norm": 0.4183383584022522, "kl": 0.01627349853515625, "learning_rate": 9.979282892652304e-07, "loss": 0.2356, "num_tokens": 26914461.0, "reward": 0.6054488718509674, "reward_std": 0.3165430836379528, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1122947484254837, "rewards/penalized_accuracy_reward/std": 0.10228274762630463, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.2563386559486389, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1923.0, "completions/max_terminated_length": 1452.25, "completions/mean_length": 1238.96875, "completions/mean_terminated_length": 879.8662109375, "completions/min_length": 401.75, "completions/min_terminated_length": 401.75, "epoch": 0.1285, "grad_norm": 0.37681910395622253, "kl": 0.01483154296875, "learning_rate": 9.97852329991824e-07, "loss": 0.3227, "num_tokens": 27003419.0, "reward": 0.35546875, "reward_std": 0.1428279560059309, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.2856559306383133, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1779.5, "completions/mean_length": 1196.53125, "completions/mean_terminated_length": 845.1613464355469, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.129, "grad_norm": 0.4643842577934265, "kl": 0.00843048095703125, "learning_rate": 9.977750064825519e-07, "loss": 0.3351, "num_tokens": 27087597.0, "reward": 0.337890625, "reward_std": 0.11889110505580902, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.23778222501277924, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1397.25, "completions/max_terminated_length": 921.75, "completions/mean_length": 581.640625, "completions/mean_terminated_length": 467.36299896240234, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.1295, "grad_norm": 0.5213297605514526, "kl": 0.0185089111328125, "learning_rate": 9.976963189729547e-07, "loss": 0.2099, "num_tokens": 27133206.0, "reward": 0.5197413265705109, "reward_std": 0.2709731422364712, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04990972951054573, "rewards/penalized_accuracy_reward/std": 0.08928137272596359, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.20091085322201252, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 1474.453125, "completions/mean_terminated_length": 1282.2145538330078, "completions/min_length": 712.25, "completions/min_terminated_length": 712.25, "epoch": 0.13, "grad_norm": 0.2803308367729187, "kl": 0.00724029541015625, "learning_rate": 9.976162677027284e-07, "loss": 0.1645, "num_tokens": 27234339.0, "reward": 0.33984375, "reward_std": 0.14140615239739418, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.28281231224536896, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1848.75, "completions/mean_length": 1207.703125, "completions/mean_terminated_length": 1045.2473449707031, "completions/min_length": 382.5, "completions/min_terminated_length": 382.5, "epoch": 0.1305, "grad_norm": 0.28739219903945923, "kl": 0.00893402099609375, "learning_rate": 9.975348529157229e-07, "loss": 0.2241, "num_tokens": 27321632.0, "reward": 0.5567076057195663, "reward_std": 0.4052836000919342, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123381769284606, "rewards/penalized_accuracy_reward/std": 0.15308412164449692, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.2850048989057541, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1805.5, "completions/mean_length": 1377.859375, "completions/mean_terminated_length": 1142.0906829833984, "completions/min_length": 569.75, "completions/min_terminated_length": 569.75, "epoch": 0.131, "grad_norm": 0.2762926518917084, "kl": 0.00759124755859375, "learning_rate": 9.974520748599421e-07, "loss": 0.2233, "num_tokens": 27419799.0, "reward": 0.6255488842725754, "reward_std": 0.39577963575720787, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1496885046362877, "rewards/penalized_accuracy_reward/std": 0.14544454962015152, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.27697187289595604, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1768.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 1031.5625, "completions/mean_terminated_length": 883.579345703125, "completions/min_length": 410.25, "completions/min_terminated_length": 410.25, "epoch": 0.1315, "grad_norm": 0.442341148853302, "kl": 0.0145263671875, "learning_rate": 9.973679337875418e-07, "loss": 0.1826, "num_tokens": 27495947.0, "reward": 0.48849794268608093, "reward_std": 0.34661831706762314, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049913025461137295, "rewards/penalized_accuracy_reward/std": 0.13038786873221397, "rewards/tag_count_reward/mean": 0.77734375, "rewards/tag_count_reward/std": 0.24799222871661186, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.25, "completions/mean_length": 1061.109375, "completions/mean_terminated_length": 881.5881500244141, "completions/min_length": 545.5, "completions/min_terminated_length": 545.5, "epoch": 0.132, "grad_norm": 0.4407768249511719, "kl": 0.01361846923828125, "learning_rate": 9.972824299548309e-07, "loss": 0.2091, "num_tokens": 27573330.0, "reward": 0.461285337805748, "reward_std": 0.3401116505265236, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04997861199080944, "rewards/penalized_accuracy_reward/std": 0.1305629089474678, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.2639808803796768, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1914.5, "completions/max_terminated_length": 1620.0, "completions/mean_length": 1000.578125, "completions/mean_terminated_length": 886.4462127685547, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.1325, "grad_norm": 0.39872175455093384, "kl": 0.0101165771484375, "learning_rate": 9.971955636222684e-07, "loss": 0.2689, "num_tokens": 27645495.0, "reward": 0.44093358516693115, "reward_std": 0.19699210487306118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01245898101478815, "rewards/penalized_accuracy_reward/std": 0.0498359240591526, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.2576194703578949, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1683.75, "completions/mean_length": 1203.8125, "completions/mean_terminated_length": 1048.6100311279297, "completions/min_length": 400.75, "completions/min_terminated_length": 400.75, "epoch": 0.133, "grad_norm": 0.3775688707828522, "kl": 0.00656890869140625, "learning_rate": 9.971073350544644e-07, "loss": 0.2114, "num_tokens": 27733595.0, "reward": 0.44251590967178345, "reward_std": 0.24736519530415535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496889792382717, "rewards/penalized_accuracy_reward/std": 0.06822802126407623, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.27019958570599556, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1714.75, "completions/max_terminated_length": 1379.25, "completions/mean_length": 1016.390625, "completions/mean_terminated_length": 838.7073059082031, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.1335, "grad_norm": 0.378361314535141, "kl": 0.0103302001953125, "learning_rate": 9.970177445201783e-07, "loss": 0.203, "num_tokens": 27809156.0, "reward": 0.3940378502011299, "reward_std": 0.20630540326237679, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012448612600564957, "rewards/penalized_accuracy_reward/std": 0.049794454127550125, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.2439829409122467, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1577.25, "completions/mean_length": 1160.765625, "completions/mean_terminated_length": 997.0696716308594, "completions/min_length": 359.25, "completions/min_terminated_length": 359.25, "epoch": 0.134, "grad_norm": 0.30473363399505615, "kl": 0.0128326416015625, "learning_rate": 9.969267922923188e-07, "loss": 0.2434, "num_tokens": 27891685.0, "reward": 0.3843087926506996, "reward_std": 0.19660573080182076, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012466897256672382, "rewards/penalized_accuracy_reward/std": 0.04986758902668953, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.2693367339670658, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1850.75, "completions/max_terminated_length": 1424.25, "completions/mean_length": 983.234375, "completions/mean_terminated_length": 754.9365539550781, "completions/min_length": 316.75, "completions/min_terminated_length": 316.75, "epoch": 0.1345, "grad_norm": 0.3929966688156128, "kl": 0.01397705078125, "learning_rate": 9.968344786479415e-07, "loss": 0.3076, "num_tokens": 27963396.0, "reward": 0.41521377861499786, "reward_std": 0.25323865562677383, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024989699944853783, "rewards/penalized_accuracy_reward/std": 0.06828486919403076, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.26039204001426697, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1602.5, "completions/mean_length": 1016.53125, "completions/mean_terminated_length": 880.2625427246094, "completions/min_length": 246.25, "completions/min_terminated_length": 246.25, "epoch": 0.135, "grad_norm": 0.39136967062950134, "kl": 0.01338958740234375, "learning_rate": 9.967408038682505e-07, "loss": 0.2366, "num_tokens": 28036950.0, "reward": 0.5864234417676926, "reward_std": 0.4141971655189991, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09985235333442688, "rewards/penalized_accuracy_reward/std": 0.16800567507743835, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.24255597591400146, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1805.75, "completions/max_terminated_length": 1603.75, "completions/mean_length": 1253.265625, "completions/mean_terminated_length": 1121.6510620117188, "completions/min_length": 644.75, "completions/min_terminated_length": 644.75, "epoch": 0.1355, "grad_norm": 0.317388117313385, "kl": 0.01053619384765625, "learning_rate": 9.96645768238595e-07, "loss": 0.1441, "num_tokens": 28125015.0, "reward": 0.4489828422665596, "reward_std": 0.35774436593055725, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06238204799592495, "rewards/penalized_accuracy_reward/std": 0.14867394417524338, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.19127538800239563, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 1030.734375, "completions/mean_terminated_length": 841.3352737426758, "completions/min_length": 470.25, "completions/min_terminated_length": 470.25, "epoch": 0.136, "grad_norm": 0.5092541575431824, "kl": 0.01025390625, "learning_rate": 9.965493720484698e-07, "loss": 0.3428, "num_tokens": 28200822.0, "reward": 0.8399534821510315, "reward_std": 0.4857650641351938, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.22466424852609634, "rewards/penalized_accuracy_reward/std": 0.19790004193782806, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.2537056691944599, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1584.5, "completions/mean_length": 1028.625, "completions/mean_terminated_length": 807.5545349121094, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.1365, "grad_norm": 0.4197331964969635, "kl": 0.0173187255859375, "learning_rate": 9.964516155915151e-07, "loss": 0.4012, "num_tokens": 28275998.0, "reward": 0.380859375, "reward_std": 0.14287172071635723, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.28574344143271446, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1934.0, "completions/max_terminated_length": 1686.25, "completions/mean_length": 1206.9375, "completions/mean_terminated_length": 982.5388946533203, "completions/min_length": 441.5, "completions/min_terminated_length": 441.5, "epoch": 0.137, "grad_norm": 0.3574504852294922, "kl": 0.014892578125, "learning_rate": 9.963524991655133e-07, "loss": 0.2177, "num_tokens": 28362650.0, "reward": 0.4631509333848953, "reward_std": 0.2884857375174761, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993484169244766, "rewards/penalized_accuracy_reward/std": 0.0893261730670929, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.27640481665730476, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.75, "completions/max_terminated_length": 1258.75, "completions/mean_length": 864.734375, "completions/mean_terminated_length": 864.734375, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.1375, "grad_norm": 0.3815893828868866, "kl": 0.01055908203125, "learning_rate": 9.962520230723906e-07, "loss": -0.0062, "num_tokens": 28428153.0, "reward": 0.6105031967163086, "reward_std": 0.2764692734926939, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0874781683087349, "rewards/penalized_accuracy_reward/std": 0.10244394093751907, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.1661878488957882, "step": 275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1411.25, "completions/max_terminated_length": 1182.75, "completions/mean_length": 810.328125, "completions/mean_terminated_length": 683.8098907470703, "completions/min_length": 296.25, "completions/min_terminated_length": 296.25, "epoch": 0.138, "grad_norm": 0.5872645974159241, "kl": 0.0235443115234375, "learning_rate": 9.961501876182148e-07, "loss": 0.1085, "num_tokens": 28486990.0, "reward": 0.45567817986011505, "reward_std": 0.3536341264843941, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03740940522402525, "rewards/penalized_accuracy_reward/std": 0.1496376246213913, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.25765910372138023, "step": 276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1728.75, "completions/mean_length": 1703.046875, "completions/mean_terminated_length": 1255.7621612548828, "completions/min_length": 879.5, "completions/min_terminated_length": 879.5, "epoch": 0.1385, "grad_norm": 0.27690964937210083, "kl": 0.00861358642578125, "learning_rate": 9.960469931131936e-07, "loss": 0.1882, "num_tokens": 28605425.0, "reward": 0.3977215811610222, "reward_std": 0.43128184974193573, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0748373493552208, "rewards/penalized_accuracy_reward/std": 0.15743892639875412, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.28916001319885254, "step": 277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1818.25, "completions/max_terminated_length": 1735.0, "completions/mean_length": 1200.75, "completions/mean_terminated_length": 1024.1515197753906, "completions/min_length": 456.5, "completions/min_terminated_length": 456.5, "epoch": 0.139, "grad_norm": 0.4134790599346161, "kl": 0.0117950439453125, "learning_rate": 9.959424398716763e-07, "loss": 0.0867, "num_tokens": 28690529.0, "reward": 0.4401395246386528, "reward_std": 0.3307597190141678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037452573888003826, "rewards/penalized_accuracy_reward/std": 0.11816658824682236, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.24348846077919006, "step": 278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1716.5, "completions/max_terminated_length": 1063.0, "completions/mean_length": 1033.875, "completions/mean_terminated_length": 628.965690612793, "completions/min_length": 195.75, "completions/min_terminated_length": 195.75, "epoch": 0.1395, "grad_norm": 0.47245198488235474, "kl": 0.015228271484375, "learning_rate": 9.958365282121496e-07, "loss": 0.4346, "num_tokens": 28767097.0, "reward": 0.3588646724820137, "reward_std": 0.21404887363314629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012440148741006851, "rewards/penalized_accuracy_reward/std": 0.049760594964027405, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.2837899178266525, "step": 279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1712.0, "completions/mean_length": 1318.28125, "completions/mean_terminated_length": 921.5256652832031, "completions/min_length": 404.75, "completions/min_terminated_length": 404.75, "epoch": 0.14, "grad_norm": 0.3525916337966919, "kl": 0.0128936767578125, "learning_rate": 9.95729258457239e-07, "loss": 0.3652, "num_tokens": 28860491.0, "reward": 0.314453125, "reward_std": 0.15326131135225296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62890625, "rewards/tag_count_reward/std": 0.3065226376056671, "step": 280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1742.25, "completions/mean_length": 1322.890625, "completions/mean_terminated_length": 1069.6358337402344, "completions/min_length": 572.25, "completions/min_terminated_length": 572.25, "epoch": 0.1405, "grad_norm": 0.3976130485534668, "kl": 0.017181396484375, "learning_rate": 9.956206309337066e-07, "loss": 0.182, "num_tokens": 28956020.0, "reward": 0.3315424695611, "reward_std": 0.21648052521049976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012450922280550003, "rewards/penalized_accuracy_reward/std": 0.04980369284749031, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.2745247595012188, "step": 281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1327.5, "completions/max_terminated_length": 1155.25, "completions/mean_length": 700.953125, "completions/mean_terminated_length": 609.6036987304688, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.141, "grad_norm": 0.4931604564189911, "kl": 0.01513671875, "learning_rate": 9.955106459724508e-07, "loss": 0.2408, "num_tokens": 29010849.0, "reward": 0.6216456890106201, "reward_std": 0.30452729389071465, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09988532960414886, "rewards/penalized_accuracy_reward/std": 0.10316114127635956, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.22115005552768707, "step": 282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1945.25, "completions/max_terminated_length": 1626.25, "completions/mean_length": 917.578125, "completions/mean_terminated_length": 783.7149047851562, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.1415, "grad_norm": 1.2883323431015015, "kl": 0.0378570556640625, "learning_rate": 9.953993039085048e-07, "loss": 0.0075, "num_tokens": 29080486.0, "reward": 0.5727795958518982, "reward_std": 0.5117232501506805, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09986637160181999, "rewards/penalized_accuracy_reward/std": 0.2292521372437477, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.2786736860871315, "step": 283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1804.25, "completions/mean_length": 1348.84375, "completions/mean_terminated_length": 1126.9280242919922, "completions/min_length": 608.75, "completions/min_terminated_length": 608.75, "epoch": 0.142, "grad_norm": 0.31036651134490967, "kl": 0.0120697021484375, "learning_rate": 9.952866050810363e-07, "loss": 0.2049, "num_tokens": 29176892.0, "reward": 0.36091582477092743, "reward_std": 0.20160202868282795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012489160522818565, "rewards/penalized_accuracy_reward/std": 0.04995664581656456, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.26064201444387436, "step": 284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1735.5, "completions/max_terminated_length": 1636.5, "completions/mean_length": 1365.59375, "completions/mean_terminated_length": 1096.7294921875, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.1425, "grad_norm": 0.2728256285190582, "kl": 0.014678955078125, "learning_rate": 9.951725498333448e-07, "loss": 0.1875, "num_tokens": 29276834.0, "reward": 0.360914021730423, "reward_std": 0.22329798713326454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012488258071243763, "rewards/penalized_accuracy_reward/std": 0.04995303228497505, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.3206666111946106, "step": 285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1765.75, "completions/max_terminated_length": 1468.5, "completions/mean_length": 975.109375, "completions/mean_terminated_length": 805.8830108642578, "completions/min_length": 211.75, "completions/min_terminated_length": 211.75, "epoch": 0.143, "grad_norm": 0.4318779408931732, "kl": 0.01629638671875, "learning_rate": 9.950571385128625e-07, "loss": 0.2791, "num_tokens": 29347321.0, "reward": 0.375, "reward_std": 0.13439549691975117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.26879100129008293, "step": 286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1914.25, "completions/max_terminated_length": 1557.0, "completions/mean_length": 1076.859375, "completions/mean_terminated_length": 894.8898620605469, "completions/min_length": 361.25, "completions/min_terminated_length": 361.25, "epoch": 0.1435, "grad_norm": 0.37838461995124817, "kl": 0.0152435302734375, "learning_rate": 9.949403714711526e-07, "loss": 0.2213, "num_tokens": 29423584.0, "reward": 0.6917309314012527, "reward_std": 0.4844302870333195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16227172315120697, "rewards/penalized_accuracy_reward/std": 0.20219706743955612, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.28298135474324226, "step": 287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1972.25, "completions/max_terminated_length": 1786.5, "completions/mean_length": 1219.046875, "completions/mean_terminated_length": 929.5428161621094, "completions/min_length": 341.25, "completions/min_terminated_length": 341.25, "epoch": 0.144, "grad_norm": 0.35971125960350037, "kl": 0.0196533203125, "learning_rate": 9.948222490639075e-07, "loss": 0.2425, "num_tokens": 29512595.0, "reward": 0.48969128727912903, "reward_std": 0.42821357771754265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07492377236485481, "rewards/penalized_accuracy_reward/std": 0.16108129918575287, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.28517700731754303, "step": 288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1821.5, "completions/mean_length": 1311.421875, "completions/mean_terminated_length": 1072.7391357421875, "completions/min_length": 495.5, "completions/min_terminated_length": 495.5, "epoch": 0.1445, "grad_norm": 0.34197357296943665, "kl": 0.0189971923828125, "learning_rate": 9.947027716509488e-07, "loss": 0.3042, "num_tokens": 29606510.0, "reward": 0.3359375, "reward_std": 0.14454649575054646, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.2890929989516735, "step": 289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1892.25, "completions/max_terminated_length": 1582.0, "completions/mean_length": 988.296875, "completions/mean_terminated_length": 790.4585418701172, "completions/min_length": 224.5, "completions/min_terminated_length": 224.5, "epoch": 0.145, "grad_norm": 0.3513518273830414, "kl": 0.016265869140625, "learning_rate": 9.94581939596225e-07, "loss": 0.3536, "num_tokens": 29678529.0, "reward": 0.4440329968929291, "reward_std": 0.2891542762517929, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744618222117424, "rewards/penalized_accuracy_reward/std": 0.08050688356161118, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.28832413256168365, "step": 290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1949.25, "completions/max_terminated_length": 1619.25, "completions/mean_length": 1116.015625, "completions/mean_terminated_length": 901.0040435791016, "completions/min_length": 329.25, "completions/min_terminated_length": 329.25, "epoch": 0.1455, "grad_norm": 0.37011778354644775, "kl": 0.0166168212890625, "learning_rate": 9.944597532678119e-07, "loss": 0.2147, "num_tokens": 29759714.0, "reward": 0.373046875, "reward_std": 0.11430288106203079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.22860577702522278, "step": 291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1566.0, "completions/max_terminated_length": 1443.25, "completions/mean_length": 909.46875, "completions/mean_terminated_length": 854.9969024658203, "completions/min_length": 385.25, "completions/min_terminated_length": 385.25, "epoch": 0.146, "grad_norm": 0.5925641655921936, "kl": 0.024078369140625, "learning_rate": 9.943362130379101e-07, "loss": 0.1247, "num_tokens": 29827184.0, "reward": 0.5017515122890472, "reward_std": 0.28355447575449944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06239919736981392, "rewards/penalized_accuracy_reward/std": 0.09558829665184021, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.23812328651547432, "step": 292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1744.25, "completions/max_terminated_length": 1594.75, "completions/mean_length": 1132.53125, "completions/mean_terminated_length": 1006.4190826416016, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "epoch": 0.1465, "grad_norm": 0.33831357955932617, "kl": 0.0169219970703125, "learning_rate": 9.942113192828444e-07, "loss": 0.1758, "num_tokens": 29913058.0, "reward": 0.6575291454792023, "reward_std": 0.543341189622879, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13735833019018173, "rewards/penalized_accuracy_reward/std": 0.24869035929441452, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.24355630576610565, "step": 293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1475.5, "completions/max_terminated_length": 1292.25, "completions/mean_length": 720.46875, "completions/mean_terminated_length": 612.6207427978516, "completions/min_length": 148.25, "completions/min_terminated_length": 148.25, "epoch": 0.147, "grad_norm": 0.36496594548225403, "kl": 0.0198211669921875, "learning_rate": 9.940850723830632e-07, "loss": 0.0938, "num_tokens": 29967872.0, "reward": 0.6502358317375183, "reward_std": 0.4751713424921036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.112227289006114, "rewards/penalized_accuracy_reward/std": 0.2258315533399582, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.1661897748708725, "step": 294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 1204.078125, "completions/mean_terminated_length": 880.1141204833984, "completions/min_length": 449.5, "completions/min_terminated_length": 449.5, "epoch": 0.1475, "grad_norm": 0.38443779945373535, "kl": 0.01751708984375, "learning_rate": 9.939574727231362e-07, "loss": 0.3288, "num_tokens": 30054405.0, "reward": 0.45721834897994995, "reward_std": 0.3091745525598526, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04989823326468468, "rewards/penalized_accuracy_reward/std": 0.08926067501306534, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.2945740297436714, "step": 295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1877.5, "completions/mean_length": 1491.34375, "completions/mean_terminated_length": 1243.578857421875, "completions/min_length": 334.75, "completions/min_terminated_length": 334.75, "epoch": 0.148, "grad_norm": 0.2679460048675537, "kl": 0.01287841796875, "learning_rate": 9.93828520691754e-07, "loss": 0.1674, "num_tokens": 30160763.0, "reward": 0.38239888846874237, "reward_std": 0.22821490466594696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012488506734371185, "rewards/penalized_accuracy_reward/std": 0.04995403066277504, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.3011827692389488, "step": 296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1659.25, "completions/max_terminated_length": 1371.0, "completions/mean_length": 1056.53125, "completions/mean_terminated_length": 761.6370391845703, "completions/min_length": 360.75, "completions/min_terminated_length": 360.75, "epoch": 0.1485, "grad_norm": 0.47203943133354187, "kl": 0.0202484130859375, "learning_rate": 9.93698216681727e-07, "loss": 0.2881, "num_tokens": 30237581.0, "reward": 0.522496372461319, "reward_std": 0.3009930197149515, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08742006123065948, "rewards/penalized_accuracy_reward/std": 0.10237589478492737, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.2042694091796875, "step": 297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1904.75, "completions/max_terminated_length": 1518.0, "completions/mean_length": 1039.4375, "completions/mean_terminated_length": 796.3323059082031, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.149, "grad_norm": 0.4505983591079712, "kl": 0.016510009765625, "learning_rate": 9.93566561089984e-07, "loss": 0.2561, "num_tokens": 30315257.0, "reward": 0.40584462136030197, "reward_std": 0.2013820931315422, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012492623180150986, "rewards/penalized_accuracy_reward/std": 0.04997049272060394, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.23792365565896034, "step": 298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1540.75, "completions/mean_length": 1309.03125, "completions/mean_terminated_length": 961.1727447509766, "completions/min_length": 472.25, "completions/min_terminated_length": 472.25, "epoch": 0.1495, "grad_norm": 0.36445268988609314, "kl": 0.0180511474609375, "learning_rate": 9.934335543175705e-07, "loss": 0.3053, "num_tokens": 30407563.0, "reward": 0.41517284512519836, "reward_std": 0.2811199724674225, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496923692524433, "rewards/penalized_accuracy_reward/std": 0.06822894513607025, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.33161159604787827, "step": 299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2023.5, "completions/max_terminated_length": 1298.75, "completions/mean_length": 879.34375, "completions/mean_terminated_length": 734.1468811035156, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.15, "grad_norm": 0.6155036091804504, "kl": 0.0238037109375, "learning_rate": 9.932991967696482e-07, "loss": 0.4132, "num_tokens": 30475057.0, "reward": 0.423828125, "reward_std": 0.12478922307491302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.24957844614982605, "step": 300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1431.84375, "completions/mean_terminated_length": 1185.2452697753906, "completions/min_length": 514.25, "completions/min_terminated_length": 514.25, "epoch": 0.1505, "grad_norm": 0.326754629611969, "kl": 0.01654052734375, "learning_rate": 9.931634888554935e-07, "loss": 0.1479, "num_tokens": 30577959.0, "reward": 0.6313088685274124, "reward_std": 0.5490754917263985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14963880088180304, "rewards/penalized_accuracy_reward/std": 0.2334039770066738, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.2799655832350254, "step": 301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1802.5, "completions/max_terminated_length": 1432.0, "completions/mean_length": 898.015625, "completions/mean_terminated_length": 693.1679077148438, "completions/min_length": 266.5, "completions/min_terminated_length": 266.5, "epoch": 0.151, "grad_norm": 0.4117719829082489, "kl": 0.0194854736328125, "learning_rate": 9.930264309884964e-07, "loss": 0.208, "num_tokens": 30643240.0, "reward": 0.47925756871700287, "reward_std": 0.31107161194086075, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03748034965246916, "rewards/penalized_accuracy_reward/std": 0.11825554817914963, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.24151154980063438, "step": 302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1817.75, "completions/max_terminated_length": 1453.0, "completions/mean_length": 1053.84375, "completions/mean_terminated_length": 890.5673217773438, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.1515, "grad_norm": 0.42081937193870544, "kl": 0.022613525390625, "learning_rate": 9.928880235861588e-07, "loss": 0.1807, "num_tokens": 30721102.0, "reward": 0.6000053435564041, "reward_std": 0.39822197891771793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09980736579746008, "rewards/penalized_accuracy_reward/std": 0.15215244889259338, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.2757805921137333, "step": 303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1592.75, "completions/mean_length": 1141.5, "completions/mean_terminated_length": 905.3779907226562, "completions/min_length": 453.25, "completions/min_terminated_length": 453.25, "epoch": 0.152, "grad_norm": 0.34556475281715393, "kl": 0.0180206298828125, "learning_rate": 9.927482670700936e-07, "loss": 0.2359, "num_tokens": 30801966.0, "reward": 0.3960720971226692, "reward_std": 0.20122775807976723, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012489174492657185, "rewards/penalized_accuracy_reward/std": 0.04995669797062874, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.2736573964357376, "step": 304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1996.75, "completions/max_terminated_length": 1767.0, "completions/mean_length": 922.796875, "completions/mean_terminated_length": 786.4402618408203, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.1525, "grad_norm": 0.39405128359794617, "kl": 0.0165863037109375, "learning_rate": 9.926071618660237e-07, "loss": 0.1962, "num_tokens": 30872673.0, "reward": 0.7101728618144989, "reward_std": 0.42521383985877037, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.22753453627228737, "step": 305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1844.25, "completions/max_terminated_length": 1727.25, "completions/mean_length": 1207.0, "completions/mean_terminated_length": 1019.3701629638672, "completions/min_length": 377.75, "completions/min_terminated_length": 377.75, "epoch": 0.153, "grad_norm": 0.4119655191898346, "kl": 0.0199127197265625, "learning_rate": 9.924647084037797e-07, "loss": 0.2738, "num_tokens": 30960161.0, "reward": 0.8576599061489105, "reward_std": 0.5191784426569939, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.22472839057445526, "rewards/penalized_accuracy_reward/std": 0.2030363380908966, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.2639167532324791, "step": 306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1764.75, "completions/max_terminated_length": 1065.75, "completions/mean_length": 1033.765625, "completions/mean_terminated_length": 657.5408172607422, "completions/min_length": 353.75, "completions/min_terminated_length": 353.75, "epoch": 0.1535, "grad_norm": 0.39391767978668213, "kl": 0.02215576171875, "learning_rate": 9.923209071172994e-07, "loss": 0.2176, "num_tokens": 31038514.0, "reward": 0.40535636246204376, "reward_std": 0.22219975851476192, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02494380623102188, "rewards/penalized_accuracy_reward/std": 0.06815947592258453, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.23959567956626415, "step": 307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1316.25, "completions/mean_length": 1407.046875, "completions/mean_terminated_length": 731.5107421875, "completions/min_length": 780.5, "completions/min_terminated_length": 268.5, "epoch": 0.154, "grad_norm": 0.2800317704677582, "kl": 0.0179901123046875, "learning_rate": 9.921757584446268e-07, "loss": 0.0496, "num_tokens": 31138037.0, "reward": 0.3526327311992645, "reward_std": 0.18723677285015583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024949176236987114, "rewards/penalized_accuracy_reward/std": 0.06817413866519928, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.1920444518327713, "step": 308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1538.25, "completions/max_terminated_length": 1411.25, "completions/mean_length": 798.125, "completions/mean_terminated_length": 720.3727722167969, "completions/min_length": 292.25, "completions/min_terminated_length": 292.25, "epoch": 0.1545, "grad_norm": 0.31493476033210754, "kl": 0.021484375, "learning_rate": 9.9202926282791e-07, "loss": 0.1681, "num_tokens": 31198109.0, "reward": 0.5432236343622208, "reward_std": 0.2406202033162117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993213713169098, "rewards/penalized_accuracy_reward/std": 0.0893213227391243, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.17135445028543472, "step": 309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 1210.3125, "completions/mean_terminated_length": 1058.0203552246094, "completions/min_length": 663.75, "completions/min_terminated_length": 663.75, "epoch": 0.155, "grad_norm": 0.372738778591156, "kl": 0.0184783935546875, "learning_rate": 9.918814207133997e-07, "loss": 0.1612, "num_tokens": 31284769.0, "reward": 0.4225013107061386, "reward_std": 0.3273412324488163, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037422528490424156, "rewards/penalized_accuracy_reward/std": 0.11808561906218529, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.26583055034279823, "step": 310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.25, "completions/mean_length": 1169.578125, "completions/mean_terminated_length": 941.2920227050781, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.1555, "grad_norm": 0.33263441920280457, "kl": 0.022216796875, "learning_rate": 9.917322325514487e-07, "loss": 0.1931, "num_tokens": 31369846.0, "reward": 0.7803092300891876, "reward_std": 0.6568419355899096, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1997249238193035, "rewards/penalized_accuracy_reward/std": 0.29108888655900955, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.274364173412323, "step": 311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1637.75, "completions/max_terminated_length": 1399.25, "completions/mean_length": 913.546875, "completions/mean_terminated_length": 654.8062591552734, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.156, "grad_norm": 0.531750500202179, "kl": 0.024169921875, "learning_rate": 9.915816987965102e-07, "loss": 0.2279, "num_tokens": 31437737.0, "reward": 0.5697769895195961, "reward_std": 0.3802216462790966, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07492755725979805, "rewards/penalized_accuracy_reward/std": 0.15761756151914597, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.1815032958984375, "step": 312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1984.75, "completions/max_terminated_length": 1772.0, "completions/mean_length": 1069.21875, "completions/mean_terminated_length": 863.5106048583984, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.1565, "grad_norm": 0.3558029532432556, "kl": 0.022125244140625, "learning_rate": 9.91429819907136e-07, "loss": 0.2044, "num_tokens": 31515223.0, "reward": 0.4811611920595169, "reward_std": 0.3106003515422344, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745558764785528, "rewards/penalized_accuracy_reward/std": 0.11818394064903259, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.22840199247002602, "step": 313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1494.5, "completions/mean_length": 1300.796875, "completions/mean_terminated_length": 985.6458587646484, "completions/min_length": 476.25, "completions/min_terminated_length": 476.25, "epoch": 0.157, "grad_norm": 0.3073103725910187, "kl": 0.0160369873046875, "learning_rate": 9.912765963459756e-07, "loss": 0.2756, "num_tokens": 31610106.0, "reward": 0.4420397877693176, "reward_std": 0.2975297048687935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037426140159368515, "rewards/penalized_accuracy_reward/std": 0.08046381175518036, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.31950972974300385, "step": 314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1790.5, "completions/max_terminated_length": 1304.25, "completions/mean_length": 1058.71875, "completions/mean_terminated_length": 833.5673980712891, "completions/min_length": 399.5, "completions/min_terminated_length": 399.5, "epoch": 0.1575, "grad_norm": 0.5769878029823303, "kl": 0.0334014892578125, "learning_rate": 9.911220285797748e-07, "loss": 0.2396, "num_tokens": 31688280.0, "reward": 0.6882999986410141, "reward_std": 0.4060720819979906, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14981405436992645, "rewards/penalized_accuracy_reward/std": 0.16813339293003082, "rewards/tag_count_reward/mean": 0.77734375, "rewards/tag_count_reward/std": 0.2781912609934807, "step": 315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1346.75, "completions/mean_length": 816.453125, "completions/mean_terminated_length": 638.9416961669922, "completions/min_length": 253.75, "completions/min_terminated_length": 253.75, "epoch": 0.158, "grad_norm": 0.47582924365997314, "kl": 0.0219573974609375, "learning_rate": 9.909661170793733e-07, "loss": 0.4103, "num_tokens": 31749941.0, "reward": 0.4850916266441345, "reward_std": 0.25283058360219, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746769577264786, "rewards/penalized_accuracy_reward/std": 0.08055312931537628, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2625318616628647, "step": 316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1817.0, "completions/max_terminated_length": 1461.75, "completions/mean_length": 818.46875, "completions/mean_terminated_length": 741.6149139404297, "completions/min_length": 369.75, "completions/min_terminated_length": 369.75, "epoch": 0.1585, "grad_norm": 0.4387405812740326, "kl": 0.022613525390625, "learning_rate": 9.908088623197048e-07, "loss": 0.1275, "num_tokens": 31814067.0, "reward": 0.40625, "reward_std": 0.12414122931659222, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.24828246980905533, "step": 317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1893.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 1378.78125, "completions/mean_terminated_length": 1284.1701049804688, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 0.159, "grad_norm": 0.228517547249794, "kl": 0.0118255615234375, "learning_rate": 9.906502647797945e-07, "loss": 0.1649, "num_tokens": 31909365.0, "reward": 0.40234375, "reward_std": 0.13917784206569195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.2783556915819645, "step": 318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1919.5, "completions/max_terminated_length": 1514.5, "completions/mean_length": 1048.296875, "completions/mean_terminated_length": 947.9295349121094, "completions/min_length": 368.75, "completions/min_terminated_length": 368.75, "epoch": 0.1595, "grad_norm": 0.3179612457752228, "kl": 0.015045166015625, "learning_rate": 9.904903249427582e-07, "loss": 0.1637, "num_tokens": 31983544.0, "reward": 0.8040232062339783, "reward_std": 0.4756402261555195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19986316561698914, "rewards/penalized_accuracy_reward/std": 0.20641779899597168, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.23907538130879402, "step": 319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 1041.921875, "completions/mean_terminated_length": 753.2850494384766, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.16, "grad_norm": 0.36994871497154236, "kl": 0.019989013671875, "learning_rate": 9.903290432958003e-07, "loss": 0.2725, "num_tokens": 32062627.0, "reward": 0.4787668436765671, "reward_std": 0.3622424155473709, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993029683828354, "rewards/penalized_accuracy_reward/std": 0.13039588928222656, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.30705421045422554, "step": 320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2031.5, "completions/max_terminated_length": 1958.25, "completions/mean_length": 1400.53125, "completions/mean_terminated_length": 1281.4986877441406, "completions/min_length": 539.25, "completions/min_terminated_length": 539.25, "epoch": 0.1605, "grad_norm": 0.22296041250228882, "kl": 0.0147552490234375, "learning_rate": 9.901664203302124e-07, "loss": 0.113, "num_tokens": 32161125.0, "reward": 0.4249121844768524, "reward_std": 0.23156446032226086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02495609037578106, "rewards/penalized_accuracy_reward/std": 0.06819302588701248, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.2269185334444046, "step": 321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1758.0, "completions/max_terminated_length": 1602.75, "completions/mean_length": 864.40625, "completions/mean_terminated_length": 810.9290313720703, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.161, "grad_norm": 0.44859495759010315, "kl": 0.0213470458984375, "learning_rate": 9.900024565413727e-07, "loss": 0.0539, "num_tokens": 32226399.0, "reward": 0.5221330225467682, "reward_std": 0.34233908355236053, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03743369039148092, "rewards/penalized_accuracy_reward/std": 0.1497347578406334, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.14509194903075695, "step": 322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2043.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1145.90625, "completions/mean_terminated_length": 949.9805755615234, "completions/min_length": 354.75, "completions/min_terminated_length": 354.75, "epoch": 0.1615, "grad_norm": 0.33813247084617615, "kl": 0.019195556640625, "learning_rate": 9.89837152428743e-07, "loss": 0.1753, "num_tokens": 32308777.0, "reward": 0.4772491008043289, "reward_std": 0.27899706177413464, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745267540216446, "rewards/penalized_accuracy_reward/std": 0.08052083104848862, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.2701104208827019, "step": 323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 1660.5, "completions/max_terminated_length": 772.75, "completions/mean_length": 1335.796875, "completions/mean_terminated_length": 463.8958435058594, "completions/min_length": 841.0, "completions/min_terminated_length": 329.0, "epoch": 0.162, "grad_norm": 0.17641980946063995, "kl": 0.01837158203125, "learning_rate": 9.896705084958687e-07, "loss": 0.1007, "num_tokens": 32406828.0, "reward": 0.31598100066185, "reward_std": 0.13677173852920532, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482686899602413, "rewards/penalized_accuracy_reward/std": 0.04993074759840965, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.1221349686384201, "step": 324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1616.0, "completions/max_terminated_length": 1411.25, "completions/mean_length": 995.265625, "completions/mean_terminated_length": 762.3854370117188, "completions/min_length": 370.25, "completions/min_terminated_length": 370.25, "epoch": 0.1625, "grad_norm": 0.4617752432823181, "kl": 0.0262603759765625, "learning_rate": 9.895025252503755e-07, "loss": 0.2453, "num_tokens": 32479693.0, "reward": 0.37890625, "reward_std": 0.1143263541162014, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.2286527119576931, "step": 325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.25, "completions/max_terminated_length": 935.25, "completions/mean_length": 581.078125, "completions/mean_terminated_length": 581.078125, "completions/min_length": 255.75, "completions/min_terminated_length": 255.75, "epoch": 0.163, "grad_norm": 0.5212326645851135, "kl": 0.028076171875, "learning_rate": 9.8933320320397e-07, "loss": 0.0085, "num_tokens": 32526098.0, "reward": 0.9871488213539124, "reward_std": 0.43143618293106556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2748244106769562, "rewards/penalized_accuracy_reward/std": 0.1913631707429886, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.20928960293531418, "step": 326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1706.5, "completions/max_terminated_length": 1181.75, "completions/mean_length": 917.828125, "completions/mean_terminated_length": 762.0534362792969, "completions/min_length": 463.5, "completions/min_terminated_length": 463.5, "epoch": 0.1635, "grad_norm": 0.5189905762672424, "kl": 0.023223876953125, "learning_rate": 9.891625428724364e-07, "loss": 0.2018, "num_tokens": 32595159.0, "reward": 0.47881144285202026, "reward_std": 0.34475470520555973, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995259828865528, "rewards/penalized_accuracy_reward/std": 0.13046515360474586, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.2261708676815033, "step": 327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1782.5, "completions/max_terminated_length": 1527.5, "completions/mean_length": 1031.265625, "completions/mean_terminated_length": 853.7333526611328, "completions/min_length": 334.75, "completions/min_terminated_length": 334.75, "epoch": 0.164, "grad_norm": 0.3236292600631714, "kl": 0.0193634033203125, "learning_rate": 9.889905447756355e-07, "loss": 0.2413, "num_tokens": 32672792.0, "reward": 0.453824982047081, "reward_std": 0.2819632701575756, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374593660235405, "rewards/penalized_accuracy_reward/std": 0.08053522557020187, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.2835954688489437, "step": 328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1965.0, "completions/max_terminated_length": 1246.25, "completions/mean_length": 867.578125, "completions/mean_terminated_length": 681.7225112915039, "completions/min_length": 271.25, "completions/min_terminated_length": 271.25, "epoch": 0.1645, "grad_norm": 0.533996045589447, "kl": 0.0202789306640625, "learning_rate": 9.888172094375033e-07, "loss": 0.3645, "num_tokens": 32739021.0, "reward": 0.5099230706691742, "reward_std": 0.2831262294203043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0498834103345871, "rewards/penalized_accuracy_reward/std": 0.08923417329788208, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.260809239000082, "step": 329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 1450.5625, "completions/mean_terminated_length": 1009.3629150390625, "completions/min_length": 519.75, "completions/min_terminated_length": 519.75, "epoch": 0.165, "grad_norm": 0.27378952503204346, "kl": 0.018829345703125, "learning_rate": 9.886425373860496e-07, "loss": 0.2913, "num_tokens": 32841569.0, "reward": 0.3277202472090721, "reward_std": 0.2422868087887764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012492935173213482, "rewards/penalized_accuracy_reward/std": 0.04997174069285393, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.33481983095407486, "step": 330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1781.75, "completions/max_terminated_length": 1390.0, "completions/mean_length": 916.078125, "completions/mean_terminated_length": 684.3410949707031, "completions/min_length": 233.25, "completions/min_terminated_length": 233.25, "epoch": 0.1655, "grad_norm": 0.4044518768787384, "kl": 0.024566650390625, "learning_rate": 9.88466529153356e-07, "loss": 0.2447, "num_tokens": 32913046.0, "reward": 0.41950997710227966, "reward_std": 0.19308701902627945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012489364482462406, "rewards/penalized_accuracy_reward/std": 0.04995746165513992, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.2356334887444973, "step": 331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1420.5, "completions/max_terminated_length": 1359.5, "completions/mean_length": 743.171875, "completions/mean_terminated_length": 627.5593719482422, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.166, "grad_norm": 0.5417068600654602, "kl": 0.02288818359375, "learning_rate": 9.882891852755732e-07, "loss": 0.1251, "num_tokens": 32971697.0, "reward": 0.43702176213264465, "reward_std": 0.1984130684286356, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012456191703677177, "rewards/penalized_accuracy_reward/std": 0.04982476308941841, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.24502058327198029, "step": 332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1750.75, "completions/max_terminated_length": 1357.0, "completions/mean_length": 876.828125, "completions/mean_terminated_length": 778.5583648681641, "completions/min_length": 352.5, "completions/min_terminated_length": 352.5, "epoch": 0.1665, "grad_norm": 0.5266909003257751, "kl": 0.02557373046875, "learning_rate": 9.881105062929221e-07, "loss": 0.2059, "num_tokens": 33036374.0, "reward": 0.4663611799478531, "reward_std": 0.18848804011940956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2315921187400818, "step": 333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1469.078125, "completions/mean_terminated_length": 1059.082275390625, "completions/min_length": 287.75, "completions/min_terminated_length": 287.75, "epoch": 0.167, "grad_norm": 0.2787356674671173, "kl": 0.0140380859375, "learning_rate": 9.879304927496896e-07, "loss": 0.3139, "num_tokens": 33140075.0, "reward": 0.310546875, "reward_std": 0.16090013459324837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.3218002915382385, "step": 334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1605.25, "completions/max_terminated_length": 1564.5, "completions/mean_length": 870.515625, "completions/mean_terminated_length": 780.9559783935547, "completions/min_length": 262.75, "completions/min_terminated_length": 262.75, "epoch": 0.1675, "grad_norm": 0.4333828389644623, "kl": 0.0224609375, "learning_rate": 9.877491451942284e-07, "loss": 0.1751, "num_tokens": 33204060.0, "reward": 0.44140625, "reward_std": 0.10103438794612885, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2020687758922577, "step": 335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1706.25, "completions/max_terminated_length": 1234.5, "completions/mean_length": 864.0, "completions/mean_terminated_length": 790.3156585693359, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.168, "grad_norm": 0.41822779178619385, "kl": 0.0236663818359375, "learning_rate": 9.875664641789543e-07, "loss": 0.2003, "num_tokens": 33268108.0, "reward": 0.7429504096508026, "reward_std": 0.26111514307558537, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1497955173254013, "rewards/penalized_accuracy_reward/std": 0.08932093530893326, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.19033434055745602, "step": 336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1722.0, "completions/max_terminated_length": 1570.25, "completions/mean_length": 1000.59375, "completions/mean_terminated_length": 914.3244323730469, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.1685, "grad_norm": 0.29420939087867737, "kl": 0.0170135498046875, "learning_rate": 9.873824502603459e-07, "loss": 0.1353, "num_tokens": 33340770.0, "reward": 0.49325424432754517, "reward_std": 0.27455418556928635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024947432801127434, "rewards/penalized_accuracy_reward/std": 0.09978973120450974, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.19180426001548767, "step": 337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1754.0, "completions/max_terminated_length": 1180.25, "completions/mean_length": 834.421875, "completions/mean_terminated_length": 702.7084426879883, "completions/min_length": 274.75, "completions/min_terminated_length": 274.75, "epoch": 0.169, "grad_norm": 15.379351615905762, "kl": 0.1101531982421875, "learning_rate": 9.871971039989407e-07, "loss": 0.3606, "num_tokens": 33401501.0, "reward": 1.1035176813602448, "reward_std": 0.6781280189752579, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3369150906801224, "rewards/penalized_accuracy_reward/std": 0.30178678780794144, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.23831390962004662, "step": 338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1565.75, "completions/mean_length": 1300.703125, "completions/mean_terminated_length": 894.8116760253906, "completions/min_length": 270.5, "completions/min_terminated_length": 270.5, "epoch": 0.1695, "grad_norm": 0.3310546278953552, "kl": 0.021728515625, "learning_rate": 9.870104259593362e-07, "loss": 0.3558, "num_tokens": 33493754.0, "reward": 0.505094438791275, "reward_std": 0.4119153097271919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07481284253299236, "rewards/penalized_accuracy_reward/std": 0.1454298011958599, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.32111945003271103, "step": 339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1867.75, "completions/max_terminated_length": 1428.0, "completions/mean_length": 1029.15625, "completions/mean_terminated_length": 853.8363952636719, "completions/min_length": 409.5, "completions/min_terminated_length": 409.5, "epoch": 0.17, "grad_norm": 0.5573697090148926, "kl": 0.020050048828125, "learning_rate": 9.86822416710186e-07, "loss": 0.3101, "num_tokens": 33571588.0, "reward": 0.4694465100765228, "reward_std": 0.268723564222455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037457626312971115, "rewards/penalized_accuracy_reward/std": 0.0805315151810646, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.24687501415610313, "step": 340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1656.75, "completions/max_terminated_length": 1379.0, "completions/mean_length": 994.484375, "completions/mean_terminated_length": 817.1903076171875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.1705, "grad_norm": 4.075127124786377, "kl": 0.0405120849609375, "learning_rate": 9.866330768241983e-07, "loss": 0.204, "num_tokens": 33643443.0, "reward": 0.6197660565376282, "reward_std": 0.4402782805263996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09992208704352379, "rewards/penalized_accuracy_reward/std": 0.1787460893392563, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.23336345702409744, "step": 341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1634.25, "completions/max_terminated_length": 1411.5, "completions/mean_length": 1027.03125, "completions/mean_terminated_length": 793.5417785644531, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.171, "grad_norm": 0.5400917530059814, "kl": 0.021453857421875, "learning_rate": 9.86442406878136e-07, "loss": 0.2628, "num_tokens": 33717461.0, "reward": 0.400390625, "reward_std": 0.12684408575296402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.25368817523121834, "step": 342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1993.0, "completions/max_terminated_length": 1659.5, "completions/mean_length": 1377.734375, "completions/mean_terminated_length": 1017.0379486083984, "completions/min_length": 404.25, "completions/min_terminated_length": 404.25, "epoch": 0.1715, "grad_norm": 0.3339293599128723, "kl": 0.0155029296875, "learning_rate": 9.862504074528126e-07, "loss": 0.2294, "num_tokens": 33818164.0, "reward": 0.3359375, "reward_std": 0.1264162715524435, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.2528325542807579, "step": 343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1647.25, "completions/mean_length": 1498.546875, "completions/mean_terminated_length": 948.4895935058594, "completions/min_length": 331.5, "completions/min_terminated_length": 331.5, "epoch": 0.172, "grad_norm": 0.2625250220298767, "kl": 0.0164642333984375, "learning_rate": 9.860570791330911e-07, "loss": 0.2686, "num_tokens": 33927207.0, "reward": 0.5102511942386627, "reward_std": 0.4323755279183388, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09985216148197651, "rewards/penalized_accuracy_reward/std": 0.15224631130695343, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.3342108875513077, "step": 344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.75, "completions/mean_length": 1241.984375, "completions/mean_terminated_length": 972.0872497558594, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.1725, "grad_norm": 0.3526162803173065, "kl": 0.021026611328125, "learning_rate": 9.85862422507884e-07, "loss": 0.3447, "num_tokens": 34015574.0, "reward": 0.3984375, "reward_std": 0.15251880884170532, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.30503762513399124, "step": 345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1708.75, "completions/max_terminated_length": 1234.5, "completions/mean_length": 904.75, "completions/mean_terminated_length": 786.3610305786133, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.173, "grad_norm": 0.5567873120307922, "kl": 0.0198211669921875, "learning_rate": 9.856664381701483e-07, "loss": 0.254, "num_tokens": 34082902.0, "reward": 0.7830678522586823, "reward_std": 0.23413977399468422, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17473703622817993, "rewards/penalized_accuracy_reward/std": 0.06821063160896301, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.22186454385519028, "step": 346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 829.375, "completions/mean_terminated_length": 727.5726470947266, "completions/min_length": 339.75, "completions/min_terminated_length": 339.75, "epoch": 0.1735, "grad_norm": 0.4571398198604584, "kl": 0.030120849609375, "learning_rate": 9.854691267168871e-07, "loss": 0.3328, "num_tokens": 34144174.0, "reward": 0.5660885274410248, "reward_std": 0.29314589500427246, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06234113872051239, "rewards/penalized_accuracy_reward/std": 0.09549939632415771, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2397993542253971, "step": 347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1936.5, "completions/max_terminated_length": 1394.25, "completions/mean_length": 960.421875, "completions/mean_terminated_length": 790.4302215576172, "completions/min_length": 426.5, "completions/min_terminated_length": 426.5, "epoch": 0.174, "grad_norm": 0.4734967350959778, "kl": 0.0227203369140625, "learning_rate": 9.852704887491445e-07, "loss": 0.2444, "num_tokens": 34215401.0, "reward": 0.7524872422218323, "reward_std": 0.3582138121128082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16237643733620644, "rewards/penalized_accuracy_reward/std": 0.13929202780127525, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.22137392684817314, "step": 348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1653.25, "completions/mean_length": 929.796875, "completions/mean_terminated_length": 799.6903228759766, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.1745, "grad_norm": 0.44127157330513, "kl": 0.0167999267578125, "learning_rate": 9.850705248720068e-07, "loss": 0.402, "num_tokens": 34284508.0, "reward": 0.4296875, "reward_std": 0.11973594687879086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2394719012081623, "step": 349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1999.25, "completions/max_terminated_length": 1899.0, "completions/mean_length": 1170.84375, "completions/mean_terminated_length": 997.7322998046875, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.175, "grad_norm": 0.3368399739265442, "kl": 0.023468017578125, "learning_rate": 9.848692356945981e-07, "loss": 0.2354, "num_tokens": 34369170.0, "reward": 0.6139007210731506, "reward_std": 0.3251902684569359, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09991911798715591, "rewards/penalized_accuracy_reward/std": 0.10319606214761734, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.26233692094683647, "step": 350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1858.0, "completions/max_terminated_length": 1616.5, "completions/mean_length": 1255.28125, "completions/mean_terminated_length": 996.1727905273438, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.1755, "grad_norm": 0.19285409152507782, "kl": 0.0177001953125, "learning_rate": 9.846666218300807e-07, "loss": 0.1116, "num_tokens": 34457108.0, "reward": 0.4019397795200348, "reward_std": 0.18450137972831726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.20439685136079788, "step": 351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1608.25, "completions/mean_length": 922.734375, "completions/mean_terminated_length": 746.3815307617188, "completions/min_length": 155.5, "completions/min_terminated_length": 155.5, "epoch": 0.176, "grad_norm": 0.5148526430130005, "kl": 0.0205078125, "learning_rate": 9.844626838956513e-07, "loss": 0.427, "num_tokens": 34527635.0, "reward": 0.423828125, "reward_std": 0.12486949190497398, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.24973899871110916, "step": 352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1538.25, "completions/mean_length": 1051.15625, "completions/mean_terminated_length": 778.5857391357422, "completions/min_length": 273.75, "completions/min_terminated_length": 273.75, "epoch": 0.1765, "grad_norm": 0.31485024094581604, "kl": 0.02227783203125, "learning_rate": 9.8425742251254e-07, "loss": 0.3509, "num_tokens": 34603469.0, "reward": 0.601986289024353, "reward_std": 0.4417789578437805, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09982126578688622, "rewards/penalized_accuracy_reward/std": 0.16801873594522476, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.2786950096487999, "step": 353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1449.75, "completions/mean_length": 1001.265625, "completions/mean_terminated_length": 776.8075866699219, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.177, "grad_norm": 0.41327348351478577, "kl": 0.027984619140625, "learning_rate": 9.84050838306009e-07, "loss": 0.2561, "num_tokens": 34677262.0, "reward": 0.7445936501026154, "reward_std": 0.5644468795508146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1623358828946948, "rewards/penalized_accuracy_reward/std": 0.24792751669883728, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.2715100906789303, "step": 354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1493.75, "completions/max_terminated_length": 1177.5, "completions/mean_length": 640.828125, "completions/mean_terminated_length": 506.6167755126953, "completions/min_length": 186.5, "completions/min_terminated_length": 186.5, "epoch": 0.1775, "grad_norm": 0.5931155681610107, "kl": 0.022491455078125, "learning_rate": 9.838429319053495e-07, "loss": 0.3353, "num_tokens": 34727155.0, "reward": 0.785490095615387, "reward_std": 0.42113256826996803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16227630153298378, "rewards/penalized_accuracy_reward/std": 0.180375337600708, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.14620334655046463, "step": 355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2041.75, "completions/max_terminated_length": 1474.5, "completions/mean_length": 829.46875, "completions/mean_terminated_length": 662.884880065918, "completions/min_length": 261.5, "completions/min_terminated_length": 261.5, "epoch": 0.178, "grad_norm": 0.3624888062477112, "kl": 0.02557373046875, "learning_rate": 9.836337039438803e-07, "loss": 0.2789, "num_tokens": 34791153.0, "reward": 0.7925818264484406, "reward_std": 0.6162402369081974, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18730651028454304, "rewards/penalized_accuracy_reward/std": 0.2704627588391304, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.2673354819417, "step": 356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1809.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 841.921875, "completions/mean_terminated_length": 803.4031524658203, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.1785, "grad_norm": 0.39758428931236267, "kl": 0.0211944580078125, "learning_rate": 9.83423155058946e-07, "loss": 0.1572, "num_tokens": 34853484.0, "reward": 0.8933538198471069, "reward_std": 0.4208607990294695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21230190806090832, "rewards/penalized_accuracy_reward/std": 0.18636619299650192, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16292952932417393, "step": 357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1738.5, "completions/max_terminated_length": 1581.0, "completions/mean_length": 958.203125, "completions/mean_terminated_length": 784.9029693603516, "completions/min_length": 234.75, "completions/min_terminated_length": 234.75, "epoch": 0.179, "grad_norm": 0.39929601550102234, "kl": 0.026031494140625, "learning_rate": 9.832112858919155e-07, "loss": 0.1895, "num_tokens": 34923897.0, "reward": 0.7985345721244812, "reward_std": 0.5457305945456028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18735324684530497, "rewards/penalized_accuracy_reward/std": 0.23922216892242432, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.2594398185610771, "step": 358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1860.0, "completions/max_terminated_length": 1426.5, "completions/mean_length": 714.0625, "completions/mean_terminated_length": 632.2683372497559, "completions/min_length": 220.5, "completions/min_terminated_length": 220.5, "epoch": 0.1795, "grad_norm": 0.3909948766231537, "kl": 0.026397705078125, "learning_rate": 9.829980970881784e-07, "loss": 0.2266, "num_tokens": 34976125.0, "reward": 0.9046447277069092, "reward_std": 0.4412313550710678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2247833013534546, "rewards/penalized_accuracy_reward/std": 0.18928585946559906, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.17445408925414085, "step": 359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1846.75, "completions/max_terminated_length": 1345.0, "completions/mean_length": 871.078125, "completions/mean_terminated_length": 671.1304016113281, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.18, "grad_norm": 0.4765426814556122, "kl": 0.030548095703125, "learning_rate": 9.82783589297145e-07, "loss": 0.4441, "num_tokens": 35040146.0, "reward": 0.48156580328941345, "reward_std": 0.24600520357489586, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024962592869997025, "rewards/penalized_accuracy_reward/std": 0.06821079552173615, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.2328270897269249, "step": 360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1402.5, "completions/max_terminated_length": 1359.75, "completions/mean_length": 807.15625, "completions/mean_terminated_length": 739.2604217529297, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.1805, "grad_norm": 0.5620487332344055, "kl": 0.024688720703125, "learning_rate": 9.825677631722435e-07, "loss": 0.1509, "num_tokens": 35102748.0, "reward": 0.6258056163787842, "reward_std": 0.3508812449872494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08731686975806952, "rewards/penalized_accuracy_reward/std": 0.1497042328119278, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.13100920617580414, "step": 361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1808.25, "completions/mean_length": 1053.640625, "completions/mean_terminated_length": 798.1398315429688, "completions/min_length": 251.25, "completions/min_terminated_length": 251.25, "epoch": 0.181, "grad_norm": 0.31405192613601685, "kl": 0.0204010009765625, "learning_rate": 9.823506193709174e-07, "loss": 0.2609, "num_tokens": 35179141.0, "reward": 0.6596240252256393, "reward_std": 0.4840147979557514, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12473388761281967, "rewards/penalized_accuracy_reward/std": 0.1910778060555458, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.28007062897086143, "step": 362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1576.75, "completions/max_terminated_length": 1463.25, "completions/mean_length": 1018.859375, "completions/mean_terminated_length": 896.3812561035156, "completions/min_length": 483.75, "completions/min_terminated_length": 483.75, "epoch": 0.1815, "grad_norm": 0.25500619411468506, "kl": 0.0202484130859375, "learning_rate": 9.821321585546243e-07, "loss": 0.0002, "num_tokens": 35253772.0, "reward": 0.9292384088039398, "reward_std": 0.41362107917666435, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2497754693031311, "rewards/penalized_accuracy_reward/std": 0.18289786577224731, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.12175257876515388, "step": 363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 1202.015625, "completions/mean_terminated_length": 884.1750335693359, "completions/min_length": 356.5, "completions/min_terminated_length": 356.5, "epoch": 0.182, "grad_norm": 0.3217252194881439, "kl": 0.024932861328125, "learning_rate": 9.81912381388834e-07, "loss": 0.3935, "num_tokens": 35338733.0, "reward": 0.380859375, "reward_std": 0.15770643204450607, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.31541287526488304, "step": 364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1752.75, "completions/max_terminated_length": 1679.0, "completions/mean_length": 967.015625, "completions/mean_terminated_length": 832.4836578369141, "completions/min_length": 330.5, "completions/min_terminated_length": 330.5, "epoch": 0.1825, "grad_norm": 0.25474077463150024, "kl": 0.024261474609375, "learning_rate": 9.816912885430258e-07, "loss": 0.1074, "num_tokens": 35410510.0, "reward": 0.46244844794273376, "reward_std": 0.1793423518538475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012474221177399158, "rewards/penalized_accuracy_reward/std": 0.049896884709596634, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.21481787413358688, "step": 365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1392.0, "completions/max_terminated_length": 1205.5, "completions/mean_length": 841.46875, "completions/mean_terminated_length": 630.3350830078125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.183, "grad_norm": 0.5077491998672485, "kl": 0.023590087890625, "learning_rate": 9.814688806906868e-07, "loss": 0.2663, "num_tokens": 35475420.0, "reward": 0.9312933087348938, "reward_std": 0.421863067895174, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2498263269662857, "rewards/penalized_accuracy_reward/std": 0.16820167750120163, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.18312866985797882, "step": 366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1768.5, "completions/max_terminated_length": 1360.0, "completions/mean_length": 975.921875, "completions/mean_terminated_length": 683.1752166748047, "completions/min_length": 304.5, "completions/min_terminated_length": 304.5, "epoch": 0.1835, "grad_norm": 0.45801204442977905, "kl": 0.025115966796875, "learning_rate": 9.812451585093098e-07, "loss": 0.4108, "num_tokens": 35547255.0, "reward": 0.6406060457229614, "reward_std": 0.3242396917194128, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1122952252626419, "rewards/penalized_accuracy_reward/std": 0.10228299349546432, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.259125417098403, "step": 367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2036.5, "completions/max_terminated_length": 1346.75, "completions/mean_length": 663.671875, "completions/mean_terminated_length": 571.5648880004883, "completions/min_length": 241.25, "completions/min_terminated_length": 241.25, "epoch": 0.184, "grad_norm": 0.4788001477718353, "kl": 0.0252227783203125, "learning_rate": 9.810201226803917e-07, "loss": 0.4679, "num_tokens": 35597490.0, "reward": 0.5031205266714096, "reward_std": 0.2162759006023407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024997767060995102, "rewards/penalized_accuracy_reward/std": 0.06830690056085587, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1989448443055153, "step": 368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1442.25, "completions/max_terminated_length": 1280.75, "completions/mean_length": 1057.59375, "completions/mean_terminated_length": 846.9531555175781, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.1845, "grad_norm": 0.4160602390766144, "kl": 0.0360565185546875, "learning_rate": 9.807937738894303e-07, "loss": 0.1485, "num_tokens": 35674472.0, "reward": 0.3828125, "reward_std": 0.10347630828619003, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.20695261657238007, "step": 369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1997.25, "completions/max_terminated_length": 1673.75, "completions/mean_length": 1340.15625, "completions/mean_terminated_length": 1084.1458740234375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.185, "grad_norm": 0.2580617368221283, "kl": 0.0161285400390625, "learning_rate": 9.805661128259235e-07, "loss": 0.2186, "num_tokens": 35770242.0, "reward": 0.453842356801033, "reward_std": 0.2757526356726885, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746805340051651, "rewards/penalized_accuracy_reward/std": 0.08055389672517776, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.25432364270091057, "step": 370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1730.5, "completions/max_terminated_length": 1453.75, "completions/mean_length": 1192.828125, "completions/mean_terminated_length": 893.0437622070312, "completions/min_length": 457.75, "completions/min_terminated_length": 457.75, "epoch": 0.1855, "grad_norm": 0.31302666664123535, "kl": 0.0201416015625, "learning_rate": 9.80337140183366e-07, "loss": 0.1413, "num_tokens": 35860839.0, "reward": 0.4210141599178314, "reward_std": 0.22862399742007256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496020309627056, "rewards/penalized_accuracy_reward/std": 0.06820429861545563, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.20192100293934345, "step": 371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1892.5, "completions/max_terminated_length": 1632.5, "completions/mean_length": 1017.890625, "completions/mean_terminated_length": 941.6055603027344, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.186, "grad_norm": 0.33351022005081177, "kl": 0.02301025390625, "learning_rate": 9.801068566592483e-07, "loss": 0.16, "num_tokens": 35935024.0, "reward": 0.5926762819290161, "reward_std": 0.426901463419199, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08735375851392746, "rewards/penalized_accuracy_reward/std": 0.1697739139199257, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.25413989275693893, "step": 372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1723.5, "completions/mean_length": 1120.28125, "completions/mean_terminated_length": 922.4655303955078, "completions/min_length": 345.25, "completions/min_terminated_length": 345.25, "epoch": 0.1865, "grad_norm": 0.3198533356189728, "kl": 0.02099609375, "learning_rate": 9.798752629550546e-07, "loss": 0.3459, "num_tokens": 36014578.0, "reward": 0.396484375, "reward_std": 0.14051313139498234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.2810262702405453, "step": 373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 1205.671875, "completions/mean_terminated_length": 951.4192962646484, "completions/min_length": 360.5, "completions/min_terminated_length": 360.5, "epoch": 0.187, "grad_norm": 0.7657854557037354, "kl": 0.032012939453125, "learning_rate": 9.796423597762588e-07, "loss": 0.3028, "num_tokens": 36102941.0, "reward": 0.5152035057544708, "reward_std": 0.3209625408053398, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062289249151945114, "rewards/penalized_accuracy_reward/std": 0.09541989117860794, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.2969161905348301, "step": 374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1601.25, "completions/max_terminated_length": 1404.0, "completions/mean_length": 694.484375, "completions/mean_terminated_length": 609.9470520019531, "completions/min_length": 245.25, "completions/min_terminated_length": 245.25, "epoch": 0.1875, "grad_norm": 0.37187856435775757, "kl": 0.02655029296875, "learning_rate": 9.794081478323245e-07, "loss": 0.187, "num_tokens": 36157036.0, "reward": 0.6917310357093811, "reward_std": 0.2641747146844864, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11246708035469055, "rewards/penalized_accuracy_reward/std": 0.1024395301938057, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.11859130859375, "step": 375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2007.25, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1262.34375, "completions/mean_terminated_length": 1128.63232421875, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.188, "grad_norm": 0.2789295017719269, "kl": 0.021148681640625, "learning_rate": 9.791726278367021e-07, "loss": 0.1471, "num_tokens": 36251442.0, "reward": 0.5148528218269348, "reward_std": 0.291807709261775, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.074809230864048, "rewards/penalized_accuracy_reward/std": 0.09974570572376251, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.22106319665908813, "step": 376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1840.25, "completions/max_terminated_length": 1625.75, "completions/mean_length": 912.8125, "completions/mean_terminated_length": 815.6375579833984, "completions/min_length": 377.5, "completions/min_terminated_length": 377.5, "epoch": 0.1885, "grad_norm": 0.4080560505390167, "kl": 0.02825927734375, "learning_rate": 9.78935800506826e-07, "loss": 0.2465, "num_tokens": 36318118.0, "reward": 0.7161418497562408, "reward_std": 0.44065922871232033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13736779615283012, "rewards/penalized_accuracy_reward/std": 0.1837097629904747, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2286243699491024, "step": 377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1673.0, "completions/max_terminated_length": 1569.25, "completions/mean_length": 1088.8125, "completions/mean_terminated_length": 871.617919921875, "completions/min_length": 395.5, "completions/min_terminated_length": 395.5, "epoch": 0.189, "grad_norm": 0.25363174080848694, "kl": 0.0242919921875, "learning_rate": 9.786976665641138e-07, "loss": 0.0839, "num_tokens": 36397274.0, "reward": 0.44642361998558044, "reward_std": 0.23347372561693192, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024969618767499924, "rewards/penalized_accuracy_reward/std": 0.06823001056909561, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.23094814270734787, "step": 378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1673.75, "completions/mean_length": 1075.28125, "completions/mean_terminated_length": 911.1981201171875, "completions/min_length": 234.25, "completions/min_terminated_length": 234.25, "epoch": 0.1895, "grad_norm": 0.349229097366333, "kl": 0.028289794921875, "learning_rate": 9.784582267339622e-07, "loss": 0.3129, "num_tokens": 36475756.0, "reward": 0.4561818540096283, "reward_std": 0.25576847046613693, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024965934455394745, "rewards/penalized_accuracy_reward/std": 0.06821992248296738, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.28045066073536873, "step": 379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.5, "completions/mean_length": 1398.328125, "completions/mean_terminated_length": 1074.3740844726562, "completions/min_length": 370.75, "completions/min_terminated_length": 370.75, "epoch": 0.19, "grad_norm": 0.27051886916160583, "kl": 0.024444580078125, "learning_rate": 9.78217481745747e-07, "loss": 0.2874, "num_tokens": 36576001.0, "reward": 0.37066351622343063, "reward_std": 0.23774508014321327, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012480194680392742, "rewards/penalized_accuracy_reward/std": 0.04992077499628067, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.326906681060791, "step": 380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1618.25, "completions/max_terminated_length": 1522.25, "completions/mean_length": 676.96875, "completions/mean_terminated_length": 656.1791687011719, "completions/min_length": 263.75, "completions/min_terminated_length": 263.75, "epoch": 0.1905, "grad_norm": 0.5354393124580383, "kl": 0.03009033203125, "learning_rate": 9.779754323328192e-07, "loss": 0.2155, "num_tokens": 36626495.0, "reward": 0.6815788149833679, "reward_std": 0.2871679849922657, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11227378249168396, "rewards/penalized_accuracy_reward/std": 0.10226354002952576, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1879417859017849, "step": 381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1757.0, "completions/max_terminated_length": 1134.25, "completions/mean_length": 766.375, "completions/mean_terminated_length": 630.8383331298828, "completions/min_length": 161.75, "completions/min_terminated_length": 161.75, "epoch": 0.191, "grad_norm": 0.5013362765312195, "kl": 0.03118896484375, "learning_rate": 9.777320792325025e-07, "loss": 0.4084, "num_tokens": 36684551.0, "reward": 0.6679511368274689, "reward_std": 0.3040498048067093, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11229588091373444, "rewards/penalized_accuracy_reward/std": 0.10228365659713745, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.19896498695015907, "step": 382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1701.75, "completions/max_terminated_length": 1313.0, "completions/mean_length": 957.828125, "completions/mean_terminated_length": 675.8381652832031, "completions/min_length": 315.75, "completions/min_terminated_length": 315.75, "epoch": 0.1915, "grad_norm": 0.46387505531311035, "kl": 0.030364990234375, "learning_rate": 9.774874231860935e-07, "loss": 0.3617, "num_tokens": 36753244.0, "reward": 0.41755613684654236, "reward_std": 0.2168395034968853, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012489001266658306, "rewards/penalized_accuracy_reward/std": 0.049956005066633224, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.26062823832035065, "step": 383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.5, "completions/mean_length": 1233.921875, "completions/mean_terminated_length": 950.5471954345703, "completions/min_length": 383.75, "completions/min_terminated_length": 383.75, "epoch": 0.192, "grad_norm": 0.34084564447402954, "kl": 0.035736083984375, "learning_rate": 9.772414649388568e-07, "loss": 0.2563, "num_tokens": 36841191.0, "reward": 0.42096760869026184, "reward_std": 0.2710782326757908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02493692748248577, "rewards/penalized_accuracy_reward/std": 0.0681406706571579, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.3164914473891258, "step": 384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1729.0, "completions/max_terminated_length": 1560.5, "completions/mean_length": 1114.296875, "completions/mean_terminated_length": 826.1607971191406, "completions/min_length": 395.5, "completions/min_terminated_length": 395.5, "epoch": 0.1925, "grad_norm": 0.24213790893554688, "kl": 0.0223388671875, "learning_rate": 9.769942052400235e-07, "loss": 0.0987, "num_tokens": 36925818.0, "reward": 0.6020061075687408, "reward_std": 0.48478836566209793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09983117785304785, "rewards/penalized_accuracy_reward/std": 0.21962810307741165, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.25864382088184357, "step": 385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1773.5, "completions/max_terminated_length": 1306.25, "completions/mean_length": 764.890625, "completions/mean_terminated_length": 652.2114715576172, "completions/min_length": 212.25, "completions/min_terminated_length": 212.25, "epoch": 0.193, "grad_norm": 0.39338091015815735, "kl": 0.023284912109375, "learning_rate": 9.767456448427896e-07, "loss": 0.2819, "num_tokens": 36982419.0, "reward": 0.8563853204250336, "reward_std": 0.5278625823557377, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1996770203113556, "rewards/penalized_accuracy_reward/std": 0.24432269483804703, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17722850292921066, "step": 386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1544.75, "completions/max_terminated_length": 1407.0, "completions/mean_length": 832.59375, "completions/mean_terminated_length": 696.1250152587891, "completions/min_length": 211.75, "completions/min_terminated_length": 211.75, "epoch": 0.1935, "grad_norm": 1.5530911684036255, "kl": 0.04547119140625, "learning_rate": 9.764957845043135e-07, "loss": 0.2959, "num_tokens": 37042985.0, "reward": 0.7754029706120491, "reward_std": 0.3330027088522911, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17481087520718575, "rewards/penalized_accuracy_reward/std": 0.13043774664402008, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.19732841849327087, "step": 387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1605.5, "completions/max_terminated_length": 1289.75, "completions/mean_length": 860.8125, "completions/mean_terminated_length": 731.9138641357422, "completions/min_length": 323.5, "completions/min_terminated_length": 323.5, "epoch": 0.194, "grad_norm": 0.37838828563690186, "kl": 0.027557373046875, "learning_rate": 9.76244624985713e-07, "loss": 0.2632, "num_tokens": 37106109.0, "reward": 0.7348591983318329, "reward_std": 0.278171019628644, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16235147416591644, "rewards/penalized_accuracy_reward/std": 0.08054892718791962, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2606709599494934, "step": 388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1279.75, "completions/mean_length": 1133.5, "completions/mean_terminated_length": 655.4381103515625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.1945, "grad_norm": 0.4000803828239441, "kl": 0.02886962890625, "learning_rate": 9.759921670520634e-07, "loss": 0.4074, "num_tokens": 37187405.0, "reward": 0.4034825563430786, "reward_std": 0.2777159623801708, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024983465671539307, "rewards/penalized_accuracy_reward/std": 0.068267822265625, "rewards/tag_count_reward/mean": 0.70703125, "rewards/tag_count_reward/std": 0.320572629570961, "step": 389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1756.75, "completions/mean_length": 1106.890625, "completions/mean_terminated_length": 862.7812042236328, "completions/min_length": 330.75, "completions/min_terminated_length": 330.75, "epoch": 0.195, "grad_norm": 0.37377768754959106, "kl": 0.029327392578125, "learning_rate": 9.757384114723953e-07, "loss": 0.2998, "num_tokens": 37268470.0, "reward": 0.515366718173027, "reward_std": 0.37367450445890427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06237085722386837, "rewards/penalized_accuracy_reward/std": 0.13917142525315285, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.3039017394185066, "step": 390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1984.0, "completions/max_terminated_length": 1551.25, "completions/mean_length": 1168.796875, "completions/mean_terminated_length": 855.0027465820312, "completions/min_length": 338.75, "completions/min_terminated_length": 338.75, "epoch": 0.1955, "grad_norm": 0.30112215876579285, "kl": 0.029022216796875, "learning_rate": 9.754833590196926e-07, "loss": 0.1804, "num_tokens": 37352937.0, "reward": 0.7030850946903229, "reward_std": 0.49125947430729866, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16208942979574203, "rewards/penalized_accuracy_reward/std": 0.1914246305823326, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.2878299094736576, "step": 391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1589.25, "completions/max_terminated_length": 1440.0, "completions/mean_length": 699.515625, "completions/mean_terminated_length": 587.5385589599609, "completions/min_length": 200.25, "completions/min_terminated_length": 200.25, "epoch": 0.196, "grad_norm": 0.961937427520752, "kl": 0.035064697265625, "learning_rate": 9.752270104708888e-07, "loss": 0.3236, "num_tokens": 37407098.0, "reward": 0.435546875, "reward_std": 0.09502226486802101, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.19004452973604202, "step": 392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1587.25, "completions/max_terminated_length": 1331.5, "completions/mean_length": 779.046875, "completions/mean_terminated_length": 654.0889511108398, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.1965, "grad_norm": 0.4454479515552521, "kl": 0.02850341796875, "learning_rate": 9.749693666068663e-07, "loss": 0.3505, "num_tokens": 37465501.0, "reward": 0.4375, "reward_std": 0.1122790053486824, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2245580106973648, "step": 393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1911.0, "completions/max_terminated_length": 1731.0, "completions/mean_length": 908.53125, "completions/mean_terminated_length": 797.4304046630859, "completions/min_length": 243.25, "completions/min_terminated_length": 243.25, "epoch": 0.197, "grad_norm": 0.3762248158454895, "kl": 0.028106689453125, "learning_rate": 9.747104282124531e-07, "loss": 0.1535, "num_tokens": 37538367.0, "reward": 0.4375, "reward_std": 0.10715573467314243, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.21431147679686546, "step": 394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1559.5, "completions/max_terminated_length": 1123.5, "completions/mean_length": 605.59375, "completions/mean_terminated_length": 508.8294143676758, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1975, "grad_norm": 0.4452160894870758, "kl": 0.03326416015625, "learning_rate": 9.744501960764203e-07, "loss": 0.1195, "num_tokens": 37588101.0, "reward": 0.6588678658008575, "reward_std": 0.3373238369822502, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09994175005704165, "rewards/penalized_accuracy_reward/std": 0.1523689553141594, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.15635622665286064, "step": 395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1484.75, "completions/mean_length": 1265.6875, "completions/mean_terminated_length": 871.9218368530273, "completions/min_length": 329.5, "completions/min_terminated_length": 329.5, "epoch": 0.198, "grad_norm": 0.4315130114555359, "kl": 0.0244293212890625, "learning_rate": 9.741886709914803e-07, "loss": 0.4774, "num_tokens": 37680529.0, "reward": 0.341796875, "reward_std": 0.16656196489930153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.68359375, "rewards/tag_count_reward/std": 0.33312392979860306, "step": 396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1692.75, "completions/max_terminated_length": 1653.0, "completions/mean_length": 1000.203125, "completions/mean_terminated_length": 811.4005737304688, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.1985, "grad_norm": 0.4443352520465851, "kl": 0.029083251953125, "learning_rate": 9.739258537542835e-07, "loss": 0.1996, "num_tokens": 37754734.0, "reward": 0.4140625, "reward_std": 0.11408073082566261, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.22816146165132523, "step": 397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1877.5, "completions/max_terminated_length": 1555.0, "completions/mean_length": 917.796875, "completions/mean_terminated_length": 764.9126281738281, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.199, "grad_norm": 0.3494514226913452, "kl": 0.026153564453125, "learning_rate": 9.73661745165417e-07, "loss": 0.2456, "num_tokens": 37823153.0, "reward": 0.6445996910333633, "reward_std": 0.4579969085752964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11233891174197197, "rewards/penalized_accuracy_reward/std": 0.18492664396762848, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.2444920390844345, "step": 398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1527.0, "completions/max_terminated_length": 1514.75, "completions/mean_length": 760.21875, "completions/mean_terminated_length": 743.3614654541016, "completions/min_length": 290.5, "completions/min_terminated_length": 290.5, "epoch": 0.1995, "grad_norm": 0.29085057973861694, "kl": 0.02374267578125, "learning_rate": 9.733963460294015e-07, "loss": 0.0847, "num_tokens": 37880191.0, "reward": 0.7780061364173889, "reward_std": 0.5089131742715836, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14974526315927505, "rewards/penalized_accuracy_reward/std": 0.24525703489780426, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10986490920186043, "step": 399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1465.75, "completions/max_terminated_length": 1122.25, "completions/mean_length": 997.9375, "completions/mean_terminated_length": 721.4147796630859, "completions/min_length": 317.25, "completions/min_terminated_length": 317.25, "epoch": 0.2, "grad_norm": 0.3456246852874756, "kl": 0.023895263671875, "learning_rate": 9.731296571546885e-07, "loss": 0.2521, "num_tokens": 37952299.0, "reward": 0.618888258934021, "reward_std": 0.2902064435184002, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1248738095164299, "rewards/penalized_accuracy_reward/std": 0.0998990535736084, "rewards/tag_count_reward/mean": 0.73828125, "rewards/tag_count_reward/std": 0.23533941060304642, "step": 400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1423.5, "completions/mean_length": 1073.5, "completions/mean_terminated_length": 774.2444610595703, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.2005, "grad_norm": 0.4671430289745331, "kl": 0.03173828125, "learning_rate": 9.728616793536587e-07, "loss": 0.4191, "num_tokens": 38033659.0, "reward": 0.392578125, "reward_std": 0.14482364989817142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.28964731469750404, "step": 401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 1383.25, "completions/mean_terminated_length": 797.6359405517578, "completions/min_length": 865.5, "completions/min_terminated_length": 353.5, "epoch": 0.201, "grad_norm": 0.23378674685955048, "kl": 0.01812744140625, "learning_rate": 9.72592413442619e-07, "loss": 0.1157, "num_tokens": 38132955.0, "reward": 0.7643474340438843, "reward_std": 0.46956757083535194, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19955650717020035, "rewards/penalized_accuracy_reward/std": 0.20447589457035065, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.18227580189704895, "step": 402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1443.25, "completions/mean_length": 882.515625, "completions/mean_terminated_length": 695.1437530517578, "completions/min_length": 226.25, "completions/min_terminated_length": 226.25, "epoch": 0.2015, "grad_norm": 0.3581865429878235, "kl": 0.0221710205078125, "learning_rate": 9.723218602418e-07, "loss": 0.2705, "num_tokens": 38198732.0, "reward": 0.5505516231060028, "reward_std": 0.3611280806362629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06238518748432398, "rewards/penalized_accuracy_reward/std": 0.1391843445599079, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.2575160972774029, "step": 403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1765.5, "completions/max_terminated_length": 1500.5, "completions/mean_length": 915.21875, "completions/mean_terminated_length": 798.5250091552734, "completions/min_length": 305.25, "completions/min_terminated_length": 305.25, "epoch": 0.202, "grad_norm": 0.45589587092399597, "kl": 0.023468017578125, "learning_rate": 9.720500205753538e-07, "loss": 0.1688, "num_tokens": 38269866.0, "reward": 0.5999012589454651, "reward_std": 0.3067239746451378, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09975531697273254, "rewards/penalized_accuracy_reward/std": 0.10302691161632538, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.24832112342119217, "step": 404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1526.25, "completions/max_terminated_length": 1368.75, "completions/mean_length": 762.09375, "completions/mean_terminated_length": 581.5982208251953, "completions/min_length": 216.5, "completions/min_terminated_length": 216.5, "epoch": 0.2025, "grad_norm": 5.440563201904297, "kl": 0.067657470703125, "learning_rate": 9.717768952713511e-07, "loss": 0.2189, "num_tokens": 38327920.0, "reward": 0.4756682813167572, "reward_std": 0.21896108612418175, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024943511933088303, "rewards/penalized_accuracy_reward/std": 0.06815867871046066, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.21762223541736603, "step": 405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1614.5, "completions/max_terminated_length": 1309.5, "completions/mean_length": 1021.625, "completions/mean_terminated_length": 695.2885589599609, "completions/min_length": 313.25, "completions/min_terminated_length": 313.25, "epoch": 0.203, "grad_norm": 0.3881336748600006, "kl": 0.027008056640625, "learning_rate": 9.71502485161779e-07, "loss": 0.1574, "num_tokens": 38403000.0, "reward": 0.5593084990978241, "reward_std": 0.2915287874639034, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08727143704891205, "rewards/penalized_accuracy_reward/std": 0.10220200568437576, "rewards/tag_count_reward/mean": 0.76953125, "rewards/tag_count_reward/std": 0.21350136399269104, "step": 406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1967.0, "completions/max_terminated_length": 1605.5, "completions/mean_length": 1171.15625, "completions/mean_terminated_length": 778.7755737304688, "completions/min_length": 237.25, "completions/min_terminated_length": 237.25, "epoch": 0.2035, "grad_norm": 0.3171648681163788, "kl": 0.02606201171875, "learning_rate": 9.71226791082538e-07, "loss": 0.2645, "num_tokens": 38487346.0, "reward": 0.4479658529162407, "reward_std": 0.34174855425953865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745949361473322, "rewards/penalized_accuracy_reward/std": 0.11817550286650658, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.30054721236228943, "step": 407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1909.75, "completions/mean_length": 1356.15625, "completions/mean_terminated_length": 1111.5057678222656, "completions/min_length": 493.25, "completions/min_terminated_length": 493.25, "epoch": 0.204, "grad_norm": 0.2614351212978363, "kl": 0.02105712890625, "learning_rate": 9.709498138734403e-07, "loss": 0.2338, "num_tokens": 38583756.0, "reward": 0.4513728618621826, "reward_std": 0.35927730798721313, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04990518279373646, "rewards/penalized_accuracy_reward/std": 0.13041264563798904, "rewards/tag_count_reward/mean": 0.703125, "rewards/tag_count_reward/std": 0.3097936362028122, "step": 408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1766.0, "completions/max_terminated_length": 1561.75, "completions/mean_length": 872.640625, "completions/mean_terminated_length": 720.5052185058594, "completions/min_length": 251.25, "completions/min_terminated_length": 251.25, "epoch": 0.2045, "grad_norm": 0.5293733477592468, "kl": 0.03131103515625, "learning_rate": 9.706715543782064e-07, "loss": 0.2654, "num_tokens": 38650421.0, "reward": 0.6848854273557663, "reward_std": 0.37688885629177094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13736458960920572, "rewards/penalized_accuracy_reward/std": 0.1497371755540371, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2339005321264267, "step": 409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1703.75, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1195.4375, "completions/mean_terminated_length": 855.9974822998047, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.205, "grad_norm": 0.30791258811950684, "kl": 0.023223876953125, "learning_rate": 9.703920134444632e-07, "loss": 0.3037, "num_tokens": 38738161.0, "reward": 0.5454665422439575, "reward_std": 0.33980661630630493, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09988170117139816, "rewards/penalized_accuracy_reward/std": 0.1031573936343193, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.2890830934047699, "step": 410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1708.25, "completions/max_terminated_length": 1378.0, "completions/mean_length": 823.421875, "completions/mean_terminated_length": 730.0473480224609, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.2055, "grad_norm": 0.32209038734436035, "kl": 0.031036376953125, "learning_rate": 9.701111919237408e-07, "loss": 0.2335, "num_tokens": 38800380.0, "reward": 0.4453125, "reward_std": 0.1013990230858326, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2027980461716652, "step": 411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1434.25, "completions/mean_length": 1033.9375, "completions/mean_terminated_length": 850.650016784668, "completions/min_length": 374.75, "completions/min_terminated_length": 374.75, "epoch": 0.206, "grad_norm": 0.3960455060005188, "kl": 0.027984619140625, "learning_rate": 9.698290906714702e-07, "loss": 0.3225, "num_tokens": 38873864.0, "reward": 0.7557931691408157, "reward_std": 0.5089286454021931, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17477159202098846, "rewards/penalized_accuracy_reward/std": 0.20300429314374924, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.2609628140926361, "step": 412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1922.75, "completions/max_terminated_length": 1752.25, "completions/mean_length": 1070.390625, "completions/mean_terminated_length": 865.0769500732422, "completions/min_length": 350.5, "completions/min_terminated_length": 350.5, "epoch": 0.2065, "grad_norm": 0.3236480951309204, "kl": 0.023773193359375, "learning_rate": 9.695457105469804e-07, "loss": 0.2042, "num_tokens": 38951761.0, "reward": 0.5927964150905609, "reward_std": 0.3733537979424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08741382509469986, "rewards/penalized_accuracy_reward/std": 0.14985162764787674, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.21603353135287762, "step": 413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2020.0, "completions/max_terminated_length": 1768.25, "completions/mean_length": 1105.875, "completions/mean_terminated_length": 982.4654083251953, "completions/min_length": 352.75, "completions/min_terminated_length": 352.75, "epoch": 0.207, "grad_norm": 0.3288894593715668, "kl": 0.02911376953125, "learning_rate": 9.69261052413497e-07, "loss": 0.2157, "num_tokens": 39031705.0, "reward": 0.659538060426712, "reward_std": 0.30347491055727005, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12469089776277542, "rewards/penalized_accuracy_reward/std": 0.09975288063287735, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2452417127788067, "step": 414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1906.75, "completions/max_terminated_length": 1634.25, "completions/mean_length": 988.9375, "completions/mean_terminated_length": 752.6017761230469, "completions/min_length": 338.5, "completions/min_terminated_length": 338.5, "epoch": 0.2075, "grad_norm": 0.36007705330848694, "kl": 0.0279541015625, "learning_rate": 9.689751171381377e-07, "loss": 0.2508, "num_tokens": 39103269.0, "reward": 0.7062208205461502, "reward_std": 0.5385962128639221, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1372900903224945, "rewards/penalized_accuracy_reward/std": 0.24104924499988556, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.24828975647687912, "step": 415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1760.75, "completions/max_terminated_length": 1466.75, "completions/mean_length": 1261.25, "completions/mean_terminated_length": 880.0221252441406, "completions/min_length": 591.5, "completions/min_terminated_length": 591.5, "epoch": 0.208, "grad_norm": 0.31072908639907837, "kl": 0.0226287841796875, "learning_rate": 9.68687905591911e-07, "loss": 0.2452, "num_tokens": 39196629.0, "reward": 0.34765625, "reward_std": 0.13561241701245308, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.27122484520077705, "step": 416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1536.5, "completions/mean_length": 1238.046875, "completions/mean_terminated_length": 800.6479187011719, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.2085, "grad_norm": 0.38012444972991943, "kl": 0.0347900390625, "learning_rate": 9.683994186497132e-07, "loss": 0.3612, "num_tokens": 39286088.0, "reward": 0.5033224374055862, "reward_std": 0.34706101939082146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490340620279312, "rewards/penalized_accuracy_reward/std": 0.09987132251262665, "rewards/tag_count_reward/mean": 0.70703125, "rewards/tag_count_reward/std": 0.3248573876917362, "step": 417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1594.25, "completions/mean_length": 762.78125, "completions/mean_terminated_length": 628.5513153076172, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.209, "grad_norm": 0.6349743604660034, "kl": 0.0380859375, "learning_rate": 9.681096571903252e-07, "loss": 0.5377, "num_tokens": 39343434.0, "reward": 0.45703125, "reward_std": 0.11322538927197456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22645078226923943, "step": 418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1866.25, "completions/max_terminated_length": 1435.75, "completions/mean_length": 992.75, "completions/mean_terminated_length": 713.6596298217773, "completions/min_length": 286.25, "completions/min_terminated_length": 286.25, "epoch": 0.2095, "grad_norm": 0.3428170680999756, "kl": 0.024139404296875, "learning_rate": 9.67818622096411e-07, "loss": 0.3782, "num_tokens": 39416314.0, "reward": 0.4521973431110382, "reward_std": 0.2584071718156338, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024926796555519104, "rewards/penalized_accuracy_reward/std": 0.06811299920082092, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.2580215558409691, "step": 419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1875.5, "completions/max_terminated_length": 1419.25, "completions/mean_length": 1194.375, "completions/mean_terminated_length": 810.7370758056641, "completions/min_length": 384.5, "completions/min_terminated_length": 384.5, "epoch": 0.21, "grad_norm": 0.3572635054588318, "kl": 0.03814697265625, "learning_rate": 9.67526314254514e-07, "loss": 0.2157, "num_tokens": 39504050.0, "reward": 0.6398703157901764, "reward_std": 0.40730760991573334, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13731797970831394, "rewards/penalized_accuracy_reward/std": 0.14975717291235924, "rewards/tag_count_reward/mean": 0.73046875, "rewards/tag_count_reward/std": 0.2892516627907753, "step": 420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1979.25, "completions/max_terminated_length": 1676.25, "completions/mean_length": 1061.390625, "completions/mean_terminated_length": 814.3823089599609, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.2105, "grad_norm": 0.30255386233329773, "kl": 0.025604248046875, "learning_rate": 9.672327345550543e-07, "loss": 0.1321, "num_tokens": 39580571.0, "reward": 0.600222036242485, "reward_std": 0.4227828234434128, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09991570562124252, "rewards/penalized_accuracy_reward/std": 0.1787346825003624, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.23327182978391647, "step": 421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1650.5, "completions/mean_length": 1081.609375, "completions/mean_terminated_length": 905.4429016113281, "completions/min_length": 395.25, "completions/min_terminated_length": 395.25, "epoch": 0.211, "grad_norm": 0.3869238495826721, "kl": 0.033203125, "learning_rate": 9.669378838923267e-07, "loss": 0.2744, "num_tokens": 39659746.0, "reward": 0.7692378461360931, "reward_std": 0.5143285058438778, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17465798556804657, "rewards/penalized_accuracy_reward/std": 0.20453858375549316, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.2819667160511017, "step": 422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2000.25, "completions/max_terminated_length": 1663.75, "completions/mean_length": 1000.359375, "completions/mean_terminated_length": 895.6854858398438, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.2115, "grad_norm": 0.3456454277038574, "kl": 0.0211181640625, "learning_rate": 9.666417631644976e-07, "loss": 0.1852, "num_tokens": 39736777.0, "reward": 0.5103371441364288, "reward_std": 0.3020637705922127, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037395138293504715, "rewards/penalized_accuracy_reward/std": 0.11798861622810364, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.21588897705078125, "step": 423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1716.75, "completions/max_terminated_length": 1190.0, "completions/mean_length": 789.859375, "completions/mean_terminated_length": 644.7145767211914, "completions/min_length": 197.25, "completions/min_terminated_length": 197.25, "epoch": 0.212, "grad_norm": 0.5325435996055603, "kl": 0.031768798828125, "learning_rate": 9.66344373273602e-07, "loss": 0.4201, "num_tokens": 39793968.0, "reward": 0.43359375, "reward_std": 0.11157581396400928, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.22315163351595402, "step": 424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 1330.15625, "completions/mean_terminated_length": 919.4500274658203, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.2125, "grad_norm": 0.3549953103065491, "kl": 0.0226287841796875, "learning_rate": 9.66045715125541e-07, "loss": 0.2157, "num_tokens": 39890362.0, "reward": 0.5756122767925262, "reward_std": 0.3913433887064457, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12472020089626312, "rewards/penalized_accuracy_reward/std": 0.15211819857358932, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.23645170032978058, "step": 425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1580.75, "completions/mean_length": 1080.609375, "completions/mean_terminated_length": 759.7526702880859, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.213, "grad_norm": 0.36675506830215454, "kl": 0.029998779296875, "learning_rate": 9.657457896300791e-07, "loss": 0.4029, "num_tokens": 39967505.0, "reward": 0.37109375, "reward_std": 0.1577535904943943, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.3155071847140789, "step": 426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1571.25, "completions/mean_length": 1429.921875, "completions/mean_terminated_length": 953.3062591552734, "completions/min_length": 537.5, "completions/min_terminated_length": 537.5, "epoch": 0.2135, "grad_norm": 0.31131941080093384, "kl": 0.025634765625, "learning_rate": 9.654445977008414e-07, "loss": 0.2219, "num_tokens": 40070924.0, "reward": 0.45298346877098083, "reward_std": 0.3121415078639984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06242923066020012, "rewards/penalized_accuracy_reward/std": 0.0956343337893486, "rewards/tag_count_reward/mean": 0.65625, "rewards/tag_count_reward/std": 0.2723150812089443, "step": 427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1825.75, "completions/max_terminated_length": 1709.75, "completions/mean_length": 1170.671875, "completions/mean_terminated_length": 882.0717926025391, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.214, "grad_norm": 0.3652254343032837, "kl": 0.034759521484375, "learning_rate": 9.651421402553108e-07, "loss": 0.2636, "num_tokens": 40158423.0, "reward": 0.359375, "reward_std": 0.14537077397108078, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.29074155166745186, "step": 428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1707.5, "completions/max_terminated_length": 1273.25, "completions/mean_length": 834.78125, "completions/mean_terminated_length": 710.295280456543, "completions/min_length": 418.25, "completions/min_terminated_length": 418.25, "epoch": 0.2145, "grad_norm": 0.4027421176433563, "kl": 0.0286865234375, "learning_rate": 9.648384182148252e-07, "loss": 0.2501, "num_tokens": 40220841.0, "reward": 0.6758059859275818, "reward_std": 0.2792086023837328, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123170554637909, "rewards/penalized_accuracy_reward/std": 0.10230289399623871, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.18433516845107079, "step": 429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1652.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 917.1875, "completions/mean_terminated_length": 725.9352722167969, "completions/min_length": 325.5, "completions/min_terminated_length": 325.5, "epoch": 0.215, "grad_norm": 0.5000618100166321, "kl": 0.034698486328125, "learning_rate": 9.645334325045745e-07, "loss": 0.1372, "num_tokens": 40288389.0, "reward": 0.4791986644268036, "reward_std": 0.24917936325073242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037450890988111496, "rewards/penalized_accuracy_reward/std": 0.0805170014500618, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.2393437847495079, "step": 430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2027.0, "completions/max_terminated_length": 1595.5, "completions/mean_length": 924.703125, "completions/mean_terminated_length": 758.0197143554688, "completions/min_length": 273.5, "completions/min_terminated_length": 273.5, "epoch": 0.2155, "grad_norm": 0.5540999174118042, "kl": 0.02764892578125, "learning_rate": 9.64227184053598e-07, "loss": 0.3782, "num_tokens": 40355298.0, "reward": 0.5736332386732101, "reward_std": 0.2863515578210354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490255683660507, "rewards/penalized_accuracy_reward/std": 0.09987016022205353, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.238564595580101, "step": 431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1655.25, "completions/mean_length": 1194.578125, "completions/mean_terminated_length": 852.4980621337891, "completions/min_length": 398.75, "completions/min_terminated_length": 398.75, "epoch": 0.216, "grad_norm": 2.1682419776916504, "kl": 0.062103271484375, "learning_rate": 9.63919673794782e-07, "loss": 0.4346, "num_tokens": 40439591.0, "reward": 0.37109375, "reward_std": 0.16048385202884674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.3209677189588547, "step": 432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1675.75, "completions/mean_length": 1163.75, "completions/mean_terminated_length": 856.9556274414062, "completions/min_length": 316.75, "completions/min_terminated_length": 316.75, "epoch": 0.2165, "grad_norm": 0.294877290725708, "kl": 0.025054931640625, "learning_rate": 9.636109026648554e-07, "loss": 0.3065, "num_tokens": 40524551.0, "reward": 0.4498949646949768, "reward_std": 0.28856784477829933, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374474860727787, "rewards/penalized_accuracy_reward/std": 0.08050968497991562, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.3220038563013077, "step": 433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1448.25, "completions/max_terminated_length": 1294.0, "completions/mean_length": 981.390625, "completions/mean_terminated_length": 849.6316986083984, "completions/min_length": 631.25, "completions/min_terminated_length": 631.25, "epoch": 0.217, "grad_norm": 0.39499393105506897, "kl": 0.02655029296875, "learning_rate": 9.633008716043892e-07, "loss": 0.1959, "num_tokens": 40594864.0, "reward": 0.5766027122735977, "reward_std": 0.3730950728058815, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0998248066753149, "rewards/penalized_accuracy_reward/std": 0.15222786739468575, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.18771917559206486, "step": 434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1625.5, "completions/max_terminated_length": 1330.75, "completions/mean_length": 1113.375, "completions/mean_terminated_length": 885.2675628662109, "completions/min_length": 362.75, "completions/min_terminated_length": 362.75, "epoch": 0.2175, "grad_norm": 0.466012567281723, "kl": 0.026641845703125, "learning_rate": 9.629895815577915e-07, "loss": 0.2076, "num_tokens": 40673752.0, "reward": 0.7578972429037094, "reward_std": 0.3447970859706402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17484704591333866, "rewards/penalized_accuracy_reward/std": 0.13039272651076317, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.21924801915884018, "step": 435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1447.25, "completions/mean_length": 1159.453125, "completions/mean_terminated_length": 757.1557846069336, "completions/min_length": 299.5, "completions/min_terminated_length": 299.5, "epoch": 0.218, "grad_norm": 0.6680520176887512, "kl": 0.03436279296875, "learning_rate": 9.626770334733058e-07, "loss": 0.4444, "num_tokens": 40757333.0, "reward": 0.3671875, "reward_std": 0.15393556281924248, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.30787112936377525, "step": 436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1534.0, "completions/max_terminated_length": 1072.5, "completions/mean_length": 868.859375, "completions/mean_terminated_length": 649.5758972167969, "completions/min_length": 342.75, "completions/min_terminated_length": 342.75, "epoch": 0.2185, "grad_norm": 0.4799414873123169, "kl": 0.0259246826171875, "learning_rate": 9.623632283030077e-07, "loss": 0.2828, "num_tokens": 40820908.0, "reward": 0.7926861941814423, "reward_std": 0.4901247061789036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18735872209072113, "rewards/penalized_accuracy_reward/std": 0.2023119404911995, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.1991802491247654, "step": 437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1736.0, "completions/mean_length": 1344.84375, "completions/mean_terminated_length": 1135.1968994140625, "completions/min_length": 716.75, "completions/min_terminated_length": 716.75, "epoch": 0.219, "grad_norm": 0.31321409344673157, "kl": 0.0264739990234375, "learning_rate": 9.620481670028026e-07, "loss": 0.2958, "num_tokens": 40917842.0, "reward": 0.6306563019752502, "reward_std": 0.33766594156622887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12489844858646393, "rewards/penalized_accuracy_reward/std": 0.09991876780986786, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.3062558099627495, "step": 438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1396.75, "completions/max_terminated_length": 970.0, "completions/mean_length": 708.40625, "completions/mean_terminated_length": 536.0798797607422, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.2195, "grad_norm": 0.4679841697216034, "kl": 0.04095458984375, "learning_rate": 9.617318505324212e-07, "loss": 0.1839, "num_tokens": 40972828.0, "reward": 0.8619801998138428, "reward_std": 0.4970371685922146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2122400924563408, "rewards/penalized_accuracy_reward/std": 0.2021806612610817, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.23239529132843018, "step": 439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 503.84375, "completions/mean_terminated_length": 503.84375, "completions/min_length": 266.5, "completions/min_terminated_length": 266.5, "epoch": 0.22, "grad_norm": 0.5141251683235168, "kl": 0.0303955078125, "learning_rate": 9.614142798554186e-07, "loss": 0.1411, "num_tokens": 41014642.0, "reward": 0.6861908137798309, "reward_std": 0.31635836511850357, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09993134532123804, "rewards/penalized_accuracy_reward/std": 0.15237163752317429, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09649410098791122, "step": 440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2012.75, "completions/max_terminated_length": 1773.0, "completions/mean_length": 975.265625, "completions/mean_terminated_length": 730.0933227539062, "completions/min_length": 217.75, "completions/min_terminated_length": 217.75, "epoch": 0.2205, "grad_norm": 0.44786131381988525, "kl": 0.04852294921875, "learning_rate": 9.610954559391704e-07, "loss": 0.2987, "num_tokens": 41089171.0, "reward": 0.5506681799888611, "reward_std": 0.37887803465127945, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06244347058236599, "rewards/penalized_accuracy_reward/std": 0.1488097533583641, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.23361961916089058, "step": 441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1762.25, "completions/max_terminated_length": 1742.5, "completions/mean_length": 1050.828125, "completions/mean_terminated_length": 795.6684112548828, "completions/min_length": 228.75, "completions/min_terminated_length": 228.75, "epoch": 0.221, "grad_norm": 0.31348466873168945, "kl": 0.024810791015625, "learning_rate": 9.607753797548691e-07, "loss": 0.1287, "num_tokens": 41166392.0, "reward": 0.655846118927002, "reward_std": 0.4201808273792267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12479805573821068, "rewards/penalized_accuracy_reward/std": 0.17134232074022293, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.2169011253863573, "step": 442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 982.46875, "completions/mean_terminated_length": 837.205078125, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.2215, "grad_norm": 0.3584749400615692, "kl": 0.0214080810546875, "learning_rate": 9.604540522775227e-07, "loss": 0.2627, "num_tokens": 41236006.0, "reward": 0.464347705245018, "reward_std": 0.20539111271500587, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012447291985154152, "rewards/penalized_accuracy_reward/std": 0.04978916794061661, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.259061973541975, "step": 443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1352.5, "completions/max_terminated_length": 1337.25, "completions/mean_length": 909.625, "completions/mean_terminated_length": 798.8125, "completions/min_length": 376.75, "completions/min_terminated_length": 376.75, "epoch": 0.222, "grad_norm": 0.4456019699573517, "kl": 0.042327880859375, "learning_rate": 9.601314744859504e-07, "loss": 0.0153, "num_tokens": 41303982.0, "reward": 0.5388699471950531, "reward_std": 0.3533320985734463, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0624037254601717, "rewards/penalized_accuracy_reward/std": 0.148702472448349, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.1419338323175907, "step": 444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1901.25, "completions/max_terminated_length": 1621.5, "completions/mean_length": 942.78125, "completions/mean_terminated_length": 856.5138854980469, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.2225, "grad_norm": 0.3707921504974365, "kl": 0.023651123046875, "learning_rate": 9.598076473627796e-07, "loss": 0.1666, "num_tokens": 41372000.0, "reward": 0.9236186593770981, "reward_std": 0.5785801559686661, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.23719994351267815, "rewards/penalized_accuracy_reward/std": 0.25913015753030777, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.1897125169634819, "step": 445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1881.25, "completions/max_terminated_length": 1572.5, "completions/mean_length": 1007.9375, "completions/mean_terminated_length": 792.5811157226562, "completions/min_length": 366.75, "completions/min_terminated_length": 366.75, "epoch": 0.223, "grad_norm": 0.45830240845680237, "kl": 0.029205322265625, "learning_rate": 9.594825718944444e-07, "loss": 0.2918, "num_tokens": 41446972.0, "reward": 0.5002773106098175, "reward_std": 0.2988961674273014, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994334280490875, "rewards/penalized_accuracy_reward/std": 0.08934137970209122, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.3021021708846092, "step": 446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1759.5, "completions/max_terminated_length": 1291.25, "completions/mean_length": 792.53125, "completions/mean_terminated_length": 712.3263549804688, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.2235, "grad_norm": 0.5028854608535767, "kl": 0.02984619140625, "learning_rate": 9.59156249071181e-07, "loss": 0.3317, "num_tokens": 41508670.0, "reward": 0.4453125, "reward_std": 0.1090052630752325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2180105298757553, "step": 447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1751.75, "completions/max_terminated_length": 1396.0, "completions/mean_length": 957.75, "completions/mean_terminated_length": 761.8963623046875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.224, "grad_norm": 0.3664565086364746, "kl": 0.02972412109375, "learning_rate": 9.588286798870248e-07, "loss": 0.303, "num_tokens": 41579310.0, "reward": 0.419921875, "reward_std": 0.11295298486948013, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.225905979052186, "step": 448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1681.25, "completions/max_terminated_length": 1611.75, "completions/mean_length": 912.03125, "completions/mean_terminated_length": 848.9159698486328, "completions/min_length": 321.25, "completions/min_terminated_length": 321.25, "epoch": 0.2245, "grad_norm": 0.29374292492866516, "kl": 0.032501220703125, "learning_rate": 9.58499865339809e-07, "loss": 0.1402, "num_tokens": 41647088.0, "reward": 0.4780885875225067, "reward_std": 0.17358159087598324, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012481792829930782, "rewards/penalized_accuracy_reward/std": 0.04992717504501343, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.18164785578846931, "step": 449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1902.5, "completions/max_terminated_length": 1351.5, "completions/mean_length": 749.046875, "completions/mean_terminated_length": 578.8802261352539, "completions/min_length": 256.25, "completions/min_terminated_length": 256.25, "epoch": 0.225, "grad_norm": 0.4321780502796173, "kl": 0.035186767578125, "learning_rate": 9.581698064311592e-07, "loss": 0.3905, "num_tokens": 41703587.0, "reward": 0.435546875, "reward_std": 0.10439721681177616, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.20879444107413292, "step": 450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1849.75, "completions/max_terminated_length": 1345.5, "completions/mean_length": 1119.421875, "completions/mean_terminated_length": 885.2424621582031, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "epoch": 0.2255, "grad_norm": 0.3323290944099426, "kl": 0.0197296142578125, "learning_rate": 9.578385041664925e-07, "loss": 0.2204, "num_tokens": 41789918.0, "reward": 0.7975766211748123, "reward_std": 0.5768356919288635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1995695624500513, "rewards/penalized_accuracy_reward/std": 0.24419691413640976, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.21700285375118256, "step": 451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1632.75, "completions/mean_length": 1025.28125, "completions/mean_terminated_length": 855.6998291015625, "completions/min_length": 386.75, "completions/min_terminated_length": 386.75, "epoch": 0.226, "grad_norm": 0.48869588971138, "kl": 0.0294036865234375, "learning_rate": 9.575059595550127e-07, "loss": 0.2971, "num_tokens": 41864256.0, "reward": 0.5783930420875549, "reward_std": 0.31277999468147755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09974340349435806, "rewards/penalized_accuracy_reward/std": 0.10301460325717926, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.2386866696178913, "step": 452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 1978.0, "completions/max_terminated_length": 1594.25, "completions/mean_length": 1492.3125, "completions/mean_terminated_length": 1045.1753845214844, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.2265, "grad_norm": 0.39390888810157776, "kl": 0.0230560302734375, "learning_rate": 9.571721736097088e-07, "loss": 0.0585, "num_tokens": 41975188.0, "reward": 0.317952923476696, "reward_std": 0.24423889070749283, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012492086738348007, "rewards/penalized_accuracy_reward/std": 0.04996835067868233, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.3044489622116089, "step": 453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1737.75, "completions/max_terminated_length": 1350.75, "completions/mean_length": 824.265625, "completions/mean_terminated_length": 620.4919891357422, "completions/min_length": 220.5, "completions/min_terminated_length": 220.5, "epoch": 0.227, "grad_norm": 0.5511372089385986, "kl": 0.035369873046875, "learning_rate": 9.568371473473503e-07, "loss": 0.3257, "num_tokens": 42035093.0, "reward": 0.6887825429439545, "reward_std": 0.45815616101026535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13736002147197723, "rewards/penalized_accuracy_reward/std": 0.1917218565940857, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.23162559047341347, "step": 454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1685.75, "completions/max_terminated_length": 1321.0, "completions/mean_length": 950.234375, "completions/mean_terminated_length": 681.5085296630859, "completions/min_length": 181.5, "completions/min_terminated_length": 181.5, "epoch": 0.2275, "grad_norm": 0.5129905939102173, "kl": 0.044677734375, "learning_rate": 9.565008817884854e-07, "loss": 0.2987, "num_tokens": 42105492.0, "reward": 0.5100703239440918, "reward_std": 0.3630083464086056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049957030452787876, "rewards/penalized_accuracy_reward/std": 0.13050727918744087, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2666747123003006, "step": 455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1318.5, "completions/max_terminated_length": 1070.75, "completions/mean_length": 564.203125, "completions/mean_terminated_length": 542.8448028564453, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.228, "grad_norm": 0.4943152964115143, "kl": 0.033233642578125, "learning_rate": 9.561633779574372e-07, "loss": 0.2116, "num_tokens": 42149345.0, "reward": 0.8530125021934509, "reward_std": 0.44692953675985336, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18724843859672546, "rewards/penalized_accuracy_reward/std": 0.20547406375408173, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11065824143588543, "step": 456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1505.75, "completions/mean_length": 1111.703125, "completions/mean_terminated_length": 833.9873199462891, "completions/min_length": 344.75, "completions/min_terminated_length": 344.75, "epoch": 0.2285, "grad_norm": 0.3858027458190918, "kl": 0.02685546875, "learning_rate": 9.55824636882301e-07, "loss": 0.363, "num_tokens": 42228142.0, "reward": 0.3984375, "reward_std": 0.15426568128168583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.30853137001395226, "step": 457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1811.25, "completions/max_terminated_length": 1792.0, "completions/mean_length": 1005.15625, "completions/mean_terminated_length": 965.0601196289062, "completions/min_length": 378.5, "completions/min_terminated_length": 378.5, "epoch": 0.229, "grad_norm": 0.3331598937511444, "kl": 0.0278472900390625, "learning_rate": 9.554846595949413e-07, "loss": 0.2084, "num_tokens": 42300424.0, "reward": 0.787621945142746, "reward_std": 0.3914083205163479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16236566007137299, "rewards/penalized_accuracy_reward/std": 0.16391247510910034, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17184823006391525, "step": 458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1768.25, "completions/max_terminated_length": 1464.25, "completions/mean_length": 1143.9375, "completions/mean_terminated_length": 907.0265045166016, "completions/min_length": 409.75, "completions/min_terminated_length": 409.75, "epoch": 0.2295, "grad_norm": 0.28830838203430176, "kl": 0.0251007080078125, "learning_rate": 9.55143447130987e-07, "loss": 0.1908, "num_tokens": 42385956.0, "reward": 0.7749680429697037, "reward_std": 0.33458903804421425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.174593398347497, "rewards/penalized_accuracy_reward/std": 0.1302623301744461, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.2030156198889017, "step": 459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1689.25, "completions/max_terminated_length": 1610.25, "completions/mean_length": 851.296875, "completions/mean_terminated_length": 782.0620422363281, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.23, "grad_norm": 0.41608354449272156, "kl": 0.028228759765625, "learning_rate": 9.54801000529831e-07, "loss": 0.1713, "num_tokens": 42449111.0, "reward": 0.831436276435852, "reward_std": 0.500624056905508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18720251321792603, "rewards/penalized_accuracy_reward/std": 0.2259065881371498, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18303454853594303, "step": 460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1323.5, "completions/mean_terminated_length": 1174.524429321289, "completions/min_length": 761.25, "completions/min_terminated_length": 761.25, "epoch": 0.2305, "grad_norm": 0.3803155720233917, "kl": 0.02618408203125, "learning_rate": 9.54457320834625e-07, "loss": 0.225, "num_tokens": 42543367.0, "reward": 0.36090368404984474, "reward_std": 0.2091497015208006, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012483092024922371, "rewards/penalized_accuracy_reward/std": 0.049932368099689484, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.21856993064284325, "step": 461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1580.75, "completions/mean_length": 1353.75, "completions/mean_terminated_length": 1086.1344909667969, "completions/min_length": 567.25, "completions/min_terminated_length": 567.25, "epoch": 0.231, "grad_norm": 0.34130391478538513, "kl": 0.032867431640625, "learning_rate": 9.54112409092277e-07, "loss": 0.2695, "num_tokens": 42639703.0, "reward": 0.376953125, "reward_std": 0.14595923945307732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.29191848635673523, "step": 462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1378.75, "completions/mean_length": 1400.0, "completions/mean_terminated_length": 807.70556640625, "completions/min_length": 355.5, "completions/min_terminated_length": 355.5, "epoch": 0.2315, "grad_norm": 0.2781061828136444, "kl": 0.029144287109375, "learning_rate": 9.537662663534477e-07, "loss": 0.1723, "num_tokens": 42739303.0, "reward": 0.4988299608230591, "reward_std": 0.45309434831142426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08730561099946499, "rewards/penalized_accuracy_reward/std": 0.19539431110024452, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.28521962836384773, "step": 463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1855.75, "completions/max_terminated_length": 1412.0, "completions/mean_length": 903.34375, "completions/mean_terminated_length": 802.9633331298828, "completions/min_length": 314.5, "completions/min_terminated_length": 314.5, "epoch": 0.232, "grad_norm": 0.5950891375541687, "kl": 0.0279541015625, "learning_rate": 9.534188936725483e-07, "loss": 0.3139, "num_tokens": 42805821.0, "reward": 1.0252284705638885, "reward_std": 0.6646209508180618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2870282679796219, "rewards/penalized_accuracy_reward/std": 0.3052823692560196, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20251959562301636, "step": 464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1918.75, "completions/max_terminated_length": 1726.75, "completions/mean_length": 999.796875, "completions/mean_terminated_length": 846.3947296142578, "completions/min_length": 302.25, "completions/min_terminated_length": 302.25, "epoch": 0.2325, "grad_norm": 0.38951560854911804, "kl": 0.026123046875, "learning_rate": 9.530702921077358e-07, "loss": 0.3248, "num_tokens": 42878752.0, "reward": 0.427734375, "reward_std": 0.12955566681921482, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.25911133736371994, "step": 465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1580.5, "completions/mean_length": 1223.828125, "completions/mean_terminated_length": 965.0674133300781, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.233, "grad_norm": 0.38255178928375244, "kl": 0.030548095703125, "learning_rate": 9.527204627209112e-07, "loss": 0.3107, "num_tokens": 42966869.0, "reward": 0.39996086061000824, "reward_std": 0.21615727618336678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012480432167649269, "rewards/penalized_accuracy_reward/std": 0.049921728670597076, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.3018973842263222, "step": 466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1522.5, "completions/mean_length": 1250.484375, "completions/mean_terminated_length": 989.8575210571289, "completions/min_length": 568.5, "completions/min_terminated_length": 568.5, "epoch": 0.2335, "grad_norm": 0.5170333385467529, "kl": 0.03448486328125, "learning_rate": 9.523694065777156e-07, "loss": 0.3332, "num_tokens": 43055524.0, "reward": 0.7613286674022675, "reward_std": 0.4574275203049183, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18730496242642403, "rewards/penalized_accuracy_reward/std": 0.16975539177656174, "rewards/tag_count_reward/mean": 0.7734375, "rewards/tag_count_reward/std": 0.29692745208740234, "step": 467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1517.75, "completions/max_terminated_length": 1314.25, "completions/mean_length": 1009.25, "completions/mean_terminated_length": 866.6562652587891, "completions/min_length": 485.5, "completions/min_terminated_length": 485.5, "epoch": 0.234, "grad_norm": 0.36454808712005615, "kl": 0.0287628173828125, "learning_rate": 9.520171247475268e-07, "loss": 0.0304, "num_tokens": 43129892.0, "reward": 0.5598839968442917, "reward_std": 0.33036769926548004, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07486387528479099, "rewards/penalized_accuracy_reward/std": 0.1455094926059246, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.16711556911468506, "step": 468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1687.25, "completions/mean_length": 1331.296875, "completions/mean_terminated_length": 1092.2577514648438, "completions/min_length": 396.25, "completions/min_terminated_length": 396.25, "epoch": 0.2345, "grad_norm": 0.3107975721359253, "kl": 0.028045654296875, "learning_rate": 9.516636183034564e-07, "loss": 0.2163, "num_tokens": 43225831.0, "reward": 0.42301490902900696, "reward_std": 0.2664007879793644, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498401142656803, "rewards/penalized_accuracy_reward/std": 0.06826931983232498, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.30644822865724564, "step": 469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1720.5, "completions/max_terminated_length": 1492.75, "completions/mean_length": 1120.984375, "completions/mean_terminated_length": 895.51806640625, "completions/min_length": 307.5, "completions/min_terminated_length": 307.5, "epoch": 0.235, "grad_norm": 0.3575640618801117, "kl": 0.0294189453125, "learning_rate": 9.513088883223463e-07, "loss": 0.1809, "num_tokens": 43305990.0, "reward": 0.392578125, "reward_std": 0.1392986923456192, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.278597392141819, "step": 470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1335.75, "completions/mean_length": 1089.546875, "completions/mean_terminated_length": 810.7654113769531, "completions/min_length": 344.75, "completions/min_terminated_length": 344.75, "epoch": 0.2355, "grad_norm": 0.36710309982299805, "kl": 0.02655029296875, "learning_rate": 9.509529358847654e-07, "loss": 0.4061, "num_tokens": 43384985.0, "reward": 0.4636413902044296, "reward_std": 0.2948217839002609, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03748475760221481, "rewards/penalized_accuracy_reward/std": 0.08058980852365494, "rewards/tag_count_reward/mean": 0.77734375, "rewards/tag_count_reward/std": 0.3136255294084549, "step": 471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1592.25, "completions/max_terminated_length": 1307.0, "completions/mean_length": 857.1875, "completions/mean_terminated_length": 764.4739227294922, "completions/min_length": 372.75, "completions/min_terminated_length": 372.75, "epoch": 0.236, "grad_norm": 0.5168818235397339, "kl": 0.03045654296875, "learning_rate": 9.505957620750069e-07, "loss": 0.202, "num_tokens": 43446277.0, "reward": 0.7638278901576996, "reward_std": 0.47502472437918186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1621873825788498, "rewards/penalized_accuracy_reward/std": 0.20207467675209045, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2112439814954996, "step": 472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 1540.078125, "completions/mean_terminated_length": 1141.5731201171875, "completions/min_length": 499.75, "completions/min_terminated_length": 499.75, "epoch": 0.2365, "grad_norm": 0.30937460064888, "kl": 0.0207366943359375, "learning_rate": 9.502373679810839e-07, "loss": 0.2331, "num_tokens": 43555018.0, "reward": 0.333984375, "reward_std": 0.15568212792277336, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.3113642632961273, "step": 473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.5, "completions/mean_length": 1111.703125, "completions/mean_terminated_length": 942.0870513916016, "completions/min_length": 348.5, "completions/min_terminated_length": 348.5, "epoch": 0.237, "grad_norm": 0.35363584756851196, "kl": 0.031585693359375, "learning_rate": 9.49877754694727e-07, "loss": 0.2592, "num_tokens": 43635591.0, "reward": 0.5603957772254944, "reward_std": 0.3667305577546358, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06242445111274719, "rewards/penalized_accuracy_reward/std": 0.1487545445561409, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.25384292379021645, "step": 474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1873.75, "completions/mean_length": 1057.328125, "completions/mean_terminated_length": 866.5067596435547, "completions/min_length": 308.75, "completions/min_terminated_length": 308.75, "epoch": 0.2375, "grad_norm": 0.31713545322418213, "kl": 0.0286865234375, "learning_rate": 9.495169233113806e-07, "loss": 0.3308, "num_tokens": 43710348.0, "reward": 0.44686421751976013, "reward_std": 0.21363577619194984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249460969120264, "rewards/penalized_accuracy_reward/std": 0.04997844249010086, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.28053389489650726, "step": 475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1542.25, "completions/max_terminated_length": 1146.0, "completions/mean_length": 709.71875, "completions/mean_terminated_length": 648.8305206298828, "completions/min_length": 298.25, "completions/min_terminated_length": 298.25, "epoch": 0.238, "grad_norm": 0.41687923669815063, "kl": 0.028717041015625, "learning_rate": 9.491548749301997e-07, "loss": 0.168, "num_tokens": 43764138.0, "reward": 0.7275604456663132, "reward_std": 0.3774439934641123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13721772097051144, "rewards/penalized_accuracy_reward/std": 0.17039547115564346, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1463961210101843, "step": 476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1422.25, "completions/max_terminated_length": 1221.0, "completions/mean_length": 780.453125, "completions/mean_terminated_length": 663.8757629394531, "completions/min_length": 290.25, "completions/min_terminated_length": 290.25, "epoch": 0.2385, "grad_norm": 0.3906925320625305, "kl": 0.029266357421875, "learning_rate": 9.487916106540465e-07, "loss": 0.1924, "num_tokens": 43827735.0, "reward": 0.7676266729831696, "reward_std": 0.42826220393180847, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16213364899158478, "rewards/penalized_accuracy_reward/std": 0.180245041847229, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2007351517677307, "step": 477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1498.75, "completions/mean_length": 1002.890625, "completions/mean_terminated_length": 722.9907379150391, "completions/min_length": 314.75, "completions/min_terminated_length": 314.75, "epoch": 0.239, "grad_norm": 0.3514080047607422, "kl": 0.035888671875, "learning_rate": 9.484271315894871e-07, "loss": 0.2396, "num_tokens": 43901056.0, "reward": 0.7077384293079376, "reward_std": 0.5901942290365696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14976764470338821, "rewards/penalized_accuracy_reward/std": 0.2510628327727318, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.2613547705113888, "step": 478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1350.25, "completions/mean_length": 781.265625, "completions/mean_terminated_length": 583.2026138305664, "completions/min_length": 253.75, "completions/min_terminated_length": 253.75, "epoch": 0.2395, "grad_norm": 0.5105019211769104, "kl": 0.03900146484375, "learning_rate": 9.480614388467877e-07, "loss": 0.4599, "num_tokens": 43960721.0, "reward": 0.4507528394460678, "reward_std": 0.1978018879890442, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012485794723033905, "rewards/penalized_accuracy_reward/std": 0.04994317889213562, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.26342107728123665, "step": 479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1731.0, "completions/max_terminated_length": 1308.5, "completions/mean_length": 1190.375, "completions/mean_terminated_length": 817.9823150634766, "completions/min_length": 444.5, "completions/min_terminated_length": 444.5, "epoch": 0.24, "grad_norm": 0.36634913086891174, "kl": 0.0222625732421875, "learning_rate": 9.47694533539912e-07, "loss": 0.3194, "num_tokens": 44044617.0, "reward": 0.6105945110321045, "reward_std": 0.3287196382880211, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12463319301605225, "rewards/penalized_accuracy_reward/std": 0.09970657527446747, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.2586130052804947, "step": 480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1720.25, "completions/max_terminated_length": 1332.75, "completions/mean_length": 817.0, "completions/mean_terminated_length": 697.9732208251953, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.2405, "grad_norm": 0.49183744192123413, "kl": 0.04119873046875, "learning_rate": 9.473264167865171e-07, "loss": 0.1903, "num_tokens": 44107033.0, "reward": 0.5006636679172516, "reward_std": 0.2572653293609619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744120895862579, "rewards/penalized_accuracy_reward/std": 0.08049635589122772, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.21844712644815445, "step": 481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1814.0, "completions/max_terminated_length": 1434.25, "completions/mean_length": 966.375, "completions/mean_terminated_length": 781.6823272705078, "completions/min_length": 310.5, "completions/min_terminated_length": 310.5, "epoch": 0.241, "grad_norm": 0.39228013157844543, "kl": 0.027740478515625, "learning_rate": 9.469570897079504e-07, "loss": 0.2606, "num_tokens": 44178001.0, "reward": 0.76955446600914, "reward_std": 0.3516606502234936, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17481629457324743, "rewards/penalized_accuracy_reward/std": 0.13050509616732597, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.23439474403858185, "step": 482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1703.75, "completions/max_terminated_length": 1383.25, "completions/mean_length": 978.296875, "completions/mean_terminated_length": 757.9997253417969, "completions/min_length": 385.25, "completions/min_terminated_length": 385.25, "epoch": 0.2415, "grad_norm": 0.44730135798454285, "kl": 0.0347900390625, "learning_rate": 9.465865534292464e-07, "loss": 0.223, "num_tokens": 44250884.0, "reward": 0.7440678477287292, "reward_std": 0.22784298658370972, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17476829886436462, "rewards/penalized_accuracy_reward/std": 0.06822273135185242, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.2452499233186245, "step": 483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1105.5, "completions/mean_length": 943.46875, "completions/mean_terminated_length": 663.6743774414062, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.242, "grad_norm": 0.5081470012664795, "kl": 0.02935791015625, "learning_rate": 9.462148090791228e-07, "loss": 0.3787, "num_tokens": 44320354.0, "reward": 0.45620231330394745, "reward_std": 0.2908473573625088, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024976153858006, "rewards/penalized_accuracy_reward/std": 0.099904615432024, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.30130699649453163, "step": 484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1774.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 897.5, "completions/mean_terminated_length": 773.2146148681641, "completions/min_length": 292.25, "completions/min_terminated_length": 292.25, "epoch": 0.2425, "grad_norm": 0.382176011800766, "kl": 0.02459716796875, "learning_rate": 9.458418577899774e-07, "loss": 0.2617, "num_tokens": 44387794.0, "reward": 0.7981224060058594, "reward_std": 0.5840234123170376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.187147150747478, "rewards/penalized_accuracy_reward/std": 0.2528454177081585, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.24688541144132614, "step": 485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1673.25, "completions/mean_length": 1253.578125, "completions/mean_terminated_length": 924.4028015136719, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.243, "grad_norm": 0.4133341908454895, "kl": 0.0255126953125, "learning_rate": 9.454677006978842e-07, "loss": 0.3947, "num_tokens": 44476119.0, "reward": 0.3828125, "reward_std": 0.1584600694477558, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.3169201537966728, "step": 486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.25, "completions/mean_length": 1570.578125, "completions/mean_terminated_length": 1036.4615478515625, "completions/min_length": 606.25, "completions/min_terminated_length": 606.25, "epoch": 0.2435, "grad_norm": 0.25319382548332214, "kl": 0.0250244140625, "learning_rate": 9.450923389425911e-07, "loss": 0.2739, "num_tokens": 44586860.0, "reward": 0.296875, "reward_std": 0.16998343169689178, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59375, "rewards/tag_count_reward/std": 0.33996687084436417, "step": 487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1809.5, "completions/max_terminated_length": 1397.25, "completions/mean_length": 1045.890625, "completions/mean_terminated_length": 785.0317535400391, "completions/min_length": 275.75, "completions/min_terminated_length": 275.75, "epoch": 0.244, "grad_norm": 0.3120822608470917, "kl": 0.031158447265625, "learning_rate": 9.44715773667515e-07, "loss": 0.2621, "num_tokens": 44664741.0, "reward": 0.46008534729480743, "reward_std": 0.2379237338900566, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024964548647403717, "rewards/penalized_accuracy_reward/std": 0.06821613758802414, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.25535421818494797, "step": 488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2023.5, "completions/max_terminated_length": 1712.0, "completions/mean_length": 1281.75, "completions/mean_terminated_length": 883.7125091552734, "completions/min_length": 355.5, "completions/min_terminated_length": 355.5, "epoch": 0.2445, "grad_norm": 0.28489625453948975, "kl": 0.0264892578125, "learning_rate": 9.443380060197385e-07, "loss": 0.3235, "num_tokens": 44756229.0, "reward": 0.405411034822464, "reward_std": 0.26873770356178284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024971140548586845, "rewards/penalized_accuracy_reward/std": 0.0682341530919075, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.31307991221547127, "step": 489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1751.5, "completions/max_terminated_length": 1474.5, "completions/mean_length": 1081.734375, "completions/mean_terminated_length": 827.4236145019531, "completions/min_length": 321.75, "completions/min_terminated_length": 321.75, "epoch": 0.245, "grad_norm": 0.38721421360969543, "kl": 0.031646728515625, "learning_rate": 9.43959037150008e-07, "loss": 0.2182, "num_tokens": 44835028.0, "reward": 0.39453125, "reward_std": 0.13045918196439743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.26091836765408516, "step": 490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1679.5, "completions/mean_length": 1246.9375, "completions/mean_terminated_length": 990.9790344238281, "completions/min_length": 356.25, "completions/min_terminated_length": 356.25, "epoch": 0.2455, "grad_norm": 0.3524274230003357, "kl": 0.031463623046875, "learning_rate": 9.43578868212728e-07, "loss": 0.3004, "num_tokens": 44925072.0, "reward": 0.45583102107048035, "reward_std": 0.29378190636634827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03748582303524017, "rewards/penalized_accuracy_reward/std": 0.0805920958518982, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.2917652949690819, "step": 491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1844.75, "completions/max_terminated_length": 1716.75, "completions/mean_length": 1235.40625, "completions/mean_terminated_length": 1005.7792053222656, "completions/min_length": 287.75, "completions/min_terminated_length": 287.75, "epoch": 0.246, "grad_norm": 0.301017701625824, "kl": 0.0236358642578125, "learning_rate": 9.431975003659594e-07, "loss": 0.1963, "num_tokens": 45013994.0, "reward": 0.47334152460098267, "reward_std": 0.34122368320822716, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037452008575201035, "rewards/penalized_accuracy_reward/std": 0.11814846098423004, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.27646573446691036, "step": 492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 1072.25, "completions/mean_terminated_length": 984.9715118408203, "completions/min_length": 421.25, "completions/min_terminated_length": 421.25, "epoch": 0.2465, "grad_norm": 0.39848411083221436, "kl": 0.02752685546875, "learning_rate": 9.428149347714143e-07, "loss": 0.2247, "num_tokens": 45092826.0, "reward": 0.86065673828125, "reward_std": 0.2660638578236103, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.199859619140625, "rewards/penalized_accuracy_reward/std": 0.09974831342697144, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17540178075432777, "step": 493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1886.0, "completions/max_terminated_length": 1641.75, "completions/mean_length": 1382.171875, "completions/mean_terminated_length": 1189.3686828613281, "completions/min_length": 611.25, "completions/min_terminated_length": 611.25, "epoch": 0.247, "grad_norm": 0.31201329827308655, "kl": 0.0253753662109375, "learning_rate": 9.424311725944543e-07, "loss": 0.0635, "num_tokens": 45190981.0, "reward": 0.7816786468029022, "reward_std": 0.528480377048254, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19943305850028992, "rewards/penalized_accuracy_reward/std": 0.2253837063908577, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.23772894591093063, "step": 494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1720.25, "completions/max_terminated_length": 1099.75, "completions/mean_length": 943.046875, "completions/mean_terminated_length": 582.0591049194336, "completions/min_length": 220.25, "completions/min_terminated_length": 220.25, "epoch": 0.2475, "grad_norm": 0.4039519727230072, "kl": 0.032379150390625, "learning_rate": 9.420462150040852e-07, "loss": 0.4237, "num_tokens": 45261512.0, "reward": 0.6210158467292786, "reward_std": 0.3422751910984516, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11226574331521988, "rewards/penalized_accuracy_reward/std": 0.10225635021924973, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.29782552644610405, "step": 495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1399.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 613.375, "completions/mean_terminated_length": 517.5416793823242, "completions/min_length": 206.5, "completions/min_terminated_length": 206.5, "epoch": 0.248, "grad_norm": 0.7555497288703918, "kl": 0.047821044921875, "learning_rate": 9.416600631729548e-07, "loss": 0.2694, "num_tokens": 45308656.0, "reward": 0.5779440253973007, "reward_std": 0.2609270680695772, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06240951642394066, "rewards/penalized_accuracy_reward/std": 0.09560411423444748, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.13943768665194511, "step": 496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1765.0, "completions/max_terminated_length": 1474.5, "completions/mean_length": 1028.796875, "completions/mean_terminated_length": 753.5709228515625, "completions/min_length": 271.75, "completions/min_terminated_length": 271.75, "epoch": 0.2485, "grad_norm": 0.4158158600330353, "kl": 0.031463623046875, "learning_rate": 9.412727182773486e-07, "loss": 0.3315, "num_tokens": 45388419.0, "reward": 0.5212394744157791, "reward_std": 0.3168007954955101, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06237754970788956, "rewards/penalized_accuracy_reward/std": 0.09555520862340927, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.28549718484282494, "step": 497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1498.5, "completions/max_terminated_length": 1365.5, "completions/mean_length": 1026.078125, "completions/mean_terminated_length": 809.8756103515625, "completions/min_length": 374.75, "completions/min_terminated_length": 374.75, "epoch": 0.249, "grad_norm": 0.37349170446395874, "kl": 0.032135009765625, "learning_rate": 9.408841814971861e-07, "loss": 0.1741, "num_tokens": 45463080.0, "reward": 0.8154351562261581, "reward_std": 0.6153025776147842, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19970976188778877, "rewards/penalized_accuracy_reward/std": 0.282634012401104, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.22471417486667633, "step": 498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1838.75, "completions/max_terminated_length": 1464.75, "completions/mean_length": 812.875, "completions/mean_terminated_length": 686.5808334350586, "completions/min_length": 288.75, "completions/min_terminated_length": 288.75, "epoch": 0.2495, "grad_norm": 0.5707184672355652, "kl": 0.03717041015625, "learning_rate": 9.404944540160177e-07, "loss": 0.1888, "num_tokens": 45525152.0, "reward": 0.5279818624258041, "reward_std": 0.31017621606588364, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037428426556289196, "rewards/penalized_accuracy_reward/std": 0.11808685958385468, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20455737598240376, "step": 499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1856.75, "completions/max_terminated_length": 1654.25, "completions/mean_length": 1154.578125, "completions/mean_terminated_length": 1024.2352447509766, "completions/min_length": 344.25, "completions/min_terminated_length": 344.25, "epoch": 0.25, "grad_norm": 0.37422168254852295, "kl": 0.025238037109375, "learning_rate": 9.401035370210212e-07, "loss": 0.0563, "num_tokens": 45607013.0, "reward": 0.634712815284729, "reward_std": 0.42718055844306946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1122782826423645, "rewards/penalized_accuracy_reward/std": 0.18028707802295685, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2186926994472742, "step": 500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1805.75, "completions/max_terminated_length": 1501.75, "completions/mean_length": 1133.390625, "completions/mean_terminated_length": 903.8963165283203, "completions/min_length": 355.5, "completions/min_terminated_length": 355.5, "epoch": 0.2505, "grad_norm": 0.47794246673583984, "kl": 0.030181884765625, "learning_rate": 9.397114317029974e-07, "loss": 0.2477, "num_tokens": 45688494.0, "reward": 0.408203125, "reward_std": 0.13666973635554314, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.27333947643637657, "step": 501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1939.75, "completions/max_terminated_length": 1665.25, "completions/mean_length": 1127.171875, "completions/mean_terminated_length": 882.7284545898438, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.251, "grad_norm": 0.2843780815601349, "kl": 0.033477783203125, "learning_rate": 9.393181392563669e-07, "loss": 0.1958, "num_tokens": 45768825.0, "reward": 0.4097044765949249, "reward_std": 0.17487933859229088, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012469430454075336, "rewards/penalized_accuracy_reward/std": 0.049877721816301346, "rewards/tag_count_reward/mean": 0.76953125, "rewards/tag_count_reward/std": 0.2170758917927742, "step": 502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1756.0, "completions/max_terminated_length": 1472.25, "completions/mean_length": 1156.046875, "completions/mean_terminated_length": 826.1505889892578, "completions/min_length": 334.5, "completions/min_terminated_length": 334.5, "epoch": 0.2515, "grad_norm": 0.3844212293624878, "kl": 0.03192138671875, "learning_rate": 9.38923660879167e-07, "loss": 0.318, "num_tokens": 45854652.0, "reward": 0.43413424491882324, "reward_std": 0.283625278621912, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03737962245941162, "rewards/penalized_accuracy_reward/std": 0.08036378026008606, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.2863175645470619, "step": 503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1676.5, "completions/mean_length": 1230.125, "completions/mean_terminated_length": 994.0015258789062, "completions/min_length": 472.5, "completions/min_terminated_length": 472.5, "epoch": 0.252, "grad_norm": 0.3334697186946869, "kl": 0.031463623046875, "learning_rate": 9.385279977730472e-07, "loss": 0.2125, "num_tokens": 45941460.0, "reward": 0.7074456810951233, "reward_std": 0.49733349680900574, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16231658682227135, "rewards/penalized_accuracy_reward/std": 0.19878628104925156, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.2683328092098236, "step": 504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1788.75, "completions/mean_length": 1296.90625, "completions/mean_terminated_length": 1083.245864868164, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.2525, "grad_norm": 0.326815664768219, "kl": 0.023956298828125, "learning_rate": 9.381311511432658e-07, "loss": 0.1972, "num_tokens": 46033902.0, "reward": 0.40583018958568573, "reward_std": 0.22187922336161137, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01248540636152029, "rewards/penalized_accuracy_reward/std": 0.04994162917137146, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.2952496148645878, "step": 505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1251.71875, "completions/mean_terminated_length": 795.0975494384766, "completions/min_length": 255.75, "completions/min_terminated_length": 255.75, "epoch": 0.253, "grad_norm": 0.3715771734714508, "kl": 0.02862548828125, "learning_rate": 9.377331221986866e-07, "loss": 0.3007, "num_tokens": 46122076.0, "reward": 0.5223662108182907, "reward_std": 0.46167295053601265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08735498040914536, "rewards/penalized_accuracy_reward/std": 0.16978903859853745, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.3334159329533577, "step": 506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1820.25, "completions/max_terminated_length": 1416.0, "completions/mean_length": 892.609375, "completions/mean_terminated_length": 674.1229248046875, "completions/min_length": 248.25, "completions/min_terminated_length": 248.25, "epoch": 0.2535, "grad_norm": 0.5326547026634216, "kl": 0.04052734375, "learning_rate": 9.373339121517746e-07, "loss": 0.3628, "num_tokens": 46187811.0, "reward": 0.7095315754413605, "reward_std": 0.4275118038058281, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14968765899538994, "rewards/penalized_accuracy_reward/std": 0.1679331213235855, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.25450707972049713, "step": 507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 1261.9375, "completions/mean_terminated_length": 931.9236297607422, "completions/min_length": 268.25, "completions/min_terminated_length": 268.25, "epoch": 0.254, "grad_norm": 0.34714245796203613, "kl": 0.028533935546875, "learning_rate": 9.36933522218593e-07, "loss": 0.3158, "num_tokens": 46275935.0, "reward": 0.37109375, "reward_std": 0.15496515110135078, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.30993032455444336, "step": 508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1728.25, "completions/max_terminated_length": 1601.0, "completions/mean_length": 1089.09375, "completions/mean_terminated_length": 967.6533203125, "completions/min_length": 452.5, "completions/min_terminated_length": 452.5, "epoch": 0.2545, "grad_norm": 0.3069932758808136, "kl": 0.02203369140625, "learning_rate": 9.36531953618799e-07, "loss": 0.1748, "num_tokens": 46355941.0, "reward": 0.6215924769639969, "reward_std": 0.41713543981313705, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.099858732894063, "rewards/penalized_accuracy_reward/std": 0.16802143305540085, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.20211800187826157, "step": 509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1719.75, "completions/mean_length": 1276.578125, "completions/mean_terminated_length": 972.5306701660156, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.255, "grad_norm": 0.31015947461128235, "kl": 0.027618408203125, "learning_rate": 9.361292075756401e-07, "loss": 0.309, "num_tokens": 46446650.0, "reward": 0.46556177735328674, "reward_std": 0.28321193158626556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746838495135307, "rewards/penalized_accuracy_reward/std": 0.08055461198091507, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.280989333987236, "step": 510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1994.0, "completions/max_terminated_length": 1612.5, "completions/mean_length": 852.15625, "completions/mean_terminated_length": 724.86669921875, "completions/min_length": 282.75, "completions/min_terminated_length": 282.75, "epoch": 0.2555, "grad_norm": 0.5047662258148193, "kl": 0.036773681640625, "learning_rate": 9.357252853159505e-07, "loss": 0.3326, "num_tokens": 46509604.0, "reward": 0.44140625, "reward_std": 0.11557747982442379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.23115497082471848, "step": 511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1981.0, "completions/max_terminated_length": 1638.25, "completions/mean_length": 965.28125, "completions/mean_terminated_length": 890.3431854248047, "completions/min_length": 345.5, "completions/min_terminated_length": 345.5, "epoch": 0.256, "grad_norm": 0.37943658232688904, "kl": 0.031646728515625, "learning_rate": 9.353201880701477e-07, "loss": 0.1243, "num_tokens": 46582390.0, "reward": 0.5967921018600464, "reward_std": 0.29339590296149254, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07476324588060379, "rewards/penalized_accuracy_reward/std": 0.09968438744544983, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.22371861711144447, "step": 512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1678.0, "completions/max_terminated_length": 1360.25, "completions/mean_length": 1064.265625, "completions/mean_terminated_length": 827.9748382568359, "completions/min_length": 377.5, "completions/min_terminated_length": 377.5, "epoch": 0.2565, "grad_norm": 0.4883468449115753, "kl": 0.035797119140625, "learning_rate": 9.34913917072228e-07, "loss": 0.2107, "num_tokens": 46658215.0, "reward": 0.49834489822387695, "reward_std": 0.29271428659558296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995369538664818, "rewards/penalized_accuracy_reward/std": 0.08935988694429398, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.2590084411203861, "step": 513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1317.75, "completions/max_terminated_length": 1106.75, "completions/mean_length": 770.25, "completions/mean_terminated_length": 659.8312530517578, "completions/min_length": 275.5, "completions/min_terminated_length": 275.5, "epoch": 0.257, "grad_norm": 0.3525479733943939, "kl": 0.031494140625, "learning_rate": 9.345064735597633e-07, "loss": 0.0857, "num_tokens": 46714279.0, "reward": 0.447265625, "reward_std": 0.07194410637021065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.1438882127404213, "step": 514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1792.5, "completions/max_terminated_length": 1520.0, "completions/mean_length": 956.21875, "completions/mean_terminated_length": 833.4271087646484, "completions/min_length": 414.5, "completions/min_terminated_length": 414.5, "epoch": 0.2575, "grad_norm": 0.5444963574409485, "kl": 0.0384521484375, "learning_rate": 9.340978587738972e-07, "loss": 0.2155, "num_tokens": 46783797.0, "reward": 0.443359375, "reward_std": 0.08519715070724487, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.1703943032771349, "step": 515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1594.0, "completions/max_terminated_length": 1515.75, "completions/mean_length": 707.796875, "completions/mean_terminated_length": 651.9615478515625, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.258, "grad_norm": 0.5597754120826721, "kl": 0.037322998046875, "learning_rate": 9.336880739593415e-07, "loss": 0.3075, "num_tokens": 46836360.0, "reward": 0.6778936088085175, "reward_std": 0.37276544608175755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11238430254161358, "rewards/penalized_accuracy_reward/std": 0.15311409533023834, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.17116425558924675, "step": 516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1678.0, "completions/max_terminated_length": 1446.5, "completions/mean_length": 899.828125, "completions/mean_terminated_length": 799.1102600097656, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.2585, "grad_norm": 0.41781342029571533, "kl": 0.033843994140625, "learning_rate": 9.332771203643714e-07, "loss": 0.0983, "num_tokens": 46902941.0, "reward": 0.4967881441116333, "reward_std": 0.29596026986837387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745657205581665, "rewards/penalized_accuracy_reward/std": 0.11815827339887619, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.23062750324606895, "step": 517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1981.75, "completions/max_terminated_length": 1568.25, "completions/mean_length": 1360.375, "completions/mean_terminated_length": 955.68603515625, "completions/min_length": 436.25, "completions/min_terminated_length": 436.25, "epoch": 0.259, "grad_norm": 0.3262256681919098, "kl": 0.0308074951171875, "learning_rate": 9.328649992408231e-07, "loss": 0.2975, "num_tokens": 47001893.0, "reward": 0.34765625, "reward_std": 0.14509744197130203, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6953125, "rewards/tag_count_reward/std": 0.29019488394260406, "step": 518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1539.5, "completions/max_terminated_length": 1492.25, "completions/mean_length": 917.21875, "completions/mean_terminated_length": 759.0511474609375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.2595, "grad_norm": 0.5477613806724548, "kl": 0.032806396484375, "learning_rate": 9.324517118440888e-07, "loss": 0.2242, "num_tokens": 47069059.0, "reward": 0.444865383207798, "reward_std": 0.18145395442843437, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012471754103899002, "rewards/penalized_accuracy_reward/std": 0.04988702014088631, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.21440255641937256, "step": 519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1711.5, "completions/max_terminated_length": 1545.0, "completions/mean_length": 857.484375, "completions/mean_terminated_length": 777.5367126464844, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.26, "grad_norm": 0.5140591263771057, "kl": 0.0372314453125, "learning_rate": 9.320372594331137e-07, "loss": 0.257, "num_tokens": 47131250.0, "reward": 0.443359375, "reward_std": 0.10435576364398003, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.20871153101325035, "step": 520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1555.25, "completions/max_terminated_length": 1237.75, "completions/mean_length": 823.984375, "completions/mean_terminated_length": 665.6603088378906, "completions/min_length": 270.5, "completions/min_terminated_length": 270.5, "epoch": 0.2605, "grad_norm": 0.4067757725715637, "kl": 0.026519775390625, "learning_rate": 9.316216432703916e-07, "loss": 0.1634, "num_tokens": 47192881.0, "reward": 0.5026784837245941, "reward_std": 0.24872096441686153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747205436229706, "rewards/penalized_accuracy_reward/std": 0.08056250214576721, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.22008750587701797, "step": 521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1437.75, "completions/max_terminated_length": 1268.0, "completions/mean_length": 1053.8125, "completions/mean_terminated_length": 867.21875, "completions/min_length": 593.25, "completions/min_terminated_length": 593.25, "epoch": 0.261, "grad_norm": 0.3643133342266083, "kl": 0.025390625, "learning_rate": 9.312048646219617e-07, "loss": 0.0333, "num_tokens": 47270005.0, "reward": 0.7208437919616699, "reward_std": 0.2328283842653036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16217970848083496, "rewards/penalized_accuracy_reward/std": 0.0804637148976326, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.17741476371884346, "step": 522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1677.75, "completions/max_terminated_length": 1526.5, "completions/mean_length": 990.125, "completions/mean_terminated_length": 817.0480804443359, "completions/min_length": 397.75, "completions/min_terminated_length": 397.75, "epoch": 0.2615, "grad_norm": 0.5309759974479675, "kl": 0.033721923828125, "learning_rate": 9.307869247574038e-07, "loss": 0.1282, "num_tokens": 47343485.0, "reward": 0.5139300376176834, "reward_std": 0.3408522456884384, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049933766946196556, "rewards/penalized_accuracy_reward/std": 0.13039683550596237, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.23876836150884628, "step": 523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1699.25, "completions/max_terminated_length": 1314.25, "completions/mean_length": 842.296875, "completions/mean_terminated_length": 685.6369171142578, "completions/min_length": 333.5, "completions/min_terminated_length": 333.5, "epoch": 0.262, "grad_norm": 0.477525919675827, "kl": 0.0443115234375, "learning_rate": 9.303678249498352e-07, "loss": 0.1044, "num_tokens": 47406848.0, "reward": 0.46052899956703186, "reward_std": 0.18323558196425438, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491059489548206, "rewards/penalized_accuracy_reward/std": 0.04996423423290253, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.21321270056068897, "step": 524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1272.75, "completions/mean_length": 1105.015625, "completions/mean_terminated_length": 752.5571594238281, "completions/min_length": 340.5, "completions/min_terminated_length": 340.5, "epoch": 0.2625, "grad_norm": 0.4064708650112152, "kl": 0.0474853515625, "learning_rate": 9.299475664759068e-07, "loss": 0.3448, "num_tokens": 47488849.0, "reward": 0.432764895260334, "reward_std": 0.3027823232114315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024976197630167007, "rewards/penalized_accuracy_reward/std": 0.09990479052066803, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.30612613260746, "step": 525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 1033.71875, "completions/mean_terminated_length": 723.1461181640625, "completions/min_length": 279.5, "completions/min_terminated_length": 279.5, "epoch": 0.263, "grad_norm": 0.37642067670822144, "kl": 0.044677734375, "learning_rate": 9.295261506157985e-07, "loss": 0.3265, "num_tokens": 47567455.0, "reward": 0.6672378182411194, "reward_std": 0.5929142348468304, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1373298466205597, "rewards/penalized_accuracy_reward/std": 0.24865684658288956, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.320106141269207, "step": 526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1874.25, "completions/max_terminated_length": 1648.25, "completions/mean_length": 1089.390625, "completions/mean_terminated_length": 905.0355377197266, "completions/min_length": 305.75, "completions/min_terminated_length": 305.75, "epoch": 0.2635, "grad_norm": 0.29783502221107483, "kl": 0.035858154296875, "learning_rate": 9.291035786532163e-07, "loss": 0.1693, "num_tokens": 47646216.0, "reward": 0.6486319452524185, "reward_std": 0.4284169152379036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11240191012620926, "rewards/penalized_accuracy_reward/std": 0.1850258708000183, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.22765622287988663, "step": 527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2024.25, "completions/max_terminated_length": 1770.75, "completions/mean_length": 1123.671875, "completions/mean_terminated_length": 927.2775573730469, "completions/min_length": 289.75, "completions/min_terminated_length": 289.75, "epoch": 0.264, "grad_norm": 0.30349528789520264, "kl": 0.02496337890625, "learning_rate": 9.286798518753878e-07, "loss": 0.1988, "num_tokens": 47726211.0, "reward": 0.5256581604480743, "reward_std": 0.3281827587634325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049938454292714596, "rewards/penalized_accuracy_reward/std": 0.13048017770051956, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.21925553865730762, "step": 528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1860.0, "completions/max_terminated_length": 1756.5, "completions/mean_length": 948.6875, "completions/mean_terminated_length": 884.6830444335938, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2645, "grad_norm": 0.3988053798675537, "kl": 0.0284423828125, "learning_rate": 9.282549715730579e-07, "loss": 0.2258, "num_tokens": 47797199.0, "reward": 0.6236276626586914, "reward_std": 0.2754148729145527, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0872044488787651, "rewards/penalized_accuracy_reward/std": 0.1021234318614006, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.1808114591985941, "step": 529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.25, "completions/mean_length": 1257.546875, "completions/mean_terminated_length": 996.3360748291016, "completions/min_length": 456.25, "completions/min_terminated_length": 456.25, "epoch": 0.265, "grad_norm": 0.3021189868450165, "kl": 0.030029296875, "learning_rate": 9.278289390404859e-07, "loss": 0.293, "num_tokens": 47887458.0, "reward": 0.41755639761686325, "reward_std": 0.22985536605119705, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012489136308431625, "rewards/penalized_accuracy_reward/std": 0.0499565452337265, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.31415246427059174, "step": 530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1979.0, "completions/max_terminated_length": 1598.25, "completions/mean_length": 827.203125, "completions/mean_terminated_length": 707.5708618164062, "completions/min_length": 210.5, "completions/min_terminated_length": 210.5, "epoch": 0.2655, "grad_norm": 0.6387527585029602, "kl": 0.0513916015625, "learning_rate": 9.274017555754407e-07, "loss": 0.3095, "num_tokens": 47953087.0, "reward": 0.4819595217704773, "reward_std": 0.185433778911829, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012464134022593498, "rewards/penalized_accuracy_reward/std": 0.04985653981566429, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18958897329866886, "step": 531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 1010.21875, "completions/mean_terminated_length": 811.2769775390625, "completions/min_length": 235.25, "completions/min_terminated_length": 235.25, "epoch": 0.266, "grad_norm": 0.48001179099082947, "kl": 0.041412353515625, "learning_rate": 9.269734224791974e-07, "loss": 0.4061, "num_tokens": 48029453.0, "reward": 0.416015625, "reward_std": 0.12994812615215778, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.25989625975489616, "step": 532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1298.75, "completions/max_terminated_length": 1141.5, "completions/mean_length": 848.421875, "completions/mean_terminated_length": 649.7427520751953, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2665, "grad_norm": 0.7189418077468872, "kl": 0.040679931640625, "learning_rate": 9.265439410565328e-07, "loss": 0.2495, "num_tokens": 48091608.0, "reward": 0.5369065999984741, "reward_std": 0.28574617207050323, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06239861249923706, "rewards/penalized_accuracy_reward/std": 0.09558742493391037, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.21046720445156097, "step": 533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1497.75, "completions/max_terminated_length": 1368.5, "completions/mean_length": 832.03125, "completions/mean_terminated_length": 652.9151916503906, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.267, "grad_norm": 0.3371395170688629, "kl": 0.039794921875, "learning_rate": 9.261133126157217e-07, "loss": 0.1281, "num_tokens": 48153290.0, "reward": 0.746853768825531, "reward_std": 0.43303800746798515, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1497940681874752, "rewards/penalized_accuracy_reward/std": 0.19244197010993958, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.14379572309553623, "step": 534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1719.5, "completions/mean_length": 1243.78125, "completions/mean_terminated_length": 862.5403442382812, "completions/min_length": 442.25, "completions/min_terminated_length": 442.25, "epoch": 0.2675, "grad_norm": 0.45407357811927795, "kl": 0.041107177734375, "learning_rate": 9.256815384685328e-07, "loss": 0.3562, "num_tokens": 48243212.0, "reward": 0.38192318379879, "reward_std": 0.26265045814216137, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02494596689939499, "rewards/penalized_accuracy_reward/std": 0.0681653544306755, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.3049377463757992, "step": 535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1915.5, "completions/max_terminated_length": 1428.0, "completions/mean_length": 955.625, "completions/mean_terminated_length": 783.2093963623047, "completions/min_length": 358.25, "completions/min_terminated_length": 358.25, "epoch": 0.268, "grad_norm": 0.32955053448677063, "kl": 0.037139892578125, "learning_rate": 9.252486199302256e-07, "loss": 0.2768, "num_tokens": 48313268.0, "reward": 0.423828125, "reward_std": 0.10449730046093464, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.20899460464715958, "step": 536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1603.25, "completions/max_terminated_length": 1172.0, "completions/mean_length": 664.421875, "completions/mean_terminated_length": 596.4500045776367, "completions/min_length": 209.5, "completions/min_terminated_length": 209.5, "epoch": 0.2685, "grad_norm": 0.5418621897697449, "kl": 0.0460205078125, "learning_rate": 9.248145583195447e-07, "loss": 0.2938, "num_tokens": 48366735.0, "reward": 0.9293870031833649, "reward_std": 0.6596960537135601, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.23715443164110184, "rewards/penalized_accuracy_reward/std": 0.30204296857118607, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.19090185686945915, "step": 537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1644.25, "completions/mean_length": 1052.671875, "completions/mean_terminated_length": 889.6717681884766, "completions/min_length": 415.5, "completions/min_terminated_length": 415.5, "epoch": 0.269, "grad_norm": 0.35101640224456787, "kl": 0.026458740234375, "learning_rate": 9.243793549587171e-07, "loss": 0.2735, "num_tokens": 48443242.0, "reward": 0.5334556847810745, "reward_std": 0.35072289034724236, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049930961802601814, "rewards/penalized_accuracy_reward/std": 0.13643736392259598, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2483838126063347, "step": 538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1961.75, "completions/max_terminated_length": 1382.75, "completions/mean_length": 777.015625, "completions/mean_terminated_length": 639.7136993408203, "completions/min_length": 320.25, "completions/min_terminated_length": 320.25, "epoch": 0.2695, "grad_norm": 0.46965816617012024, "kl": 0.04736328125, "learning_rate": 9.239430111734476e-07, "loss": 0.1918, "num_tokens": 48502459.0, "reward": 0.5702212005853653, "reward_std": 0.37132715806365013, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06245434284210205, "rewards/penalized_accuracy_reward/std": 0.14881933480501175, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2309735342860222, "step": 539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1594.25, "completions/max_terminated_length": 1335.5, "completions/mean_length": 872.46875, "completions/mean_terminated_length": 777.0861053466797, "completions/min_length": 322.5, "completions/min_terminated_length": 322.5, "epoch": 0.27, "grad_norm": 0.39487001299858093, "kl": 0.029998779296875, "learning_rate": 9.235055282929153e-07, "loss": 0.1379, "num_tokens": 48566201.0, "reward": 0.7507734447717667, "reward_std": 0.5775187909603119, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14980078116059303, "rewards/penalized_accuracy_reward/std": 0.2654493451118469, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.15551739931106567, "step": 540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1558.0, "completions/max_terminated_length": 1279.25, "completions/mean_length": 803.59375, "completions/mean_terminated_length": 652.5760498046875, "completions/min_length": 294.5, "completions/min_terminated_length": 294.5, "epoch": 0.2705, "grad_norm": 0.5140396952629089, "kl": 0.037353515625, "learning_rate": 9.230669076497687e-07, "loss": 0.2879, "num_tokens": 48626303.0, "reward": 0.47419071197509766, "reward_std": 0.18698296695947647, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012485983781516552, "rewards/penalized_accuracy_reward/std": 0.04994393512606621, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.17419016361236572, "step": 541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1345.5, "completions/mean_length": 1386.71875, "completions/mean_terminated_length": 868.8958740234375, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.271, "grad_norm": 0.29457977414131165, "kl": 0.0299072265625, "learning_rate": 9.226271505801224e-07, "loss": 0.2292, "num_tokens": 48726077.0, "reward": 0.41115593910217285, "reward_std": 0.24104278534650803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024913907051086426, "rewards/penalized_accuracy_reward/std": 0.06807775795459747, "rewards/tag_count_reward/mean": 0.72265625, "rewards/tag_count_reward/std": 0.2551500126719475, "step": 542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1520.75, "completions/mean_length": 1229.9375, "completions/mean_terminated_length": 857.9327087402344, "completions/min_length": 339.75, "completions/min_terminated_length": 339.75, "epoch": 0.2715, "grad_norm": 0.3240607678890228, "kl": 0.0294189453125, "learning_rate": 9.221862584235526e-07, "loss": 0.2764, "num_tokens": 48814073.0, "reward": 0.4996182322502136, "reward_std": 0.3874186612665653, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06230911519378424, "rewards/penalized_accuracy_reward/std": 0.13897734135389328, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.31972454488277435, "step": 543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1800.75, "completions/mean_length": 1055.78125, "completions/mean_terminated_length": 911.5617523193359, "completions/min_length": 352.25, "completions/min_terminated_length": 352.25, "epoch": 0.272, "grad_norm": 0.41857942938804626, "kl": 0.03460693359375, "learning_rate": 9.217442325230936e-07, "loss": 0.2393, "num_tokens": 48891355.0, "reward": 0.48340076208114624, "reward_std": 0.24978534132242203, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02490350417792797, "rewards/penalized_accuracy_reward/std": 0.0680493414402008, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.26915430650115013, "step": 544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1974.5, "completions/max_terminated_length": 1714.5, "completions/mean_length": 999.90625, "completions/mean_terminated_length": 853.401123046875, "completions/min_length": 354.5, "completions/min_terminated_length": 354.5, "epoch": 0.2725, "grad_norm": 0.46337562799453735, "kl": 0.04034423828125, "learning_rate": 9.213010742252327e-07, "loss": 0.3569, "num_tokens": 48965333.0, "reward": 0.435546875, "reward_std": 0.12286523915827274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2457304783165455, "step": 545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 1091.390625, "completions/mean_terminated_length": 879.6553802490234, "completions/min_length": 366.5, "completions/min_terminated_length": 366.5, "epoch": 0.273, "grad_norm": 0.4740728437900543, "kl": 0.03759765625, "learning_rate": 9.208567848799069e-07, "loss": 0.3382, "num_tokens": 49046878.0, "reward": 0.44290006160736084, "reward_std": 0.20702584832906723, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012465653009712696, "rewards/penalized_accuracy_reward/std": 0.049862612038850784, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.27031252905726433, "step": 546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 593.328125, "completions/mean_terminated_length": 593.328125, "completions/min_length": 167.25, "completions/min_terminated_length": 167.25, "epoch": 0.2735, "grad_norm": 0.5176637172698975, "kl": 0.032684326171875, "learning_rate": 9.204113658404989e-07, "loss": 0.1284, "num_tokens": 49097091.0, "reward": 0.5284564942121506, "reward_std": 0.23795261979103088, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024970432743430138, "rewards/penalized_accuracy_reward/std": 0.09988173469901085, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1285141110420227, "step": 547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1647.0, "completions/max_terminated_length": 1567.5, "completions/mean_length": 1038.78125, "completions/mean_terminated_length": 896.9447174072266, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.274, "grad_norm": 0.3655906021595001, "kl": 0.03533935546875, "learning_rate": 9.199648184638318e-07, "loss": 0.1868, "num_tokens": 49171445.0, "reward": 0.6294483840465546, "reward_std": 0.30885397642850876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09988044202327728, "rewards/penalized_accuracy_reward/std": 0.10315614193677902, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.21640067547559738, "step": 548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1638.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 887.6875, "completions/mean_terminated_length": 646.5630798339844, "completions/min_length": 269.5, "completions/min_terminated_length": 269.5, "epoch": 0.2745, "grad_norm": 0.2664050757884979, "kl": 0.03057861328125, "learning_rate": 9.195171441101668e-07, "loss": 0.3305, "num_tokens": 49237585.0, "reward": 0.581220418214798, "reward_std": 0.2921970635652542, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07478989660739899, "rewards/penalized_accuracy_reward/std": 0.09971990436315536, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.2136894129216671, "step": 549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1242.0, "completions/max_terminated_length": 1198.75, "completions/mean_length": 648.078125, "completions/mean_terminated_length": 629.8156433105469, "completions/min_length": 197.25, "completions/min_terminated_length": 197.25, "epoch": 0.275, "grad_norm": 0.2869943678379059, "kl": 0.037750244140625, "learning_rate": 9.190683441431974e-07, "loss": 0.0605, "num_tokens": 49288294.0, "reward": 0.5690198540687561, "reward_std": 0.18442301452159882, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03743961453437805, "rewards/penalized_accuracy_reward/std": 0.08049276471138, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1851.25, "completions/max_terminated_length": 1091.25, "completions/mean_length": 977.421875, "completions/mean_terminated_length": 540.8500061035156, "completions/min_length": 234.75, "completions/min_terminated_length": 234.75, "epoch": 0.2755, "grad_norm": 0.6040741801261902, "kl": 0.04510498046875, "learning_rate": 9.186184199300463e-07, "loss": 0.3465, "num_tokens": 49363041.0, "reward": 0.3828125, "reward_std": 0.10777534916996956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.21555070951581, "step": 551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1690.75, "completions/max_terminated_length": 1538.25, "completions/mean_length": 869.765625, "completions/mean_terminated_length": 722.0666961669922, "completions/min_length": 224.25, "completions/min_terminated_length": 224.25, "epoch": 0.276, "grad_norm": 0.5180755853652954, "kl": 0.04302978515625, "learning_rate": 9.181673728412605e-07, "loss": 0.3068, "num_tokens": 49427842.0, "reward": 0.439453125, "reward_std": 0.11344381235539913, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.22688763216137886, "step": 552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1549.25, "completions/max_terminated_length": 1196.0, "completions/mean_length": 740.5625, "completions/mean_terminated_length": 625.2427978515625, "completions/min_length": 190.75, "completions/min_terminated_length": 190.75, "epoch": 0.2765, "grad_norm": 0.5234861969947815, "kl": 0.033203125, "learning_rate": 9.177152042508077e-07, "loss": 0.3013, "num_tokens": 49484630.0, "reward": 0.7754766792058945, "reward_std": 0.3573654778301716, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1621524030342698, "rewards/penalized_accuracy_reward/std": 0.1391945779323578, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.1860179379582405, "step": 553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1902.0, "completions/max_terminated_length": 1506.75, "completions/mean_length": 929.09375, "completions/mean_terminated_length": 797.6757659912109, "completions/min_length": 346.5, "completions/min_terminated_length": 346.5, "epoch": 0.277, "grad_norm": 0.4482908844947815, "kl": 0.035980224609375, "learning_rate": 9.17261915536072e-07, "loss": 0.2371, "num_tokens": 49554540.0, "reward": 0.7699340432882309, "reward_std": 0.4501419849693775, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1623107548803091, "rewards/penalized_accuracy_reward/std": 0.19538898766040802, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2343991994857788, "step": 554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1721.75, "completions/max_terminated_length": 1628.75, "completions/mean_length": 897.390625, "completions/mean_terminated_length": 880.198974609375, "completions/min_length": 415.25, "completions/min_terminated_length": 415.25, "epoch": 0.2775, "grad_norm": 0.3967706859111786, "kl": 0.033203125, "learning_rate": 9.168075080778494e-07, "loss": -0.0109, "num_tokens": 49621285.0, "reward": 0.5914964526891708, "reward_std": 0.256313256919384, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06234978884458542, "rewards/penalized_accuracy_reward/std": 0.09551263600587845, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1305759735405445, "step": 555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 1065.53125, "completions/mean_terminated_length": 850.8928833007812, "completions/min_length": 292.25, "completions/min_terminated_length": 292.25, "epoch": 0.278, "grad_norm": 0.37875887751579285, "kl": 0.0361328125, "learning_rate": 9.163519832603436e-07, "loss": 0.3397, "num_tokens": 49698615.0, "reward": 0.4410072863101959, "reward_std": 0.22077508829534054, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012495825998485088, "rewards/penalized_accuracy_reward/std": 0.04998330399394035, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.2909424193203449, "step": 556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1695.75, "completions/max_terminated_length": 1236.5, "completions/mean_length": 874.703125, "completions/mean_terminated_length": 573.1159210205078, "completions/min_length": 191.25, "completions/min_terminated_length": 191.25, "epoch": 0.2785, "grad_norm": 0.4375259578227997, "kl": 0.035247802734375, "learning_rate": 9.158953424711624e-07, "loss": 0.3514, "num_tokens": 49765300.0, "reward": 0.611794650554657, "reward_std": 0.32479309663176537, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09984263777732849, "rewards/penalized_accuracy_reward/std": 0.10311712324619293, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.2715224865823984, "step": 557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 1192.6875, "completions/mean_terminated_length": 944.0756530761719, "completions/min_length": 347.75, "completions/min_terminated_length": 347.75, "epoch": 0.279, "grad_norm": 0.4607226252555847, "kl": 0.029144287109375, "learning_rate": 9.154375871013128e-07, "loss": 0.3096, "num_tokens": 49855696.0, "reward": 0.42725494503974915, "reward_std": 0.21611913666129112, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012455599382519722, "rewards/penalized_accuracy_reward/std": 0.049822401255369186, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.28817062079906464, "step": 558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 1353.75, "completions/mean_terminated_length": 1014.1983489990234, "completions/min_length": 491.75, "completions/min_terminated_length": 491.75, "epoch": 0.2795, "grad_norm": 0.3172443211078644, "kl": 0.032623291015625, "learning_rate": 9.149787185451969e-07, "loss": 0.335, "num_tokens": 49950576.0, "reward": 0.3671875, "reward_std": 0.16776007413864136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.3355201631784439, "step": 559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1411.5, "completions/mean_length": 1109.296875, "completions/mean_terminated_length": 771.0823669433594, "completions/min_length": 314.75, "completions/min_terminated_length": 314.75, "epoch": 0.28, "grad_norm": 0.42934271693229675, "kl": 0.031768798828125, "learning_rate": 9.145187382006081e-07, "loss": 0.3572, "num_tokens": 50030723.0, "reward": 0.4155249744653702, "reward_std": 0.22773858159780502, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012449988164007664, "rewards/penalized_accuracy_reward/std": 0.049799952656030655, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.2981189675629139, "step": 560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1746.0, "completions/max_terminated_length": 1463.75, "completions/mean_length": 799.109375, "completions/mean_terminated_length": 608.2406311035156, "completions/min_length": 227.5, "completions/min_terminated_length": 227.5, "epoch": 0.2805, "grad_norm": 0.5636959075927734, "kl": 0.04888916015625, "learning_rate": 9.140576474687263e-07, "loss": 0.1378, "num_tokens": 50091706.0, "reward": 0.49322402477264404, "reward_std": 0.2506713457405567, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024932319298386574, "rewards/penalized_accuracy_reward/std": 0.0997292809188366, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.22563419491052628, "step": 561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1901.75, "completions/max_terminated_length": 1426.5, "completions/mean_length": 1073.109375, "completions/mean_terminated_length": 894.7687530517578, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "epoch": 0.281, "grad_norm": 0.43828460574150085, "kl": 0.02703857421875, "learning_rate": 9.135954477541137e-07, "loss": 0.2707, "num_tokens": 50170449.0, "reward": 0.48346276581287384, "reward_std": 0.23032421246170998, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02493450790643692, "rewards/penalized_accuracy_reward/std": 0.06813406199216843, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2461865283548832, "step": 562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1770.25, "completions/max_terminated_length": 1111.75, "completions/mean_length": 1179.28125, "completions/mean_terminated_length": 614.1651916503906, "completions/min_length": 248.25, "completions/min_terminated_length": 248.25, "epoch": 0.2815, "grad_norm": 0.42256563901901245, "kl": 0.03741455078125, "learning_rate": 9.131321404647109e-07, "loss": 0.3269, "num_tokens": 50258099.0, "reward": 0.4455718994140625, "reward_std": 0.2845580652356148, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993438348174095, "rewards/penalized_accuracy_reward/std": 0.08932536095380783, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.2558863088488579, "step": 563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1700.5, "completions/max_terminated_length": 1437.0, "completions/mean_length": 778.234375, "completions/mean_terminated_length": 655.2624206542969, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.282, "grad_norm": 0.537639319896698, "kl": 0.040557861328125, "learning_rate": 9.126677270118322e-07, "loss": 0.275, "num_tokens": 50316530.0, "reward": 0.4800582230091095, "reward_std": 0.17811370268464088, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012490048073232174, "rewards/penalized_accuracy_reward/std": 0.049960192292928696, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.21374599263072014, "step": 564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1727.25, "completions/max_terminated_length": 1682.5, "completions/mean_length": 1072.859375, "completions/mean_terminated_length": 958.3068237304688, "completions/min_length": 393.25, "completions/min_terminated_length": 393.25, "epoch": 0.2825, "grad_norm": 0.35869404673576355, "kl": 0.03338623046875, "learning_rate": 9.122022088101613e-07, "loss": 0.1764, "num_tokens": 50396697.0, "reward": 0.47575022280216217, "reward_std": 0.2370310313999653, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024984486401081085, "rewards/penalized_accuracy_reward/std": 0.06827061623334885, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.21847054921090603, "step": 565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1469.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 859.265625, "completions/mean_terminated_length": 745.9012451171875, "completions/min_length": 287.75, "completions/min_terminated_length": 287.75, "epoch": 0.283, "grad_norm": 0.2847440242767334, "kl": 0.03033447265625, "learning_rate": 9.117355872777477e-07, "loss": 0.0558, "num_tokens": 50462378.0, "reward": 0.5452117174863815, "reward_std": 0.32017600908875465, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994961339980364, "rewards/penalized_accuracy_reward/std": 0.13047274947166443, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.16447614133358002, "step": 566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1744.5, "completions/mean_length": 1106.6875, "completions/mean_terminated_length": 993.5274505615234, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.2835, "grad_norm": 0.4350048899650574, "kl": 0.041290283203125, "learning_rate": 9.112678638360015e-07, "loss": 0.2635, "num_tokens": 50545030.0, "reward": 0.4624919593334198, "reward_std": 0.20015317387878895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012495980598032475, "rewards/penalized_accuracy_reward/std": 0.0499839261174202, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2485373169183731, "step": 567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 1077.75, "completions/mean_terminated_length": 893.0250549316406, "completions/min_length": 318.25, "completions/min_terminated_length": 318.25, "epoch": 0.284, "grad_norm": 15041.6416015625, "kl": 87.01889038085938, "learning_rate": 9.107990399096893e-07, "loss": 3.7566, "num_tokens": 50622870.0, "reward": 0.5966266989707947, "reward_std": 0.3773827403783798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08737585041671991, "rewards/penalized_accuracy_reward/std": 0.14981801062822342, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.26047012582421303, "step": 568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1854.75, "completions/max_terminated_length": 1649.0, "completions/mean_length": 988.59375, "completions/mean_terminated_length": 843.698974609375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.2845, "grad_norm": 0.28968942165374756, "kl": 0.035980224609375, "learning_rate": 9.103291169269299e-07, "loss": 0.1666, "num_tokens": 50696748.0, "reward": 0.6564937829971313, "reward_std": 0.29131171107292175, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11242657899856567, "rewards/penalized_accuracy_reward/std": 0.10240264236927032, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.20025964826345444, "step": 569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1726.0, "completions/max_terminated_length": 1167.75, "completions/mean_length": 891.015625, "completions/mean_terminated_length": 664.9770126342773, "completions/min_length": 240.5, "completions/min_terminated_length": 240.5, "epoch": 0.285, "grad_norm": 0.4642666280269623, "kl": 0.04315185546875, "learning_rate": 9.098580963191907e-07, "loss": 0.3042, "num_tokens": 50763181.0, "reward": 0.9340077191591263, "reward_std": 0.6234997622668743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.26192574948072433, "rewards/penalized_accuracy_reward/std": 0.2782973349094391, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.21911673620343208, "step": 570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1857.25, "completions/max_terminated_length": 1521.0, "completions/mean_length": 809.0625, "completions/mean_terminated_length": 748.2165374755859, "completions/min_length": 295.75, "completions/min_terminated_length": 295.75, "epoch": 0.2855, "grad_norm": 1.1934202909469604, "kl": 0.05841064453125, "learning_rate": 9.093859795212817e-07, "loss": 0.1502, "num_tokens": 50823409.0, "reward": 0.49759966135025024, "reward_std": 0.14762506261467934, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012471708469092846, "rewards/penalized_accuracy_reward/std": 0.04988683760166168, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.12984733283519745, "step": 571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1908.5, "completions/max_terminated_length": 1795.0, "completions/mean_length": 1104.671875, "completions/mean_terminated_length": 938.1469421386719, "completions/min_length": 401.5, "completions/min_terminated_length": 401.5, "epoch": 0.286, "grad_norm": 0.4511258006095886, "kl": 0.0361328125, "learning_rate": 9.089127679713529e-07, "loss": 0.1203, "num_tokens": 50901980.0, "reward": 0.5026211738586426, "reward_std": 0.2596193328499794, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744340315461159, "rewards/penalized_accuracy_reward/std": 0.08050090819597244, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.26295091211795807, "step": 572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1796.5, "completions/max_terminated_length": 1319.75, "completions/mean_length": 1057.140625, "completions/mean_terminated_length": 785.9899139404297, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.2865, "grad_norm": 0.6515030860900879, "kl": 0.039093017578125, "learning_rate": 9.084384631108882e-07, "loss": 0.2389, "num_tokens": 50982229.0, "reward": 0.3984375, "reward_std": 0.10886681452393532, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.21773363836109638, "step": 573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1580.5, "completions/mean_length": 1104.53125, "completions/mean_terminated_length": 865.8319854736328, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.287, "grad_norm": 0.43272244930267334, "kl": 0.0355224609375, "learning_rate": 9.079630663847031e-07, "loss": 0.4006, "num_tokens": 51063111.0, "reward": 0.400390625, "reward_std": 0.15591760352253914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.3118352144956589, "step": 574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1665.25, "completions/max_terminated_length": 1207.0, "completions/mean_length": 648.8125, "completions/mean_terminated_length": 603.1010513305664, "completions/min_length": 254.75, "completions/min_terminated_length": 254.75, "epoch": 0.2875, "grad_norm": 0.4941796660423279, "kl": 0.0328369140625, "learning_rate": 9.074865792409381e-07, "loss": 0.0855, "num_tokens": 51114779.0, "reward": 0.6244037300348282, "reward_std": 0.32485349103808403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07489717565476894, "rewards/penalized_accuracy_reward/std": 0.14556249603629112, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1333485022187233, "step": 575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1804.25, "completions/max_terminated_length": 1379.75, "completions/mean_length": 930.46875, "completions/mean_terminated_length": 780.4833374023438, "completions/min_length": 349.25, "completions/min_terminated_length": 349.25, "epoch": 0.288, "grad_norm": 0.4994284212589264, "kl": 0.0374755859375, "learning_rate": 9.070090031310558e-07, "loss": 0.1766, "num_tokens": 51182009.0, "reward": 0.5102041661739349, "reward_std": 0.306219682097435, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037328651174902916, "rewards/penalized_accuracy_reward/std": 0.11778053641319275, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.23528573662042618, "step": 576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1872.25, "completions/max_terminated_length": 1624.0, "completions/mean_length": 915.015625, "completions/mean_terminated_length": 783.0158233642578, "completions/min_length": 235.75, "completions/min_terminated_length": 235.75, "epoch": 0.2885, "grad_norm": 0.4576295018196106, "kl": 0.030059814453125, "learning_rate": 9.065303395098358e-07, "loss": 0.3879, "num_tokens": 51248602.0, "reward": 0.4375, "reward_std": 0.12381584197282791, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.24763169139623642, "step": 577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1662.75, "completions/max_terminated_length": 1420.0, "completions/mean_length": 799.359375, "completions/mean_terminated_length": 636.9851837158203, "completions/min_length": 250.5, "completions/min_terminated_length": 250.5, "epoch": 0.289, "grad_norm": 0.5753300786018372, "kl": 0.040679931640625, "learning_rate": 9.060505898353705e-07, "loss": 0.3334, "num_tokens": 51310849.0, "reward": 0.8346112072467804, "reward_std": 0.4122166484594345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1995321661233902, "rewards/penalized_accuracy_reward/std": 0.16096988320350647, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.20786115154623985, "step": 578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1444.25, "completions/max_terminated_length": 1236.5, "completions/mean_length": 920.453125, "completions/mean_terminated_length": 744.0972290039062, "completions/min_length": 283.5, "completions/min_terminated_length": 283.5, "epoch": 0.2895, "grad_norm": 0.5280103087425232, "kl": 0.0472412109375, "learning_rate": 9.055697555690607e-07, "loss": 0.1225, "num_tokens": 51378990.0, "reward": 0.4487980678677559, "reward_std": 0.19275202602148056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01248497236520052, "rewards/penalized_accuracy_reward/std": 0.04993989318609238, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.23983510583639145, "step": 579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2010.25, "completions/max_terminated_length": 1630.0, "completions/mean_length": 789.71875, "completions/mean_terminated_length": 681.942138671875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.29, "grad_norm": 0.47236305475234985, "kl": 0.037078857421875, "learning_rate": 9.050878381756107e-07, "loss": 0.3526, "num_tokens": 51436956.0, "reward": 0.4609375, "reward_std": 0.09177451208233833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18354902788996696, "step": 580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1782.75, "completions/max_terminated_length": 1529.25, "completions/mean_length": 908.578125, "completions/mean_terminated_length": 844.4486846923828, "completions/min_length": 282.75, "completions/min_terminated_length": 282.75, "epoch": 0.2905, "grad_norm": 0.4663105010986328, "kl": 0.037109375, "learning_rate": 9.046048391230247e-07, "loss": 0.2324, "num_tokens": 51505153.0, "reward": 0.6200970709323883, "reward_std": 0.4025311302393675, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08739228919148445, "rewards/penalized_accuracy_reward/std": 0.16382846236228943, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21527612209320068, "step": 581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 1066.421875, "completions/mean_terminated_length": 921.3791809082031, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.291, "grad_norm": 0.43481671810150146, "kl": 0.029937744140625, "learning_rate": 9.041207598826017e-07, "loss": 0.1889, "num_tokens": 51583708.0, "reward": 0.6640016436576843, "reward_std": 0.4332228936254978, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11227425932884216, "rewards/penalized_accuracy_reward/std": 0.18483060598373413, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.23830724135041237, "step": 582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1587.5, "completions/max_terminated_length": 1354.75, "completions/mean_length": 903.640625, "completions/mean_terminated_length": 791.407470703125, "completions/min_length": 338.75, "completions/min_terminated_length": 338.75, "epoch": 0.2915, "grad_norm": 0.45867443084716797, "kl": 0.040863037109375, "learning_rate": 9.036356019289309e-07, "loss": 0.2622, "num_tokens": 51654773.0, "reward": 0.4453125, "reward_std": 0.10917102172970772, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21834204345941544, "step": 583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1237.5, "completions/mean_length": 802.765625, "completions/mean_terminated_length": 579.7192459106445, "completions/min_length": 209.75, "completions/min_terminated_length": 209.75, "epoch": 0.292, "grad_norm": 0.6033048033714294, "kl": 0.037261962890625, "learning_rate": 9.031493667398872e-07, "loss": 0.5532, "num_tokens": 51713846.0, "reward": 0.4375, "reward_std": 0.12954327277839184, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2590865455567837, "step": 584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1660.75, "completions/max_terminated_length": 1634.5, "completions/mean_length": 893.046875, "completions/mean_terminated_length": 863.2854461669922, "completions/min_length": 350.75, "completions/min_terminated_length": 350.75, "epoch": 0.2925, "grad_norm": 0.3732563853263855, "kl": 0.028656005859375, "learning_rate": 9.026620557966279e-07, "loss": 0.0947, "num_tokens": 51781273.0, "reward": 0.48786433041095734, "reward_std": 0.17530784010887146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012486852705478668, "rewards/penalized_accuracy_reward/std": 0.04994741082191467, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.16480516269803047, "step": 585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 1045.046875, "completions/mean_terminated_length": 715.7019348144531, "completions/min_length": 291.5, "completions/min_terminated_length": 291.5, "epoch": 0.293, "grad_norm": 0.47843629121780396, "kl": 0.041748046875, "learning_rate": 9.021736705835862e-07, "loss": 0.2999, "num_tokens": 51858332.0, "reward": 0.5443861782550812, "reward_std": 0.42359884455800056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07492746412754059, "rewards/penalized_accuracy_reward/std": 0.1576133891940117, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.3019050769507885, "step": 586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1754.75, "completions/max_terminated_length": 1656.0, "completions/mean_length": 975.53125, "completions/mean_terminated_length": 746.5454559326172, "completions/min_length": 272.5, "completions/min_terminated_length": 272.5, "epoch": 0.2935, "grad_norm": 0.33707401156425476, "kl": 0.04034423828125, "learning_rate": 9.016842125884684e-07, "loss": 0.3134, "num_tokens": 51932126.0, "reward": 0.423828125, "reward_std": 0.11893029883503914, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.23786060512065887, "step": 587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1653.25, "completions/max_terminated_length": 1508.75, "completions/mean_length": 812.90625, "completions/mean_terminated_length": 737.3802337646484, "completions/min_length": 296.75, "completions/min_terminated_length": 296.75, "epoch": 0.294, "grad_norm": 0.746476411819458, "kl": 0.0369873046875, "learning_rate": 9.011936833022484e-07, "loss": 0.3312, "num_tokens": 51992264.0, "reward": 0.447265625, "reward_std": 0.09388333931565285, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.1877666898071766, "step": 588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1756.25, "completions/mean_length": 1220.40625, "completions/mean_terminated_length": 1032.5709991455078, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.2945, "grad_norm": 0.3236791491508484, "kl": 0.0302734375, "learning_rate": 9.007020842191634e-07, "loss": 0.1367, "num_tokens": 52079250.0, "reward": 0.785160630941391, "reward_std": 0.4885980188846588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2508617453277111, "step": 589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1450.5, "completions/max_terminated_length": 1345.0, "completions/mean_length": 878.109375, "completions/mean_terminated_length": 727.9486846923828, "completions/min_length": 341.5, "completions/min_terminated_length": 341.5, "epoch": 0.295, "grad_norm": 0.45927149057388306, "kl": 0.0347900390625, "learning_rate": 9.002094168367095e-07, "loss": 0.1806, "num_tokens": 52148361.0, "reward": 0.427734375, "reward_std": 0.09848404303193092, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.19696808606386185, "step": 590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1826.0, "completions/max_terminated_length": 1747.25, "completions/mean_length": 921.359375, "completions/mean_terminated_length": 822.8900909423828, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2955, "grad_norm": 0.3046923875808716, "kl": 0.037628173828125, "learning_rate": 8.997156826556369e-07, "loss": 0.0865, "num_tokens": 52216960.0, "reward": 0.4609375, "reward_std": 0.07073409110307693, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.14146818220615387, "step": 591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1709.25, "completions/max_terminated_length": 1438.75, "completions/mean_length": 836.984375, "completions/mean_terminated_length": 714.4937591552734, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.296, "grad_norm": 0.4469590187072754, "kl": 0.0302734375, "learning_rate": 8.992208831799456e-07, "loss": 0.1742, "num_tokens": 52280527.0, "reward": 0.504909336566925, "reward_std": 0.21406473591923714, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024915602058172226, "rewards/penalized_accuracy_reward/std": 0.06808238476514816, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.16945893317461014, "step": 592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1580.0, "completions/max_terminated_length": 1329.75, "completions/mean_length": 854.625, "completions/mean_terminated_length": 751.8392944335938, "completions/min_length": 388.75, "completions/min_terminated_length": 388.75, "epoch": 0.2965, "grad_norm": 0.3415658473968506, "kl": 0.02801513671875, "learning_rate": 8.987250199168808e-07, "loss": 0.1268, "num_tokens": 52348023.0, "reward": 0.6183881461620331, "reward_std": 0.24876977875828743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07481906563043594, "rewards/penalized_accuracy_reward/std": 0.0997588038444519, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.14459824562072754, "step": 593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 957.328125, "completions/mean_terminated_length": 850.1440887451172, "completions/min_length": 263.75, "completions/min_terminated_length": 263.75, "epoch": 0.297, "grad_norm": 0.4812287390232086, "kl": 0.036102294921875, "learning_rate": 8.982280943769278e-07, "loss": 0.1497, "num_tokens": 52417132.0, "reward": 0.5819331705570221, "reward_std": 0.3456512354314327, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062450957484543324, "rewards/penalized_accuracy_reward/std": 0.13933026790618896, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22529706731438637, "step": 594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1845.75, "completions/max_terminated_length": 1311.75, "completions/mean_length": 935.796875, "completions/mean_terminated_length": 755.6238250732422, "completions/min_length": 333.25, "completions/min_terminated_length": 333.25, "epoch": 0.2975, "grad_norm": 0.5023441910743713, "kl": 0.048553466796875, "learning_rate": 8.977301080738079e-07, "loss": 0.4067, "num_tokens": 52486575.0, "reward": 0.596894383430481, "reward_std": 0.3091595992445946, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07481438666582108, "rewards/penalized_accuracy_reward/std": 0.09975256025791168, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.21930895745754242, "step": 595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1770.25, "completions/max_terminated_length": 1402.75, "completions/mean_length": 909.109375, "completions/mean_terminated_length": 748.6622314453125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.298, "grad_norm": 0.37006640434265137, "kl": 0.0313720703125, "learning_rate": 8.97231062524474e-07, "loss": 0.2837, "num_tokens": 52551878.0, "reward": 0.6792820394039154, "reward_std": 0.3026893138885498, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1247972697019577, "rewards/penalized_accuracy_reward/std": 0.09983798861503601, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.231594055891037, "step": 596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1436.5, "completions/mean_length": 939.65625, "completions/mean_terminated_length": 822.1262359619141, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.2985, "grad_norm": 0.498913049697876, "kl": 0.03955078125, "learning_rate": 8.967309592491052e-07, "loss": 0.2925, "num_tokens": 52620640.0, "reward": 0.5567715764045715, "reward_std": 0.33078255131840706, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04987016413360834, "rewards/penalized_accuracy_reward/std": 0.1303049623966217, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21412638202309608, "step": 597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1970.25, "completions/max_terminated_length": 1618.0, "completions/mean_length": 921.828125, "completions/mean_terminated_length": 842.0878295898438, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.299, "grad_norm": 0.44612449407577515, "kl": 0.0372314453125, "learning_rate": 8.962297997711027e-07, "loss": 0.2782, "num_tokens": 52689205.0, "reward": 0.554930254817009, "reward_std": 0.3296592365950346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049926068633794785, "rewards/penalized_accuracy_reward/std": 0.1304454691708088, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.22119063138961792, "step": 598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1795.25, "completions/mean_length": 1165.265625, "completions/mean_terminated_length": 987.20458984375, "completions/min_length": 425.25, "completions/min_terminated_length": 425.25, "epoch": 0.2995, "grad_norm": 0.39814692735671997, "kl": 0.03240966796875, "learning_rate": 8.957275856170855e-07, "loss": 0.2302, "num_tokens": 52774998.0, "reward": 0.520247146487236, "reward_std": 0.31033239141106606, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746731486171484, "rewards/penalized_accuracy_reward/std": 0.1182023286819458, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2211344614624977, "step": 599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1930.5, "completions/max_terminated_length": 1557.75, "completions/mean_length": 834.578125, "completions/mean_terminated_length": 769.3000183105469, "completions/min_length": 319.5, "completions/min_terminated_length": 319.5, "epoch": 0.3, "grad_norm": 0.7652565836906433, "kl": 0.04925537109375, "learning_rate": 8.952243183168848e-07, "loss": 0.381, "num_tokens": 52839915.0, "reward": 0.5108701288700104, "reward_std": 0.2194131650030613, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496631070971489, "rewards/penalized_accuracy_reward/std": 0.06822095066308975, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.19332484155893326, "step": 600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1868.25, "completions/max_terminated_length": 1690.0, "completions/mean_length": 1199.015625, "completions/mean_terminated_length": 1025.5013427734375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.3005, "grad_norm": 0.335091233253479, "kl": 0.032989501953125, "learning_rate": 8.9471999940354e-07, "loss": 0.2819, "num_tokens": 52924972.0, "reward": 0.404296875, "reward_std": 0.14041507057845592, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.28083014860749245, "step": 601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 739.8125, "completions/mean_terminated_length": 739.8125, "completions/min_length": 214.25, "completions/min_terminated_length": 214.25, "epoch": 0.301, "grad_norm": 0.6523067951202393, "kl": 0.044219970703125, "learning_rate": 8.942146304132943e-07, "loss": 0.0771, "num_tokens": 52980864.0, "reward": 0.5573646426200867, "reward_std": 0.21278144046664238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037471383810043335, "rewards/penalized_accuracy_reward/std": 0.08056104928255081, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11663510836660862, "step": 602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1645.0, "completions/max_terminated_length": 1393.25, "completions/mean_length": 900.484375, "completions/mean_terminated_length": 837.1347961425781, "completions/min_length": 362.25, "completions/min_terminated_length": 362.25, "epoch": 0.3015, "grad_norm": 0.43400126695632935, "kl": 0.03900146484375, "learning_rate": 8.937082128855891e-07, "loss": 0.1434, "num_tokens": 53047535.0, "reward": 0.758455291390419, "reward_std": 0.5371468290686607, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14973545633256435, "rewards/penalized_accuracy_reward/std": 0.24533718451857567, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.178252836689353, "step": 603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1772.5, "completions/max_terminated_length": 1031.25, "completions/mean_length": 835.125, "completions/mean_terminated_length": 593.768798828125, "completions/min_length": 309.75, "completions/min_terminated_length": 309.75, "epoch": 0.302, "grad_norm": 0.4432345926761627, "kl": 0.040679931640625, "learning_rate": 8.932007483630596e-07, "loss": 0.3003, "num_tokens": 53110311.0, "reward": 0.4507412537932396, "reward_std": 0.19330283999443054, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012480001896619797, "rewards/penalized_accuracy_reward/std": 0.049920011311769485, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.24217836558818817, "step": 604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1921.25, "completions/max_terminated_length": 1640.0, "completions/mean_length": 882.609375, "completions/mean_terminated_length": 748.7308044433594, "completions/min_length": 300.5, "completions/min_terminated_length": 300.5, "epoch": 0.3025, "grad_norm": 0.3916875422000885, "kl": 0.036834716796875, "learning_rate": 8.926922383915315e-07, "loss": 0.3123, "num_tokens": 53175294.0, "reward": 0.5412374883890152, "reward_std": 0.3337825257331133, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049915626645088196, "rewards/penalized_accuracy_reward/std": 0.13043814897537231, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.23303577676415443, "step": 605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1919.25, "completions/max_terminated_length": 1627.0, "completions/mean_length": 1090.125, "completions/mean_terminated_length": 904.0766296386719, "completions/min_length": 316.75, "completions/min_terminated_length": 316.75, "epoch": 0.303, "grad_norm": 0.3405042588710785, "kl": 0.03302001953125, "learning_rate": 8.921826845200138e-07, "loss": 0.2057, "num_tokens": 53252694.0, "reward": 0.5697108805179596, "reward_std": 0.387961033731699, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07489450555294752, "rewards/penalized_accuracy_reward/std": 0.14557360112667084, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.22834060341119766, "step": 606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2000.25, "completions/max_terminated_length": 1813.5, "completions/mean_length": 1134.21875, "completions/mean_terminated_length": 842.063720703125, "completions/min_length": 338.75, "completions/min_terminated_length": 338.75, "epoch": 0.3035, "grad_norm": 0.3357657492160797, "kl": 0.03802490234375, "learning_rate": 8.916720883006963e-07, "loss": 0.1347, "num_tokens": 53335204.0, "reward": 0.6250461786985397, "reward_std": 0.48930710554122925, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11232777871191502, "rewards/penalized_accuracy_reward/std": 0.21798338741064072, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.20184841752052307, "step": 607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1395.25, "completions/max_terminated_length": 1287.5, "completions/mean_length": 764.8125, "completions/mean_terminated_length": 692.3645935058594, "completions/min_length": 260.25, "completions/min_terminated_length": 260.25, "epoch": 0.304, "grad_norm": 0.4030363857746124, "kl": 0.0328369140625, "learning_rate": 8.911604512889434e-07, "loss": 0.2146, "num_tokens": 53393912.0, "reward": 0.8700506389141083, "reward_std": 0.4495784230530262, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19967375695705414, "rewards/penalized_accuracy_reward/std": 0.20461168885231018, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.10063419491052628, "step": 608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 1031.359375, "completions/mean_terminated_length": 815.64013671875, "completions/min_length": 310.25, "completions/min_terminated_length": 310.25, "epoch": 0.3045, "grad_norm": 0.39655882120132446, "kl": 0.0364990234375, "learning_rate": 8.906477750432903e-07, "loss": 0.2659, "num_tokens": 53470543.0, "reward": 0.48510295152664185, "reward_std": 0.33077528327703476, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747335262596607, "rewards/penalized_accuracy_reward/std": 0.11822248250246048, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.28692178428173065, "step": 609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1473.5, "completions/max_terminated_length": 1442.25, "completions/mean_length": 838.5625, "completions/mean_terminated_length": 827.6395874023438, "completions/min_length": 287.75, "completions/min_terminated_length": 287.75, "epoch": 0.305, "grad_norm": 0.4367918372154236, "kl": 0.031524658203125, "learning_rate": 8.901340611254378e-07, "loss": 0.0906, "num_tokens": 53532451.0, "reward": 0.6149891912937164, "reward_std": 0.30586008355021477, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062377408146858215, "rewards/penalized_accuracy_reward/std": 0.1391153372824192, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06822281517088413, "step": 610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1873.75, "completions/max_terminated_length": 1555.25, "completions/mean_length": 1115.75, "completions/mean_terminated_length": 1007.7811279296875, "completions/min_length": 386.5, "completions/min_terminated_length": 386.5, "epoch": 0.3055, "grad_norm": 0.38622307777404785, "kl": 0.026458740234375, "learning_rate": 8.896193111002475e-07, "loss": 0.2125, "num_tokens": 53611619.0, "reward": 0.453125, "reward_std": 0.09693676605820656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19387353584170341, "step": 611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1478.5, "completions/max_terminated_length": 1299.0, "completions/mean_length": 887.734375, "completions/mean_terminated_length": 687.8461608886719, "completions/min_length": 260.75, "completions/min_terminated_length": 260.75, "epoch": 0.306, "grad_norm": 0.31019556522369385, "kl": 0.030364990234375, "learning_rate": 8.891035265357371e-07, "loss": 0.1494, "num_tokens": 53680322.0, "reward": 0.46248772740364075, "reward_std": 0.16391568258404732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249386090785265, "rewards/penalized_accuracy_reward/std": 0.0499754436314106, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.15686860121786594, "step": 612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1432.75, "completions/mean_length": 1017.578125, "completions/mean_terminated_length": 880.2109527587891, "completions/min_length": 327.5, "completions/min_terminated_length": 327.5, "epoch": 0.3065, "grad_norm": 0.42505040764808655, "kl": 0.028106689453125, "learning_rate": 8.88586709003076e-07, "loss": 0.3377, "num_tokens": 53754295.0, "reward": 0.42578125, "reward_std": 0.11807363107800484, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.23614726588129997, "step": 613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 1269.4375, "completions/mean_terminated_length": 898.6923522949219, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.307, "grad_norm": 0.3787683844566345, "kl": 0.03045654296875, "learning_rate": 8.8806886007658e-07, "loss": 0.3492, "num_tokens": 53846435.0, "reward": 0.359375, "reward_std": 0.14993701875209808, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.29987405240535736, "step": 614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1557.75, "completions/max_terminated_length": 1366.75, "completions/mean_length": 939.984375, "completions/mean_terminated_length": 730.8020935058594, "completions/min_length": 361.25, "completions/min_terminated_length": 361.25, "epoch": 0.3075, "grad_norm": 0.23092368245124817, "kl": 0.03271484375, "learning_rate": 8.875499813337067e-07, "loss": 0.1855, "num_tokens": 53914610.0, "reward": 0.431640625, "reward_std": 0.08577096834778786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.17154193669557571, "step": 615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1705.0, "completions/max_terminated_length": 1407.75, "completions/mean_length": 1056.40625, "completions/mean_terminated_length": 783.9494171142578, "completions/min_length": 404.25, "completions/min_terminated_length": 404.25, "epoch": 0.308, "grad_norm": 0.43035027384757996, "kl": 0.04547119140625, "learning_rate": 8.87030074355051e-07, "loss": 0.2977, "num_tokens": 53991820.0, "reward": 0.8055772185325623, "reward_std": 0.45160458981990814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19966360554099083, "rewards/penalized_accuracy_reward/std": 0.17851387709379196, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.23001603037118912, "step": 616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1467.0, "completions/max_terminated_length": 1296.25, "completions/mean_length": 901.0, "completions/mean_terminated_length": 723.0536041259766, "completions/min_length": 367.75, "completions/min_terminated_length": 367.75, "epoch": 0.3085, "grad_norm": 0.42341506481170654, "kl": 0.02850341796875, "learning_rate": 8.865091407243394e-07, "loss": 0.226, "num_tokens": 54058684.0, "reward": 0.956275999546051, "reward_std": 0.3804323263466358, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2623176723718643, "rewards/penalized_accuracy_reward/std": 0.14989428594708443, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.17587247677147388, "step": 617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1754.5, "completions/mean_length": 1167.828125, "completions/mean_terminated_length": 1037.8653259277344, "completions/min_length": 435.75, "completions/min_terminated_length": 435.75, "epoch": 0.309, "grad_norm": 0.37345874309539795, "kl": 0.02777099609375, "learning_rate": 8.859871820284261e-07, "loss": 0.2301, "num_tokens": 54142497.0, "reward": 0.5450503677129745, "reward_std": 0.28543636947870255, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049868933856487274, "rewards/penalized_accuracy_reward/std": 0.08920828253030777, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.24883094802498817, "step": 618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 1205.953125, "completions/mean_terminated_length": 1046.3090362548828, "completions/min_length": 500.75, "completions/min_terminated_length": 500.75, "epoch": 0.3095, "grad_norm": 0.3037377595901489, "kl": 0.028564453125, "learning_rate": 8.85464199857288e-07, "loss": 0.1601, "num_tokens": 54228286.0, "reward": 0.5334607660770416, "reward_std": 0.3476554434746504, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993349872529507, "rewards/penalized_accuracy_reward/std": 0.13644429296255112, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.25441059842705727, "step": 619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1691.75, "completions/max_terminated_length": 1629.25, "completions/mean_length": 885.046875, "completions/mean_terminated_length": 782.6041870117188, "completions/min_length": 279.25, "completions/min_terminated_length": 279.25, "epoch": 0.31, "grad_norm": 0.4097415506839752, "kl": 0.03314208984375, "learning_rate": 8.849401958040192e-07, "loss": 0.2565, "num_tokens": 54292609.0, "reward": 0.7161563783884048, "reward_std": 0.3971068933606148, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12467975355684757, "rewards/penalized_accuracy_reward/std": 0.17113158106803894, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14788984507322311, "step": 620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1697.25, "completions/mean_length": 1120.984375, "completions/mean_terminated_length": 965.2140655517578, "completions/min_length": 446.75, "completions/min_terminated_length": 446.75, "epoch": 0.3105, "grad_norm": 0.4559696912765503, "kl": 0.02960205078125, "learning_rate": 8.844151714648274e-07, "loss": 0.2362, "num_tokens": 54372048.0, "reward": 0.6294115781784058, "reward_std": 0.3161335811018944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09986203908920288, "rewards/penalized_accuracy_reward/std": 0.1031372919678688, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2637249082326889, "step": 621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.25, "completions/mean_length": 1072.671875, "completions/mean_terminated_length": 846.07568359375, "completions/min_length": 323.25, "completions/min_terminated_length": 323.25, "epoch": 0.311, "grad_norm": 0.4277040362358093, "kl": 0.039642333984375, "learning_rate": 8.838891284390273e-07, "loss": 0.3465, "num_tokens": 54451723.0, "reward": 0.419921875, "reward_std": 0.1454715896397829, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.2909431792795658, "step": 622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1910.5, "completions/max_terminated_length": 1419.5, "completions/mean_length": 723.625, "completions/mean_terminated_length": 663.377555847168, "completions/min_length": 262.25, "completions/min_terminated_length": 262.25, "epoch": 0.3115, "grad_norm": 0.4175041913986206, "kl": 0.0372314453125, "learning_rate": 8.833620683290375e-07, "loss": 0.2759, "num_tokens": 54506211.0, "reward": 1.001018300652504, "reward_std": 0.7308917194604874, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2622279077768326, "rewards/penalized_accuracy_reward/std": 0.359072670340538, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16395078226923943, "step": 623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1219.5, "completions/max_terminated_length": 1137.5, "completions/mean_length": 555.765625, "completions/mean_terminated_length": 521.8169708251953, "completions/min_length": 181.5, "completions/min_terminated_length": 181.5, "epoch": 0.312, "grad_norm": 0.52876216173172, "kl": 0.03857421875, "learning_rate": 8.828339927403745e-07, "loss": 0.1159, "num_tokens": 54550436.0, "reward": 0.6112006902694702, "reward_std": 0.22892003692686558, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062436286360025406, "rewards/penalized_accuracy_reward/std": 0.09564511477947235, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07525964826345444, "step": 624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1559.75, "completions/max_terminated_length": 1453.0, "completions/mean_length": 785.78125, "completions/mean_terminated_length": 746.0513458251953, "completions/min_length": 288.75, "completions/min_terminated_length": 288.75, "epoch": 0.3125, "grad_norm": 0.4276594817638397, "kl": 0.0369873046875, "learning_rate": 8.823049032816478e-07, "loss": 0.1625, "num_tokens": 54609334.0, "reward": 0.47265625, "reward_std": 0.06924767419695854, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13849535211920738, "step": 625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1545.75, "completions/max_terminated_length": 1458.0, "completions/mean_length": 901.484375, "completions/mean_terminated_length": 770.421875, "completions/min_length": 435.5, "completions/min_terminated_length": 435.5, "epoch": 0.313, "grad_norm": 0.35870763659477234, "kl": 0.037078857421875, "learning_rate": 8.817748015645558e-07, "loss": 0.1433, "num_tokens": 54677093.0, "reward": 0.7045192420482635, "reward_std": 0.40748805925250053, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12472054734826088, "rewards/penalized_accuracy_reward/std": 0.18269464373588562, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.09751351922750473, "step": 626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1374.25, "completions/max_terminated_length": 930.25, "completions/mean_length": 721.96875, "completions/mean_terminated_length": 506.8777389526367, "completions/min_length": 240.25, "completions/min_terminated_length": 240.25, "epoch": 0.3135, "grad_norm": 0.5172901749610901, "kl": 0.048583984375, "learning_rate": 8.812436892038805e-07, "loss": 0.3382, "num_tokens": 54733219.0, "reward": 0.5202065408229828, "reward_std": 0.24126176163554192, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374470129609108, "rewards/penalized_accuracy_reward/std": 0.08050870150327682, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.16048868745565414, "step": 627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1181.25, "completions/mean_length": 1071.890625, "completions/mean_terminated_length": 740.6762008666992, "completions/min_length": 315.5, "completions/min_terminated_length": 315.5, "epoch": 0.314, "grad_norm": 0.48264992237091064, "kl": 0.036895751953125, "learning_rate": 8.807115678174819e-07, "loss": 0.3659, "num_tokens": 54812684.0, "reward": 0.5693257451057434, "reward_std": 0.32196369767189026, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0873972475528717, "rewards/penalized_accuracy_reward/std": 0.1023491844534874, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.26619137078523636, "step": 628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1900.25, "completions/max_terminated_length": 1657.0, "completions/mean_length": 776.609375, "completions/mean_terminated_length": 621.0577087402344, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3145, "grad_norm": 0.5324896574020386, "kl": 0.05810546875, "learning_rate": 8.801784390262943e-07, "loss": 0.304, "num_tokens": 54874163.0, "reward": 0.6049021482467651, "reward_std": 0.28287386521697044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491201907396317, "rewards/penalized_accuracy_reward/std": 0.09988279640674591, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2044307142496109, "step": 629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1933.5, "completions/max_terminated_length": 1498.25, "completions/mean_length": 900.84375, "completions/mean_terminated_length": 743.9531402587891, "completions/min_length": 225.5, "completions/min_terminated_length": 225.5, "epoch": 0.315, "grad_norm": 0.32549354434013367, "kl": 0.03570556640625, "learning_rate": 8.796443044543203e-07, "loss": 0.2047, "num_tokens": 54940649.0, "reward": 0.5432431399822235, "reward_std": 0.3187037371098995, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994187783449888, "rewards/penalized_accuracy_reward/std": 0.1304815523326397, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.19791140407323837, "step": 630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1986.75, "completions/max_terminated_length": 1739.5, "completions/mean_length": 911.4375, "completions/mean_terminated_length": 774.1842651367188, "completions/min_length": 265.5, "completions/min_terminated_length": 265.5, "epoch": 0.3155, "grad_norm": 0.42144152522087097, "kl": 0.036529541015625, "learning_rate": 8.791091657286267e-07, "loss": 0.2554, "num_tokens": 55009221.0, "reward": 0.46636103093624115, "reward_std": 0.18011178076267242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012477392330765724, "rewards/penalized_accuracy_reward/std": 0.0499095693230629, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2418053150177002, "step": 631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1588.75, "completions/max_terminated_length": 1455.0, "completions/mean_length": 871.578125, "completions/mean_terminated_length": 758.6488189697266, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.316, "grad_norm": 0.34393173456192017, "kl": 0.0372314453125, "learning_rate": 8.785730244793386e-07, "loss": 0.2538, "num_tokens": 55074010.0, "reward": 0.6277958452701569, "reward_std": 0.44380518421530724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08733541797846556, "rewards/penalized_accuracy_reward/std": 0.1953153796494007, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.17864800989627838, "step": 632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1963.0, "completions/max_terminated_length": 1563.5, "completions/mean_length": 913.5625, "completions/mean_terminated_length": 823.456428527832, "completions/min_length": 296.75, "completions/min_terminated_length": 296.75, "epoch": 0.3165, "grad_norm": 0.4044509530067444, "kl": 0.043212890625, "learning_rate": 8.780358823396352e-07, "loss": 0.186, "num_tokens": 55143566.0, "reward": 0.6491726636886597, "reward_std": 0.2947415728121996, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.0625, "rewards/penalized_accuracy_reward/mean": 0.08728164434432983, "rewards/penalized_accuracy_reward/std": 0.10221382230520248, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.17260181531310081, "step": 633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1599.75, "completions/max_terminated_length": 1499.75, "completions/mean_length": 799.953125, "completions/mean_terminated_length": 778.7104187011719, "completions/min_length": 251.75, "completions/min_terminated_length": 251.75, "epoch": 0.317, "grad_norm": 0.4251973032951355, "kl": 0.032928466796875, "learning_rate": 8.774977409457447e-07, "loss": -0.2209, "num_tokens": 55205179.0, "reward": 0.7069846093654633, "reward_std": 0.37017929553985596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11228136718273163, "rewards/penalized_accuracy_reward/std": 0.17039788514375687, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11894455552101135, "step": 634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1649.5, "completions/mean_length": 1207.015625, "completions/mean_terminated_length": 943.2183685302734, "completions/min_length": 482.25, "completions/min_terminated_length": 482.25, "epoch": 0.3175, "grad_norm": 0.33479395508766174, "kl": 0.028106689453125, "learning_rate": 8.769586019369391e-07, "loss": 0.3163, "num_tokens": 55292060.0, "reward": 0.6439521908760071, "reward_std": 0.33914145454764366, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12471046298742294, "rewards/penalized_accuracy_reward/std": 0.09976840764284134, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.29791542887687683, "step": 635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 584.9375, "completions/mean_terminated_length": 584.9375, "completions/min_length": 260.5, "completions/min_terminated_length": 260.5, "epoch": 0.318, "grad_norm": 0.6023445129394531, "kl": 0.0411376953125, "learning_rate": 8.764184669555293e-07, "loss": 0.1973, "num_tokens": 55339064.0, "reward": 0.5441095530986786, "reward_std": 0.15997862815856934, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024984467774629593, "rewards/penalized_accuracy_reward/std": 0.06827056407928467, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2029.75, "completions/max_terminated_length": 1711.75, "completions/mean_length": 1062.921875, "completions/mean_terminated_length": 895.3437652587891, "completions/min_length": 455.25, "completions/min_terminated_length": 455.25, "epoch": 0.3185, "grad_norm": 0.40889623761177063, "kl": 0.035614013671875, "learning_rate": 8.758773376468604e-07, "loss": 0.275, "num_tokens": 55415843.0, "reward": 0.6848480552434921, "reward_std": 0.4547341614961624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12465058639645576, "rewards/penalized_accuracy_reward/std": 0.18892636895179749, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.23797086998820305, "step": 637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1823.25, "completions/max_terminated_length": 1363.25, "completions/mean_length": 1105.3125, "completions/mean_terminated_length": 833.2991180419922, "completions/min_length": 369.25, "completions/min_terminated_length": 369.25, "epoch": 0.319, "grad_norm": 0.2552751898765564, "kl": 0.043731689453125, "learning_rate": 8.753352156593055e-07, "loss": 0.1403, "num_tokens": 55497783.0, "reward": 0.7423095107078552, "reward_std": 0.5213407203555107, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16217038873583078, "rewards/penalized_accuracy_reward/std": 0.23269116133451462, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.22746434807777405, "step": 638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1677.25, "completions/mean_length": 1213.125, "completions/mean_terminated_length": 881.1054382324219, "completions/min_length": 254.75, "completions/min_terminated_length": 254.75, "epoch": 0.3195, "grad_norm": 3.8428902626037598, "kl": 0.109405517578125, "learning_rate": 8.747921026442629e-07, "loss": 0.417, "num_tokens": 55584767.0, "reward": 0.392578125, "reward_std": 0.16452470794320107, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.3290494233369827, "step": 639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1925.0, "completions/max_terminated_length": 1651.75, "completions/mean_length": 1031.34375, "completions/mean_terminated_length": 884.1114807128906, "completions/min_length": 435.75, "completions/min_terminated_length": 435.75, "epoch": 0.32, "grad_norm": 0.36546197533607483, "kl": 0.02911376953125, "learning_rate": 8.742480002561487e-07, "loss": 0.1708, "num_tokens": 55659157.0, "reward": 0.752230703830719, "reward_std": 0.5759753882884979, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16224816255271435, "rewards/penalized_accuracy_reward/std": 0.2598419263958931, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.23655857890844345, "step": 640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1575.0, "completions/max_terminated_length": 1240.5, "completions/mean_length": 745.03125, "completions/mean_terminated_length": 586.7211608886719, "completions/min_length": 255.25, "completions/min_terminated_length": 255.25, "epoch": 0.3205, "grad_norm": 63.92862319946289, "kl": 0.84197998046875, "learning_rate": 8.737029101523929e-07, "loss": 0.3091, "num_tokens": 55716695.0, "reward": 0.45858001708984375, "reward_std": 0.17700516991317272, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012493135407567024, "rewards/penalized_accuracy_reward/std": 0.049972545355558395, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.20839229598641396, "step": 641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.25, "completions/mean_length": 1173.0625, "completions/mean_terminated_length": 875.6727447509766, "completions/min_length": 242.25, "completions/min_terminated_length": 242.25, "epoch": 0.321, "grad_norm": 0.4779733717441559, "kl": 0.04083251953125, "learning_rate": 8.731568339934348e-07, "loss": 0.4063, "num_tokens": 55800635.0, "reward": 0.396484375, "reward_std": 0.14884895831346512, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.29769792407751083, "step": 642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2001.0, "completions/max_terminated_length": 1508.5, "completions/mean_length": 846.515625, "completions/mean_terminated_length": 775.8961639404297, "completions/min_length": 408.5, "completions/min_terminated_length": 408.5, "epoch": 0.3215, "grad_norm": 0.45955801010131836, "kl": 0.038299560546875, "learning_rate": 8.726097734427172e-07, "loss": 0.2164, "num_tokens": 55866764.0, "reward": 0.49558551609516144, "reward_std": 0.1613723263144493, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012441194616258144, "rewards/penalized_accuracy_reward/std": 0.049764782190322876, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16209635883569717, "step": 643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1725.5, "completions/max_terminated_length": 1324.25, "completions/mean_length": 910.59375, "completions/mean_terminated_length": 669.8367767333984, "completions/min_length": 355.5, "completions/min_terminated_length": 355.5, "epoch": 0.322, "grad_norm": 0.6651291251182556, "kl": 0.0513916015625, "learning_rate": 8.72061730166681e-07, "loss": 0.3674, "num_tokens": 55934674.0, "reward": 0.7428238093852997, "reward_std": 0.2695072181522846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16242754459381104, "rewards/penalized_accuracy_reward/std": 0.08058663457632065, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.23092791624367237, "step": 644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1639.0, "completions/max_terminated_length": 1466.5, "completions/mean_length": 1070.859375, "completions/mean_terminated_length": 791.9750061035156, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.3225, "grad_norm": 0.451645165681839, "kl": 0.039703369140625, "learning_rate": 8.715127058347614e-07, "loss": 0.1928, "num_tokens": 56014041.0, "reward": 0.4350516349077225, "reward_std": 0.19016793370246887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012447690591216087, "rewards/penalized_accuracy_reward/std": 0.04979076236486435, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.23560860008001328, "step": 645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1957.0, "completions/max_terminated_length": 1658.75, "completions/mean_length": 1189.359375, "completions/mean_terminated_length": 969.9859313964844, "completions/min_length": 565.75, "completions/min_terminated_length": 565.75, "epoch": 0.323, "grad_norm": 0.3978725075721741, "kl": 0.0409393310546875, "learning_rate": 8.709627021193816e-07, "loss": 0.2527, "num_tokens": 56100432.0, "reward": 0.404296875, "reward_std": 0.11217692121863365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.2243538573384285, "step": 646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1883.5, "completions/max_terminated_length": 1425.75, "completions/mean_length": 843.53125, "completions/mean_terminated_length": 543.2364120483398, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3235, "grad_norm": 0.3415524363517761, "kl": 0.052734375, "learning_rate": 8.704117206959484e-07, "loss": 0.2896, "num_tokens": 56165426.0, "reward": 0.416015625, "reward_std": 0.09222675487399101, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.18445350974798203, "step": 647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1679.25, "completions/max_terminated_length": 1302.75, "completions/mean_length": 1055.765625, "completions/mean_terminated_length": 819.6672821044922, "completions/min_length": 364.75, "completions/min_terminated_length": 364.75, "epoch": 0.324, "grad_norm": 0.5403870940208435, "kl": 0.0435791015625, "learning_rate": 8.698597632428466e-07, "loss": 0.176, "num_tokens": 56241955.0, "reward": 0.4404217302799225, "reward_std": 0.22341754846274853, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024898363277316093, "rewards/penalized_accuracy_reward/std": 0.06803528219461441, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.22139518707990646, "step": 648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1283.5, "completions/mean_length": 753.59375, "completions/mean_terminated_length": 622.1369400024414, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.3245, "grad_norm": 0.6642272472381592, "kl": 0.034820556640625, "learning_rate": 8.693068314414344e-07, "loss": 0.3033, "num_tokens": 56303433.0, "reward": 0.7183797210454941, "reward_std": 0.5319165773689747, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1248148512095213, "rewards/penalized_accuracy_reward/std": 0.24431907385587692, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1868420448154211, "step": 649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1818.5, "completions/max_terminated_length": 1495.5, "completions/mean_length": 955.59375, "completions/mean_terminated_length": 820.0337677001953, "completions/min_length": 386.5, "completions/min_terminated_length": 386.5, "epoch": 0.325, "grad_norm": 0.5052257776260376, "kl": 0.02783203125, "learning_rate": 8.687529269760379e-07, "loss": 0.2601, "num_tokens": 56372671.0, "reward": 0.4761258512735367, "reward_std": 0.19926531985402107, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012476989068090916, "rewards/penalized_accuracy_reward/std": 0.04990795999765396, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.19889883697032928, "step": 650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1892.25, "completions/max_terminated_length": 1671.0, "completions/mean_length": 1000.421875, "completions/mean_terminated_length": 830.3982696533203, "completions/min_length": 341.5, "completions/min_terminated_length": 341.5, "epoch": 0.3255, "grad_norm": 0.3504140079021454, "kl": 0.029022216796875, "learning_rate": 8.681980515339463e-07, "loss": 0.2016, "num_tokens": 56444554.0, "reward": 0.716090053319931, "reward_std": 0.45094114914536476, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13734189420938492, "rewards/penalized_accuracy_reward/std": 0.19165977835655212, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.21047796308994293, "step": 651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1704.5, "completions/max_terminated_length": 1414.5, "completions/mean_length": 789.28125, "completions/mean_terminated_length": 684.2816162109375, "completions/min_length": 255.25, "completions/min_terminated_length": 255.25, "epoch": 0.326, "grad_norm": 0.35127702355384827, "kl": 0.03619384765625, "learning_rate": 8.676422068054064e-07, "loss": 0.2581, "num_tokens": 56502780.0, "reward": 0.8312894403934479, "reward_std": 0.3692151606082916, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18712907657027245, "rewards/penalized_accuracy_reward/std": 0.1486886963248253, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19039630144834518, "step": 652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1709.0, "completions/max_terminated_length": 1418.0, "completions/mean_length": 713.5625, "completions/mean_terminated_length": 596.4467926025391, "completions/min_length": 257.5, "completions/min_terminated_length": 257.5, "epoch": 0.3265, "grad_norm": 0.5727707147598267, "kl": 0.038421630859375, "learning_rate": 8.670853944836176e-07, "loss": 0.2223, "num_tokens": 56559216.0, "reward": 0.6087927222251892, "reward_std": 0.268830019980669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0749041810631752, "rewards/penalized_accuracy_reward/std": 0.09987225383520126, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.13817108422517776, "step": 653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1757.75, "completions/max_terminated_length": 1438.75, "completions/mean_length": 765.25, "completions/mean_terminated_length": 706.0687866210938, "completions/min_length": 351.25, "completions/min_terminated_length": 351.25, "epoch": 0.327, "grad_norm": 0.6504042744636536, "kl": 0.045074462890625, "learning_rate": 8.665276162647267e-07, "loss": 0.2286, "num_tokens": 56619792.0, "reward": 0.470703125, "reward_std": 0.08527654223144054, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17055309563875198, "step": 654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1609.5, "completions/max_terminated_length": 1355.5, "completions/mean_length": 736.75, "completions/mean_terminated_length": 713.6239624023438, "completions/min_length": 296.5, "completions/min_terminated_length": 296.5, "epoch": 0.3275, "grad_norm": 0.6054185628890991, "kl": 0.0382080078125, "learning_rate": 8.659688738478231e-07, "loss": 0.1733, "num_tokens": 56673504.0, "reward": 0.7695435881614685, "reward_std": 0.4114902764558792, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13867804035544395, "rewards/penalized_accuracy_reward/std": 0.19336096197366714, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1613.75, "completions/max_terminated_length": 1468.5, "completions/mean_length": 624.1875, "completions/mean_terminated_length": 578.3325958251953, "completions/min_length": 227.75, "completions/min_terminated_length": 227.75, "epoch": 0.328, "grad_norm": 0.5894107222557068, "kl": 0.057373046875, "learning_rate": 8.654091689349329e-07, "loss": 0.2346, "num_tokens": 56725884.0, "reward": 0.5206549316644669, "reward_std": 0.20357568934559822, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024975907057523727, "rewards/penalized_accuracy_reward/std": 0.06824716925621033, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.13416270911693573, "step": 656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1924.5, "completions/max_terminated_length": 1612.25, "completions/mean_length": 1059.9375, "completions/mean_terminated_length": 935.9113311767578, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.3285, "grad_norm": 0.3330658972263336, "kl": 0.0277099609375, "learning_rate": 8.648485032310144e-07, "loss": 0.2377, "num_tokens": 56802856.0, "reward": 0.8483972549438477, "reward_std": 0.5084436498582363, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19958925247192383, "rewards/penalized_accuracy_reward/std": 0.2257288619875908, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.19544945657253265, "step": 657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1659.25, "completions/max_terminated_length": 1468.25, "completions/mean_length": 766.21875, "completions/mean_terminated_length": 728.8010559082031, "completions/min_length": 304.25, "completions/min_terminated_length": 304.25, "epoch": 0.329, "grad_norm": 0.45431119203567505, "kl": 0.041168212890625, "learning_rate": 8.642868784439527e-07, "loss": 0.2061, "num_tokens": 56862278.0, "reward": 0.756956547498703, "reward_std": 0.3494044691324234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13726733438670635, "rewards/penalized_accuracy_reward/std": 0.14976096525788307, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1265372931957245, "step": 658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1520.5, "completions/max_terminated_length": 1162.0, "completions/mean_length": 675.640625, "completions/mean_terminated_length": 570.6697998046875, "completions/min_length": 245.75, "completions/min_terminated_length": 245.75, "epoch": 0.3295, "grad_norm": 0.7085420489311218, "kl": 0.04437255859375, "learning_rate": 8.63724296284554e-07, "loss": 0.2157, "num_tokens": 56914383.0, "reward": 0.5167614221572876, "reward_std": 0.2581435889005661, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0249822698533535, "rewards/penalized_accuracy_reward/std": 0.0999290868639946, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1648966744542122, "step": 659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1388.75, "completions/max_terminated_length": 1288.0, "completions/mean_length": 728.15625, "completions/mean_terminated_length": 660.2839965820312, "completions/min_length": 314.5, "completions/min_terminated_length": 314.5, "epoch": 0.33, "grad_norm": 0.38744863867759705, "kl": 0.03436279296875, "learning_rate": 8.631607584665413e-07, "loss": 0.182, "num_tokens": 56970537.0, "reward": 0.7470422238111496, "reward_std": 0.42454393208026886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1371929869055748, "rewards/penalized_accuracy_reward/std": 0.19148292392492294, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14933442324399948, "step": 660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2022.25, "completions/max_terminated_length": 1652.75, "completions/mean_length": 908.75, "completions/mean_terminated_length": 701.2291870117188, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3305, "grad_norm": 0.6051314473152161, "kl": 0.037384033203125, "learning_rate": 8.625962667065487e-07, "loss": 0.3355, "num_tokens": 57036457.0, "reward": 0.6123197227716446, "reward_std": 0.4526142403483391, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08740985859185457, "rewards/penalized_accuracy_reward/std": 0.19551756605505943, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.22436867654323578, "step": 661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1789.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 837.796875, "completions/mean_terminated_length": 687.9733734130859, "completions/min_length": 236.75, "completions/min_terminated_length": 236.75, "epoch": 0.331, "grad_norm": 0.4679225981235504, "kl": 0.03619384765625, "learning_rate": 8.620308227241157e-07, "loss": 0.2541, "num_tokens": 57101468.0, "reward": 0.7047668546438217, "reward_std": 0.5010490156710148, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12484437879174948, "rewards/penalized_accuracy_reward/std": 0.2204299010336399, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.19427300989627838, "step": 662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1850.0, "completions/max_terminated_length": 1336.25, "completions/mean_length": 787.234375, "completions/mean_terminated_length": 706.7960052490234, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.3315, "grad_norm": 4.4539313316345215, "kl": 0.083251953125, "learning_rate": 8.614644282416831e-07, "loss": 0.225, "num_tokens": 57162955.0, "reward": 0.49177730083465576, "reward_std": 0.17845379747450352, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012490208260715008, "rewards/penalized_accuracy_reward/std": 0.04996083304286003, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18383773788809776, "step": 663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2024.5, "completions/max_terminated_length": 1601.25, "completions/mean_length": 750.65625, "completions/mean_terminated_length": 678.8931732177734, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.332, "grad_norm": 0.5116720199584961, "kl": 0.032135009765625, "learning_rate": 8.608970849845862e-07, "loss": 0.2907, "num_tokens": 57219765.0, "reward": 0.7722051441669464, "reward_std": 0.4483204819262028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16549422964453697, "step": 664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1311.5, "completions/mean_length": 912.984375, "completions/mean_terminated_length": 748.3647308349609, "completions/min_length": 259.25, "completions/min_terminated_length": 259.25, "epoch": 0.3325, "grad_norm": 0.6298213005065918, "kl": 0.03839111328125, "learning_rate": 8.603287946810513e-07, "loss": 0.4499, "num_tokens": 57290052.0, "reward": 0.570084810256958, "reward_std": 0.29257190600037575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062386155128479004, "rewards/penalized_accuracy_reward/std": 0.09556835889816284, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.23886634036898613, "step": 665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1712.5, "completions/max_terminated_length": 1518.75, "completions/mean_length": 769.125, "completions/mean_terminated_length": 604.8575592041016, "completions/min_length": 203.75, "completions/min_terminated_length": 203.75, "epoch": 0.333, "grad_norm": 0.606941819190979, "kl": 0.043792724609375, "learning_rate": 8.597595590621892e-07, "loss": 0.3595, "num_tokens": 57349004.0, "reward": 0.4453125, "reward_std": 0.11179974302649498, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.22359948605298996, "step": 666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1184.5, "completions/max_terminated_length": 1137.0, "completions/mean_length": 777.0, "completions/mean_terminated_length": 723.1363830566406, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.3335, "grad_norm": 0.3617597818374634, "kl": 0.025634765625, "learning_rate": 8.591893798619903e-07, "loss": 0.1225, "num_tokens": 57407948.0, "reward": 0.5108873322606087, "reward_std": 0.23254762589931488, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024974919855594635, "rewards/penalized_accuracy_reward/std": 0.09989968314766884, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.13063044100999832, "step": 667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1662.25, "completions/max_terminated_length": 1171.75, "completions/mean_length": 803.03125, "completions/mean_terminated_length": 680.8388061523438, "completions/min_length": 264.25, "completions/min_terminated_length": 264.25, "epoch": 0.334, "grad_norm": 0.4217642843723297, "kl": 0.033935546875, "learning_rate": 8.586182588173194e-07, "loss": 0.1601, "num_tokens": 57467550.0, "reward": 0.7935260087251663, "reward_std": 0.4768768046051264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16238800436258316, "rewards/penalized_accuracy_reward/std": 0.21815772727131844, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1593499705195427, "step": 668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1472.75, "completions/max_terminated_length": 1372.5, "completions/mean_length": 993.640625, "completions/mean_terminated_length": 802.5866546630859, "completions/min_length": 261.75, "completions/min_terminated_length": 261.75, "epoch": 0.3345, "grad_norm": 0.39722251892089844, "kl": 0.02838134765625, "learning_rate": 8.580461976679099e-07, "loss": 0.1295, "num_tokens": 57540807.0, "reward": 0.7633879333734512, "reward_std": 0.484527800232172, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17466270923614502, "rewards/penalized_accuracy_reward/std": 0.19780827313661575, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.23038379102945328, "step": 669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1445.25, "completions/max_terminated_length": 1170.0, "completions/mean_length": 697.90625, "completions/mean_terminated_length": 638.8296203613281, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.335, "grad_norm": 0.5433564782142639, "kl": 0.036834716796875, "learning_rate": 8.574731981563597e-07, "loss": 0.1886, "num_tokens": 57595473.0, "reward": 0.9200735092163086, "reward_std": 0.46785217337310314, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2246851772069931, "rewards/penalized_accuracy_reward/std": 0.2030104249715805, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.12366268411278725, "step": 670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1932.75, "completions/max_terminated_length": 1770.75, "completions/mean_length": 860.609375, "completions/mean_terminated_length": 763.7973480224609, "completions/min_length": 294.75, "completions/min_terminated_length": 294.75, "epoch": 0.3355, "grad_norm": 0.5174123644828796, "kl": 0.03924560546875, "learning_rate": 8.568992620281243e-07, "loss": 0.3471, "num_tokens": 57658520.0, "reward": 0.4781147390604019, "reward_std": 0.1938327830284834, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012494870461523533, "rewards/penalized_accuracy_reward/std": 0.04997948184609413, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2300429493188858, "step": 671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1911.25, "completions/max_terminated_length": 1577.0, "completions/mean_length": 1127.328125, "completions/mean_terminated_length": 931.9107666015625, "completions/min_length": 367.25, "completions/min_terminated_length": 367.25, "epoch": 0.336, "grad_norm": 0.33028411865234375, "kl": 0.0255126953125, "learning_rate": 8.56324391031513e-07, "loss": 0.2982, "num_tokens": 57737069.0, "reward": 0.6273581385612488, "reward_std": 0.3071911633014679, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09981187433004379, "rewards/penalized_accuracy_reward/std": 0.10308531671762466, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.2241397500038147, "step": 672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1861.5, "completions/max_terminated_length": 1416.5, "completions/mean_length": 875.34375, "completions/mean_terminated_length": 753.3124389648438, "completions/min_length": 353.75, "completions/min_terminated_length": 353.75, "epoch": 0.3365, "grad_norm": 0.880089282989502, "kl": 0.064178466796875, "learning_rate": 8.557485869176825e-07, "loss": 0.3171, "num_tokens": 57802019.0, "reward": 0.7296172827482224, "reward_std": 0.4109017439186573, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13726957701146603, "rewards/penalized_accuracy_reward/std": 0.17052824795246124, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.16614429652690887, "step": 673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1407.0, "completions/mean_length": 1043.4375, "completions/mean_terminated_length": 781.1030578613281, "completions/min_length": 354.5, "completions/min_terminated_length": 354.5, "epoch": 0.337, "grad_norm": 0.5259400010108948, "kl": 0.03564453125, "learning_rate": 8.551718514406318e-07, "loss": 0.3691, "num_tokens": 57879743.0, "reward": 0.412109375, "reward_std": 0.13682801835238934, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.27365603670477867, "step": 674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1696.25, "completions/max_terminated_length": 1516.25, "completions/mean_length": 793.671875, "completions/mean_terminated_length": 690.0673217773438, "completions/min_length": 264.5, "completions/min_terminated_length": 264.5, "epoch": 0.3375, "grad_norm": 0.3714185059070587, "kl": 0.052001953125, "learning_rate": 8.545941863571973e-07, "loss": 0.1187, "num_tokens": 57938794.0, "reward": 0.48590974509716034, "reward_std": 0.1481589376926422, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012486121617257595, "rewards/penalized_accuracy_reward/std": 0.04994449391961098, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.14585772156715393, "step": 675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1682.25, "completions/mean_length": 1133.515625, "completions/mean_terminated_length": 959.9372863769531, "completions/min_length": 350.5, "completions/min_terminated_length": 350.5, "epoch": 0.338, "grad_norm": 0.3675209879875183, "kl": 0.0308837890625, "learning_rate": 8.540155934270471e-07, "loss": 0.3457, "num_tokens": 58019979.0, "reward": 0.6714324653148651, "reward_std": 0.3174782171845436, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12477873265743256, "rewards/penalized_accuracy_reward/std": 0.09982305765151978, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.24581274390220642, "step": 676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1636.75, "completions/max_terminated_length": 1350.25, "completions/mean_length": 1082.109375, "completions/mean_terminated_length": 791.982177734375, "completions/min_length": 385.75, "completions/min_terminated_length": 385.75, "epoch": 0.3385, "grad_norm": 0.24614571034908295, "kl": 0.03759765625, "learning_rate": 8.534360744126753e-07, "loss": 0.1962, "num_tokens": 58097714.0, "reward": 0.42535632848739624, "reward_std": 0.1962723396718502, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482849881052971, "rewards/penalized_accuracy_reward/std": 0.049931399524211884, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.24598412215709686, "step": 677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1296.25, "completions/max_terminated_length": 970.0, "completions/mean_length": 725.046875, "completions/mean_terminated_length": 514.8437652587891, "completions/min_length": 191.75, "completions/min_terminated_length": 191.75, "epoch": 0.339, "grad_norm": 0.5067505240440369, "kl": 0.05023193359375, "learning_rate": 8.528556310793979e-07, "loss": 0.2556, "num_tokens": 58154149.0, "reward": 0.431640625, "reward_std": 0.09686268866062164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.1937253773212433, "step": 678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1345.75, "completions/max_terminated_length": 1345.75, "completions/mean_length": 650.703125, "completions/mean_terminated_length": 650.703125, "completions/min_length": 302.25, "completions/min_terminated_length": 302.25, "epoch": 0.3395, "grad_norm": 0.3210277855396271, "kl": 0.03076171875, "learning_rate": 8.522742651953456e-07, "loss": 0.0292, "num_tokens": 58204658.0, "reward": 1.0411759465932846, "reward_std": 0.6009402126073837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2744942158460617, "rewards/penalized_accuracy_reward/std": 0.3009553626179695, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.042695630341768265, "step": 679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1431.5, "completions/max_terminated_length": 1259.25, "completions/mean_length": 811.078125, "completions/mean_terminated_length": 697.7305297851562, "completions/min_length": 249.5, "completions/min_terminated_length": 249.5, "epoch": 0.34, "grad_norm": 0.26940983533859253, "kl": 0.04058837890625, "learning_rate": 8.516919785314595e-07, "loss": 0.1355, "num_tokens": 58266471.0, "reward": 0.6048239320516586, "reward_std": 0.26059041917324066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07487291097640991, "rewards/penalized_accuracy_reward/std": 0.09983056038618088, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.15822411328554153, "step": 680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1704.75, "completions/max_terminated_length": 1412.0, "completions/mean_length": 981.03125, "completions/mean_terminated_length": 819.5326385498047, "completions/min_length": 268.75, "completions/min_terminated_length": 268.75, "epoch": 0.3405, "grad_norm": 0.37998315691947937, "kl": 0.036865234375, "learning_rate": 8.511087728614862e-07, "loss": 0.1405, "num_tokens": 58337337.0, "reward": 0.6218252182006836, "reward_std": 0.4110981784760952, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0872797966003418, "rewards/penalized_accuracy_reward/std": 0.1696445345878601, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.19387998431921005, "step": 681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1656.25, "completions/max_terminated_length": 1644.0, "completions/mean_length": 886.71875, "completions/mean_terminated_length": 834.2375183105469, "completions/min_length": 278.5, "completions/min_terminated_length": 278.5, "epoch": 0.341, "grad_norm": 0.3063591420650482, "kl": 0.0330810546875, "learning_rate": 8.50524649961971e-07, "loss": 0.0233, "num_tokens": 58405015.0, "reward": 0.6184279173612595, "reward_std": 0.3932286500930786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07483895774930716, "rewards/penalized_accuracy_reward/std": 0.18910017609596252, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.07905694842338562, "step": 682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1661.75, "completions/max_terminated_length": 1393.0, "completions/mean_length": 997.5, "completions/mean_terminated_length": 717.3846282958984, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.3415, "grad_norm": 0.42122411727905273, "kl": 0.04205322265625, "learning_rate": 8.499396116122535e-07, "loss": 0.3415, "num_tokens": 58476935.0, "reward": 0.7857148051261902, "reward_std": 0.5164125896990299, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1887558326125145, "rewards/penalized_accuracy_reward/std": 0.20375166088342667, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.2685704994946718, "step": 683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1906.0, "completions/max_terminated_length": 1563.75, "completions/mean_length": 848.078125, "completions/mean_terminated_length": 769.0040435791016, "completions/min_length": 322.75, "completions/min_terminated_length": 322.75, "epoch": 0.342, "grad_norm": 0.4293214678764343, "kl": 0.033538818359375, "learning_rate": 8.493536595944622e-07, "loss": 0.2721, "num_tokens": 58539772.0, "reward": 0.48046875, "reward_std": 0.06327171996235847, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12654344737529755, "step": 684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1925.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 831.546875, "completions/mean_terminated_length": 665.5482330322266, "completions/min_length": 188.5, "completions/min_terminated_length": 188.5, "epoch": 0.3425, "grad_norm": 0.5194879770278931, "kl": 0.055877685546875, "learning_rate": 8.487667956935087e-07, "loss": 0.3499, "num_tokens": 58603663.0, "reward": 0.47805511951446533, "reward_std": 0.18040170893073082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012465055100619793, "rewards/penalized_accuracy_reward/std": 0.04986022040247917, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20321232452988625, "step": 685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1900.25, "completions/max_terminated_length": 1498.75, "completions/mean_length": 886.625, "completions/mean_terminated_length": 701.8313598632812, "completions/min_length": 285.5, "completions/min_terminated_length": 285.5, "epoch": 0.343, "grad_norm": 0.4069730043411255, "kl": 0.0377197265625, "learning_rate": 8.481790216970819e-07, "loss": 0.3304, "num_tokens": 58670663.0, "reward": 0.7008166909217834, "reward_std": 0.38246266916394234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12482238933444023, "rewards/penalized_accuracy_reward/std": 0.15222448110580444, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20257513411343098, "step": 686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1677.0, "completions/max_terminated_length": 1518.25, "completions/mean_length": 869.25, "completions/mean_terminated_length": 808.8645935058594, "completions/min_length": 381.25, "completions/min_terminated_length": 381.25, "epoch": 0.3435, "grad_norm": 0.433769553899765, "kl": 0.036407470703125, "learning_rate": 8.475903393956433e-07, "loss": 0.1513, "num_tokens": 58738231.0, "reward": 0.8622609078884125, "reward_std": 0.3857553955167532, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19968517497181892, "rewards/penalized_accuracy_reward/std": 0.16101034730672836, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.19334407150745392, "step": 687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 1135.46875, "completions/mean_terminated_length": 915.1333770751953, "completions/min_length": 388.25, "completions/min_terminated_length": 388.25, "epoch": 0.344, "grad_norm": 0.4379492998123169, "kl": 0.040130615234375, "learning_rate": 8.470007505824215e-07, "loss": 0.2796, "num_tokens": 58821605.0, "reward": 0.44100069999694824, "reward_std": 0.21210768446326256, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012492536567151546, "rewards/penalized_accuracy_reward/std": 0.049970149993896484, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.2800755612552166, "step": 688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1649.25, "completions/max_terminated_length": 1027.5, "completions/mean_length": 959.59375, "completions/mean_terminated_length": 404.48231506347656, "completions/min_length": 698.0, "completions/min_terminated_length": 186.0, "epoch": 0.3445, "grad_norm": 0.8357529044151306, "kl": 0.03369140625, "learning_rate": 8.464102570534061e-07, "loss": 0.1885, "num_tokens": 58892635.0, "reward": 0.5693359971046448, "reward_std": 0.25034601986408234, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08740238100290298, "rewards/penalized_accuracy_reward/std": 0.10235520452260971, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.125, "step": 689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1584.0, "completions/max_terminated_length": 1377.25, "completions/mean_length": 866.671875, "completions/mean_terminated_length": 792.5718994140625, "completions/min_length": 368.5, "completions/min_terminated_length": 368.5, "epoch": 0.345, "grad_norm": 0.3848258852958679, "kl": 0.036956787109375, "learning_rate": 8.458188606073431e-07, "loss": 0.0777, "num_tokens": 58958022.0, "reward": 0.654945582151413, "reward_std": 0.34780529141426086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09993372671306133, "rewards/penalized_accuracy_reward/std": 0.15235214680433273, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.14904648810625076, "step": 690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1711.75, "completions/max_terminated_length": 1333.0, "completions/mean_length": 652.5, "completions/mean_terminated_length": 506.88765716552734, "completions/min_length": 225.75, "completions/min_terminated_length": 225.75, "epoch": 0.3455, "grad_norm": 0.582502007484436, "kl": 0.05517578125, "learning_rate": 8.452265630457282e-07, "loss": 0.3923, "num_tokens": 59009878.0, "reward": 0.4878714829683304, "reward_std": 0.1708313785493374, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012490427121520042, "rewards/penalized_accuracy_reward/std": 0.04996171593666077, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1418159231543541, "step": 691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1955.25, "completions/max_terminated_length": 1540.25, "completions/mean_length": 1016.984375, "completions/mean_terminated_length": 745.2650756835938, "completions/min_length": 362.25, "completions/min_terminated_length": 362.25, "epoch": 0.346, "grad_norm": 0.45393088459968567, "kl": 0.04840087890625, "learning_rate": 8.446333661728028e-07, "loss": 0.3264, "num_tokens": 59083445.0, "reward": 0.421875, "reward_std": 0.1067611537873745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.2135223187506199, "step": 692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1776.25, "completions/max_terminated_length": 1639.0, "completions/mean_length": 951.046875, "completions/mean_terminated_length": 843.1053924560547, "completions/min_length": 364.5, "completions/min_terminated_length": 364.5, "epoch": 0.3465, "grad_norm": 0.4357074797153473, "kl": 0.03717041015625, "learning_rate": 8.440392717955475e-07, "loss": 0.0153, "num_tokens": 59151848.0, "reward": 0.6779531240463257, "reward_std": 0.41389285027980804, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11241406202316284, "rewards/penalized_accuracy_reward/std": 0.1804790124297142, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19286471977829933, "step": 693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1656.25, "completions/max_terminated_length": 1424.5, "completions/mean_length": 783.546875, "completions/mean_terminated_length": 650.3937530517578, "completions/min_length": 250.5, "completions/min_terminated_length": 250.5, "epoch": 0.347, "grad_norm": 0.49429696798324585, "kl": 0.03607177734375, "learning_rate": 8.434442817236765e-07, "loss": 0.0583, "num_tokens": 59211307.0, "reward": 0.5858272165060043, "reward_std": 0.38220982253551483, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06244486849755049, "rewards/penalized_accuracy_reward/std": 0.18045112490653992, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.125, "step": 694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1968.5, "completions/max_terminated_length": 1866.0, "completions/mean_length": 1372.953125, "completions/mean_terminated_length": 1046.0302124023438, "completions/min_length": 396.25, "completions/min_terminated_length": 396.25, "epoch": 0.3475, "grad_norm": 0.354573130607605, "kl": 0.03131103515625, "learning_rate": 8.428483977696328e-07, "loss": 0.2308, "num_tokens": 59309688.0, "reward": 0.37452371418476105, "reward_std": 0.20858509838581085, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012457170523703098, "rewards/penalized_accuracy_reward/std": 0.049828678369522095, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.244492769241333, "step": 695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1897.5, "completions/max_terminated_length": 1533.0, "completions/mean_length": 909.984375, "completions/mean_terminated_length": 739.2975921630859, "completions/min_length": 337.75, "completions/min_terminated_length": 337.75, "epoch": 0.348, "grad_norm": 0.6514567732810974, "kl": 0.04864501953125, "learning_rate": 8.422516217485825e-07, "loss": 0.269, "num_tokens": 59382071.0, "reward": 0.443359375, "reward_std": 0.1034887284040451, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2069774568080902, "step": 696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1195.25, "completions/max_terminated_length": 961.5, "completions/mean_length": 554.328125, "completions/mean_terminated_length": 485.49400329589844, "completions/min_length": 212.25, "completions/min_terminated_length": 212.25, "epoch": 0.3485, "grad_norm": 0.22992578148841858, "kl": 0.04144287109375, "learning_rate": 8.416539554784089e-07, "loss": 0.0761, "num_tokens": 59426412.0, "reward": 0.5054126381874084, "reward_std": 0.1114024743437767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012471946887671947, "rewards/penalized_accuracy_reward/std": 0.04988778755068779, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.0752599686384201, "step": 697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1684.75, "completions/max_terminated_length": 1277.75, "completions/mean_length": 802.9375, "completions/mean_terminated_length": 712.459831237793, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.349, "grad_norm": 0.39386603236198425, "kl": 0.038665771484375, "learning_rate": 8.410554007797068e-07, "loss": 0.183, "num_tokens": 59486792.0, "reward": 0.5358857214450836, "reward_std": 0.2203049622476101, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747411072254181, "rewards/penalized_accuracy_reward/std": 0.08056692034006119, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.16486985608935356, "step": 698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1714.0, "completions/max_terminated_length": 1448.75, "completions/mean_length": 644.71875, "completions/mean_terminated_length": 601.1833419799805, "completions/min_length": 287.75, "completions/min_terminated_length": 287.75, "epoch": 0.3495, "grad_norm": 0.6006714701652527, "kl": 0.047119140625, "learning_rate": 8.404559594757777e-07, "loss": 0.2513, "num_tokens": 59539206.0, "reward": 1.1043650805950165, "reward_std": 0.5781332179903984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.31194816529750824, "rewards/penalized_accuracy_reward/std": 0.2728019803762436, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10415080189704895, "step": 699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1734.25, "completions/max_terminated_length": 1399.25, "completions/mean_length": 934.875, "completions/mean_terminated_length": 810.5841064453125, "completions/min_length": 413.25, "completions/min_terminated_length": 413.25, "epoch": 0.35, "grad_norm": 0.37930890917778015, "kl": 0.029571533203125, "learning_rate": 8.398556333926239e-07, "loss": 0.189, "num_tokens": 59606590.0, "reward": 0.7370415329933167, "reward_std": 0.4189099036157131, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14977076649665833, "rewards/penalized_accuracy_reward/std": 0.168015718460083, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.23206470161676407, "step": 700 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1250.703125, "completions/mean_terminated_length": 1109.1659240722656, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.3505, "grad_norm": 0.27972474694252014, "kl": 0.027008056640625, "learning_rate": 8.392544243589427e-07, "loss": 0.1206, "num_tokens": 59695515.0, "reward": 0.4585680067539215, "reward_std": 0.1945447325706482, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012487126514315605, "rewards/penalized_accuracy_reward/std": 0.04994850978255272, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.23745574057102203, "step": 701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1819.0, "completions/max_terminated_length": 1608.25, "completions/mean_length": 857.296875, "completions/mean_terminated_length": 736.337760925293, "completions/min_length": 320.25, "completions/min_terminated_length": 320.25, "epoch": 0.351, "grad_norm": 1.5092288255691528, "kl": 0.05670166015625, "learning_rate": 8.38652334206121e-07, "loss": 0.1973, "num_tokens": 59760990.0, "reward": 0.9853394031524658, "reward_std": 0.3554669916629791, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2622009366750717, "rewards/penalized_accuracy_reward/std": 0.1497531719505787, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1505199372768402, "step": 702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1717.25, "completions/max_terminated_length": 1511.25, "completions/mean_length": 834.546875, "completions/mean_terminated_length": 749.6250152587891, "completions/min_length": 256.5, "completions/min_terminated_length": 256.5, "epoch": 0.3515, "grad_norm": 0.5232129693031311, "kl": 0.03558349609375, "learning_rate": 8.3804936476823e-07, "loss": 0.1161, "num_tokens": 59823553.0, "reward": 0.520514726638794, "reward_std": 0.18799709156155586, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024905800819396973, "rewards/penalized_accuracy_reward/std": 0.06805562227964401, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.14216844737529755, "step": 703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1824.75, "completions/max_terminated_length": 1342.25, "completions/mean_length": 890.765625, "completions/mean_terminated_length": 675.7343826293945, "completions/min_length": 321.5, "completions/min_terminated_length": 321.5, "epoch": 0.352, "grad_norm": 0.5017415881156921, "kl": 0.049560546875, "learning_rate": 8.374455178820189e-07, "loss": 0.3379, "num_tokens": 59893442.0, "reward": 0.4375, "reward_std": 0.10194796323776245, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2038959302008152, "step": 704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2046.75, "completions/max_terminated_length": 1770.0, "completions/mean_length": 1071.234375, "completions/mean_terminated_length": 983.5122985839844, "completions/min_length": 295.5, "completions/min_terminated_length": 295.5, "epoch": 0.3525, "grad_norm": 0.397815465927124, "kl": 0.0322265625, "learning_rate": 8.368407953869103e-07, "loss": 0.2404, "num_tokens": 59970001.0, "reward": 0.7467325329780579, "reward_std": 0.27028490975499153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14973346889019012, "rewards/penalized_accuracy_reward/std": 0.08928399533033371, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2176983542740345, "step": 705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1541.5, "completions/mean_length": 911.34375, "completions/mean_terminated_length": 780.8212738037109, "completions/min_length": 277.75, "completions/min_terminated_length": 277.75, "epoch": 0.353, "grad_norm": 0.506496787071228, "kl": 0.035064697265625, "learning_rate": 8.362351991249937e-07, "loss": 0.231, "num_tokens": 60036151.0, "reward": 0.8449334800243378, "reward_std": 0.4906987249851227, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19981049001216888, "rewards/penalized_accuracy_reward/std": 0.20472953468561172, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.23377006873488426, "step": 706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1589.5, "completions/mean_length": 1031.125, "completions/mean_terminated_length": 854.460205078125, "completions/min_length": 403.75, "completions/min_terminated_length": 403.75, "epoch": 0.3535, "grad_norm": 0.4337652325630188, "kl": 0.0372314453125, "learning_rate": 8.356287309410204e-07, "loss": 0.3637, "num_tokens": 60112447.0, "reward": 0.6582225561141968, "reward_std": 0.324313523247838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11231440305709839, "rewards/penalized_accuracy_reward/std": 0.1023005023598671, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.26829610764980316, "step": 707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1709.0, "completions/max_terminated_length": 1448.25, "completions/mean_length": 661.609375, "completions/mean_terminated_length": 574.7149047851562, "completions/min_length": 233.5, "completions/min_terminated_length": 233.5, "epoch": 0.354, "grad_norm": 0.6895096898078918, "kl": 0.05029296875, "learning_rate": 8.350213926823974e-07, "loss": 0.3215, "num_tokens": 60164934.0, "reward": 1.0119882822036743, "reward_std": 0.4280601777136326, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.27454882860183716, "rewards/penalized_accuracy_reward/std": 0.18279554694890976, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.19698673486709595, "step": 708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1969.75, "completions/max_terminated_length": 1934.5, "completions/mean_length": 1263.484375, "completions/mean_terminated_length": 1008.0649108886719, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.3545, "grad_norm": 0.30543816089630127, "kl": 0.034027099609375, "learning_rate": 8.344131861991828e-07, "loss": 0.1234, "num_tokens": 60253861.0, "reward": 0.5002773702144623, "reward_std": 0.3319804444909096, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049943375401198864, "rewards/penalized_accuracy_reward/std": 0.13043496757745743, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.21038984507322311, "step": 709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1560.5, "completions/max_terminated_length": 1428.0, "completions/mean_length": 804.09375, "completions/mean_terminated_length": 787.3572998046875, "completions/min_length": 315.5, "completions/min_terminated_length": 315.5, "epoch": 0.355, "grad_norm": 0.3301846981048584, "kl": 0.02801513671875, "learning_rate": 8.338041133440788e-07, "loss": 0.1562, "num_tokens": 60313211.0, "reward": 1.1145488023757935, "reward_std": 0.4833357483148575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.312157217413187, "rewards/penalized_accuracy_reward/std": 0.2386995255947113, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.049575019627809525, "step": 710 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1857.5, "completions/max_terminated_length": 1681.5, "completions/mean_length": 1014.40625, "completions/mean_terminated_length": 880.9212341308594, "completions/min_length": 379.75, "completions/min_terminated_length": 379.75, "epoch": 0.3555, "grad_norm": 0.27762818336486816, "kl": 0.026031494140625, "learning_rate": 8.331941759724268e-07, "loss": 0.2238, "num_tokens": 60387413.0, "reward": 0.5201459676027298, "reward_std": 0.24589870125055313, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374167300760746, "rewards/penalized_accuracy_reward/std": 0.08044358342885971, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21544159948825836, "step": 711 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 1284.25, "completions/mean_terminated_length": 1104.375259399414, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.356, "grad_norm": 0.3857506215572357, "kl": 0.032073974609375, "learning_rate": 8.325833759422021e-07, "loss": 0.292, "num_tokens": 60481445.0, "reward": 0.7079275250434875, "reward_std": 0.3121926672756672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14986220002174377, "rewards/penalized_accuracy_reward/std": 0.08936057239770889, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.27952801063656807, "step": 712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1618.25, "completions/max_terminated_length": 1404.25, "completions/mean_length": 789.859375, "completions/mean_terminated_length": 733.5410919189453, "completions/min_length": 227.75, "completions/min_terminated_length": 227.75, "epoch": 0.3565, "grad_norm": 0.4364882707595825, "kl": 0.038848876953125, "learning_rate": 8.319717151140072e-07, "loss": 0.113, "num_tokens": 60540284.0, "reward": 0.8433334678411484, "reward_std": 0.35913945361971855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1872917152941227, "rewards/penalized_accuracy_reward/std": 0.14878134429454803, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.15591933205723763, "step": 713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1837.25, "completions/max_terminated_length": 1393.5, "completions/mean_length": 810.578125, "completions/mean_terminated_length": 736.1605987548828, "completions/min_length": 336.25, "completions/min_terminated_length": 336.25, "epoch": 0.357, "grad_norm": 0.5321258306503296, "kl": 0.035797119140625, "learning_rate": 8.313591953510673e-07, "loss": 0.2808, "num_tokens": 60604529.0, "reward": 0.8394942581653595, "reward_std": 0.4730181973427534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18732525408267975, "rewards/penalized_accuracy_reward/std": 0.20228997617959976, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17558613047003746, "step": 714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 961.375, "completions/mean_terminated_length": 787.3586578369141, "completions/min_length": 303.5, "completions/min_terminated_length": 303.5, "epoch": 0.3575, "grad_norm": 0.556026816368103, "kl": 0.040679931640625, "learning_rate": 8.307458185192238e-07, "loss": 0.3702, "num_tokens": 60676841.0, "reward": 0.49918271601200104, "reward_std": 0.2367298435419798, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498198300600052, "rewards/penalized_accuracy_reward/std": 0.06826377660036087, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.24712799862027168, "step": 715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1767.5, "completions/max_terminated_length": 1575.25, "completions/mean_length": 961.40625, "completions/mean_terminated_length": 846.1800537109375, "completions/min_length": 380.25, "completions/min_terminated_length": 380.25, "epoch": 0.358, "grad_norm": 0.4573288857936859, "kl": 0.034912109375, "learning_rate": 8.301315864869289e-07, "loss": 0.2313, "num_tokens": 60746307.0, "reward": 0.5315123200416565, "reward_std": 0.2789682950824499, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993584007024765, "rewards/penalized_accuracy_reward/std": 0.08932796120643616, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.23886724933981895, "step": 716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1550.25, "completions/max_terminated_length": 1323.0, "completions/mean_length": 844.125, "completions/mean_terminated_length": 675.0800628662109, "completions/min_length": 249.5, "completions/min_terminated_length": 249.5, "epoch": 0.3585, "grad_norm": 0.35286834836006165, "kl": 0.0430908203125, "learning_rate": 8.295165011252396e-07, "loss": 0.1775, "num_tokens": 60807675.0, "reward": 0.6141887903213501, "reward_std": 0.2846856154501438, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08736782521009445, "rewards/penalized_accuracy_reward/std": 0.10231483727693558, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.1800437942147255, "step": 717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1751.25, "completions/max_terminated_length": 1346.25, "completions/mean_length": 787.234375, "completions/mean_terminated_length": 716.6888809204102, "completions/min_length": 378.75, "completions/min_terminated_length": 378.75, "epoch": 0.359, "grad_norm": 0.4124862849712372, "kl": 0.028533935546875, "learning_rate": 8.289005643078131e-07, "loss": 0.1985, "num_tokens": 60865786.0, "reward": 0.6396755576133728, "reward_std": 0.27992378547787666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0874159038066864, "rewards/penalized_accuracy_reward/std": 0.10237110406160355, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18409235030412674, "step": 718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1987.25, "completions/max_terminated_length": 1724.75, "completions/mean_length": 868.546875, "completions/mean_terminated_length": 710.9063720703125, "completions/min_length": 268.75, "completions/min_terminated_length": 268.75, "epoch": 0.3595, "grad_norm": 0.49609068036079407, "kl": 0.04388427734375, "learning_rate": 8.282837779108993e-07, "loss": 0.3592, "num_tokens": 60930253.0, "reward": 0.5929546803236008, "reward_std": 0.3937194421887398, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07479764893651009, "rewards/penalized_accuracy_reward/std": 0.15734750777482986, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.23571450635790825, "step": 719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1571.5, "completions/max_terminated_length": 1448.25, "completions/mean_length": 913.15625, "completions/mean_terminated_length": 782.8898315429688, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.36, "grad_norm": 0.398825079202652, "kl": 0.038818359375, "learning_rate": 8.276661438133368e-07, "loss": 0.2401, "num_tokens": 60996535.0, "reward": 0.9923827350139618, "reward_std": 0.3821817860007286, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2745116800069809, "rewards/penalized_accuracy_reward/std": 0.15218711644411087, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2095179334282875, "step": 720 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1929.0, "completions/max_terminated_length": 1694.75, "completions/mean_length": 1292.09375, "completions/mean_terminated_length": 1055.387451171875, "completions/min_length": 568.75, "completions/min_terminated_length": 568.75, "epoch": 0.3605, "grad_norm": 0.34802237153053284, "kl": 0.038299560546875, "learning_rate": 8.270476638965461e-07, "loss": 0.1504, "num_tokens": 61093773.0, "reward": 0.48306146264076233, "reward_std": 0.32143428549170494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037429168820381165, "rewards/penalized_accuracy_reward/std": 0.11806512251496315, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.2736465558409691, "step": 721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1959.75, "completions/max_terminated_length": 1647.25, "completions/mean_length": 1077.0625, "completions/mean_terminated_length": 937.6395874023438, "completions/min_length": 295.75, "completions/min_terminated_length": 295.75, "epoch": 0.361, "grad_norm": 0.46330246329307556, "kl": 0.047821044921875, "learning_rate": 8.264283400445243e-07, "loss": 0.3169, "num_tokens": 61173601.0, "reward": 0.4375, "reward_std": 0.11287032812833786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.22574065625667572, "step": 722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1918.0, "completions/max_terminated_length": 1419.25, "completions/mean_length": 802.71875, "completions/mean_terminated_length": 689.9444961547852, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.3615, "grad_norm": 0.6362351775169373, "kl": 0.041259765625, "learning_rate": 8.258081741438394e-07, "loss": 0.516, "num_tokens": 61232847.0, "reward": 0.7181322574615479, "reward_std": 0.27850089594721794, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12469111382961273, "rewards/penalized_accuracy_reward/std": 0.09975293278694153, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.15799006074666977, "step": 723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 881.203125, "completions/mean_terminated_length": 653.5304260253906, "completions/min_length": 285.5, "completions/min_terminated_length": 285.5, "epoch": 0.362, "grad_norm": 0.4797056317329407, "kl": 0.04949951171875, "learning_rate": 8.25187168083624e-07, "loss": 0.4341, "num_tokens": 61300716.0, "reward": 0.6643298864364624, "reward_std": 0.40079159289598465, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11243838164955378, "rewards/penalized_accuracy_reward/std": 0.15318092331290245, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2707347422838211, "step": 724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1902.0, "completions/max_terminated_length": 1652.25, "completions/mean_length": 1141.390625, "completions/mean_terminated_length": 955.1925811767578, "completions/min_length": 431.5, "completions/min_terminated_length": 431.5, "epoch": 0.3625, "grad_norm": 0.45697611570358276, "kl": 0.03863525390625, "learning_rate": 8.245653237555705e-07, "loss": 0.2034, "num_tokens": 61384629.0, "reward": 0.7019585967063904, "reward_std": 0.2903725728392601, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.137112095952034, "rewards/penalized_accuracy_reward/std": 0.09547270089387894, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.19885431230068207, "step": 725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1307.25, "completions/mean_length": 871.640625, "completions/mean_terminated_length": 617.3095474243164, "completions/min_length": 260.75, "completions/min_terminated_length": 260.75, "epoch": 0.363, "grad_norm": 0.6525475382804871, "kl": 0.04351806640625, "learning_rate": 8.239426430539243e-07, "loss": 0.5746, "num_tokens": 61447822.0, "reward": 0.43359375, "reward_std": 0.12853525765240192, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.25707052275538445, "step": 726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1574.5, "completions/max_terminated_length": 1196.5, "completions/mean_length": 886.75, "completions/mean_terminated_length": 736.6562652587891, "completions/min_length": 369.5, "completions/min_terminated_length": 369.5, "epoch": 0.3635, "grad_norm": 0.36487850546836853, "kl": 0.03875732421875, "learning_rate": 8.23319127875479e-07, "loss": 0.275, "num_tokens": 61516830.0, "reward": 0.6505560874938965, "reward_std": 0.29212525486946106, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09969210624694824, "rewards/penalized_accuracy_reward/std": 0.10296161472797394, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.19310662150382996, "step": 727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1520.75, "completions/max_terminated_length": 1356.0, "completions/mean_length": 1148.46875, "completions/mean_terminated_length": 752.2864685058594, "completions/min_length": 333.5, "completions/min_terminated_length": 333.5, "epoch": 0.364, "grad_norm": 0.5477501749992371, "kl": 0.081298828125, "learning_rate": 8.226947801195699e-07, "loss": 0.1823, "num_tokens": 61600444.0, "reward": 0.33984375, "reward_std": 0.11755681410431862, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.23511362820863724, "step": 728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1596.5, "completions/max_terminated_length": 1550.5, "completions/mean_length": 805.921875, "completions/mean_terminated_length": 673.3129272460938, "completions/min_length": 253.5, "completions/min_terminated_length": 253.5, "epoch": 0.3645, "grad_norm": 0.4799973666667938, "kl": 0.04742431640625, "learning_rate": 8.220696016880687e-07, "loss": 0.1874, "num_tokens": 61663927.0, "reward": 0.8757006675004959, "reward_std": 0.3788723386824131, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21226438134908676, "rewards/penalized_accuracy_reward/std": 0.14865262061357498, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.16313419491052628, "step": 729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1862.25, "completions/max_terminated_length": 1460.75, "completions/mean_length": 1052.53125, "completions/mean_terminated_length": 866.934326171875, "completions/min_length": 413.5, "completions/min_terminated_length": 413.5, "epoch": 0.365, "grad_norm": 0.5254454612731934, "kl": 0.05682373046875, "learning_rate": 8.21443594485377e-07, "loss": 0.1902, "num_tokens": 61746937.0, "reward": 0.48347797989845276, "reward_std": 0.2917974665760994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024942108429968357, "rewards/penalized_accuracy_reward/std": 0.09976842999458313, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2365049198269844, "step": 730 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1621.75, "completions/max_terminated_length": 1428.5, "completions/mean_length": 969.078125, "completions/mean_terminated_length": 897.3076477050781, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "epoch": 0.3655, "grad_norm": 0.5754501223564148, "kl": 0.03729248046875, "learning_rate": 8.208167604184217e-07, "loss": 0.05, "num_tokens": 61818686.0, "reward": 0.5875345319509506, "reward_std": 0.3149792104959488, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062321956269443035, "rewards/penalized_accuracy_reward/std": 0.13903803005814552, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1619652472436428, "step": 731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1688.25, "completions/max_terminated_length": 1332.25, "completions/mean_length": 765.0625, "completions/mean_terminated_length": 700.9369201660156, "completions/min_length": 237.75, "completions/min_terminated_length": 237.75, "epoch": 0.366, "grad_norm": 0.4901498854160309, "kl": 0.047027587890625, "learning_rate": 8.201891013966478e-07, "loss": 0.3064, "num_tokens": 61876866.0, "reward": 0.5724132657051086, "reward_std": 0.2527250796556473, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04987850785255432, "rewards/penalized_accuracy_reward/std": 0.08922543376684189, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1485484316945076, "step": 732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1972.0, "completions/max_terminated_length": 1637.25, "completions/mean_length": 1160.625, "completions/mean_terminated_length": 859.0941772460938, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.3665, "grad_norm": 0.33227071166038513, "kl": 0.047454833984375, "learning_rate": 8.195606193320136e-07, "loss": 0.2515, "num_tokens": 61958938.0, "reward": 0.5309629440307617, "reward_std": 0.30003656074404716, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06235646829009056, "rewards/penalized_accuracy_reward/std": 0.09552284330129623, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.2534877099096775, "step": 733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1440.5, "completions/mean_length": 1256.890625, "completions/mean_terminated_length": 955.3670196533203, "completions/min_length": 449.75, "completions/min_terminated_length": 449.75, "epoch": 0.367, "grad_norm": 0.3111726641654968, "kl": 0.03399658203125, "learning_rate": 8.189313161389844e-07, "loss": 0.3278, "num_tokens": 62051939.0, "reward": 0.40234375, "reward_std": 0.1469247303903103, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.29384946823120117, "step": 734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1798.5, "completions/max_terminated_length": 1264.0, "completions/mean_length": 706.953125, "completions/mean_terminated_length": 596.28662109375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.3675, "grad_norm": 0.5685709714889526, "kl": 0.048095703125, "learning_rate": 8.183011937345271e-07, "loss": 0.3222, "num_tokens": 62105888.0, "reward": 0.6606056392192841, "reward_std": 0.4222455509006977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09983407333493233, "rewards/penalized_accuracy_reward/std": 0.1760636866092682, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2121710628271103, "step": 735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1884.0, "completions/max_terminated_length": 1343.75, "completions/mean_length": 1073.078125, "completions/mean_terminated_length": 814.9798355102539, "completions/min_length": 284.75, "completions/min_terminated_length": 284.75, "epoch": 0.368, "grad_norm": 0.3137330412864685, "kl": 0.041778564453125, "learning_rate": 8.176702540381036e-07, "loss": 0.2153, "num_tokens": 62183285.0, "reward": 0.46942363679409027, "reward_std": 0.27750821225345135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037446193397045135, "rewards/penalized_accuracy_reward/std": 0.08050690591335297, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.27285773679614067, "step": 736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1443.5, "completions/mean_length": 973.28125, "completions/mean_terminated_length": 810.0548095703125, "completions/min_length": 312.5, "completions/min_terminated_length": 312.5, "epoch": 0.3685, "grad_norm": 0.5527155995368958, "kl": 0.039215087890625, "learning_rate": 8.170384989716657e-07, "loss": 0.4178, "num_tokens": 62254551.0, "reward": 0.439453125, "reward_std": 0.1261542122811079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2523084282875061, "step": 737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1397.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 824.53125, "completions/mean_terminated_length": 777.46875, "completions/min_length": 353.75, "completions/min_terminated_length": 353.75, "epoch": 0.369, "grad_norm": 0.3967598080635071, "kl": 0.0396728515625, "learning_rate": 8.164059304596488e-07, "loss": 0.1268, "num_tokens": 62317721.0, "reward": 0.6493288278579712, "reward_std": 0.34841088950634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0873597264289856, "rewards/penalized_accuracy_reward/std": 0.14969755336642265, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11859130859375, "step": 738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.5, "completions/max_terminated_length": 1232.5, "completions/mean_length": 570.796875, "completions/mean_terminated_length": 570.796875, "completions/min_length": 203.5, "completions/min_terminated_length": 203.5, "epoch": 0.3695, "grad_norm": 0.39294907450675964, "kl": 0.033203125, "learning_rate": 8.157725504289664e-07, "loss": 0.0273, "num_tokens": 62365596.0, "reward": 0.6339615285396576, "reward_std": 0.3126318044960499, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07479327358305454, "rewards/penalized_accuracy_reward/std": 0.14536265656352043, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.07800374925136566, "step": 739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1537.75, "completions/max_terminated_length": 1419.25, "completions/mean_length": 712.90625, "completions/mean_terminated_length": 608.2984313964844, "completions/min_length": 262.75, "completions/min_terminated_length": 262.75, "epoch": 0.37, "grad_norm": 0.4243226945400238, "kl": 0.04296875, "learning_rate": 8.151383608090039e-07, "loss": 0.2605, "num_tokens": 62424854.0, "reward": 0.46484375, "reward_std": 0.08527670428156853, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17055341601371765, "step": 740 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1359.25, "completions/max_terminated_length": 1141.5, "completions/mean_length": 701.078125, "completions/mean_terminated_length": 625.0870819091797, "completions/min_length": 339.75, "completions/min_terminated_length": 339.75, "epoch": 0.3705, "grad_norm": 0.4735656678676605, "kl": 0.024139404296875, "learning_rate": 8.145033635316128e-07, "loss": 0.2294, "num_tokens": 62477243.0, "reward": 1.0050747096538544, "reward_std": 0.2315639912194456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2623029723763466, "rewards/penalized_accuracy_reward/std": 0.09565801511416794, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11361231282353401, "step": 741 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1851.75, "completions/max_terminated_length": 1517.5, "completions/mean_length": 937.421875, "completions/mean_terminated_length": 839.8523406982422, "completions/min_length": 387.75, "completions/min_terminated_length": 387.75, "epoch": 0.371, "grad_norm": 0.46169981360435486, "kl": 0.0482177734375, "learning_rate": 8.138675605311051e-07, "loss": 0.2022, "num_tokens": 62550006.0, "reward": 0.5680734813213348, "reward_std": 0.29042537324130535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06235705316066742, "rewards/penalized_accuracy_reward/std": 0.0955238789319992, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.19875526055693626, "step": 742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1803.5, "completions/max_terminated_length": 1570.5, "completions/mean_length": 762.578125, "completions/mean_terminated_length": 720.1227722167969, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3715, "grad_norm": 0.3563061058521271, "kl": 0.03466796875, "learning_rate": 8.13230953744247e-07, "loss": 0.1649, "num_tokens": 62611211.0, "reward": 0.48046875, "reward_std": 0.04900597594678402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09801195561885834, "step": 743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1783.75, "completions/max_terminated_length": 1624.5, "completions/mean_length": 967.1875, "completions/mean_terminated_length": 864.3206939697266, "completions/min_length": 287.25, "completions/min_terminated_length": 287.25, "epoch": 0.372, "grad_norm": 0.36488601565361023, "kl": 0.051025390625, "learning_rate": 8.125935451102528e-07, "loss": 0.2129, "num_tokens": 62681991.0, "reward": 0.455078125, "reward_std": 0.0830291211605072, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.166058249771595, "step": 744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1166.328125, "completions/mean_terminated_length": 985.4548492431641, "completions/min_length": 357.25, "completions/min_terminated_length": 357.25, "epoch": 0.3725, "grad_norm": 0.33468589186668396, "kl": 0.040252685546875, "learning_rate": 8.119553365707802e-07, "loss": 0.1355, "num_tokens": 62765708.0, "reward": 0.4967789053916931, "reward_std": 0.33116910979151726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745195269584656, "rewards/penalized_accuracy_reward/std": 0.11816548183560371, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.2807880975306034, "step": 745 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1398.0, "completions/mean_length": 842.90625, "completions/mean_terminated_length": 664.7122039794922, "completions/min_length": 175.5, "completions/min_terminated_length": 175.5, "epoch": 0.373, "grad_norm": 0.39832207560539246, "kl": 0.032135009765625, "learning_rate": 8.113163300699228e-07, "loss": 0.2025, "num_tokens": 62829654.0, "reward": 0.5584958642721176, "reward_std": 0.4396803416311741, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06245105806738138, "rewards/penalized_accuracy_reward/std": 0.18648108839988708, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2575543113052845, "step": 746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1624.0, "completions/max_terminated_length": 1377.25, "completions/mean_length": 792.046875, "completions/mean_terminated_length": 705.1568298339844, "completions/min_length": 298.25, "completions/min_terminated_length": 298.25, "epoch": 0.3735, "grad_norm": 0.3914431631565094, "kl": 0.041717529296875, "learning_rate": 8.106765275542053e-07, "loss": 0.1719, "num_tokens": 62891833.0, "reward": 0.5358215570449829, "reward_std": 0.2570416107773781, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744202759116888, "rewards/penalized_accuracy_reward/std": 0.11814591661095619, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17268584668636322, "step": 747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1793.5, "completions/max_terminated_length": 1335.5, "completions/mean_length": 951.078125, "completions/mean_terminated_length": 758.4294128417969, "completions/min_length": 296.5, "completions/min_terminated_length": 296.5, "epoch": 0.374, "grad_norm": 0.44113314151763916, "kl": 0.046478271484375, "learning_rate": 8.100359309725774e-07, "loss": 0.2335, "num_tokens": 62967454.0, "reward": 0.6102351546287537, "reward_std": 0.30026749335229397, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08734412491321564, "rewards/penalized_accuracy_reward/std": 0.10228706151247025, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.19138674437999725, "step": 748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 1291.453125, "completions/mean_terminated_length": 1041.7616271972656, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 0.3745, "grad_norm": 0.4052998721599579, "kl": 0.04522705078125, "learning_rate": 8.093945422764069e-07, "loss": 0.2896, "num_tokens": 63063851.0, "reward": 0.5547842979431152, "reward_std": 0.30914999172091484, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11235307157039642, "rewards/penalized_accuracy_reward/std": 0.10233572125434875, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.23708771914243698, "step": 749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1922.0, "completions/max_terminated_length": 1726.75, "completions/mean_length": 1095.96875, "completions/mean_terminated_length": 898.3994445800781, "completions/min_length": 360.75, "completions/min_terminated_length": 360.75, "epoch": 0.375, "grad_norm": 0.35548239946365356, "kl": 0.05084228515625, "learning_rate": 8.087523634194754e-07, "loss": 0.2415, "num_tokens": 63141849.0, "reward": 0.427734375, "reward_std": 0.09891058132052422, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.19782117009162903, "step": 750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1382.5, "completions/max_terminated_length": 1264.75, "completions/mean_length": 706.65625, "completions/mean_terminated_length": 672.6227722167969, "completions/min_length": 178.75, "completions/min_terminated_length": 178.75, "epoch": 0.3755, "grad_norm": 0.32261255383491516, "kl": 0.041595458984375, "learning_rate": 8.081093963579707e-07, "loss": 0.0743, "num_tokens": 63195907.0, "reward": 0.6532460451126099, "reward_std": 0.35451360046863556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08736521005630493, "rewards/penalized_accuracy_reward/std": 0.1698097139596939, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.09959635883569717, "step": 751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1751.75, "completions/max_terminated_length": 1503.25, "completions/mean_length": 1042.265625, "completions/mean_terminated_length": 865.6104125976562, "completions/min_length": 318.25, "completions/min_terminated_length": 318.25, "epoch": 0.376, "grad_norm": 0.5976893901824951, "kl": 0.0450439453125, "learning_rate": 8.074656430504823e-07, "loss": 0.2638, "num_tokens": 63273108.0, "reward": 0.4527234733104706, "reward_std": 0.211591936647892, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012494553811848164, "rewards/penalized_accuracy_reward/std": 0.04997821897268295, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.22327104210853577, "step": 752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1987.25, "completions/max_terminated_length": 1646.75, "completions/mean_length": 909.703125, "completions/mean_terminated_length": 740.0948028564453, "completions/min_length": 276.5, "completions/min_terminated_length": 276.5, "epoch": 0.3765, "grad_norm": 0.3687702715396881, "kl": 0.04595947265625, "learning_rate": 8.068211054579943e-07, "loss": 0.2583, "num_tokens": 63341537.0, "reward": 0.7721154242753983, "reward_std": 0.327325277030468, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16242489777505398, "rewards/penalized_accuracy_reward/std": 0.1393732726573944, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.18227311596274376, "step": 753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1774.75, "completions/max_terminated_length": 1606.25, "completions/mean_length": 877.734375, "completions/mean_terminated_length": 778.4833526611328, "completions/min_length": 349.75, "completions/min_terminated_length": 349.75, "epoch": 0.377, "grad_norm": 0.34843966364860535, "kl": 0.039642333984375, "learning_rate": 8.061757855438799e-07, "loss": 0.2051, "num_tokens": 63407648.0, "reward": 0.8047696352005005, "reward_std": 0.20021026208996773, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17484575510025024, "rewards/penalized_accuracy_reward/std": 0.06825292855501175, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18227330595254898, "step": 754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1518.5, "completions/mean_length": 1163.109375, "completions/mean_terminated_length": 797.7684631347656, "completions/min_length": 286.25, "completions/min_terminated_length": 286.25, "epoch": 0.3775, "grad_norm": 0.33848077058792114, "kl": 0.036346435546875, "learning_rate": 8.055296852738956e-07, "loss": 0.3142, "num_tokens": 63494503.0, "reward": 0.380859375, "reward_std": 0.13617093302309513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.27234187349677086, "step": 755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1980.0, "completions/max_terminated_length": 1628.5, "completions/mean_length": 924.09375, "completions/mean_terminated_length": 746.7020111083984, "completions/min_length": 227.5, "completions/min_terminated_length": 227.5, "epoch": 0.378, "grad_norm": 0.4737327992916107, "kl": 0.041534423828125, "learning_rate": 8.048828066161747e-07, "loss": 0.3842, "num_tokens": 63561725.0, "reward": 0.439453125, "reward_std": 0.11963003128767014, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.23926006257534027, "step": 756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1370.5, "completions/max_terminated_length": 1311.5, "completions/mean_length": 857.8125, "completions/mean_terminated_length": 763.8147583007812, "completions/min_length": 407.5, "completions/min_terminated_length": 407.5, "epoch": 0.3785, "grad_norm": 0.4499627649784088, "kl": 0.04364013671875, "learning_rate": 8.04235151541222e-07, "loss": 0.0412, "num_tokens": 63624689.0, "reward": 0.6852559149265289, "reward_std": 0.48128681257367134, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12485453207045794, "rewards/penalized_accuracy_reward/std": 0.2204672247171402, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.14791342057287693, "step": 757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1744.5, "completions/mean_length": 1381.703125, "completions/mean_terminated_length": 1089.2032470703125, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.379, "grad_norm": 0.3138720989227295, "kl": 0.035125732421875, "learning_rate": 8.035867220219071e-07, "loss": 0.1771, "num_tokens": 63722286.0, "reward": 0.6338729858398438, "reward_std": 0.42281437292695045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12455367669463158, "rewards/penalized_accuracy_reward/std": 0.17100490629673004, "rewards/tag_count_reward/mean": 0.76953125, "rewards/tag_count_reward/std": 0.23549684509634972, "step": 758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1870.5, "completions/max_terminated_length": 1447.25, "completions/mean_length": 1201.6875, "completions/mean_terminated_length": 774.3073120117188, "completions/min_length": 389.75, "completions/min_terminated_length": 389.75, "epoch": 0.3795, "grad_norm": 0.41582155227661133, "kl": 0.045806884765625, "learning_rate": 8.029375200334587e-07, "loss": 0.3878, "num_tokens": 63808298.0, "reward": 0.6570266485214233, "reward_std": 0.31150298938155174, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14980238676071167, "rewards/penalized_accuracy_reward/std": 0.0893249660730362, "rewards/tag_count_reward/mean": 0.71484375, "rewards/tag_count_reward/std": 0.2999761626124382, "step": 759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1409.5, "completions/max_terminated_length": 1069.25, "completions/mean_length": 836.09375, "completions/mean_terminated_length": 574.8849487304688, "completions/min_length": 229.75, "completions/min_terminated_length": 229.75, "epoch": 0.38, "grad_norm": 0.2441646307706833, "kl": 0.0552978515625, "learning_rate": 8.022875475534588e-07, "loss": 0.2305, "num_tokens": 63870864.0, "reward": 0.427734375, "reward_std": 0.08980410173535347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.17960820347070694, "step": 760 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 1258.78125, "completions/mean_terminated_length": 959.7219848632812, "completions/min_length": 403.25, "completions/min_terminated_length": 403.25, "epoch": 0.3805, "grad_norm": 0.5523656010627747, "kl": 0.06243896484375, "learning_rate": 8.01636806561836e-07, "loss": 0.2996, "num_tokens": 63960978.0, "reward": 0.4464060366153717, "reward_std": 0.26816821843385696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024960827082395554, "rewards/penalized_accuracy_reward/std": 0.06820597499608994, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.31157951056957245, "step": 761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1551.75, "completions/max_terminated_length": 1492.75, "completions/mean_length": 959.859375, "completions/mean_terminated_length": 881.7285003662109, "completions/min_length": 503.25, "completions/min_terminated_length": 503.25, "epoch": 0.381, "grad_norm": 0.4631112217903137, "kl": 0.034881591796875, "learning_rate": 8.009852990408606e-07, "loss": 0.1496, "num_tokens": 64030297.0, "reward": 0.5219442397356033, "reward_std": 0.23435698077082634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037339307367801666, "rewards/penalized_accuracy_reward/std": 0.08027712255716324, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.19291912205517292, "step": 762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1903.0, "completions/max_terminated_length": 1859.75, "completions/mean_length": 1076.4375, "completions/mean_terminated_length": 1039.6189880371094, "completions/min_length": 446.75, "completions/min_terminated_length": 446.75, "epoch": 0.3815, "grad_norm": 0.3357700705528259, "kl": 0.035369873046875, "learning_rate": 8.003330269751372e-07, "loss": 0.1433, "num_tokens": 64107957.0, "reward": 0.458984375, "reward_std": 0.08420451916754246, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1684090457856655, "step": 763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1977.25, "completions/max_terminated_length": 1584.25, "completions/mean_length": 1064.75, "completions/mean_terminated_length": 849.1263580322266, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.382, "grad_norm": 0.46015501022338867, "kl": 0.0560302734375, "learning_rate": 7.996799923515997e-07, "loss": 0.2474, "num_tokens": 64188533.0, "reward": 0.44042452424764633, "reward_std": 0.25531620159745216, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024899762123823166, "rewards/penalized_accuracy_reward/std": 0.06803911179304123, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.26519249379634857, "step": 764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1742.5, "completions/max_terminated_length": 1336.0, "completions/mean_length": 916.8125, "completions/mean_terminated_length": 678.2976531982422, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.3825, "grad_norm": 0.45131099224090576, "kl": 0.0516357421875, "learning_rate": 7.990261971595048e-07, "loss": 0.3312, "num_tokens": 64255481.0, "reward": 0.8215436935424805, "reward_std": 0.29333217442035675, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1998343337327242, "rewards/penalized_accuracy_reward/std": 0.0998813547194004, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.2661408353596926, "step": 765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1474.0, "completions/max_terminated_length": 1343.5, "completions/mean_length": 916.140625, "completions/mean_terminated_length": 775.544921875, "completions/min_length": 395.25, "completions/min_terminated_length": 395.25, "epoch": 0.383, "grad_norm": 0.4156896770000458, "kl": 0.045440673828125, "learning_rate": 7.983716433904262e-07, "loss": 0.0785, "num_tokens": 64327010.0, "reward": 0.8695912659168243, "reward_std": 0.29248058423399925, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21213937923312187, "rewards/penalized_accuracy_reward/std": 0.11804943159222603, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.16116780787706375, "step": 766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1544.75, "completions/mean_length": 935.21875, "completions/mean_terminated_length": 780.4778747558594, "completions/min_length": 403.25, "completions/min_terminated_length": 403.25, "epoch": 0.3835, "grad_norm": 0.4867936670780182, "kl": 0.048248291015625, "learning_rate": 7.977163330382479e-07, "loss": 0.2931, "num_tokens": 64395584.0, "reward": 0.5123829692602158, "reward_std": 0.2623783554881811, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744148090481758, "rewards/penalized_accuracy_reward/std": 0.08049678057432175, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.25303496047854424, "step": 767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1746.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 1026.609375, "completions/mean_terminated_length": 948.0275421142578, "completions/min_length": 342.25, "completions/min_terminated_length": 342.25, "epoch": 0.384, "grad_norm": 0.3801063597202301, "kl": 0.04864501953125, "learning_rate": 7.970602680991592e-07, "loss": 0.226, "num_tokens": 64471703.0, "reward": 0.7773511707782745, "reward_std": 0.6053038686513901, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16211308911442757, "rewards/penalized_accuracy_reward/std": 0.2739698737859726, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19511458277702332, "step": 768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1825.25, "completions/max_terminated_length": 1565.75, "completions/mean_length": 765.5625, "completions/mean_terminated_length": 707.1854553222656, "completions/min_length": 310.25, "completions/min_terminated_length": 310.25, "epoch": 0.3845, "grad_norm": 0.26380637288093567, "kl": 0.031494140625, "learning_rate": 7.964034505716476e-07, "loss": 0.1471, "num_tokens": 64529179.0, "reward": 0.8548071384429932, "reward_std": 0.5495161265134811, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18716919422149658, "rewards/penalized_accuracy_reward/std": 0.2702968940138817, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11695349216461182, "step": 769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1938.0, "completions/max_terminated_length": 1630.25, "completions/mean_length": 1193.421875, "completions/mean_terminated_length": 986.4651641845703, "completions/min_length": 503.25, "completions/min_terminated_length": 503.25, "epoch": 0.385, "grad_norm": 0.33145540952682495, "kl": 0.033538818359375, "learning_rate": 7.957458824564931e-07, "loss": 0.1792, "num_tokens": 64613702.0, "reward": 0.41796875, "reward_std": 0.11792946048080921, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.23585893586277962, "step": 770 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1778.25, "completions/max_terminated_length": 1446.75, "completions/mean_length": 1090.484375, "completions/mean_terminated_length": 945.9977111816406, "completions/min_length": 446.25, "completions/min_terminated_length": 446.25, "epoch": 0.3855, "grad_norm": 0.31593045592308044, "kl": 0.02996826171875, "learning_rate": 7.950875657567621e-07, "loss": 0.2125, "num_tokens": 64695765.0, "reward": 0.427734375, "reward_std": 0.10847755335271358, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.21695511415600777, "step": 771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1804.5, "completions/max_terminated_length": 1299.5, "completions/mean_length": 1267.640625, "completions/mean_terminated_length": 847.81298828125, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.386, "grad_norm": 0.3474520444869995, "kl": 0.049560546875, "learning_rate": 7.944285024778017e-07, "loss": 0.1962, "num_tokens": 64785038.0, "reward": 0.3878491297364235, "reward_std": 0.2908061593770981, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024979250505566597, "rewards/penalized_accuracy_reward/std": 0.09991700574755669, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.2817354165017605, "step": 772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1773.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 1199.140625, "completions/mean_terminated_length": 980.821044921875, "completions/min_length": 499.75, "completions/min_terminated_length": 499.75, "epoch": 0.3865, "grad_norm": 0.26359888911247253, "kl": 0.035491943359375, "learning_rate": 7.93768694627233e-07, "loss": 0.199, "num_tokens": 64871143.0, "reward": 0.6388890147209167, "reward_std": 0.3044482506811619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11241325736045837, "rewards/penalized_accuracy_reward/std": 0.10239050537347794, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.24575175344944, "step": 773 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1743.25, "completions/max_terminated_length": 1475.5, "completions/mean_length": 1080.484375, "completions/mean_terminated_length": 879.9688568115234, "completions/min_length": 304.5, "completions/min_terminated_length": 304.5, "epoch": 0.387, "grad_norm": 0.2752874195575714, "kl": 0.035675048828125, "learning_rate": 7.931081442149448e-07, "loss": 0.1885, "num_tokens": 64950406.0, "reward": 0.49720972776412964, "reward_std": 0.21377826295793056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497205138206482, "rewards/penalized_accuracy_reward/std": 0.06823664158582687, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.19796335324645042, "step": 774 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 1120.328125, "completions/mean_terminated_length": 864.1595230102539, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "epoch": 0.3875, "grad_norm": 0.46203866600990295, "kl": 0.051025390625, "learning_rate": 7.924468532530883e-07, "loss": 0.4029, "num_tokens": 65033531.0, "reward": 0.4468149244785309, "reward_std": 0.20855069160461426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012469959445297718, "rewards/penalized_accuracy_reward/std": 0.04987983778119087, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.25573352724313736, "step": 775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1660.0, "completions/max_terminated_length": 1297.5, "completions/mean_length": 901.78125, "completions/mean_terminated_length": 790.6118011474609, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.388, "grad_norm": 0.3604573905467987, "kl": 0.036529541015625, "learning_rate": 7.917848237560708e-07, "loss": 0.1649, "num_tokens": 65100669.0, "reward": 0.4991663545370102, "reward_std": 0.21191035583615303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024973805993795395, "rewards/penalized_accuracy_reward/std": 0.0682414323091507, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.19624610245227814, "step": 776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1976.25, "completions/max_terminated_length": 1774.5, "completions/mean_length": 1069.1875, "completions/mean_terminated_length": 899.5781555175781, "completions/min_length": 261.25, "completions/min_terminated_length": 261.25, "epoch": 0.3885, "grad_norm": 0.4295975863933563, "kl": 0.0462646484375, "learning_rate": 7.911220577405484e-07, "loss": 0.2203, "num_tokens": 65177689.0, "reward": 0.6680144965648651, "reward_std": 0.38269615918397903, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11232755985110998, "rewards/penalized_accuracy_reward/std": 0.15296602621674538, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2061559185385704, "step": 777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1545.0, "completions/max_terminated_length": 1397.25, "completions/mean_length": 872.359375, "completions/mean_terminated_length": 824.5504760742188, "completions/min_length": 327.75, "completions/min_terminated_length": 327.75, "epoch": 0.389, "grad_norm": 0.33896610140800476, "kl": 0.041534423828125, "learning_rate": 7.904585572254218e-07, "loss": 0.1016, "num_tokens": 65244592.0, "reward": 0.7242990434169769, "reward_std": 0.4567772001028061, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12484481558203697, "rewards/penalized_accuracy_reward/std": 0.2204490229487419, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11827929690480232, "step": 778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1432.75, "completions/max_terminated_length": 1426.75, "completions/mean_length": 777.265625, "completions/mean_terminated_length": 718.5052185058594, "completions/min_length": 296.5, "completions/min_terminated_length": 296.5, "epoch": 0.3895, "grad_norm": 0.3938414752483368, "kl": 0.0328369140625, "learning_rate": 7.897943242318285e-07, "loss": 0.1126, "num_tokens": 65301441.0, "reward": 0.9641905128955841, "reward_std": 0.4242732562124729, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24967339634895325, "rewards/penalized_accuracy_reward/std": 0.1828368380665779, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1435973346233368, "step": 779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1760.0, "completions/max_terminated_length": 1753.75, "completions/mean_length": 740.40625, "completions/mean_terminated_length": 720.9708404541016, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.39, "grad_norm": 0.5400559306144714, "kl": 0.040435791015625, "learning_rate": 7.891293607831373e-07, "loss": 0.2372, "num_tokens": 65358363.0, "reward": 0.8360437750816345, "reward_std": 0.17789852246642113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17485782504081726, "rewards/penalized_accuracy_reward/std": 0.06825756281614304, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09947281517088413, "step": 780 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1897.5, "completions/max_terminated_length": 1702.25, "completions/mean_length": 1021.515625, "completions/mean_terminated_length": 923.4219207763672, "completions/min_length": 410.5, "completions/min_terminated_length": 410.5, "epoch": 0.3905, "grad_norm": 0.35671940445899963, "kl": 0.046539306640625, "learning_rate": 7.884636689049422e-07, "loss": 0.2013, "num_tokens": 65434188.0, "reward": 0.5780421197414398, "reward_std": 0.2794995456933975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06245855987071991, "rewards/penalized_accuracy_reward/std": 0.09567923843860626, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.21697916835546494, "step": 781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1154.5, "completions/mean_length": 697.09375, "completions/mean_terminated_length": 535.8857269287109, "completions/min_length": 205.75, "completions/min_terminated_length": 205.75, "epoch": 0.391, "grad_norm": 0.7086508870124817, "kl": 0.044677734375, "learning_rate": 7.877972506250562e-07, "loss": 0.5236, "num_tokens": 65487986.0, "reward": 0.658770740032196, "reward_std": 0.3053102530539036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09989318251609802, "rewards/penalized_accuracy_reward/std": 0.10316924750804901, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2277960628271103, "step": 782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1949.0, "completions/max_terminated_length": 1649.25, "completions/mean_length": 981.8125, "completions/mean_terminated_length": 841.4211730957031, "completions/min_length": 403.75, "completions/min_terminated_length": 403.75, "epoch": 0.3915, "grad_norm": 0.3728766441345215, "kl": 0.0408935546875, "learning_rate": 7.871301079735049e-07, "loss": 0.2953, "num_tokens": 65558950.0, "reward": 0.522237092256546, "reward_std": 0.25556135922670364, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03748572990298271, "rewards/penalized_accuracy_reward/std": 0.08059189468622208, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.22597433626651764, "step": 783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1577.5, "completions/max_terminated_length": 1338.75, "completions/mean_length": 949.5625, "completions/mean_terminated_length": 794.8697967529297, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.392, "grad_norm": 0.48815032839775085, "kl": 0.0313262939453125, "learning_rate": 7.864622429825204e-07, "loss": 0.2027, "num_tokens": 65627722.0, "reward": 0.6029460430145264, "reward_std": 0.2651270478963852, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491052150726318, "rewards/penalized_accuracy_reward/std": 0.09988074749708176, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.14294016361236572, "step": 784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1784.25, "completions/max_terminated_length": 1294.25, "completions/mean_length": 977.40625, "completions/mean_terminated_length": 773.1666870117188, "completions/min_length": 392.5, "completions/min_terminated_length": 392.5, "epoch": 0.3925, "grad_norm": 0.36962446570396423, "kl": 0.0323944091796875, "learning_rate": 7.857936576865356e-07, "loss": 0.3394, "num_tokens": 65700100.0, "reward": 0.44140625, "reward_std": 0.11645255237817764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.23290512338280678, "step": 785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1745.5, "completions/max_terminated_length": 1598.5, "completions/mean_length": 1027.609375, "completions/mean_terminated_length": 869.2314300537109, "completions/min_length": 377.75, "completions/min_terminated_length": 377.75, "epoch": 0.393, "grad_norm": 0.27239304780960083, "kl": 0.034332275390625, "learning_rate": 7.851243541221769e-07, "loss": 0.1107, "num_tokens": 65778443.0, "reward": 0.46635375916957855, "reward_std": 0.18832804262638092, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012473754584789276, "rewards/penalized_accuracy_reward/std": 0.049895018339157104, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.22366224229335785, "step": 786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1836.25, "completions/max_terminated_length": 1639.5, "completions/mean_length": 785.671875, "completions/mean_terminated_length": 748.5729370117188, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3935, "grad_norm": 0.4728951156139374, "kl": 0.03729248046875, "learning_rate": 7.844543343282595e-07, "loss": 0.1682, "num_tokens": 65840406.0, "reward": 1.0030574202537537, "reward_std": 0.5299651026725769, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2622708985581994, "rewards/penalized_accuracy_reward/std": 0.24792122095823288, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1227458082139492, "step": 787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1959.25, "completions/max_terminated_length": 1652.0, "completions/mean_length": 1051.765625, "completions/mean_terminated_length": 893.6245727539062, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.394, "grad_norm": 0.4045305848121643, "kl": 0.04803466796875, "learning_rate": 7.837836003457793e-07, "loss": 0.238, "num_tokens": 65917607.0, "reward": 0.5604260265827179, "reward_std": 0.3545318618416786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0624395776540041, "rewards/penalized_accuracy_reward/std": 0.13930366933345795, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2406025119125843, "step": 788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1772.0, "completions/max_terminated_length": 1376.5, "completions/mean_length": 942.34375, "completions/mean_terminated_length": 744.2595062255859, "completions/min_length": 326.25, "completions/min_terminated_length": 326.25, "epoch": 0.3945, "grad_norm": 0.4512350559234619, "kl": 0.03021240234375, "learning_rate": 7.831121542179086e-07, "loss": 0.3142, "num_tokens": 65985213.0, "reward": 0.6793519854545593, "reward_std": 0.46456312388181686, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12483224272727966, "rewards/penalized_accuracy_reward/std": 0.18280525505542755, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2585727125406265, "step": 789 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1408.75, "completions/max_terminated_length": 1377.75, "completions/mean_length": 779.15625, "completions/mean_terminated_length": 683.0906219482422, "completions/min_length": 281.5, "completions/min_terminated_length": 281.5, "epoch": 0.395, "grad_norm": 0.3656206429004669, "kl": 0.034759521484375, "learning_rate": 7.824399979899889e-07, "loss": 0.0501, "num_tokens": 66043863.0, "reward": 0.6049627661705017, "reward_std": 0.25924910232424736, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07494232803583145, "rewards/penalized_accuracy_reward/std": 0.09992311894893646, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.11880578845739365, "step": 790 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1528.0, "completions/max_terminated_length": 1368.5, "completions/mean_length": 773.71875, "completions/mean_terminated_length": 720.8605804443359, "completions/min_length": 323.25, "completions/min_terminated_length": 323.25, "epoch": 0.3955, "grad_norm": 0.4407970905303955, "kl": 0.034881591796875, "learning_rate": 7.817671337095244e-07, "loss": 0.0545, "num_tokens": 66101253.0, "reward": 0.6127186268568039, "reward_std": 0.3385565038770437, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0749139990657568, "rewards/penalized_accuracy_reward/std": 0.1455671340227127, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1790002305060625, "step": 791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1485.5, "completions/mean_length": 1187.546875, "completions/mean_terminated_length": 916.5892944335938, "completions/min_length": 461.75, "completions/min_terminated_length": 461.75, "epoch": 0.396, "grad_norm": 0.3108740448951721, "kl": 0.030731201171875, "learning_rate": 7.810935634261764e-07, "loss": 0.2311, "num_tokens": 66188568.0, "reward": 0.4640117585659027, "reward_std": 0.2797943316400051, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024974622763693333, "rewards/penalized_accuracy_reward/std": 0.09989849478006363, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.27199968695640564, "step": 792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1614.5, "completions/mean_length": 968.546875, "completions/mean_terminated_length": 818.8017883300781, "completions/min_length": 323.75, "completions/min_terminated_length": 323.75, "epoch": 0.3965, "grad_norm": 0.3998006582260132, "kl": 0.03619384765625, "learning_rate": 7.804192891917571e-07, "loss": 0.3389, "num_tokens": 66259051.0, "reward": 0.4663480520248413, "reward_std": 0.19895833171904087, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01247089821845293, "rewards/penalized_accuracy_reward/std": 0.04988359659910202, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2513415552675724, "step": 793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1595.25, "completions/max_terminated_length": 1504.75, "completions/mean_length": 795.359375, "completions/mean_terminated_length": 759.9732208251953, "completions/min_length": 253.5, "completions/min_terminated_length": 253.5, "epoch": 0.397, "grad_norm": 0.5034902691841125, "kl": 0.04327392578125, "learning_rate": 7.797443130602226e-07, "loss": 0.1453, "num_tokens": 66320978.0, "reward": 0.5495204776525497, "reward_std": 0.27826364152133465, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745555318892002, "rewards/penalized_accuracy_reward/std": 0.11818822100758553, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13827571645379066, "step": 794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1599.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 716.953125, "completions/mean_terminated_length": 638.2153167724609, "completions/min_length": 246.5, "completions/min_terminated_length": 246.5, "epoch": 0.3975, "grad_norm": 0.5708227157592773, "kl": 0.07476806640625, "learning_rate": 7.79068637087667e-07, "loss": 0.072, "num_tokens": 66374591.0, "reward": 1.1545245349407196, "reward_std": 0.7645311057567596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3497232161462307, "rewards/penalized_accuracy_reward/std": 0.3661581948399544, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.17421667277812958, "step": 795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1868.25, "completions/max_terminated_length": 1638.25, "completions/mean_length": 992.15625, "completions/mean_terminated_length": 948.3468933105469, "completions/min_length": 478.5, "completions/min_terminated_length": 478.5, "epoch": 0.398, "grad_norm": 0.3690018355846405, "kl": 0.0360107421875, "learning_rate": 7.783922633323169e-07, "loss": 0.0713, "num_tokens": 66446777.0, "reward": 0.7758225053548813, "reward_std": 0.5920516178011894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14963000174611807, "rewards/penalized_accuracy_reward/std": 0.2884094566106796, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14699668437242508, "step": 796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1793.75, "completions/max_terminated_length": 1340.0, "completions/mean_length": 733.78125, "completions/mean_terminated_length": 687.9437637329102, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.3985, "grad_norm": 0.6670532822608948, "kl": 0.04193115234375, "learning_rate": 7.777151938545235e-07, "loss": 0.2504, "num_tokens": 66502363.0, "reward": 0.5303771197795868, "reward_std": 0.18845407478511333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024954188615083694, "rewards/penalized_accuracy_reward/std": 0.06818787753582001, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11781632527709007, "step": 797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1735.25, "completions/max_terminated_length": 1459.75, "completions/mean_length": 921.59375, "completions/mean_terminated_length": 869.4195098876953, "completions/min_length": 403.25, "completions/min_terminated_length": 403.25, "epoch": 0.399, "grad_norm": 0.3551866114139557, "kl": 0.030303955078125, "learning_rate": 7.770374307167585e-07, "loss": 0.174, "num_tokens": 66570849.0, "reward": 0.6258149445056915, "reward_std": 0.2895685192197561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08732154220342636, "rewards/penalized_accuracy_reward/std": 0.1022605448961258, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.1900223344564438, "step": 798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1587.25, "completions/max_terminated_length": 1445.5, "completions/mean_length": 879.765625, "completions/mean_terminated_length": 781.1658935546875, "completions/min_length": 375.5, "completions/min_terminated_length": 375.5, "epoch": 0.3995, "grad_norm": 4.843781471252441, "kl": 0.0823974609375, "learning_rate": 7.763589759836058e-07, "loss": 0.2137, "num_tokens": 66635586.0, "reward": 0.6357553005218506, "reward_std": 0.39497506991028786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0874088928103447, "rewards/penalized_accuracy_reward/std": 0.1638934090733528, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.14803672581911087, "step": 799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1789.25, "completions/max_terminated_length": 1535.5, "completions/mean_length": 1090.515625, "completions/mean_terminated_length": 880.0173645019531, "completions/min_length": 368.5, "completions/min_terminated_length": 368.5, "epoch": 0.4, "grad_norm": 0.39683619141578674, "kl": 0.0313720703125, "learning_rate": 7.756798317217558e-07, "loss": 0.1837, "num_tokens": 66712867.0, "reward": 0.5217934548854828, "reward_std": 0.33734423853456974, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049959227442741394, "rewards/penalized_accuracy_reward/std": 0.1305224671959877, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.23449411243200302, "step": 800 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1603.5, "completions/max_terminated_length": 1371.0, "completions/mean_length": 845.953125, "completions/mean_terminated_length": 772.2170104980469, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.4005, "grad_norm": 0.5386746525764465, "kl": 0.05859375, "learning_rate": 7.75e-07, "loss": 0.1664, "num_tokens": 66779136.0, "reward": 0.46484375, "reward_std": 0.08873844146728516, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17747688479721546, "step": 801 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1595.0, "completions/max_terminated_length": 1424.25, "completions/mean_length": 988.234375, "completions/mean_terminated_length": 774.1935119628906, "completions/min_length": 357.75, "completions/min_terminated_length": 357.75, "epoch": 0.401, "grad_norm": 0.3405839502811432, "kl": 0.0472412109375, "learning_rate": 7.743194828892235e-07, "loss": 0.2125, "num_tokens": 66853151.0, "reward": 0.4296875, "reward_std": 0.08714957907795906, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.17429915815591812, "step": 802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1820.75, "completions/max_terminated_length": 1437.75, "completions/mean_length": 1073.09375, "completions/mean_terminated_length": 877.4170227050781, "completions/min_length": 484.25, "completions/min_terminated_length": 484.25, "epoch": 0.4015, "grad_norm": 0.30638325214385986, "kl": 0.032745361328125, "learning_rate": 7.736382824623999e-07, "loss": 0.2308, "num_tokens": 66929621.0, "reward": 0.5505905151367188, "reward_std": 0.3027886264026165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062404632568359375, "rewards/penalized_accuracy_reward/std": 0.09559661895036697, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.26388393342494965, "step": 803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1748.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 817.625, "completions/mean_terminated_length": 669.1830825805664, "completions/min_length": 244.75, "completions/min_terminated_length": 244.75, "epoch": 0.402, "grad_norm": 0.5207036137580872, "kl": 0.0440673828125, "learning_rate": 7.729564007945834e-07, "loss": 0.4134, "num_tokens": 66992237.0, "reward": 0.6430922448635101, "reward_std": 0.3183657303452492, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09986643493175507, "rewards/penalized_accuracy_reward/std": 0.10314164310693741, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.25656110793352127, "step": 804 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1594.0, "completions/max_terminated_length": 1396.75, "completions/mean_length": 934.78125, "completions/mean_terminated_length": 692.5823059082031, "completions/min_length": 275.25, "completions/min_terminated_length": 275.25, "epoch": 0.4025, "grad_norm": 0.4433252811431885, "kl": 0.050048828125, "learning_rate": 7.72273839962904e-07, "loss": 0.2244, "num_tokens": 67061599.0, "reward": 0.5237544029951096, "reward_std": 0.2547884099185467, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04996313899755478, "rewards/penalized_accuracy_reward/std": 0.0893767923116684, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.19592283479869366, "step": 805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1987.5, "completions/max_terminated_length": 1701.0, "completions/mean_length": 970.859375, "completions/mean_terminated_length": 848.7208557128906, "completions/min_length": 363.5, "completions/min_terminated_length": 363.5, "epoch": 0.403, "grad_norm": 0.5706811547279358, "kl": 0.04351806640625, "learning_rate": 7.715906020465602e-07, "loss": 0.3098, "num_tokens": 67133254.0, "reward": 0.5049925148487091, "reward_std": 0.21571943163871765, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024957196786999702, "rewards/penalized_accuracy_reward/std": 0.06819604337215424, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.20875544100999832, "step": 806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1257.140625, "completions/mean_terminated_length": 1020.7310180664062, "completions/min_length": 437.25, "completions/min_terminated_length": 437.25, "epoch": 0.4035, "grad_norm": 0.3155382573604584, "kl": 0.033966064453125, "learning_rate": 7.709066891268133e-07, "loss": 0.2823, "num_tokens": 67223199.0, "reward": 0.400390625, "reward_std": 0.1503902357071638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.3007804751396179, "step": 807 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1726.25, "completions/max_terminated_length": 1510.5, "completions/mean_length": 1048.484375, "completions/mean_terminated_length": 987.8104248046875, "completions/min_length": 310.5, "completions/min_terminated_length": 310.5, "epoch": 0.404, "grad_norm": 0.34603220224380493, "kl": 0.02911376953125, "learning_rate": 7.702221032869808e-07, "loss": 0.175, "num_tokens": 67299006.0, "reward": 0.5283876061439514, "reward_std": 0.19025876931846142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024935990571975708, "rewards/penalized_accuracy_reward/std": 0.06813815981149673, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1079649031162262, "step": 808 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1581.0, "completions/max_terminated_length": 1318.5, "completions/mean_length": 702.09375, "completions/mean_terminated_length": 610.8644409179688, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.4045, "grad_norm": 0.387920618057251, "kl": 0.0609130859375, "learning_rate": 7.695368466124296e-07, "loss": 0.1746, "num_tokens": 67355060.0, "reward": 0.7972026020288467, "reward_std": 0.48345237970352173, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1622731564566493, "rewards/penalized_accuracy_reward/std": 0.23281435295939445, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.12515868619084358, "step": 809 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1499.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 641.265625, "completions/mean_terminated_length": 572.7875061035156, "completions/min_length": 183.5, "completions/min_terminated_length": 183.5, "epoch": 0.405, "grad_norm": 0.4688602685928345, "kl": 0.04510498046875, "learning_rate": 7.688509211905707e-07, "loss": 0.2986, "num_tokens": 67403509.0, "reward": 0.854809045791626, "reward_std": 0.44708552956581116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18717015534639359, "rewards/penalized_accuracy_reward/std": 0.2021149918437004, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12654344737529755, "step": 810 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1237.5, "completions/max_terminated_length": 1216.25, "completions/mean_length": 626.21875, "completions/mean_terminated_length": 610.5635681152344, "completions/min_length": 221.25, "completions/min_terminated_length": 221.25, "epoch": 0.4055, "grad_norm": 0.4487760066986084, "kl": 0.04534912109375, "learning_rate": 7.681643291108517e-07, "loss": 0.0313, "num_tokens": 67451667.0, "reward": 0.48828125, "reward_std": 0.04043455049395561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08086910098791122, "step": 811 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1583.25, "completions/mean_length": 1019.09375, "completions/mean_terminated_length": 861.6128692626953, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.406, "grad_norm": 0.4698185324668884, "kl": 0.045440673828125, "learning_rate": 7.67477072464751e-07, "loss": 0.3632, "num_tokens": 67524713.0, "reward": 0.5103067755699158, "reward_std": 0.27150655537843704, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037379950284957886, "rewards/penalized_accuracy_reward/std": 0.08036451786756516, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2576989643275738, "step": 812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1664.0, "completions/max_terminated_length": 1571.25, "completions/mean_length": 831.796875, "completions/mean_terminated_length": 793.0348815917969, "completions/min_length": 316.5, "completions/min_terminated_length": 316.5, "epoch": 0.4065, "grad_norm": 0.23233801126480103, "kl": 0.0595703125, "learning_rate": 7.667891533457718e-07, "loss": 0.1043, "num_tokens": 67585820.0, "reward": 0.4878842681646347, "reward_std": 0.12634258344769478, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012496821582317352, "rewards/penalized_accuracy_reward/std": 0.04998728632926941, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.116670623421669, "step": 813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1384.5, "completions/max_terminated_length": 1284.0, "completions/mean_length": 709.34375, "completions/mean_terminated_length": 667.1049194335938, "completions/min_length": 233.5, "completions/min_terminated_length": 233.5, "epoch": 0.407, "grad_norm": 0.46760469675064087, "kl": 0.041168212890625, "learning_rate": 7.661005738494349e-07, "loss": 0.0722, "num_tokens": 67642978.0, "reward": 0.5764629542827606, "reward_std": 0.21101678907871246, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995022714138031, "rewards/penalized_accuracy_reward/std": 0.08935368806123734, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11522135883569717, "step": 814 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1762.25, "completions/max_terminated_length": 1337.0, "completions/mean_length": 974.703125, "completions/mean_terminated_length": 795.7195129394531, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.4075, "grad_norm": 0.48642051219940186, "kl": 0.06085205078125, "learning_rate": 7.654113360732732e-07, "loss": 0.043, "num_tokens": 67715679.0, "reward": 0.5369401276111603, "reward_std": 0.415772020816803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062415385618805885, "rewards/penalized_accuracy_reward/std": 0.18636029213666916, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.15817352384328842, "step": 815 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1589.25, "completions/max_terminated_length": 1369.0, "completions/mean_length": 832.890625, "completions/mean_terminated_length": 729.2041931152344, "completions/min_length": 300.75, "completions/min_terminated_length": 300.75, "epoch": 0.408, "grad_norm": 0.3323485255241394, "kl": 0.043212890625, "learning_rate": 7.647214421168238e-07, "loss": 0.2212, "num_tokens": 67777704.0, "reward": 0.518621951341629, "reward_std": 0.1973434630781412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024935975670814514, "rewards/penalized_accuracy_reward/std": 0.06813807040452957, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.12213464826345444, "step": 816 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1558.0, "completions/max_terminated_length": 1412.75, "completions/mean_length": 767.234375, "completions/mean_terminated_length": 696.1130065917969, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.4085, "grad_norm": 0.33012962341308594, "kl": 0.03997802734375, "learning_rate": 7.640308940816239e-07, "loss": 0.123, "num_tokens": 67835719.0, "reward": 0.754634827375412, "reward_std": 0.3804667443037033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14977833814918995, "rewards/penalized_accuracy_reward/std": 0.16808483749628067, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.12221188098192215, "step": 817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1599.25, "completions/max_terminated_length": 1512.5, "completions/mean_length": 959.734375, "completions/mean_terminated_length": 809.6572113037109, "completions/min_length": 366.75, "completions/min_terminated_length": 366.75, "epoch": 0.409, "grad_norm": 0.4413408935070038, "kl": 0.03790283203125, "learning_rate": 7.633396940712023e-07, "loss": 0.2198, "num_tokens": 67906822.0, "reward": 0.416015625, "reward_std": 0.11374459974467754, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.22748920321464539, "step": 818 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1720.75, "completions/max_terminated_length": 1376.5, "completions/mean_length": 827.34375, "completions/mean_terminated_length": 699.2897720336914, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.4095, "grad_norm": 0.6412301063537598, "kl": 0.06036376953125, "learning_rate": 7.626478441910744e-07, "loss": 0.2575, "num_tokens": 67970348.0, "reward": 0.4761523902416229, "reward_std": 0.17941838689148426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012490259483456612, "rewards/penalized_accuracy_reward/std": 0.049961041659116745, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.21216461807489395, "step": 819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1464.5, "completions/max_terminated_length": 1336.25, "completions/mean_length": 898.265625, "completions/mean_terminated_length": 773.6450958251953, "completions/min_length": 372.75, "completions/min_terminated_length": 372.75, "epoch": 0.41, "grad_norm": 0.3287445902824402, "kl": 0.0379638671875, "learning_rate": 7.619553465487344e-07, "loss": 0.1927, "num_tokens": 68037693.0, "reward": 0.7660070061683655, "reward_std": 0.45055120065808296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16230038926005363, "rewards/penalized_accuracy_reward/std": 0.1987818330526352, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.13071492686867714, "step": 820 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1841.25, "completions/max_terminated_length": 1234.25, "completions/mean_length": 773.921875, "completions/mean_terminated_length": 687.4391632080078, "completions/min_length": 276.25, "completions/min_terminated_length": 276.25, "epoch": 0.4105, "grad_norm": 0.37275373935699463, "kl": 0.034027099609375, "learning_rate": 7.612622032536507e-07, "loss": 0.2268, "num_tokens": 68094568.0, "reward": 0.6742685437202454, "reward_std": 0.436601959168911, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17341844737529755, "step": 821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1689.0, "completions/max_terminated_length": 1661.5, "completions/mean_length": 921.671875, "completions/mean_terminated_length": 838.2499389648438, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.411, "grad_norm": 0.5492926836013794, "kl": 0.060546875, "learning_rate": 7.60568416417258e-07, "loss": 0.233, "num_tokens": 68166915.0, "reward": 0.6797896027565002, "reward_std": 0.44101744145154953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11235574260354042, "rewards/penalized_accuracy_reward/std": 0.184958815574646, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.19218279235064983, "step": 822 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1666.5, "completions/max_terminated_length": 1357.25, "completions/mean_length": 767.65625, "completions/mean_terminated_length": 701.1226043701172, "completions/min_length": 238.75, "completions/min_terminated_length": 238.75, "epoch": 0.4115, "grad_norm": 0.3872928321361542, "kl": 0.03985595703125, "learning_rate": 7.59873988152951e-07, "loss": 0.208, "num_tokens": 68226861.0, "reward": 0.482421875, "reward_std": 0.04783676192164421, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09567352384328842, "step": 823 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1835.25, "completions/max_terminated_length": 1770.5, "completions/mean_length": 1296.0625, "completions/mean_terminated_length": 1135.7875061035156, "completions/min_length": 644.5, "completions/min_terminated_length": 644.5, "epoch": 0.412, "grad_norm": 0.2594967186450958, "kl": 0.04290771484375, "learning_rate": 7.591789205760789e-07, "loss": 0.1227, "num_tokens": 68320033.0, "reward": 0.44250890612602234, "reward_std": 0.2251712940633297, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496539056301117, "rewards/penalized_accuracy_reward/std": 0.06821844726800919, "rewards/tag_count_reward/mean": 0.78515625, "rewards/tag_count_reward/std": 0.19495915807783604, "step": 824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1750.0, "completions/max_terminated_length": 1583.5, "completions/mean_length": 1115.75, "completions/mean_terminated_length": 1006.9152526855469, "completions/min_length": 493.75, "completions/min_terminated_length": 493.75, "epoch": 0.4125, "grad_norm": 0.4116116762161255, "kl": 0.048828125, "learning_rate": 7.584832158039378e-07, "loss": 0.2095, "num_tokens": 68402609.0, "reward": 0.44140625, "reward_std": 0.09796088375151157, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.19592177122831345, "step": 825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1796.0, "completions/max_terminated_length": 1333.75, "completions/mean_length": 984.796875, "completions/mean_terminated_length": 694.9130706787109, "completions/min_length": 282.25, "completions/min_terminated_length": 282.25, "epoch": 0.413, "grad_norm": 0.48539039492607117, "kl": 0.05206298828125, "learning_rate": 7.577868759557653e-07, "loss": 0.4304, "num_tokens": 68475284.0, "reward": 0.7365775108337402, "reward_std": 0.2839368060231209, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16223406791687012, "rewards/penalized_accuracy_reward/std": 0.08049080520868301, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.2459103986620903, "step": 826 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1561.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 883.859375, "completions/mean_terminated_length": 883.859375, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.4135, "grad_norm": 0.32689806818962097, "kl": 0.02630615234375, "learning_rate": 7.570899031527332e-07, "loss": 0.0786, "num_tokens": 68541819.0, "reward": 0.534346878528595, "reward_std": 0.17516138590872288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024985941126942635, "rewards/penalized_accuracy_reward/std": 0.0682745948433876, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09088464826345444, "step": 827 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1699.25, "completions/max_terminated_length": 1398.25, "completions/mean_length": 793.015625, "completions/mean_terminated_length": 722.0772552490234, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.414, "grad_norm": 0.3555731773376465, "kl": 0.032012939453125, "learning_rate": 7.563922995179418e-07, "loss": 0.1772, "num_tokens": 68600412.0, "reward": 0.7186048924922943, "reward_std": 0.26517206989228725, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12492744624614716, "rewards/penalized_accuracy_reward/std": 0.09994197636842728, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.13057629391551018, "step": 828 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1662.75, "completions/max_terminated_length": 1242.25, "completions/mean_length": 637.703125, "completions/mean_terminated_length": 592.3823089599609, "completions/min_length": 215.25, "completions/min_terminated_length": 215.25, "epoch": 0.4145, "grad_norm": 0.5083458423614502, "kl": 0.035919189453125, "learning_rate": 7.556940671764124e-07, "loss": 0.2481, "num_tokens": 68649113.0, "reward": 0.6629031002521515, "reward_std": 0.35637183487415314, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08731093071401119, "rewards/penalized_accuracy_reward/std": 0.16374678164720535, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1647.25, "completions/mean_length": 1366.515625, "completions/mean_terminated_length": 918.6548004150391, "completions/min_length": 282.5, "completions/min_terminated_length": 282.5, "epoch": 0.415, "grad_norm": 0.3876892030239105, "kl": 0.05303955078125, "learning_rate": 7.54995208255082e-07, "loss": 0.3532, "num_tokens": 68747338.0, "reward": 0.349609375, "reward_std": 0.15747417509555817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.31494835764169693, "step": 830 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1421.5, "completions/max_terminated_length": 1421.5, "completions/mean_length": 808.203125, "completions/mean_terminated_length": 808.203125, "completions/min_length": 319.25, "completions/min_terminated_length": 319.25, "epoch": 0.4155, "grad_norm": 0.3687044084072113, "kl": 0.04443359375, "learning_rate": 7.54295724882796e-07, "loss": 0.0048, "num_tokens": 68809031.0, "reward": 0.494140625, "reward_std": 0.0234375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1200.5, "completions/mean_length": 917.578125, "completions/mean_terminated_length": 694.9870300292969, "completions/min_length": 353.5, "completions/min_terminated_length": 353.5, "epoch": 0.416, "grad_norm": 0.544877290725708, "kl": 0.05072021484375, "learning_rate": 7.535956191903021e-07, "loss": 0.4282, "num_tokens": 68874044.0, "reward": 0.5374500602483749, "reward_std": 0.2878213878720999, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04997503384947777, "rewards/penalized_accuracy_reward/std": 0.08939805626869202, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2550031952559948, "step": 832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1677.5, "completions/max_terminated_length": 1414.25, "completions/mean_length": 827.359375, "completions/mean_terminated_length": 751.4534606933594, "completions/min_length": 293.75, "completions/min_terminated_length": 293.75, "epoch": 0.4165, "grad_norm": 0.4582601487636566, "kl": 0.052001953125, "learning_rate": 7.528948933102438e-07, "loss": 0.1597, "num_tokens": 68936515.0, "reward": 0.6224792301654816, "reward_std": 0.2526271492242813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491148263216019, "rewards/penalized_accuracy_reward/std": 0.09988202154636383, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13390429690480232, "step": 833 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1528.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 982.0625, "completions/mean_terminated_length": 823.5602722167969, "completions/min_length": 298.5, "completions/min_terminated_length": 298.5, "epoch": 0.417, "grad_norm": 0.3999004364013672, "kl": 0.038543701171875, "learning_rate": 7.521935493771534e-07, "loss": 0.1922, "num_tokens": 69008471.0, "reward": 0.5660509169101715, "reward_std": 0.2854628600180149, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062322333455085754, "rewards/penalized_accuracy_reward/std": 0.09547057002782822, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.18904344737529755, "step": 834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1574.5, "completions/max_terminated_length": 1460.0, "completions/mean_length": 764.15625, "completions/mean_terminated_length": 669.1554107666016, "completions/min_length": 311.75, "completions/min_terminated_length": 311.75, "epoch": 0.4175, "grad_norm": 0.49389180541038513, "kl": 0.0440673828125, "learning_rate": 7.514915895274463e-07, "loss": 0.2315, "num_tokens": 69069665.0, "reward": 0.6128427684307098, "reward_std": 0.271845031529665, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07497607171535492, "rewards/penalized_accuracy_reward/std": 0.0999680906534195, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1438177078962326, "step": 835 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1976.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 1123.71875, "completions/mean_terminated_length": 1019.7053985595703, "completions/min_length": 393.75, "completions/min_terminated_length": 393.75, "epoch": 0.418, "grad_norm": 0.33106568455696106, "kl": 0.031097412109375, "learning_rate": 7.507890158994139e-07, "loss": 0.2081, "num_tokens": 69151343.0, "reward": 0.6317660063505173, "reward_std": 0.3636409305036068, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08736737910658121, "rewards/penalized_accuracy_reward/std": 0.14975539222359657, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1921535972505808, "step": 836 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1620.0, "completions/max_terminated_length": 1444.75, "completions/mean_length": 904.296875, "completions/mean_terminated_length": 847.1201324462891, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.4185, "grad_norm": 0.3897942900657654, "kl": 0.0347900390625, "learning_rate": 7.500858306332172e-07, "loss": 0.0964, "num_tokens": 69218450.0, "reward": 0.7741357088088989, "reward_std": 0.4115989934653044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14976315200328827, "rewards/penalized_accuracy_reward/std": 0.18280881643295288, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1227458082139492, "step": 837 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1516.5, "completions/max_terminated_length": 1353.5, "completions/mean_length": 717.390625, "completions/mean_terminated_length": 679.3250122070312, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.419, "grad_norm": 0.440054714679718, "kl": 0.033233642578125, "learning_rate": 7.493820358708809e-07, "loss": 0.1805, "num_tokens": 69271851.0, "reward": 0.47265625, "reward_std": 0.07002151571214199, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14004303142428398, "step": 838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1816.75, "completions/max_terminated_length": 1452.75, "completions/mean_length": 1173.484375, "completions/mean_terminated_length": 1001.6769714355469, "completions/min_length": 594.75, "completions/min_terminated_length": 594.75, "epoch": 0.4195, "grad_norm": 0.457161009311676, "kl": 0.03790283203125, "learning_rate": 7.486776337562853e-07, "loss": 0.2057, "num_tokens": 69357114.0, "reward": 0.45467251539230347, "reward_std": 0.21025384590029716, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012492503970861435, "rewards/penalized_accuracy_reward/std": 0.04997001215815544, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.24116361141204834, "step": 839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1333.75, "completions/max_terminated_length": 1219.25, "completions/mean_length": 599.984375, "completions/mean_terminated_length": 536.8297882080078, "completions/min_length": 168.5, "completions/min_terminated_length": 168.5, "epoch": 0.42, "grad_norm": 0.3352448642253876, "kl": 0.042724609375, "learning_rate": 7.479726264351618e-07, "loss": 0.2189, "num_tokens": 69403769.0, "reward": 0.482421875, "reward_std": 0.055459219962358475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 840 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1662.5, "completions/max_terminated_length": 1609.0, "completions/mean_length": 1009.796875, "completions/mean_terminated_length": 935.8350982666016, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.4205, "grad_norm": 0.4760090708732605, "kl": 0.049774169921875, "learning_rate": 7.472670160550848e-07, "loss": 0.1556, "num_tokens": 69476684.0, "reward": 0.6220276057720184, "reward_std": 0.27970814518630505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08738099038600922, "rewards/penalized_accuracy_reward/std": 0.1023302748799324, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.15009522065520287, "step": 841 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1893.5, "completions/max_terminated_length": 1419.5, "completions/mean_length": 756.25, "completions/mean_terminated_length": 712.9677276611328, "completions/min_length": 339.25, "completions/min_terminated_length": 339.25, "epoch": 0.421, "grad_norm": 0.5179550647735596, "kl": 0.0439453125, "learning_rate": 7.46560804765466e-07, "loss": 0.2313, "num_tokens": 69536764.0, "reward": 0.7685229778289795, "reward_std": 0.2535283640027046, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14988648891448975, "rewards/penalized_accuracy_reward/std": 0.08937513083219528, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1495562344789505, "step": 842 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.75, "completions/max_terminated_length": 1189.75, "completions/mean_length": 500.71875, "completions/mean_terminated_length": 500.71875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.4215, "grad_norm": 0.47538965940475464, "kl": 0.0419921875, "learning_rate": 7.458539947175473e-07, "loss": 0.0766, "num_tokens": 69576202.0, "reward": 0.6860317587852478, "reward_std": 0.38724206387996674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0998518206179142, "rewards/penalized_accuracy_reward/std": 0.1761014237999916, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07007849216461182, "step": 843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1351.0, "completions/max_terminated_length": 1186.75, "completions/mean_length": 733.46875, "completions/mean_terminated_length": 660.21875, "completions/min_length": 362.5, "completions/min_terminated_length": 362.5, "epoch": 0.422, "grad_norm": 0.5129413604736328, "kl": 0.040802001953125, "learning_rate": 7.45146588064395e-07, "loss": 0.1478, "num_tokens": 69632056.0, "reward": 0.6317241787910461, "reward_std": 0.27741651237010956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08734646439552307, "rewards/penalized_accuracy_reward/std": 0.10228972136974335, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.15746081620454788, "step": 844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1755.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 903.296875, "completions/mean_terminated_length": 791.8739318847656, "completions/min_length": 334.25, "completions/min_terminated_length": 334.25, "epoch": 0.4225, "grad_norm": 0.336095929145813, "kl": 0.042633056640625, "learning_rate": 7.444385869608921e-07, "loss": 0.169, "num_tokens": 69703643.0, "reward": 0.5897350907325745, "reward_std": 0.32160579413175583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06244567036628723, "rewards/penalized_accuracy_reward/std": 0.1392783597111702, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16800560802221298, "step": 845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1343.75, "completions/max_terminated_length": 1265.0, "completions/mean_length": 715.59375, "completions/mean_terminated_length": 681.6227722167969, "completions/min_length": 308.75, "completions/min_terminated_length": 308.75, "epoch": 0.423, "grad_norm": 0.40422120690345764, "kl": 0.044189453125, "learning_rate": 7.437299935637328e-07, "loss": 0.0641, "num_tokens": 69760673.0, "reward": 0.8839108049869537, "reward_std": 0.4219716787338257, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19976790249347687, "rewards/penalized_accuracy_reward/std": 0.19974808394908905, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08656632527709007, "step": 846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1423.5, "completions/mean_length": 871.046875, "completions/mean_terminated_length": 671.2208557128906, "completions/min_length": 260.5, "completions/min_terminated_length": 260.5, "epoch": 0.4235, "grad_norm": 0.5605896711349487, "kl": 0.056060791015625, "learning_rate": 7.430208100314156e-07, "loss": 0.4448, "num_tokens": 69824676.0, "reward": 0.46833738684654236, "reward_std": 0.1921681985259056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012489006854593754, "rewards/penalized_accuracy_reward/std": 0.049956027418375015, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.23746410757303238, "step": 847 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1854.75, "completions/max_terminated_length": 1627.75, "completions/mean_length": 980.453125, "completions/mean_terminated_length": 834.8639831542969, "completions/min_length": 403.25, "completions/min_terminated_length": 403.25, "epoch": 0.424, "grad_norm": 0.26496973633766174, "kl": 0.0391845703125, "learning_rate": 7.423110385242366e-07, "loss": 0.1193, "num_tokens": 69895425.0, "reward": 0.5891576260328293, "reward_std": 0.37484754249453545, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07485223934054375, "rewards/penalized_accuracy_reward/std": 0.15747565776109695, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.20295725762844086, "step": 848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1773.0, "completions/max_terminated_length": 1379.75, "completions/mean_length": 936.359375, "completions/mean_terminated_length": 787.097412109375, "completions/min_length": 300.75, "completions/min_terminated_length": 300.75, "epoch": 0.4245, "grad_norm": 0.5276629328727722, "kl": 0.04693603515625, "learning_rate": 7.416006812042827e-07, "loss": 0.2714, "num_tokens": 69966264.0, "reward": 0.6451197266578674, "reward_std": 0.3039501681923866, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09990361332893372, "rewards/penalized_accuracy_reward/std": 0.10318004339933395, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.19518016278743744, "step": 849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1903.5, "completions/max_terminated_length": 1610.0, "completions/mean_length": 1008.015625, "completions/mean_terminated_length": 842.9614868164062, "completions/min_length": 326.5, "completions/min_terminated_length": 326.5, "epoch": 0.425, "grad_norm": 0.4242863357067108, "kl": 0.0546875, "learning_rate": 7.408897402354255e-07, "loss": 0.217, "num_tokens": 70040057.0, "reward": 0.4796573370695114, "reward_std": 0.22665414586663246, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024984916672110558, "rewards/penalized_accuracy_reward/std": 0.06827179342508316, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.21862982586026192, "step": 850 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1502.0, "completions/max_terminated_length": 1333.5, "completions/mean_length": 939.296875, "completions/mean_terminated_length": 786.7187652587891, "completions/min_length": 369.5, "completions/min_terminated_length": 369.5, "epoch": 0.4255, "grad_norm": 0.3350716233253479, "kl": 0.044677734375, "learning_rate": 7.401782177833147e-07, "loss": 0.118, "num_tokens": 70108092.0, "reward": 0.4296875, "reward_std": 0.0876203328371048, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.1752406731247902, "step": 851 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1678.25, "completions/max_terminated_length": 1413.0, "completions/mean_length": 670.28125, "completions/mean_terminated_length": 605.5419769287109, "completions/min_length": 227.25, "completions/min_terminated_length": 227.25, "epoch": 0.426, "grad_norm": 0.36517420411109924, "kl": 0.0509033203125, "learning_rate": 7.394661160153709e-07, "loss": 0.205, "num_tokens": 70161006.0, "reward": 0.6204132735729218, "reward_std": 0.2387280985713005, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07485507428646088, "rewards/penalized_accuracy_reward/std": 0.0998067781329155, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11756322160363197, "step": 852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.25, "completions/mean_length": 1085.5, "completions/mean_terminated_length": 888.6618804931641, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.4265, "grad_norm": 0.414117693901062, "kl": 0.039215087890625, "learning_rate": 7.387534371007797e-07, "loss": 0.3359, "num_tokens": 70239694.0, "reward": 0.6101767420768738, "reward_std": 0.31388762034475803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08731493353843689, "rewards/penalized_accuracy_reward/std": 0.10225291550159454, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2572440914809704, "step": 853 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1449.25, "completions/max_terminated_length": 1369.75, "completions/mean_length": 801.703125, "completions/mean_terminated_length": 744.6057739257812, "completions/min_length": 291.25, "completions/min_terminated_length": 291.25, "epoch": 0.427, "grad_norm": 0.42656975984573364, "kl": 0.035552978515625, "learning_rate": 7.380401832104845e-07, "loss": 0.1561, "num_tokens": 70299083.0, "reward": 0.5495204031467438, "reward_std": 0.2851836308836937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745551034808159, "rewards/penalized_accuracy_reward/std": 0.11817923933267593, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13808366656303406, "step": 854 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1897.75, "completions/max_terminated_length": 1529.25, "completions/mean_length": 860.3125, "completions/mean_terminated_length": 787.7364654541016, "completions/min_length": 383.5, "completions/min_terminated_length": 383.5, "epoch": 0.4275, "grad_norm": 0.3371671736240387, "kl": 0.032135009765625, "learning_rate": 7.373263565171805e-07, "loss": 0.2761, "num_tokens": 70363215.0, "reward": 0.7974375188350677, "reward_std": 0.2209333721548319, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16239061951637268, "rewards/penalized_accuracy_reward/std": 0.0805683583021164, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1195933148264885, "step": 855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1501.5, "completions/max_terminated_length": 1255.5, "completions/mean_length": 771.0625, "completions/mean_terminated_length": 734.3531494140625, "completions/min_length": 268.75, "completions/min_terminated_length": 268.75, "epoch": 0.428, "grad_norm": 0.4141817092895508, "kl": 0.0408935546875, "learning_rate": 7.366119591953075e-07, "loss": 0.1429, "num_tokens": 70421907.0, "reward": 0.7472142279148102, "reward_std": 0.5515827238559723, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1372789889574051, "rewards/penalized_accuracy_reward/std": 0.27035993710160255, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15074944868683815, "step": 856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1761.5, "completions/max_terminated_length": 1619.25, "completions/mean_length": 824.140625, "completions/mean_terminated_length": 695.4230804443359, "completions/min_length": 281.5, "completions/min_terminated_length": 281.5, "epoch": 0.4285, "grad_norm": 0.3801182806491852, "kl": 0.037322998046875, "learning_rate": 7.358969934210438e-07, "loss": 0.3114, "num_tokens": 70484524.0, "reward": 0.7910361289978027, "reward_std": 0.3813345953822136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17481493577361107, "rewards/penalized_accuracy_reward/std": 0.15750489383935928, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.19415950030088425, "step": 857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1874.75, "completions/max_terminated_length": 1605.75, "completions/mean_length": 1041.21875, "completions/mean_terminated_length": 893.1969146728516, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.429, "grad_norm": 0.39743319153785706, "kl": 0.052978515625, "learning_rate": 7.35181461372299e-07, "loss": 0.2238, "num_tokens": 70563962.0, "reward": 0.6661253273487091, "reward_std": 0.29685039073228836, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11235953867435455, "rewards/penalized_accuracy_reward/std": 0.10234158486127853, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2152357567101717, "step": 858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1586.75, "completions/mean_length": 1166.765625, "completions/mean_terminated_length": 870.0702667236328, "completions/min_length": 361.25, "completions/min_terminated_length": 361.25, "epoch": 0.4295, "grad_norm": 0.4465550482273102, "kl": 0.076171875, "learning_rate": 7.344653652287077e-07, "loss": 0.2749, "num_tokens": 70654043.0, "reward": 0.4327191114425659, "reward_std": 0.26833653077483177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02495330572128296, "rewards/penalized_accuracy_reward/std": 0.06818542629480362, "rewards/tag_count_reward/mean": 0.765625, "rewards/tag_count_reward/std": 0.30233529210090637, "step": 859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1873.25, "completions/max_terminated_length": 1755.25, "completions/mean_length": 1105.59375, "completions/mean_terminated_length": 891.8809661865234, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.43, "grad_norm": 0.3231870234012604, "kl": 0.043365478515625, "learning_rate": 7.337487071716232e-07, "loss": 0.1704, "num_tokens": 70734097.0, "reward": 0.44489410519599915, "reward_std": 0.19274215586483479, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012486115097999573, "rewards/penalized_accuracy_reward/std": 0.04994446039199829, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.23868054896593094, "step": 860 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1342.75, "completions/max_terminated_length": 1284.75, "completions/mean_length": 874.71875, "completions/mean_terminated_length": 800.9656372070312, "completions/min_length": 343.25, "completions/min_terminated_length": 343.25, "epoch": 0.4305, "grad_norm": 0.502406895160675, "kl": 0.036865234375, "learning_rate": 7.330314893841101e-07, "loss": 0.0332, "num_tokens": 70798511.0, "reward": 1.2351502478122711, "reward_std": 0.5566266812384129, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.38710638880729675, "rewards/penalized_accuracy_reward/std": 0.2566945552825928, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.11625919491052628, "step": 861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1807.5, "completions/max_terminated_length": 1221.75, "completions/mean_length": 900.453125, "completions/mean_terminated_length": 677.7973022460938, "completions/min_length": 357.5, "completions/min_terminated_length": 357.5, "epoch": 0.431, "grad_norm": 0.47678592801094055, "kl": 0.04351806640625, "learning_rate": 7.323137140509381e-07, "loss": 0.2845, "num_tokens": 70867932.0, "reward": 0.7387713193893433, "reward_std": 0.5978758819401264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14965910464525223, "rewards/penalized_accuracy_reward/std": 0.2652198448777199, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2328943032771349, "step": 862 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1823.75, "completions/max_terminated_length": 1535.5, "completions/mean_length": 982.875, "completions/mean_terminated_length": 844.7550659179688, "completions/min_length": 378.5, "completions/min_terminated_length": 378.5, "epoch": 0.4315, "grad_norm": 0.33918529748916626, "kl": 0.0413818359375, "learning_rate": 7.315953833585755e-07, "loss": 0.2667, "num_tokens": 70939732.0, "reward": 0.4453125, "reward_std": 0.09954788163304329, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.19909576326608658, "step": 863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1724.5, "completions/max_terminated_length": 1527.75, "completions/mean_length": 971.53125, "completions/mean_terminated_length": 808.5666809082031, "completions/min_length": 268.25, "completions/min_terminated_length": 268.25, "epoch": 0.432, "grad_norm": 0.48412784934043884, "kl": 0.0467529296875, "learning_rate": 7.308764994951821e-07, "loss": 0.2639, "num_tokens": 71011318.0, "reward": 0.48198390007019043, "reward_std": 0.1886712983250618, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012476328760385513, "rewards/penalized_accuracy_reward/std": 0.04990531876683235, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17772135883569717, "step": 864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1871.0, "completions/max_terminated_length": 1698.25, "completions/mean_length": 972.78125, "completions/mean_terminated_length": 883.5875091552734, "completions/min_length": 234.5, "completions/min_terminated_length": 234.5, "epoch": 0.4325, "grad_norm": 0.3607935607433319, "kl": 0.044769287109375, "learning_rate": 7.301570646506027e-07, "loss": 0.1933, "num_tokens": 71082536.0, "reward": 0.4609375, "reward_std": 0.07598912715911865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.15197825618088245, "step": 865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1334.25, "completions/max_terminated_length": 971.75, "completions/mean_length": 642.96875, "completions/mean_terminated_length": 564.7468109130859, "completions/min_length": 260.25, "completions/min_terminated_length": 260.25, "epoch": 0.433, "grad_norm": 0.8921627402305603, "kl": 0.047515869140625, "learning_rate": 7.294370810163607e-07, "loss": 0.0539, "num_tokens": 71135238.0, "reward": 0.6763193905353546, "reward_std": 0.42092814296483994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09987844526767731, "rewards/penalized_accuracy_reward/std": 0.19965245947241783, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.13096532225608826, "step": 866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1504.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 804.296875, "completions/mean_terminated_length": 702.1971893310547, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.4335, "grad_norm": 0.4409496784210205, "kl": 0.0401611328125, "learning_rate": 7.287165507856512e-07, "loss": 0.1068, "num_tokens": 71196505.0, "reward": 0.7472949624061584, "reward_std": 0.39347635209560394, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13731933757662773, "rewards/penalized_accuracy_reward/std": 0.17058511078357697, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13238365203142166, "step": 867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1929.25, "completions/max_terminated_length": 1635.25, "completions/mean_length": 1143.671875, "completions/mean_terminated_length": 928.9046630859375, "completions/min_length": 460.5, "completions/min_terminated_length": 460.5, "epoch": 0.434, "grad_norm": 0.34166234731674194, "kl": 0.045806884765625, "learning_rate": 7.279954761533342e-07, "loss": 0.2368, "num_tokens": 71279892.0, "reward": 0.48697829246520996, "reward_std": 0.26657987758517265, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03743446245789528, "rewards/penalized_accuracy_reward/std": 0.08048167824745178, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.26243683882057667, "step": 868 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 1143.859375, "completions/mean_terminated_length": 1033.7869720458984, "completions/min_length": 602.75, "completions/min_terminated_length": 602.75, "epoch": 0.4345, "grad_norm": 0.43245965242385864, "kl": 0.0467529296875, "learning_rate": 7.27273859315928e-07, "loss": 0.2557, "num_tokens": 71363355.0, "reward": 0.5048832893371582, "reward_std": 0.2307001780718565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0249025821685791, "rewards/penalized_accuracy_reward/std": 0.06804681569337845, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.22741226106882095, "step": 869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1904.75, "completions/max_terminated_length": 1741.5, "completions/mean_length": 1058.796875, "completions/mean_terminated_length": 947.3616485595703, "completions/min_length": 431.5, "completions/min_terminated_length": 431.5, "epoch": 0.435, "grad_norm": 0.45342013239860535, "kl": 0.035552978515625, "learning_rate": 7.265517024716026e-07, "loss": 0.2958, "num_tokens": 71442670.0, "reward": 0.45703125, "reward_std": 0.09015131741762161, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18030264228582382, "step": 870 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1931.25, "completions/max_terminated_length": 1459.5, "completions/mean_length": 865.484375, "completions/mean_terminated_length": 638.7979354858398, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.4355, "grad_norm": 0.49831947684288025, "kl": 0.049560546875, "learning_rate": 7.258290078201731e-07, "loss": 0.2992, "num_tokens": 71507629.0, "reward": 0.9464412331581116, "reward_std": 0.5948023982346058, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2622831016778946, "rewards/penalized_accuracy_reward/std": 0.26712706685066223, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.1927468664944172, "step": 871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1972.75, "completions/max_terminated_length": 1733.25, "completions/mean_length": 1007.828125, "completions/mean_terminated_length": 945.1622314453125, "completions/min_length": 359.25, "completions/min_terminated_length": 359.25, "epoch": 0.436, "grad_norm": 0.34882044792175293, "kl": 0.0404052734375, "learning_rate": 7.251057775630927e-07, "loss": 0.0781, "num_tokens": 71580994.0, "reward": 0.5784193426370621, "reward_std": 0.2865803986787796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995186347514391, "rewards/penalized_accuracy_reward/std": 0.13045789673924446, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14369958639144897, "step": 872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1757.25, "completions/max_terminated_length": 1405.0, "completions/mean_length": 967.0625, "completions/mean_terminated_length": 715.1644592285156, "completions/min_length": 191.25, "completions/min_terminated_length": 191.25, "epoch": 0.4365, "grad_norm": 0.5599665641784668, "kl": 0.07220458984375, "learning_rate": 7.243820139034464e-07, "loss": 0.3455, "num_tokens": 71654646.0, "reward": 0.6579961776733398, "reward_std": 0.3128668814897537, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12489653378725052, "rewards/penalized_accuracy_reward/std": 0.09991730749607086, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.26238081976771355, "step": 873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.25, "completions/mean_length": 1193.21875, "completions/mean_terminated_length": 1032.9693603515625, "completions/min_length": 466.75, "completions/min_terminated_length": 466.75, "epoch": 0.437, "grad_norm": 0.38956502079963684, "kl": 0.04254150390625, "learning_rate": 7.236577190459433e-07, "loss": 0.284, "num_tokens": 71739924.0, "reward": 0.4468435049057007, "reward_std": 0.22067348659038544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012484249658882618, "rewards/penalized_accuracy_reward/std": 0.04993699863553047, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.29733070731163025, "step": 874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1552.5, "completions/max_terminated_length": 1423.5, "completions/mean_length": 869.28125, "completions/mean_terminated_length": 852.6468811035156, "completions/min_length": 359.5, "completions/min_terminated_length": 359.5, "epoch": 0.4375, "grad_norm": 0.27060940861701965, "kl": 0.02734375, "learning_rate": 7.229328951969115e-07, "loss": 0.0384, "num_tokens": 71804422.0, "reward": 0.7936118394136429, "reward_std": 0.37369105219841003, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14973561093211174, "rewards/penalized_accuracy_reward/std": 0.18282224237918854, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1632.0, "completions/max_terminated_length": 1492.75, "completions/mean_length": 736.5, "completions/mean_terminated_length": 714.5093841552734, "completions/min_length": 282.5, "completions/min_terminated_length": 282.5, "epoch": 0.438, "grad_norm": 0.4941197335720062, "kl": 0.05291748046875, "learning_rate": 7.222075445642904e-07, "loss": -0.0017, "num_tokens": 71863510.0, "reward": 0.9606008678674698, "reward_std": 0.5146909281611443, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2371363714337349, "rewards/penalized_accuracy_reward/std": 0.25460152328014374, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1494.75, "completions/max_terminated_length": 1348.0, "completions/mean_length": 706.046875, "completions/mean_terminated_length": 620.7375946044922, "completions/min_length": 292.25, "completions/min_terminated_length": 292.25, "epoch": 0.4385, "grad_norm": 0.4534655213356018, "kl": 0.044830322265625, "learning_rate": 7.214816693576234e-07, "loss": 0.2555, "num_tokens": 71917849.0, "reward": 0.47265625, "reward_std": 0.06981047987937927, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13962095975875854, "step": 877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1774.75, "completions/max_terminated_length": 1474.0, "completions/mean_length": 920.8125, "completions/mean_terminated_length": 725.3116302490234, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.439, "grad_norm": 0.411729633808136, "kl": 0.03765869140625, "learning_rate": 7.207552717880522e-07, "loss": 0.2521, "num_tokens": 71985101.0, "reward": 0.6467412561178207, "reward_std": 0.44499000906944275, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11243312433362007, "rewards/penalized_accuracy_reward/std": 0.18050511926412582, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.21592704206705093, "step": 878 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1701.25, "completions/max_terminated_length": 1303.5, "completions/mean_length": 928.46875, "completions/mean_terminated_length": 807.0031433105469, "completions/min_length": 229.5, "completions/min_terminated_length": 229.5, "epoch": 0.4395, "grad_norm": 0.3654100000858307, "kl": 0.034637451171875, "learning_rate": 7.200283540683102e-07, "loss": 0.2089, "num_tokens": 72055211.0, "reward": 0.5510916709899902, "reward_std": 0.2559785395860672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995989054441452, "rewards/penalized_accuracy_reward/std": 0.0893709734082222, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.19014080986380577, "step": 879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1699.5, "completions/max_terminated_length": 1499.25, "completions/mean_length": 1099.875, "completions/mean_terminated_length": 959.8009033203125, "completions/min_length": 476.75, "completions/min_terminated_length": 476.75, "epoch": 0.44, "grad_norm": 0.47027459740638733, "kl": 0.03668212890625, "learning_rate": 7.193009184127145e-07, "loss": 0.1977, "num_tokens": 72135427.0, "reward": 0.439453125, "reward_std": 0.11316476203501225, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2263295315206051, "step": 880 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1985.5, "completions/max_terminated_length": 1553.5, "completions/mean_length": 1117.65625, "completions/mean_terminated_length": 905.1340179443359, "completions/min_length": 372.75, "completions/min_terminated_length": 372.75, "epoch": 0.4405, "grad_norm": 0.3301040828227997, "kl": 0.028778076171875, "learning_rate": 7.185729670371604e-07, "loss": 0.3643, "num_tokens": 72215549.0, "reward": 0.42578125, "reward_std": 0.11671638116240501, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.23343276977539062, "step": 881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1973.75, "completions/max_terminated_length": 1479.75, "completions/mean_length": 884.984375, "completions/mean_terminated_length": 807.3409423828125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.441, "grad_norm": 0.4756188690662384, "kl": 0.051483154296875, "learning_rate": 7.17844502159114e-07, "loss": 0.2023, "num_tokens": 72281292.0, "reward": 0.5397913753986359, "reward_std": 0.24217670783400536, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747381269931793, "rewards/penalized_accuracy_reward/std": 0.08056627959012985, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19986840710043907, "step": 882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 1109.265625, "completions/mean_terminated_length": 844.343147277832, "completions/min_length": 283.25, "completions/min_terminated_length": 283.25, "epoch": 0.4415, "grad_norm": 0.4037554860115051, "kl": 0.045867919921875, "learning_rate": 7.171155259976057e-07, "loss": 0.3754, "num_tokens": 72363213.0, "reward": 0.43123605847358704, "reward_std": 0.21415164694190025, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012493029236793518, "rewards/penalized_accuracy_reward/std": 0.04997211694717407, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.31231479346752167, "step": 883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1435.5, "completions/mean_length": 1029.640625, "completions/mean_terminated_length": 844.8750152587891, "completions/min_length": 401.5, "completions/min_terminated_length": 401.5, "epoch": 0.442, "grad_norm": 0.3430726230144501, "kl": 0.040679931640625, "learning_rate": 7.163860407732231e-07, "loss": 0.2471, "num_tokens": 72438262.0, "reward": 0.5873167365789413, "reward_std": 0.5107679218053818, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490835897624493, "rewards/penalized_accuracy_reward/std": 0.23630590736865997, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.26739031076431274, "step": 884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1991.25, "completions/max_terminated_length": 1387.5, "completions/mean_length": 845.46875, "completions/mean_terminated_length": 721.6113586425781, "completions/min_length": 308.75, "completions/min_terminated_length": 308.75, "epoch": 0.4425, "grad_norm": 0.46541351079940796, "kl": 0.0408935546875, "learning_rate": 7.156560487081051e-07, "loss": 0.3931, "num_tokens": 72500868.0, "reward": 0.8852511346340179, "reward_std": 0.581207737326622, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21215682104229927, "rewards/penalized_accuracy_reward/std": 0.263655848801136, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18771570920944214, "step": 885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1799.25, "completions/max_terminated_length": 1526.25, "completions/mean_length": 875.890625, "completions/mean_terminated_length": 782.7969055175781, "completions/min_length": 309.75, "completions/min_terminated_length": 309.75, "epoch": 0.443, "grad_norm": 0.572424054145813, "kl": 0.05413818359375, "learning_rate": 7.149255520259338e-07, "loss": 0.2745, "num_tokens": 72567117.0, "reward": 0.4453125, "reward_std": 0.10587087273597717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21174174547195435, "step": 886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1743.25, "completions/max_terminated_length": 1223.5, "completions/mean_length": 848.78125, "completions/mean_terminated_length": 772.6544952392578, "completions/min_length": 358.75, "completions/min_terminated_length": 358.75, "epoch": 0.4435, "grad_norm": 0.44595927000045776, "kl": 0.036041259765625, "learning_rate": 7.141945529519288e-07, "loss": 0.2233, "num_tokens": 72629887.0, "reward": 0.5245651751756668, "reward_std": 0.192240996286273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024977896362543106, "rewards/penalized_accuracy_reward/std": 0.06825261563062668, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14488695561885834, "step": 887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1654.0, "completions/max_terminated_length": 1562.25, "completions/mean_length": 928.5, "completions/mean_terminated_length": 826.6741790771484, "completions/min_length": 451.5, "completions/min_terminated_length": 451.5, "epoch": 0.444, "grad_norm": 0.20587855577468872, "kl": 0.0364990234375, "learning_rate": 7.134630537128403e-07, "loss": 0.151, "num_tokens": 72698351.0, "reward": 0.455078125, "reward_std": 0.0712975338101387, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1425950787961483, "step": 888 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1328.5, "completions/max_terminated_length": 1017.5, "completions/mean_length": 584.234375, "completions/mean_terminated_length": 545.5625152587891, "completions/min_length": 202.75, "completions/min_terminated_length": 202.75, "epoch": 0.4445, "grad_norm": 0.6978926658630371, "kl": 0.0460205078125, "learning_rate": 7.127310565369415e-07, "loss": 0.1211, "num_tokens": 72743662.0, "reward": 0.7032998502254486, "reward_std": 0.2599146328866482, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123921126127243, "rewards/penalized_accuracy_reward/std": 0.10237129777669907, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13137998431921005, "step": 889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1351.25, "completions/mean_length": 902.625, "completions/mean_terminated_length": 643.7955322265625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.445, "grad_norm": 0.4428101181983948, "kl": 0.045013427734375, "learning_rate": 7.11998563654023e-07, "loss": 0.4694, "num_tokens": 72809798.0, "reward": 0.45464950799942017, "reward_std": 0.22119802236557007, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012481006793677807, "rewards/penalized_accuracy_reward/std": 0.04992402717471123, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.28131748735904694, "step": 890 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1809.75, "completions/max_terminated_length": 1600.25, "completions/mean_length": 867.5, "completions/mean_terminated_length": 775.4776000976562, "completions/min_length": 204.5, "completions/min_terminated_length": 204.5, "epoch": 0.4455, "grad_norm": 0.5229078531265259, "kl": 0.05389404296875, "learning_rate": 7.11265577295385e-07, "loss": 0.1918, "num_tokens": 72872742.0, "reward": 0.4897882640361786, "reward_std": 0.17500081285834312, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012472261674702168, "rewards/penalized_accuracy_reward/std": 0.04988904669880867, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17097900435328484, "step": 891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1345.5, "completions/max_terminated_length": 1118.5, "completions/mean_length": 629.4375, "completions/mean_terminated_length": 604.4562530517578, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.446, "grad_norm": 0.4612952172756195, "kl": 0.03948974609375, "learning_rate": 7.105320996938314e-07, "loss": 0.0891, "num_tokens": 72921282.0, "reward": 0.8359793424606323, "reward_std": 0.5021386295557022, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17482560314238071, "rewards/penalized_accuracy_reward/std": 0.2416248470544815, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1946.0, "completions/max_terminated_length": 1628.25, "completions/mean_length": 995.8125, "completions/mean_terminated_length": 877.0200958251953, "completions/min_length": 450.5, "completions/min_terminated_length": 450.5, "epoch": 0.4465, "grad_norm": 0.401496559381485, "kl": 0.03424072265625, "learning_rate": 7.097981330836616e-07, "loss": 0.3688, "num_tokens": 72993094.0, "reward": 0.447265625, "reward_std": 0.11683398112654686, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23366796970367432, "step": 893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1458.0, "completions/max_terminated_length": 1088.5, "completions/mean_length": 674.921875, "completions/mean_terminated_length": 567.2450408935547, "completions/min_length": 280.25, "completions/min_terminated_length": 280.25, "epoch": 0.447, "grad_norm": 0.49624183773994446, "kl": 0.0455322265625, "learning_rate": 7.090636797006657e-07, "loss": 0.2786, "num_tokens": 73045921.0, "reward": 0.6434061229228973, "reward_std": 0.2709214352071285, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08732806146144867, "rewards/penalized_accuracy_reward/std": 0.10226824879646301, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1327698826789856, "step": 894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1967.25, "completions/max_terminated_length": 1439.75, "completions/mean_length": 884.25, "completions/mean_terminated_length": 722.8892822265625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4475, "grad_norm": 0.40386900305747986, "kl": 0.035125732421875, "learning_rate": 7.083287417821157e-07, "loss": 0.1984, "num_tokens": 73113777.0, "reward": 0.4296875, "reward_std": 0.1238612923771143, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2477225884795189, "step": 895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1808.5, "completions/max_terminated_length": 1636.0, "completions/mean_length": 1021.40625, "completions/mean_terminated_length": 938.7583618164062, "completions/min_length": 358.25, "completions/min_terminated_length": 358.25, "epoch": 0.448, "grad_norm": 0.34492549300193787, "kl": 0.035247802734375, "learning_rate": 7.075933215667604e-07, "loss": 0.1415, "num_tokens": 73189963.0, "reward": 0.466796875, "reward_std": 0.071086585521698, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1421731822192669, "step": 896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1551.0, "completions/max_terminated_length": 1389.25, "completions/mean_length": 818.296875, "completions/mean_terminated_length": 739.2647552490234, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.4485, "grad_norm": 0.5400869846343994, "kl": 0.041534423828125, "learning_rate": 7.068574212948169e-07, "loss": 0.1491, "num_tokens": 73250238.0, "reward": 0.5128426253795624, "reward_std": 0.26384905725717545, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024975997395813465, "rewards/penalized_accuracy_reward/std": 0.09990398958325386, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1701274737715721, "step": 897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1628.0, "completions/max_terminated_length": 1551.75, "completions/mean_length": 895.140625, "completions/mean_terminated_length": 793.4326934814453, "completions/min_length": 379.75, "completions/min_terminated_length": 379.75, "epoch": 0.449, "grad_norm": 0.4120320975780487, "kl": 0.0428466796875, "learning_rate": 7.06121043207965e-07, "loss": 0.2001, "num_tokens": 73316999.0, "reward": 0.4742019772529602, "reward_std": 0.1830596812069416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491613626480103, "rewards/penalized_accuracy_reward/std": 0.04996645450592041, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2155756913125515, "step": 898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1669.5, "completions/max_terminated_length": 1616.0, "completions/mean_length": 856.390625, "completions/mean_terminated_length": 805.9503326416016, "completions/min_length": 330.5, "completions/min_terminated_length": 330.5, "epoch": 0.4495, "grad_norm": 0.3699914813041687, "kl": 0.04058837890625, "learning_rate": 7.053841895493406e-07, "loss": 0.1154, "num_tokens": 73383840.0, "reward": 0.8821738958358765, "reward_std": 0.41543304920196533, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19987600483000278, "rewards/penalized_accuracy_reward/std": 0.19880098849534988, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1690.75, "completions/max_terminated_length": 1335.75, "completions/mean_length": 786.03125, "completions/mean_terminated_length": 672.8572998046875, "completions/min_length": 300.75, "completions/min_terminated_length": 300.75, "epoch": 0.45, "grad_norm": 0.47861412167549133, "kl": 0.063232421875, "learning_rate": 7.046468625635274e-07, "loss": 0.2223, "num_tokens": 73444242.0, "reward": 0.5705457627773285, "reward_std": 0.23411628603935242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049921318888664246, "rewards/penalized_accuracy_reward/std": 0.08930207788944244, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11102426052093506, "step": 900 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1541.25, "completions/max_terminated_length": 1196.25, "completions/mean_length": 723.78125, "completions/mean_terminated_length": 599.2264251708984, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.4505, "grad_norm": 0.6627294421195984, "kl": 0.057373046875, "learning_rate": 7.039090644965509e-07, "loss": 0.1833, "num_tokens": 73500804.0, "reward": 0.5280951261520386, "reward_std": 0.24320730194449425, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037485066801309586, "rewards/penalized_accuracy_reward/std": 0.08059047907590866, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.2018338106572628, "step": 901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1940.25, "completions/max_terminated_length": 1861.25, "completions/mean_length": 1060.03125, "completions/mean_terminated_length": 826.2375335693359, "completions/min_length": 375.25, "completions/min_terminated_length": 375.25, "epoch": 0.451, "grad_norm": 0.28196197748184204, "kl": 0.05950927734375, "learning_rate": 7.031707975958726e-07, "loss": 0.1751, "num_tokens": 73579414.0, "reward": 0.5563388764858246, "reward_std": 0.27178410068154335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062349122017621994, "rewards/penalized_accuracy_reward/std": 0.09551167488098145, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.19751594588160515, "step": 902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1791.5, "completions/max_terminated_length": 1529.5, "completions/mean_length": 963.515625, "completions/mean_terminated_length": 872.6823425292969, "completions/min_length": 382.75, "completions/min_terminated_length": 382.75, "epoch": 0.4515, "grad_norm": 0.4353559911251068, "kl": 0.035125732421875, "learning_rate": 7.024320641103811e-07, "loss": 0.2089, "num_tokens": 73650615.0, "reward": 0.510857105255127, "reward_std": 0.21225575730204582, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024959806352853775, "rewards/penalized_accuracy_reward/std": 0.06820319592952728, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1516987681388855, "step": 903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1646.25, "completions/max_terminated_length": 1389.25, "completions/mean_length": 764.859375, "completions/mean_terminated_length": 677.3896102905273, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.452, "grad_norm": 0.4838756322860718, "kl": 0.0401611328125, "learning_rate": 7.01692866290387e-07, "loss": 0.1818, "num_tokens": 73709038.0, "reward": 0.5206267982721329, "reward_std": 0.18099568784236908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024961836636066437, "rewards/penalized_accuracy_reward/std": 0.0682087242603302, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.140625, "step": 904 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1721.5, "completions/max_terminated_length": 1513.25, "completions/mean_length": 1011.984375, "completions/mean_terminated_length": 907.9030151367188, "completions/min_length": 375.25, "completions/min_terminated_length": 375.25, "epoch": 0.4525, "grad_norm": 0.3553853929042816, "kl": 0.038726806640625, "learning_rate": 7.009532063876148e-07, "loss": 0.2005, "num_tokens": 73781277.0, "reward": 0.458984375, "reward_std": 0.09738549217581749, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19477099925279617, "step": 905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1989.25, "completions/max_terminated_length": 1632.5, "completions/mean_length": 992.203125, "completions/mean_terminated_length": 888.6318359375, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.453, "grad_norm": 0.44720035791397095, "kl": 0.04052734375, "learning_rate": 7.002130866551968e-07, "loss": 0.2739, "num_tokens": 73852858.0, "reward": 0.5915529727935791, "reward_std": 0.2788460776209831, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06237805262207985, "rewards/penalized_accuracy_reward/std": 0.09555599093437195, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17546816915273666, "step": 906 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2002.25, "completions/max_terminated_length": 1250.0, "completions/mean_length": 743.1875, "completions/mean_terminated_length": 672.8354339599609, "completions/min_length": 271.25, "completions/min_terminated_length": 271.25, "epoch": 0.4535, "grad_norm": 0.47011637687683105, "kl": 0.0439453125, "learning_rate": 6.994725093476664e-07, "loss": 0.317, "num_tokens": 73908566.0, "reward": 0.9298354983329773, "reward_std": 0.5999719947576523, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.22468337789177895, "rewards/penalized_accuracy_reward/std": 0.2871624156832695, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13270078226923943, "step": 907 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1953.25, "completions/max_terminated_length": 1434.75, "completions/mean_length": 791.71875, "completions/mean_terminated_length": 724.1823120117188, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.454, "grad_norm": 0.47029486298561096, "kl": 0.04608154296875, "learning_rate": 6.987314767209503e-07, "loss": 0.2962, "num_tokens": 73973556.0, "reward": 0.5226205885410309, "reward_std": 0.1987179070711136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498217299580574, "rewards/penalized_accuracy_reward/std": 0.06826429814100266, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1613014042377472, "step": 908 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1747.25, "completions/max_terminated_length": 1281.0, "completions/mean_length": 844.578125, "completions/mean_terminated_length": 719.3609466552734, "completions/min_length": 321.75, "completions/min_terminated_length": 321.75, "epoch": 0.4545, "grad_norm": 0.42405787110328674, "kl": 0.0384521484375, "learning_rate": 6.979899910323624e-07, "loss": 0.1731, "num_tokens": 74035513.0, "reward": 0.6127912402153015, "reward_std": 0.3725506402552128, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07495030388236046, "rewards/penalized_accuracy_reward/std": 0.16113832592964172, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1880394071340561, "step": 909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 1203.828125, "completions/mean_terminated_length": 851.1305084228516, "completions/min_length": 276.25, "completions/min_terminated_length": 276.25, "epoch": 0.455, "grad_norm": 0.3040239214897156, "kl": 0.06756591796875, "learning_rate": 6.972480545405968e-07, "loss": 0.2938, "num_tokens": 74128494.0, "reward": 0.4944232851266861, "reward_std": 0.3653462082147598, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049946016632020473, "rewards/penalized_accuracy_reward/std": 0.1304924637079239, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.3147721141576767, "step": 910 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1651.5, "completions/max_terminated_length": 1145.0, "completions/mean_length": 814.78125, "completions/mean_terminated_length": 659.3927154541016, "completions/min_length": 307.5, "completions/min_terminated_length": 307.5, "epoch": 0.4555, "grad_norm": 1.7316147089004517, "kl": 0.096099853515625, "learning_rate": 6.965056695057204e-07, "loss": 0.3338, "num_tokens": 74191888.0, "reward": 0.8833970427513123, "reward_std": 0.16841546069190372, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21220633387565613, "rewards/penalized_accuracy_reward/std": 0.04995655618404271, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1745435781776905, "step": 911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1936.0, "completions/max_terminated_length": 1657.75, "completions/mean_length": 1023.171875, "completions/mean_terminated_length": 913.7433319091797, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.456, "grad_norm": 0.42997467517852783, "kl": 0.05108642578125, "learning_rate": 6.957628381891673e-07, "loss": 0.3312, "num_tokens": 74265195.0, "reward": 0.4683185815811157, "reward_std": 0.2052464783191681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012479602359235287, "rewards/penalized_accuracy_reward/std": 0.049918413162231445, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.23759080469608307, "step": 912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1453.75, "completions/max_terminated_length": 1433.5, "completions/mean_length": 816.40625, "completions/mean_terminated_length": 738.2864685058594, "completions/min_length": 226.75, "completions/min_terminated_length": 226.75, "epoch": 0.4565, "grad_norm": 0.40601667761802673, "kl": 0.044952392578125, "learning_rate": 6.950195628537299e-07, "loss": 0.0615, "num_tokens": 74326469.0, "reward": 0.6875731572508812, "reward_std": 0.33082032203674316, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11234126705676317, "rewards/penalized_accuracy_reward/std": 0.15301619097590446, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.12690627574920654, "step": 913 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1723.25, "completions/mean_length": 1370.09375, "completions/mean_terminated_length": 1013.5958404541016, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "epoch": 0.457, "grad_norm": 0.24621082842350006, "kl": 0.024261474609375, "learning_rate": 6.942758457635543e-07, "loss": 0.2971, "num_tokens": 74425003.0, "reward": 0.4498295783996582, "reward_std": 0.338124617934227, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03741478733718395, "rewards/penalized_accuracy_reward/std": 0.11804086714982986, "rewards/tag_count_reward/mean": 0.75, "rewards/tag_count_reward/std": 0.30840588361024857, "step": 914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1799.5, "completions/max_terminated_length": 1305.5, "completions/mean_length": 704.578125, "completions/mean_terminated_length": 657.4343948364258, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.4575, "grad_norm": 0.5558103919029236, "kl": 0.039703369140625, "learning_rate": 6.935316891841315e-07, "loss": 0.2923, "num_tokens": 74479296.0, "reward": 0.4765625, "reward_std": 0.060294944792985916, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.12058989331126213, "step": 915 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1679.25, "completions/max_terminated_length": 1361.5, "completions/mean_length": 837.0625, "completions/mean_terminated_length": 756.9271850585938, "completions/min_length": 283.5, "completions/min_terminated_length": 283.5, "epoch": 0.458, "grad_norm": 0.36190739274024963, "kl": 0.03558349609375, "learning_rate": 6.927870953822915e-07, "loss": 0.1999, "num_tokens": 74542244.0, "reward": 0.614659309387207, "reward_std": 0.3598010763525963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490777969360352, "rewards/penalized_accuracy_reward/std": 0.16104690730571747, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16065122932195663, "step": 916 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1492.0, "completions/max_terminated_length": 1350.75, "completions/mean_length": 703.171875, "completions/mean_terminated_length": 645.3966369628906, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.4585, "grad_norm": 0.40362706780433655, "kl": 0.036773681640625, "learning_rate": 6.920420666261961e-07, "loss": 0.0472, "num_tokens": 74597823.0, "reward": 0.6572499573230743, "reward_std": 0.31643248349428177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08741403743624687, "rewards/penalized_accuracy_reward/std": 0.14981938153505325, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08017472177743912, "step": 917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1565.75, "completions/max_terminated_length": 1206.25, "completions/mean_length": 852.078125, "completions/mean_terminated_length": 649.5965881347656, "completions/min_length": 327.25, "completions/min_terminated_length": 327.25, "epoch": 0.459, "grad_norm": 0.5996440052986145, "kl": 0.049285888671875, "learning_rate": 6.912966051853322e-07, "loss": 0.2025, "num_tokens": 74666228.0, "reward": 0.5990903377532959, "reward_std": 0.32586832344532013, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07493579387664795, "rewards/penalized_accuracy_reward/std": 0.14562638849020004, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.16788379102945328, "step": 918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1708.0, "completions/max_terminated_length": 1573.25, "completions/mean_length": 919.34375, "completions/mean_terminated_length": 817.1131439208984, "completions/min_length": 347.75, "completions/min_terminated_length": 347.75, "epoch": 0.4595, "grad_norm": 0.41150131821632385, "kl": 0.036468505859375, "learning_rate": 6.905507133305047e-07, "loss": 0.2266, "num_tokens": 74731770.0, "reward": 0.599035233259201, "reward_std": 0.28239361196756363, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490824908018112, "rewards/penalized_accuracy_reward/std": 0.09987767785787582, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.1652765516191721, "step": 919 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1607.75, "completions/max_terminated_length": 1364.75, "completions/mean_length": 741.0, "completions/mean_terminated_length": 681.2449645996094, "completions/min_length": 316.5, "completions/min_terminated_length": 316.5, "epoch": 0.46, "grad_norm": 0.4533882141113281, "kl": 0.03955078125, "learning_rate": 6.898043933338293e-07, "loss": 0.1336, "num_tokens": 74787914.0, "reward": 0.5513983368873596, "reward_std": 0.282422449439764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.15779344737529755, "step": 920 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1854.0, "completions/max_terminated_length": 1722.0, "completions/mean_length": 1223.515625, "completions/mean_terminated_length": 1078.8294982910156, "completions/min_length": 523.5, "completions/min_terminated_length": 523.5, "epoch": 0.4605, "grad_norm": 0.2389804720878601, "kl": 0.029541015625, "learning_rate": 6.890576474687263e-07, "loss": 0.1191, "num_tokens": 74878907.0, "reward": 0.48312458395957947, "reward_std": 0.2576030530035496, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037460729479789734, "rewards/penalized_accuracy_reward/std": 0.08053815364837646, "rewards/tag_count_reward/mean": 0.81640625, "rewards/tag_count_reward/std": 0.22939562425017357, "step": 921 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1217.0, "completions/max_terminated_length": 1103.75, "completions/mean_length": 809.4375, "completions/mean_terminated_length": 699.359375, "completions/min_length": 405.5, "completions/min_terminated_length": 405.5, "epoch": 0.461, "grad_norm": 0.2639961838722229, "kl": 0.0438232421875, "learning_rate": 6.883104780099133e-07, "loss": 0.1007, "num_tokens": 74940455.0, "reward": 0.458984375, "reward_std": 0.05273720622062683, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.10547441244125366, "step": 922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1625.75, "completions/mean_length": 976.109375, "completions/mean_terminated_length": 889.3536376953125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.4615, "grad_norm": 0.6084376573562622, "kl": 0.0538330078125, "learning_rate": 6.875628872333975e-07, "loss": 0.3094, "num_tokens": 75012414.0, "reward": 0.5337149351835251, "reward_std": 0.25067565590143204, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03736528009176254, "rewards/penalized_accuracy_reward/std": 0.08033294975757599, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22057628631591797, "step": 923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1799.25, "completions/max_terminated_length": 1649.25, "completions/mean_length": 854.734375, "completions/mean_terminated_length": 835.1948089599609, "completions/min_length": 384.25, "completions/min_terminated_length": 384.25, "epoch": 0.462, "grad_norm": 0.38291049003601074, "kl": 0.037933349609375, "learning_rate": 6.868148774164706e-07, "loss": 0.1077, "num_tokens": 75075389.0, "reward": 1.056778222322464, "reward_std": 0.6737663298845291, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.287178173661232, "rewards/penalized_accuracy_reward/std": 0.3304397985339165, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1295.25, "completions/max_terminated_length": 1224.0, "completions/mean_length": 797.5625, "completions/mean_terminated_length": 787.6750183105469, "completions/min_length": 392.75, "completions/min_terminated_length": 392.75, "epoch": 0.4625, "grad_norm": 0.3447692096233368, "kl": 0.03302001953125, "learning_rate": 6.860664508377001e-07, "loss": 0.0957, "num_tokens": 75134897.0, "reward": 0.6880701184272766, "reward_std": 0.44713006913661957, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0998944416642189, "rewards/penalized_accuracy_reward/std": 0.21980088204145432, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.06976010836660862, "step": 925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1972.75, "completions/max_terminated_length": 1577.0, "completions/mean_length": 1048.796875, "completions/mean_terminated_length": 983.4989929199219, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "epoch": 0.463, "grad_norm": 0.32676970958709717, "kl": 0.030975341796875, "learning_rate": 6.853176097769228e-07, "loss": 0.1606, "num_tokens": 75211316.0, "reward": 0.47265625, "reward_std": 0.06666003540158272, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13332007452845573, "step": 926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 998.09375, "completions/mean_terminated_length": 861.7521362304688, "completions/min_length": 310.5, "completions/min_terminated_length": 310.5, "epoch": 0.4635, "grad_norm": 0.46280303597450256, "kl": 0.048309326171875, "learning_rate": 6.84568356515239e-07, "loss": 0.3378, "num_tokens": 75282138.0, "reward": 0.5569323748350143, "reward_std": 0.2731237821280956, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995056241750717, "rewards/penalized_accuracy_reward/std": 0.08935429900884628, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22578164935112, "step": 927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1952.5, "completions/max_terminated_length": 1644.0, "completions/mean_length": 1139.5, "completions/mean_terminated_length": 1069.4650115966797, "completions/min_length": 552.25, "completions/min_terminated_length": 552.25, "epoch": 0.464, "grad_norm": 25.29310417175293, "kl": 0.63104248046875, "learning_rate": 6.838186933350036e-07, "loss": 0.1407, "num_tokens": 75365546.0, "reward": 0.5148123651742935, "reward_std": 0.1972140111029148, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498430758714676, "rewards/penalized_accuracy_reward/std": 0.06827012449502945, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16657259315252304, "step": 928 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1852.75, "completions/max_terminated_length": 1680.25, "completions/mean_length": 956.46875, "completions/mean_terminated_length": 907.8810119628906, "completions/min_length": 386.75, "completions/min_terminated_length": 386.75, "epoch": 0.4645, "grad_norm": 0.3608327805995941, "kl": 0.03173828125, "learning_rate": 6.83068622519821e-07, "loss": 0.15, "num_tokens": 75434136.0, "reward": 0.655168205499649, "reward_std": 0.36275917291641235, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08734972774982452, "rewards/penalized_accuracy_reward/std": 0.16380179673433304, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.08397135883569717, "step": 929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1860.25, "completions/max_terminated_length": 1635.25, "completions/mean_length": 1247.53125, "completions/mean_terminated_length": 1034.109375, "completions/min_length": 612.75, "completions/min_terminated_length": 612.75, "epoch": 0.465, "grad_norm": 0.30971065163612366, "kl": 0.043304443359375, "learning_rate": 6.823181463545366e-07, "loss": 0.0698, "num_tokens": 75521322.0, "reward": 0.5134457126259804, "reward_std": 0.39995156042277813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06238691508769989, "rewards/penalized_accuracy_reward/std": 0.1802150383591652, "rewards/tag_count_reward/mean": 0.77734375, "rewards/tag_count_reward/std": 0.11429412476718426, "step": 930 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1823.0, "completions/max_terminated_length": 1666.5, "completions/mean_length": 890.65625, "completions/mean_terminated_length": 811.7156524658203, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.4655, "grad_norm": 8.933405876159668, "kl": 0.1126708984375, "learning_rate": 6.815672671252315e-07, "loss": 0.1561, "num_tokens": 75586052.0, "reward": 0.518733024597168, "reward_std": 0.1932173501700163, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024991512298583984, "rewards/penalized_accuracy_reward/std": 0.06828982383012772, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.15148866176605225, "step": 931 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1943.5, "completions/max_terminated_length": 1856.75, "completions/mean_length": 929.953125, "completions/mean_terminated_length": 870.9251708984375, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.466, "grad_norm": 0.3287794291973114, "kl": 0.0352783203125, "learning_rate": 6.808159871192136e-07, "loss": 0.1886, "num_tokens": 75652625.0, "reward": 0.7993905246257782, "reward_std": 0.2264639399945736, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1623905897140503, "rewards/penalized_accuracy_reward/std": 0.08056829869747162, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1440858170390129, "step": 932 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.25, "completions/max_terminated_length": 1029.25, "completions/mean_length": 525.28125, "completions/mean_terminated_length": 525.28125, "completions/min_length": 241.5, "completions/min_terminated_length": 241.5, "epoch": 0.4665, "grad_norm": 0.2965310215950012, "kl": 0.03045654296875, "learning_rate": 6.800643086250121e-07, "loss": 0.0692, "num_tokens": 75694515.0, "reward": 0.7976270020008087, "reward_std": 0.18644759058952332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14979007840156555, "rewards/penalized_accuracy_reward/std": 0.08931756019592285, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1828.25, "completions/max_terminated_length": 1409.25, "completions/mean_length": 1000.34375, "completions/mean_terminated_length": 874.2134552001953, "completions/min_length": 442.75, "completions/min_terminated_length": 442.75, "epoch": 0.467, "grad_norm": 0.4095255136489868, "kl": 0.0380859375, "learning_rate": 6.793122339323705e-07, "loss": 0.2121, "num_tokens": 75768937.0, "reward": 0.7354792952537537, "reward_std": 0.2766250316053629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13727089762687683, "rewards/penalized_accuracy_reward/std": 0.09558338671922684, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20475103333592415, "step": 934 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1515.75, "completions/max_terminated_length": 1465.75, "completions/mean_length": 797.453125, "completions/mean_terminated_length": 766.5223388671875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.4675, "grad_norm": 0.2791343331336975, "kl": 0.03656005859375, "learning_rate": 6.78559765332238e-07, "loss": -0.0014, "num_tokens": 75826726.0, "reward": 0.956919938325882, "reward_std": 0.30512047931551933, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.23724904283881187, "rewards/penalized_accuracy_reward/std": 0.13926167786121368, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09803754836320877, "step": 935 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1854.25, "completions/max_terminated_length": 1481.0, "completions/mean_length": 1024.546875, "completions/mean_terminated_length": 891.6151580810547, "completions/min_length": 360.75, "completions/min_terminated_length": 360.75, "epoch": 0.468, "grad_norm": 0.47800424695014954, "kl": 0.04150390625, "learning_rate": 6.778069051167653e-07, "loss": 0.3035, "num_tokens": 75902745.0, "reward": 0.552958533167839, "reward_std": 0.3312564380466938, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0499167749658227, "rewards/penalized_accuracy_reward/std": 0.130374226719141, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19534877501428127, "step": 936 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1875.25, "completions/max_terminated_length": 1468.75, "completions/mean_length": 979.609375, "completions/mean_terminated_length": 882.3822784423828, "completions/min_length": 433.25, "completions/min_terminated_length": 433.25, "epoch": 0.4685, "grad_norm": 0.39929547905921936, "kl": 0.049560546875, "learning_rate": 6.770536555792944e-07, "loss": 0.2639, "num_tokens": 75973760.0, "reward": 0.46875, "reward_std": 0.07875495962798595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1575099192559719, "step": 937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1475.25, "completions/mean_length": 961.1875, "completions/mean_terminated_length": 788.8994293212891, "completions/min_length": 420.75, "completions/min_terminated_length": 420.75, "epoch": 0.469, "grad_norm": 0.5052503943443298, "kl": 0.0499267578125, "learning_rate": 6.763000190143545e-07, "loss": 0.2952, "num_tokens": 76048892.0, "reward": 0.6179974675178528, "reward_std": 0.41185810789465904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08731904625892639, "rewards/penalized_accuracy_reward/std": 0.16376107931137085, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.25947367399930954, "step": 938 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1772.5, "completions/max_terminated_length": 1385.0, "completions/mean_length": 965.78125, "completions/mean_terminated_length": 799.4301300048828, "completions/min_length": 333.5, "completions/min_terminated_length": 333.5, "epoch": 0.4695, "grad_norm": 0.4045129716396332, "kl": 0.038055419921875, "learning_rate": 6.755459977176532e-07, "loss": 0.2632, "num_tokens": 76118750.0, "reward": 0.4971597194671631, "reward_std": 0.28031598031520844, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02494704257696867, "rewards/penalized_accuracy_reward/std": 0.09978817030787468, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2365270033478737, "step": 939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1360.75, "completions/max_terminated_length": 1159.5, "completions/mean_length": 748.6875, "completions/mean_terminated_length": 718.3729553222656, "completions/min_length": 418.25, "completions/min_terminated_length": 418.25, "epoch": 0.47, "grad_norm": 0.4237317740917206, "kl": 0.041473388671875, "learning_rate": 6.747915939860701e-07, "loss": 0.1264, "num_tokens": 76177850.0, "reward": 0.5841989815235138, "reward_std": 0.22843345999717712, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0499119870364666, "rewards/penalized_accuracy_reward/std": 0.08928529918193817, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 940 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1660.25, "completions/max_terminated_length": 1409.25, "completions/mean_length": 813.734375, "completions/mean_terminated_length": 774.8791809082031, "completions/min_length": 300.5, "completions/min_terminated_length": 300.5, "epoch": 0.4705, "grad_norm": 0.49351996183395386, "kl": 0.03643798828125, "learning_rate": 6.740368101176495e-07, "loss": 0.2359, "num_tokens": 76236649.0, "reward": 0.6840329021215439, "reward_std": 0.3911346197128296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09982895851135254, "rewards/penalized_accuracy_reward/std": 0.17857951670885086, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11211910098791122, "step": 941 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1774.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 1189.8125, "completions/mean_terminated_length": 892.7924194335938, "completions/min_length": 479.25, "completions/min_terminated_length": 479.25, "epoch": 0.471, "grad_norm": 0.3656640648841858, "kl": 0.043548583984375, "learning_rate": 6.732816484115946e-07, "loss": 0.1232, "num_tokens": 76323629.0, "reward": 0.41558346152305603, "reward_std": 0.1491929478943348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012479232624173164, "rewards/penalized_accuracy_reward/std": 0.049916934221982956, "rewards/tag_count_reward/mean": 0.78125, "rewards/tag_count_reward/std": 0.1432051993906498, "step": 942 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1786.25, "completions/max_terminated_length": 1563.75, "completions/mean_length": 1051.25, "completions/mean_terminated_length": 919.4403533935547, "completions/min_length": 473.5, "completions/min_terminated_length": 473.5, "epoch": 0.4715, "grad_norm": 0.4714430570602417, "kl": 0.052734375, "learning_rate": 6.725261111682584e-07, "loss": 0.1938, "num_tokens": 76399165.0, "reward": 0.4683106243610382, "reward_std": 0.17457574233412743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012475625611841679, "rewards/penalized_accuracy_reward/std": 0.04990250617265701, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.20362093299627304, "step": 943 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2001.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 1084.078125, "completions/mean_terminated_length": 952.2377777099609, "completions/min_length": 421.75, "completions/min_terminated_length": 421.75, "epoch": 0.472, "grad_norm": 0.3961322009563446, "kl": 0.036346435546875, "learning_rate": 6.717702006891386e-07, "loss": 0.2666, "num_tokens": 76478082.0, "reward": 0.4453125, "reward_std": 0.10758432000875473, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21516864374279976, "step": 944 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1645.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 730.328125, "completions/mean_terminated_length": 690.152099609375, "completions/min_length": 267.5, "completions/min_terminated_length": 267.5, "epoch": 0.4725, "grad_norm": 0.5115809440612793, "kl": 0.04437255859375, "learning_rate": 6.710139192768694e-07, "loss": 0.1935, "num_tokens": 76536663.0, "reward": 0.6341219544410706, "reward_std": 0.2503880597651005, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07487349212169647, "rewards/penalized_accuracy_reward/std": 0.09983134269714355, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 945 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1667.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 823.296875, "completions/mean_terminated_length": 787.4062652587891, "completions/min_length": 282.5, "completions/min_terminated_length": 282.5, "epoch": 0.473, "grad_norm": 0.30688801407814026, "kl": 0.039886474609375, "learning_rate": 6.702572692352155e-07, "loss": 0.0878, "num_tokens": 76599418.0, "reward": 0.6932358741760254, "reward_std": 0.3462311215698719, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11224293895065784, "rewards/penalized_accuracy_reward/std": 0.15290019288659096, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.10735589265823364, "step": 946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1968.5, "completions/max_terminated_length": 1818.5, "completions/mean_length": 1340.390625, "completions/mean_terminated_length": 1131.3905639648438, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.4735, "grad_norm": 0.27965524792671204, "kl": 0.0252685546875, "learning_rate": 6.695002528690639e-07, "loss": 0.1666, "num_tokens": 76695715.0, "reward": 0.5847046971321106, "reward_std": 0.2986071724444628, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0872742310166359, "rewards/penalized_accuracy_reward/std": 0.10220520198345184, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.2200508452951908, "step": 947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1908.5, "completions/max_terminated_length": 1461.75, "completions/mean_length": 989.5, "completions/mean_terminated_length": 803.9111633300781, "completions/min_length": 339.5, "completions/min_terminated_length": 339.5, "epoch": 0.474, "grad_norm": 1.1408426761627197, "kl": 0.056549072265625, "learning_rate": 6.687428724844179e-07, "loss": 0.2698, "num_tokens": 76767651.0, "reward": 0.4566120579838753, "reward_std": 0.18596418760716915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012485715560615063, "rewards/penalized_accuracy_reward/std": 0.04994286224246025, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.22485998645424843, "step": 948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1775.25, "completions/max_terminated_length": 1580.75, "completions/mean_length": 1042.515625, "completions/mean_terminated_length": 937.3542022705078, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.4745, "grad_norm": 0.22320182621479034, "kl": 0.0360107421875, "learning_rate": 6.679851303883891e-07, "loss": 0.1093, "num_tokens": 76845172.0, "reward": 0.5396399199962616, "reward_std": 0.2035933267325163, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0373980849981308, "rewards/penalized_accuracy_reward/std": 0.08040352165699005, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.13582947477698326, "step": 949 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1945.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 1051.484375, "completions/mean_terminated_length": 925.6875305175781, "completions/min_length": 326.75, "completions/min_terminated_length": 326.75, "epoch": 0.475, "grad_norm": 0.3846583664417267, "kl": 0.03643798828125, "learning_rate": 6.672270288891918e-07, "loss": 0.2512, "num_tokens": 76920355.0, "reward": 0.5278931707143784, "reward_std": 0.24744723364710808, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03738408163189888, "rewards/penalized_accuracy_reward/std": 0.0803733691573143, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1993006393313408, "step": 950 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1609.25, "completions/mean_length": 1278.578125, "completions/mean_terminated_length": 1013.4362182617188, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.4755, "grad_norm": 0.3840214014053345, "kl": 0.029327392578125, "learning_rate": 6.664685702961344e-07, "loss": 0.2874, "num_tokens": 77009704.0, "reward": 0.404296875, "reward_std": 0.1247348040342331, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.2494696080684662, "step": 951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1795.5, "completions/max_terminated_length": 1548.25, "completions/mean_length": 890.1875, "completions/mean_terminated_length": 799.3366241455078, "completions/min_length": 292.25, "completions/min_terminated_length": 292.25, "epoch": 0.476, "grad_norm": 0.4384143352508545, "kl": 0.029632568359375, "learning_rate": 6.657097569196133e-07, "loss": 0.1766, "num_tokens": 77077076.0, "reward": 0.6185539662837982, "reward_std": 0.3552247639745474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490198407322168, "rewards/penalized_accuracy_reward/std": 0.14555081352591515, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16667437925934792, "step": 952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1565.75, "completions/max_terminated_length": 1338.75, "completions/mean_length": 894.265625, "completions/mean_terminated_length": 764.34375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.4765, "grad_norm": 0.46649369597435, "kl": 0.0435791015625, "learning_rate": 6.649505910711058e-07, "loss": 0.227, "num_tokens": 77144581.0, "reward": 0.5280373692512512, "reward_std": 0.3019108921289444, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745618276298046, "rewards/penalized_accuracy_reward/std": 0.11818651109933853, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.18681886047124863, "step": 953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1959.25, "completions/max_terminated_length": 1439.0, "completions/mean_length": 871.375, "completions/mean_terminated_length": 782.9589233398438, "completions/min_length": 367.75, "completions/min_terminated_length": 367.75, "epoch": 0.477, "grad_norm": 0.41770243644714355, "kl": 0.037567138671875, "learning_rate": 6.641910750631626e-07, "loss": 0.2508, "num_tokens": 77212541.0, "reward": 0.6242839843034744, "reward_std": 0.338595949113369, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07483730372041464, "rewards/penalized_accuracy_reward/std": 0.1454489454627037, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.109375, "step": 954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1774.75, "completions/max_terminated_length": 1558.75, "completions/mean_length": 1043.421875, "completions/mean_terminated_length": 932.6753997802734, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "epoch": 0.4775, "grad_norm": 0.30176833271980286, "kl": 0.032318115234375, "learning_rate": 6.634312112094013e-07, "loss": 0.1443, "num_tokens": 77288008.0, "reward": 0.5627230703830719, "reward_std": 0.33411380648612976, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0499162208288908, "rewards/penalized_accuracy_reward/std": 0.1363971084356308, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.12263917922973633, "step": 955 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1580.25, "completions/mean_length": 1099.265625, "completions/mean_terminated_length": 856.1712188720703, "completions/min_length": 394.25, "completions/min_terminated_length": 394.25, "epoch": 0.478, "grad_norm": 0.5318382978439331, "kl": 0.063232421875, "learning_rate": 6.626710018244987e-07, "loss": 0.3774, "num_tokens": 77369257.0, "reward": 0.421875, "reward_std": 0.13675822876393795, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.2735164575278759, "step": 956 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1713.75, "completions/max_terminated_length": 1511.0, "completions/mean_length": 910.90625, "completions/mean_terminated_length": 851.6826019287109, "completions/min_length": 386.75, "completions/min_terminated_length": 386.75, "epoch": 0.4785, "grad_norm": 0.40677565336227417, "kl": 0.0308837890625, "learning_rate": 6.619104492241847e-07, "loss": 0.2407, "num_tokens": 77436563.0, "reward": 1.0162328481674194, "reward_std": 0.43254092521965504, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2747179791331291, "rewards/penalized_accuracy_reward/std": 0.18925581127405167, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18460387364029884, "step": 957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1615.0, "completions/max_terminated_length": 1545.5, "completions/mean_length": 1066.140625, "completions/mean_terminated_length": 900.6062622070312, "completions/min_length": 328.5, "completions/min_terminated_length": 328.5, "epoch": 0.479, "grad_norm": 0.42275962233543396, "kl": 0.03216552734375, "learning_rate": 6.611495557252344e-07, "loss": 0.2229, "num_tokens": 77513628.0, "reward": 0.8621268421411514, "reward_std": 0.17632184327521827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21231341920793056, "rewards/penalized_accuracy_reward/std": 0.050037106600939296, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.17926881089806557, "step": 958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1884.0, "completions/max_terminated_length": 1732.5, "completions/mean_length": 1130.265625, "completions/mean_terminated_length": 1008.9330596923828, "completions/min_length": 547.5, "completions/min_terminated_length": 547.5, "epoch": 0.4795, "grad_norm": 0.34551897644996643, "kl": 0.035736083984375, "learning_rate": 6.603883236454612e-07, "loss": 0.2517, "num_tokens": 77597469.0, "reward": 0.5318502187728882, "reward_std": 0.25821051746606827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03740948066115379, "rewards/penalized_accuracy_reward/std": 0.08042796701192856, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1947091706097126, "step": 959 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1571.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 781.421875, "completions/mean_terminated_length": 639.9779815673828, "completions/min_length": 275.5, "completions/min_terminated_length": 275.5, "epoch": 0.48, "grad_norm": 0.4649176001548767, "kl": 0.051025390625, "learning_rate": 6.596267553037102e-07, "loss": 0.2658, "num_tokens": 77659768.0, "reward": 0.5550170391798019, "reward_std": 0.25017984211444855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04996946081519127, "rewards/penalized_accuracy_reward/std": 0.08938808739185333, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.19068621844053268, "step": 960 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1280.5, "completions/mean_length": 916.546875, "completions/mean_terminated_length": 673.8084716796875, "completions/min_length": 263.5, "completions/min_terminated_length": 263.5, "epoch": 0.4805, "grad_norm": 0.4903619587421417, "kl": 0.059600830078125, "learning_rate": 6.588648530198504e-07, "loss": 0.4751, "num_tokens": 77727579.0, "reward": 0.5275617241859436, "reward_std": 0.294077355414629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0499136745929718, "rewards/penalized_accuracy_reward/std": 0.08928830921649933, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.2751017287373543, "step": 961 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1974.5, "completions/max_terminated_length": 1552.25, "completions/mean_length": 1019.265625, "completions/mean_terminated_length": 836.7160415649414, "completions/min_length": 323.25, "completions/min_terminated_length": 323.25, "epoch": 0.481, "grad_norm": 0.47484391927719116, "kl": 0.06072998046875, "learning_rate": 6.581026191147687e-07, "loss": 0.3066, "num_tokens": 77805916.0, "reward": 0.48549312353134155, "reward_std": 0.2553938813507557, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024973122403025627, "rewards/penalized_accuracy_reward/std": 0.09989249333739281, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.21236922964453697, "step": 962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1622.75, "completions/max_terminated_length": 1301.75, "completions/mean_length": 728.5, "completions/mean_terminated_length": 643.9552230834961, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4815, "grad_norm": 2.959747314453125, "kl": 0.037506103515625, "learning_rate": 6.573400559103613e-07, "loss": -0.0305, "num_tokens": 77869084.0, "reward": 0.380859375, "reward_std": 0.09077189117670059, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.76171875, "rewards/tag_count_reward/std": 0.18154378235340118, "step": 963 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1880.75, "completions/max_terminated_length": 1660.0, "completions/mean_length": 975.828125, "completions/mean_terminated_length": 902.3624420166016, "completions/min_length": 479.5, "completions/min_terminated_length": 479.5, "epoch": 0.482, "grad_norm": 0.36452457308769226, "kl": 0.035125732421875, "learning_rate": 6.565771657295285e-07, "loss": 0.2403, "num_tokens": 77939249.0, "reward": 0.695393979549408, "reward_std": 0.27957817912101746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11234541982412338, "rewards/penalized_accuracy_reward/std": 0.10232875496149063, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.14984130859375, "step": 964 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 1049.53125, "completions/mean_terminated_length": 891.1526336669922, "completions/min_length": 404.75, "completions/min_terminated_length": 404.75, "epoch": 0.4825, "grad_norm": 0.3706720769405365, "kl": 0.05120849609375, "learning_rate": 6.558139508961654e-07, "loss": 0.2077, "num_tokens": 78019683.0, "reward": 0.5142548829317093, "reward_std": 0.3036527056246996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.25008493661880493, "step": 965 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1824.25, "completions/max_terminated_length": 1417.75, "completions/mean_length": 1045.265625, "completions/mean_terminated_length": 867.4648895263672, "completions/min_length": 263.75, "completions/min_terminated_length": 263.75, "epoch": 0.483, "grad_norm": 0.33797380328178406, "kl": 0.03887939453125, "learning_rate": 6.550504137351575e-07, "loss": 0.2939, "num_tokens": 78097492.0, "reward": 0.6028713434934616, "reward_std": 0.2688403092324734, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0748731717467308, "rewards/penalized_accuracy_reward/std": 0.0998309776186943, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.16845724917948246, "step": 966 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1936.0, "completions/max_terminated_length": 1797.5, "completions/mean_length": 1093.109375, "completions/mean_terminated_length": 962.9217376708984, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.4835, "grad_norm": 0.32223188877105713, "kl": 0.034515380859375, "learning_rate": 6.542865565723707e-07, "loss": 0.2384, "num_tokens": 78175691.0, "reward": 0.4912979304790497, "reward_std": 0.23424532264471054, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024945836514234543, "rewards/penalized_accuracy_reward/std": 0.06816502660512924, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2500963918864727, "step": 967 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.25, "completions/mean_length": 1059.46875, "completions/mean_terminated_length": 953.0000762939453, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.484, "grad_norm": 0.3556400239467621, "kl": 0.040191650390625, "learning_rate": 6.53522381734647e-07, "loss": 0.2352, "num_tokens": 78250841.0, "reward": 0.8393541276454926, "reward_std": 0.44332002103328705, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18725517019629478, "rewards/penalized_accuracy_reward/std": 0.18492865562438965, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.21390652284026146, "step": 968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1723.25, "completions/max_terminated_length": 1375.25, "completions/mean_length": 864.3125, "completions/mean_terminated_length": 790.8384399414062, "completions/min_length": 363.25, "completions/min_terminated_length": 363.25, "epoch": 0.4845, "grad_norm": 0.43303635716438293, "kl": 0.042388916015625, "learning_rate": 6.527578915497951e-07, "loss": 0.2387, "num_tokens": 78315373.0, "reward": 0.462890625, "reward_std": 0.08695381693542004, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17390763387084007, "step": 969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.75, "completions/max_terminated_length": 1225.75, "completions/mean_length": 773.21875, "completions/mean_terminated_length": 773.21875, "completions/min_length": 383.25, "completions/min_terminated_length": 383.25, "epoch": 0.485, "grad_norm": 0.3109491467475891, "kl": 0.035430908203125, "learning_rate": 6.519930883465847e-07, "loss": 0.092, "num_tokens": 78375099.0, "reward": 0.6647911667823792, "reward_std": 0.31840643659234047, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08727839589118958, "rewards/penalized_accuracy_reward/std": 0.14958390593528748, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06524410098791122, "step": 970 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1370.0, "completions/max_terminated_length": 1283.75, "completions/mean_length": 884.40625, "completions/mean_terminated_length": 812.1812438964844, "completions/min_length": 374.5, "completions/min_terminated_length": 374.5, "epoch": 0.4855, "grad_norm": 0.5134212374687195, "kl": 0.039581298828125, "learning_rate": 6.512279744547392e-07, "loss": 0.0714, "num_tokens": 78442965.0, "reward": 0.6686618030071259, "reward_std": 0.25960060581564903, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09995590150356293, "rewards/penalized_accuracy_reward/std": 0.10323400795459747, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.10626518912613392, "step": 971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.5, "completions/mean_length": 1250.234375, "completions/mean_terminated_length": 1033.1483917236328, "completions/min_length": 510.5, "completions/min_terminated_length": 510.5, "epoch": 0.486, "grad_norm": 0.29738444089889526, "kl": 0.037689208984375, "learning_rate": 6.50462552204928e-07, "loss": 0.2291, "num_tokens": 78533268.0, "reward": 0.6696784794330597, "reward_std": 0.46219057589769363, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12487829849123955, "rewards/penalized_accuracy_reward/std": 0.18293289840221405, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.27272310480475426, "step": 972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1982.0, "completions/max_terminated_length": 1543.5, "completions/mean_length": 1010.40625, "completions/mean_terminated_length": 849.5384826660156, "completions/min_length": 376.25, "completions/min_terminated_length": 376.25, "epoch": 0.4865, "grad_norm": 0.3877555727958679, "kl": 0.0411376953125, "learning_rate": 6.496968239287603e-07, "loss": 0.2257, "num_tokens": 78608398.0, "reward": 0.5220916569232941, "reward_std": 0.312452370300889, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03741301503032446, "rewards/penalized_accuracy_reward/std": 0.11806262284517288, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23484240099787712, "step": 973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1638.5, "completions/max_terminated_length": 1319.25, "completions/mean_length": 692.328125, "completions/mean_terminated_length": 632.9531555175781, "completions/min_length": 307.75, "completions/min_terminated_length": 307.75, "epoch": 0.487, "grad_norm": 0.6318943500518799, "kl": 0.033599853515625, "learning_rate": 6.489307919587769e-07, "loss": 0.3466, "num_tokens": 78660291.0, "reward": 0.7627832293510437, "reward_std": 0.23801366984844208, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13725098967552185, "rewards/penalized_accuracy_reward/std": 0.09556933492422104, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 974 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1524.75, "completions/mean_length": 1056.359375, "completions/mean_terminated_length": 818.5320281982422, "completions/min_length": 347.75, "completions/min_terminated_length": 347.75, "epoch": 0.4875, "grad_norm": 0.4565117359161377, "kl": 0.04730224609375, "learning_rate": 6.481644586284442e-07, "loss": 0.4154, "num_tokens": 78735450.0, "reward": 0.7063267827033997, "reward_std": 0.31241485849022865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13734307885169983, "rewards/penalized_accuracy_reward/std": 0.09563349932432175, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.2670883461833, "step": 975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1627.5, "completions/mean_length": 1077.90625, "completions/mean_terminated_length": 962.1988372802734, "completions/min_length": 364.5, "completions/min_terminated_length": 364.5, "epoch": 0.488, "grad_norm": 0.5567187070846558, "kl": 0.04840087890625, "learning_rate": 6.473978262721463e-07, "loss": 0.2716, "num_tokens": 78817012.0, "reward": 0.4991767704486847, "reward_std": 0.22915126755833626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0249790046364069, "rewards/penalized_accuracy_reward/std": 0.06825564056634903, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.23225072026252747, "step": 976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1448.25, "completions/max_terminated_length": 928.0, "completions/mean_length": 675.171875, "completions/mean_terminated_length": 529.0238800048828, "completions/min_length": 235.5, "completions/min_terminated_length": 235.5, "epoch": 0.4885, "grad_norm": 0.40632644295692444, "kl": 0.06201171875, "learning_rate": 6.466308972251785e-07, "loss": 0.2745, "num_tokens": 78868495.0, "reward": 1.0045007467269897, "reward_std": 0.4522459898144007, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.27471132576465607, "rewards/penalized_accuracy_reward/std": 0.1892596036195755, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18064343929290771, "step": 977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1638.75, "completions/max_terminated_length": 1347.75, "completions/mean_length": 853.875, "completions/mean_terminated_length": 797.9903411865234, "completions/min_length": 328.75, "completions/min_terminated_length": 328.75, "epoch": 0.489, "grad_norm": 0.35597723722457886, "kl": 0.032684326171875, "learning_rate": 6.458636738237395e-07, "loss": 0.143, "num_tokens": 78932087.0, "reward": 0.6800364255905151, "reward_std": 0.335312195122242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09978383406996727, "rewards/penalized_accuracy_reward/std": 0.15219174325466156, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10415080189704895, "step": 978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1737.75, "completions/max_terminated_length": 1450.25, "completions/mean_length": 807.25, "completions/mean_terminated_length": 692.4429321289062, "completions/min_length": 282.25, "completions/min_terminated_length": 282.25, "epoch": 0.4895, "grad_norm": 0.5782247185707092, "kl": 0.04315185546875, "learning_rate": 6.45096158404925e-07, "loss": 0.2923, "num_tokens": 78991383.0, "reward": 0.4781160056591034, "reward_std": 0.19124192371964455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249550562351942, "rewards/penalized_accuracy_reward/std": 0.04998202621936798, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.18255575001239777, "step": 979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1499.0, "completions/max_terminated_length": 1182.75, "completions/mean_length": 657.890625, "completions/mean_terminated_length": 590.0989685058594, "completions/min_length": 275.75, "completions/min_terminated_length": 275.75, "epoch": 0.49, "grad_norm": 0.4407811164855957, "kl": 0.0537109375, "learning_rate": 6.443283533067198e-07, "loss": 0.2723, "num_tokens": 79043216.0, "reward": 0.474609375, "reward_std": 0.05902016907930374, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11804034188389778, "step": 980 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1648.25, "completions/max_terminated_length": 1304.25, "completions/mean_length": 758.3125, "completions/mean_terminated_length": 648.1755523681641, "completions/min_length": 207.25, "completions/min_terminated_length": 207.25, "epoch": 0.4905, "grad_norm": 0.46231570839881897, "kl": 0.04888916015625, "learning_rate": 6.435602608679916e-07, "loss": 0.2248, "num_tokens": 79098052.0, "reward": 0.48395924270153046, "reward_std": 0.16657893359661102, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012487435713410378, "rewards/penalized_accuracy_reward/std": 0.04994974657893181, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18082401901483536, "step": 981 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1702.75, "completions/max_terminated_length": 1686.0, "completions/mean_length": 876.515625, "completions/mean_terminated_length": 803.71875, "completions/min_length": 410.25, "completions/min_terminated_length": 410.25, "epoch": 0.491, "grad_norm": 0.3127359449863434, "kl": 0.029266357421875, "learning_rate": 6.427918834284834e-07, "loss": 0.1191, "num_tokens": 79162197.0, "reward": 0.8511944562196732, "reward_std": 0.3185293674468994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18731597810983658, "rewards/penalized_accuracy_reward/std": 0.14869160950183868, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09375, "step": 982 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1693.75, "completions/max_terminated_length": 1549.25, "completions/mean_length": 707.03125, "completions/mean_terminated_length": 686.1614685058594, "completions/min_length": 291.5, "completions/min_terminated_length": 291.5, "epoch": 0.4915, "grad_norm": 0.5435469746589661, "kl": 0.05206298828125, "learning_rate": 6.420232233288055e-07, "loss": 0.0564, "num_tokens": 79215495.0, "reward": 0.7572470754384995, "reward_std": 0.49062396213412285, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13741259183734655, "rewards/penalized_accuracy_reward/std": 0.23929088562726974, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08159362338483334, "step": 983 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1780.5, "completions/max_terminated_length": 1473.75, "completions/mean_length": 890.96875, "completions/mean_terminated_length": 822.8723602294922, "completions/min_length": 354.75, "completions/min_terminated_length": 354.75, "epoch": 0.492, "grad_norm": 0.41063132882118225, "kl": 0.0413818359375, "learning_rate": 6.412542829104306e-07, "loss": 0.2391, "num_tokens": 79284581.0, "reward": 0.497632771730423, "reward_std": 0.17332101613283157, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012488256208598614, "rewards/penalized_accuracy_reward/std": 0.049953024834394455, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14682991802692413, "step": 984 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1539.5, "completions/mean_length": 1009.0625, "completions/mean_terminated_length": 903.8321990966797, "completions/min_length": 367.5, "completions/min_terminated_length": 367.5, "epoch": 0.4925, "grad_norm": 0.5502820014953613, "kl": 0.03765869140625, "learning_rate": 6.404850645156841e-07, "loss": 0.2766, "num_tokens": 79361673.0, "reward": 0.5186351537704468, "reward_std": 0.20844950154423714, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024942580610513687, "rewards/penalized_accuracy_reward/std": 0.06815611571073532, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17186805978417397, "step": 985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1396.25, "completions/max_terminated_length": 1298.0, "completions/mean_length": 721.0, "completions/mean_terminated_length": 684.1071472167969, "completions/min_length": 265.75, "completions/min_terminated_length": 265.75, "epoch": 0.493, "grad_norm": 0.4838447868824005, "kl": 0.04443359375, "learning_rate": 6.397155704877388e-07, "loss": 0.0976, "num_tokens": 79414633.0, "reward": 0.5592676103115082, "reward_std": 0.19660865887999535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744630143046379, "rewards/penalized_accuracy_reward/std": 0.08050712943077087, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08450498431921005, "step": 986 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1627.5, "completions/max_terminated_length": 1363.5, "completions/mean_length": 605.640625, "completions/mean_terminated_length": 581.4906311035156, "completions/min_length": 236.75, "completions/min_terminated_length": 236.75, "epoch": 0.4935, "grad_norm": 0.47245657444000244, "kl": 0.0439453125, "learning_rate": 6.389458031706068e-07, "loss": 0.0264, "num_tokens": 79464450.0, "reward": 0.6401020586490631, "reward_std": 0.38575369119644165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07493384554982185, "rewards/penalized_accuracy_reward/std": 0.1892768181860447, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 987 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1341.5, "completions/max_terminated_length": 1315.5, "completions/mean_length": 757.8125, "completions/mean_terminated_length": 746.0354309082031, "completions/min_length": 378.75, "completions/min_terminated_length": 378.75, "epoch": 0.494, "grad_norm": 0.8407354950904846, "kl": 0.0723876953125, "learning_rate": 6.381757649091329e-07, "loss": 0.0133, "num_tokens": 79522646.0, "reward": 0.490234375, "reward_std": 0.0390625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1713.25, "completions/max_terminated_length": 1282.5, "completions/mean_length": 686.421875, "completions/mean_terminated_length": 597.6007690429688, "completions/min_length": 245.75, "completions/min_terminated_length": 245.75, "epoch": 0.4945, "grad_norm": 0.5135132074356079, "kl": 0.06170654296875, "learning_rate": 6.374054580489873e-07, "loss": 0.2224, "num_tokens": 79579073.0, "reward": 0.7473484724760056, "reward_std": 0.5045844838023186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13734611123800278, "rewards/penalized_accuracy_reward/std": 0.23284417763352394, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17495574057102203, "step": 989 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1436.75, "completions/max_terminated_length": 1213.5, "completions/mean_length": 688.34375, "completions/mean_terminated_length": 649.9354400634766, "completions/min_length": 345.25, "completions/min_terminated_length": 345.25, "epoch": 0.495, "grad_norm": 0.5993620157241821, "kl": 0.04339599609375, "learning_rate": 6.366348849366583e-07, "loss": 0.0932, "num_tokens": 79631703.0, "reward": 0.6552124917507172, "reward_std": 0.4524923264980316, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08737187832593918, "rewards/penalized_accuracy_reward/std": 0.2169223129749298, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12235792353749275, "step": 990 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2018.5, "completions/max_terminated_length": 1768.5, "completions/mean_length": 1016.65625, "completions/mean_terminated_length": 952.4613494873047, "completions/min_length": 331.5, "completions/min_terminated_length": 331.5, "epoch": 0.4955, "grad_norm": 0.40146422386169434, "kl": 0.035614013671875, "learning_rate": 6.358640479194451e-07, "loss": 0.146, "num_tokens": 79705441.0, "reward": 0.5606954246759415, "reward_std": 0.2570592798292637, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04987895488739014, "rewards/penalized_accuracy_reward/std": 0.08922623097896576, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1957952082157135, "step": 991 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1258.0, "completions/max_terminated_length": 1027.5, "completions/mean_length": 563.890625, "completions/mean_terminated_length": 541.6927185058594, "completions/min_length": 231.5, "completions/min_terminated_length": 231.5, "epoch": 0.496, "grad_norm": 0.5492409467697144, "kl": 0.044525146484375, "learning_rate": 6.35092949345451e-07, "loss": 0.0285, "num_tokens": 79750986.0, "reward": 0.5054555237293243, "reward_std": 0.12943263351917267, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249338872730732, "rewards/penalized_accuracy_reward/std": 0.04997355490922928, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.0923399031162262, "step": 992 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1699.25, "completions/max_terminated_length": 1428.5, "completions/mean_length": 722.40625, "completions/mean_terminated_length": 659.7227783203125, "completions/min_length": 293.75, "completions/min_terminated_length": 293.75, "epoch": 0.4965, "grad_norm": 0.5145401954650879, "kl": 0.03204345703125, "learning_rate": 6.343215915635761e-07, "loss": 0.3228, "num_tokens": 79805396.0, "reward": 0.7682163119316101, "reward_std": 0.35173944756388664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14973315596580505, "rewards/penalized_accuracy_reward/std": 0.14548839256167412, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18760817870497704, "step": 993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1861.5, "completions/max_terminated_length": 1328.0, "completions/mean_length": 811.328125, "completions/mean_terminated_length": 731.4308319091797, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.497, "grad_norm": 0.5266659259796143, "kl": 0.049652099609375, "learning_rate": 6.335499769235098e-07, "loss": 0.2243, "num_tokens": 79868793.0, "reward": 0.51283760368824, "reward_std": 0.26160251908004284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024973484687507153, "rewards/penalized_accuracy_reward/std": 0.09989394247531891, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1751544177532196, "step": 994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.75, "completions/max_terminated_length": 1262.75, "completions/mean_length": 585.015625, "completions/mean_terminated_length": 585.015625, "completions/min_length": 263.5, "completions/min_terminated_length": 263.5, "epoch": 0.4975, "grad_norm": 0.6080282330513, "kl": 0.0438232421875, "learning_rate": 6.327781077757241e-07, "loss": 0.059, "num_tokens": 79915946.0, "reward": 0.9188090264797211, "reward_std": 0.38463154435157776, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21233420819044113, "rewards/penalized_accuracy_reward/std": 0.1850016638636589, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 995 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1856.0, "completions/max_terminated_length": 1208.25, "completions/mean_length": 740.828125, "completions/mean_terminated_length": 649.5067138671875, "completions/min_length": 275.5, "completions/min_terminated_length": 275.5, "epoch": 0.498, "grad_norm": 0.7049466371536255, "kl": 0.0538330078125, "learning_rate": 6.320059864714664e-07, "loss": 0.4498, "num_tokens": 79973103.0, "reward": 0.46484375, "reward_std": 0.10409492254257202, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.20818985998630524, "step": 996 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1621.25, "completions/max_terminated_length": 1305.5, "completions/mean_length": 845.8125, "completions/mean_terminated_length": 767.1049194335938, "completions/min_length": 241.5, "completions/min_terminated_length": 241.5, "epoch": 0.4985, "grad_norm": 0.5689443945884705, "kl": 0.049835205078125, "learning_rate": 6.31233615362752e-07, "loss": 0.2405, "num_tokens": 80039651.0, "reward": 0.5148210823535919, "reward_std": 0.21331962198019028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024988669902086258, "rewards/penalized_accuracy_reward/std": 0.06828204542398453, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19530896097421646, "step": 997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1780.5, "completions/max_terminated_length": 1444.25, "completions/mean_length": 798.234375, "completions/mean_terminated_length": 736.6480865478516, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.499, "grad_norm": 0.49004602432250977, "kl": 0.04449462890625, "learning_rate": 6.304609968023572e-07, "loss": 0.1414, "num_tokens": 80100258.0, "reward": 0.8281608819961548, "reward_std": 0.4010786935687065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1748226396739483, "rewards/penalized_accuracy_reward/std": 0.1761467531323433, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.12236407771706581, "step": 998 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1578.0, "completions/max_terminated_length": 1366.5, "completions/mean_length": 683.984375, "completions/mean_terminated_length": 639.5479431152344, "completions/min_length": 251.75, "completions/min_terminated_length": 251.75, "epoch": 0.4995, "grad_norm": 0.45918557047843933, "kl": 0.03717041015625, "learning_rate": 6.296881331438126e-07, "loss": 0.1683, "num_tokens": 80153089.0, "reward": 0.8531024903059006, "reward_std": 0.3439999707043171, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18729343079030514, "rewards/penalized_accuracy_reward/std": 0.14881925284862518, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14832578226923943, "step": 999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1622.25, "completions/max_terminated_length": 1408.75, "completions/mean_length": 929.96875, "completions/mean_terminated_length": 733.9135437011719, "completions/min_length": 375.5, "completions/min_terminated_length": 375.5, "epoch": 0.5, "grad_norm": 0.3790642023086548, "kl": 0.054412841796875, "learning_rate": 6.289150267413942e-07, "loss": 0.1703, "num_tokens": 80222351.0, "reward": 0.42578125, "reward_std": 0.06492474302649498, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.12984948605298996, "step": 1000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1757.75, "completions/max_terminated_length": 1488.0, "completions/mean_length": 762.5625, "completions/mean_terminated_length": 635.0349578857422, "completions/min_length": 312.5, "completions/min_terminated_length": 312.5, "epoch": 0.5005, "grad_norm": 0.3926351070404053, "kl": 0.04876708984375, "learning_rate": 6.281416799501187e-07, "loss": 0.3468, "num_tokens": 80282403.0, "reward": 0.756243109703064, "reward_std": 0.38211724534630775, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14960592612624168, "rewards/penalized_accuracy_reward/std": 0.16790466010570526, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1443345732986927, "step": 1001 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1702.75, "completions/mean_length": 948.75, "completions/mean_terminated_length": 836.2619323730469, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.501, "grad_norm": 0.4110569357872009, "kl": 0.0430908203125, "learning_rate": 6.273680951257342e-07, "loss": 0.2465, "num_tokens": 80353331.0, "reward": 0.5627556890249252, "reward_std": 0.32377340644598007, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993253666907549, "rewards/penalized_accuracy_reward/std": 0.1304669752717018, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2106337696313858, "step": 1002 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1554.75, "completions/max_terminated_length": 1292.25, "completions/mean_length": 840.84375, "completions/mean_terminated_length": 747.5474090576172, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.5015, "grad_norm": 0.4723946452140808, "kl": 0.041748046875, "learning_rate": 6.265942746247146e-07, "loss": 0.1918, "num_tokens": 80418169.0, "reward": 0.6279982328414917, "reward_std": 0.35267777368426323, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08743661362677813, "rewards/penalized_accuracy_reward/std": 0.14989228174090385, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.21739886701107025, "step": 1003 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1731.75, "completions/max_terminated_length": 1438.0, "completions/mean_length": 1024.890625, "completions/mean_terminated_length": 904.1820678710938, "completions/min_length": 536.75, "completions/min_terminated_length": 536.75, "epoch": 0.502, "grad_norm": 0.32954949140548706, "kl": 0.05230712890625, "learning_rate": 6.258202208042511e-07, "loss": 0.1658, "num_tokens": 80493602.0, "reward": 0.4933355152606964, "reward_std": 0.20644189976155758, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024988070130348206, "rewards/penalized_accuracy_reward/std": 0.0682804062962532, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.18155991658568382, "step": 1004 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1360.25, "completions/max_terminated_length": 1360.25, "completions/mean_length": 628.046875, "completions/mean_terminated_length": 628.046875, "completions/min_length": 286.5, "completions/min_terminated_length": 286.5, "epoch": 0.5025, "grad_norm": 0.4514228105545044, "kl": 0.029296875, "learning_rate": 6.25045936022246e-07, "loss": 0.2793, "num_tokens": 80540869.0, "reward": 1.0201766043901443, "reward_std": 0.6158956736326218, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2620414271950722, "rewards/penalized_accuracy_reward/std": 0.3001353368163109, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1005 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1713.75, "completions/max_terminated_length": 1430.5, "completions/mean_length": 876.234375, "completions/mean_terminated_length": 740.0346984863281, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.503, "grad_norm": 0.4763568639755249, "kl": 0.05035400390625, "learning_rate": 6.242714226373049e-07, "loss": 0.3321, "num_tokens": 80612372.0, "reward": 0.8484911322593689, "reward_std": 0.11966166459023952, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19963619112968445, "rewards/penalized_accuracy_reward/std": 0.00010056952305603772, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.23922347277402878, "step": 1006 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 660.234375, "completions/mean_terminated_length": 660.234375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.5035, "grad_norm": 1.0684740543365479, "kl": 0.0479736328125, "learning_rate": 6.2349668300873e-07, "loss": 0.0606, "num_tokens": 80664931.0, "reward": 0.6227239668369293, "reward_std": 0.3049125224351883, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06233854591846466, "rewards/penalized_accuracy_reward/std": 0.14855001121759415, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1007 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1976.75, "completions/max_terminated_length": 1463.0, "completions/mean_length": 879.140625, "completions/mean_terminated_length": 747.3067474365234, "completions/min_length": 236.5, "completions/min_terminated_length": 236.5, "epoch": 0.504, "grad_norm": 0.489240437746048, "kl": 0.04608154296875, "learning_rate": 6.227217194965125e-07, "loss": 0.265, "num_tokens": 80729164.0, "reward": 0.47767114639282227, "reward_std": 0.27103443816304207, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024968386627733707, "rewards/penalized_accuracy_reward/std": 0.09987355396151543, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.23936587944626808, "step": 1008 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1578.75, "completions/max_terminated_length": 1562.0, "completions/mean_length": 861.890625, "completions/mean_terminated_length": 833.4464416503906, "completions/min_length": 305.5, "completions/min_terminated_length": 305.5, "epoch": 0.5045, "grad_norm": 0.46307143568992615, "kl": 0.039306640625, "learning_rate": 6.219465344613258e-07, "loss": 0.1577, "num_tokens": 80795525.0, "reward": 0.47265625, "reward_std": 0.053015146404504776, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.10603029653429985, "step": 1009 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1779.5, "completions/max_terminated_length": 1716.25, "completions/mean_length": 1141.34375, "completions/mean_terminated_length": 1081.8541870117188, "completions/min_length": 558.25, "completions/min_terminated_length": 558.25, "epoch": 0.505, "grad_norm": 0.21202026307582855, "kl": 0.033966064453125, "learning_rate": 6.211711302645177e-07, "loss": 0.0914, "num_tokens": 80880331.0, "reward": 0.4765625, "reward_std": 0.04554459825158119, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09108919650316238, "step": 1010 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1768.5, "completions/max_terminated_length": 1484.75, "completions/mean_length": 823.484375, "completions/mean_terminated_length": 767.4448089599609, "completions/min_length": 370.5, "completions/min_terminated_length": 370.5, "epoch": 0.5055, "grad_norm": 1.1861815452575684, "kl": 0.052978515625, "learning_rate": 6.203955092681039e-07, "loss": 0.2171, "num_tokens": 80943706.0, "reward": 0.6319992244243622, "reward_std": 0.24575938284397125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07478867471218109, "rewards/penalized_accuracy_reward/std": 0.09971825778484344, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.12082063034176826, "step": 1011 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1840.25, "completions/max_terminated_length": 1224.5, "completions/mean_length": 809.015625, "completions/mean_terminated_length": 643.985107421875, "completions/min_length": 290.25, "completions/min_terminated_length": 290.25, "epoch": 0.506, "grad_norm": 0.6069717407226562, "kl": 0.0645751953125, "learning_rate": 6.196196738347607e-07, "loss": 0.3365, "num_tokens": 81006187.0, "reward": 0.46635669469833374, "reward_std": 0.18976467661559582, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012475223280489445, "rewards/penalized_accuracy_reward/std": 0.04990089312195778, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2282046489417553, "step": 1012 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1679.25, "completions/max_terminated_length": 1646.0, "completions/mean_length": 921.15625, "completions/mean_terminated_length": 883.5312652587891, "completions/min_length": 346.75, "completions/min_terminated_length": 346.75, "epoch": 0.5065, "grad_norm": 0.4143922030925751, "kl": 0.03680419921875, "learning_rate": 6.188436263278172e-07, "loss": 0.1375, "num_tokens": 81074565.0, "reward": 0.6221559643745422, "reward_std": 0.2648408003151417, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07474984973669052, "rewards/penalized_accuracy_reward/std": 0.09966651350259781, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1310155801475048, "step": 1013 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1682.75, "completions/max_terminated_length": 1192.5, "completions/mean_length": 693.546875, "completions/mean_terminated_length": 626.2460021972656, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.507, "grad_norm": 0.5346431136131287, "kl": 0.04443359375, "learning_rate": 6.180673691112486e-07, "loss": 0.18, "num_tokens": 81129752.0, "reward": 0.6052153259515762, "reward_std": 0.23570768907666206, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062373287975788116, "rewards/penalized_accuracy_reward/std": 0.09554865956306458, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11366254836320877, "step": 1014 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2036.75, "completions/max_terminated_length": 1547.5, "completions/mean_length": 924.6875, "completions/mean_terminated_length": 810.7823181152344, "completions/min_length": 344.5, "completions/min_terminated_length": 344.5, "epoch": 0.5075, "grad_norm": 0.4464888870716095, "kl": 0.039276123046875, "learning_rate": 6.172909045496694e-07, "loss": 0.271, "num_tokens": 81198292.0, "reward": 0.735695481300354, "reward_std": 0.27523817867040634, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.137378990650177, "rewards/penalized_accuracy_reward/std": 0.09565847367048264, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18607009947299957, "step": 1015 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1821.0, "completions/max_terminated_length": 1504.5, "completions/mean_length": 1060.875, "completions/mean_terminated_length": 982.2785949707031, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.508, "grad_norm": 0.28564634919166565, "kl": 0.0335693359375, "learning_rate": 6.165142350083249e-07, "loss": 0.1901, "num_tokens": 81273580.0, "reward": 0.451171875, "reward_std": 0.09639623202383518, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.19279246404767036, "step": 1016 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1653.0, "completions/max_terminated_length": 1378.5, "completions/mean_length": 752.96875, "completions/mean_terminated_length": 643.9771728515625, "completions/min_length": 275.75, "completions/min_terminated_length": 275.75, "epoch": 0.5085, "grad_norm": 0.38409364223480225, "kl": 0.0513916015625, "learning_rate": 6.157373628530852e-07, "loss": 0.2092, "num_tokens": 81330394.0, "reward": 0.48588642477989197, "reward_std": 0.16596362739801407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01247446145862341, "rewards/penalized_accuracy_reward/std": 0.049897849559783936, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18529859744012356, "step": 1017 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1858.75, "completions/max_terminated_length": 1522.25, "completions/mean_length": 911.28125, "completions/mean_terminated_length": 747.8810577392578, "completions/min_length": 344.5, "completions/min_terminated_length": 344.5, "epoch": 0.509, "grad_norm": 0.5492093563079834, "kl": 0.057861328125, "learning_rate": 6.149602904504378e-07, "loss": 0.2747, "num_tokens": 81399724.0, "reward": 0.5085500478744507, "reward_std": 0.26220937073230743, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747814521193504, "rewards/penalized_accuracy_reward/std": 0.08057559281587601, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.20211635529994965, "step": 1018 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1737.0, "completions/max_terminated_length": 1481.5, "completions/mean_length": 823.0, "completions/mean_terminated_length": 661.9130096435547, "completions/min_length": 226.5, "completions/min_terminated_length": 226.5, "epoch": 0.5095, "grad_norm": 0.49711158871650696, "kl": 0.05987548828125, "learning_rate": 6.141830201674802e-07, "loss": 0.3011, "num_tokens": 81460812.0, "reward": 0.4820139706134796, "reward_std": 0.1806828584522009, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491359375417233, "rewards/penalized_accuracy_reward/std": 0.04996544122695923, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17548329010605812, "step": 1019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1749.25, "completions/mean_length": 1303.59375, "completions/mean_terminated_length": 988.5364074707031, "completions/min_length": 305.25, "completions/min_terminated_length": 305.25, "epoch": 0.51, "grad_norm": 0.27147728204727173, "kl": 0.0389404296875, "learning_rate": 6.134055543719121e-07, "loss": 0.2179, "num_tokens": 81552882.0, "reward": 0.6768075823783875, "reward_std": 0.4678993225097656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13723190873861313, "rewards/penalized_accuracy_reward/std": 0.18360642343759537, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.2740439847111702, "step": 1020 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1756.25, "completions/max_terminated_length": 1585.0, "completions/mean_length": 976.234375, "completions/mean_terminated_length": 870.0029907226562, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.5105, "grad_norm": 0.3212156295776367, "kl": 0.04315185546875, "learning_rate": 6.126278954320294e-07, "loss": 0.1219, "num_tokens": 81623585.0, "reward": 0.6086461842060089, "reward_std": 0.3702331930398941, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07483090460300446, "rewards/penalized_accuracy_reward/std": 0.1608816385269165, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19477099925279617, "step": 1021 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.25, "completions/mean_length": 1097.796875, "completions/mean_terminated_length": 895.7113342285156, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.511, "grad_norm": 0.32780998945236206, "kl": 0.0379638671875, "learning_rate": 6.118500457167159e-07, "loss": 0.1816, "num_tokens": 81702852.0, "reward": 0.854115828871727, "reward_std": 0.6633635386824608, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2122141644358635, "rewards/penalized_accuracy_reward/std": 0.2915225028991699, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2826947495341301, "step": 1022 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1976.0, "completions/max_terminated_length": 1722.75, "completions/mean_length": 995.75, "completions/mean_terminated_length": 930.8368072509766, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.5115, "grad_norm": 0.3102157413959503, "kl": 0.032958984375, "learning_rate": 6.11072007595437e-07, "loss": 0.1781, "num_tokens": 81774484.0, "reward": 0.5206457376480103, "reward_std": 0.18851526454091072, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024971304461359978, "rewards/penalized_accuracy_reward/std": 0.06823459267616272, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.14620652049779892, "step": 1023 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 901.390625, "completions/mean_terminated_length": 756.1964569091797, "completions/min_length": 302.5, "completions/min_terminated_length": 302.5, "epoch": 0.512, "grad_norm": 0.9471465945243835, "kl": 0.0416259765625, "learning_rate": 6.102937834382315e-07, "loss": 0.3959, "num_tokens": 81841389.0, "reward": 0.5338646769523621, "reward_std": 0.25055598840117455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744015470147133, "rewards/penalized_accuracy_reward/std": 0.08049391955137253, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22413434088230133, "step": 1024 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1844.25, "completions/max_terminated_length": 1386.0, "completions/mean_length": 809.515625, "completions/mean_terminated_length": 728.5266571044922, "completions/min_length": 318.5, "completions/min_terminated_length": 318.5, "epoch": 0.5125, "grad_norm": 0.5600014925003052, "kl": 0.0540771484375, "learning_rate": 6.095153756157051e-07, "loss": 0.2334, "num_tokens": 81903166.0, "reward": 0.6146568655967712, "reward_std": 0.2773335184901953, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490655034780502, "rewards/penalized_accuracy_reward/std": 0.09987540543079376, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19571788981556892, "step": 1025 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1309.25, "completions/max_terminated_length": 1160.25, "completions/mean_length": 572.171875, "completions/mean_terminated_length": 523.7812576293945, "completions/min_length": 285.5, "completions/min_terminated_length": 285.5, "epoch": 0.513, "grad_norm": 0.48976606130599976, "kl": 0.041534423828125, "learning_rate": 6.087367864990232e-07, "loss": 0.1651, "num_tokens": 81948697.0, "reward": 0.6380780339241028, "reward_std": 0.22635666653513908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07489839941263199, "rewards/penalized_accuracy_reward/std": 0.09986460208892822, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.05325498431921005, "step": 1026 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1745.5, "completions/max_terminated_length": 1611.75, "completions/mean_length": 863.296875, "completions/mean_terminated_length": 789.2053680419922, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5135, "grad_norm": 0.3312412202358246, "kl": 0.068634033203125, "learning_rate": 6.079580184599032e-07, "loss": 0.128, "num_tokens": 82014684.0, "reward": 0.5206340402364731, "reward_std": 0.17665143683552742, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024965457618236542, "rewards/penalized_accuracy_reward/std": 0.06821861863136292, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11384230479598045, "step": 1027 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1824.0, "completions/max_terminated_length": 1471.25, "completions/mean_length": 859.3125, "completions/mean_terminated_length": 795.0858764648438, "completions/min_length": 372.25, "completions/min_terminated_length": 372.25, "epoch": 0.514, "grad_norm": 0.646034300327301, "kl": 0.06982421875, "learning_rate": 6.071790738706078e-07, "loss": 0.2038, "num_tokens": 82078544.0, "reward": 0.7833283543586731, "reward_std": 0.24499286524951458, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16217198967933655, "rewards/penalized_accuracy_reward/std": 0.08045991510152817, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18758258782327175, "step": 1028 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1608.0, "completions/max_terminated_length": 1284.75, "completions/mean_length": 740.984375, "completions/mean_terminated_length": 716.6958389282227, "completions/min_length": 322.5, "completions/min_terminated_length": 322.5, "epoch": 0.5145, "grad_norm": 0.48399367928504944, "kl": 0.04644775390625, "learning_rate": 6.06399955103937e-07, "loss": 0.2539, "num_tokens": 82134815.0, "reward": 0.6670586466789246, "reward_std": 0.22899428009986877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08743558079004288, "rewards/penalized_accuracy_reward/std": 0.10239406675100327, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.04841229319572449, "step": 1029 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1731.0, "completions/max_terminated_length": 1474.75, "completions/mean_length": 959.234375, "completions/mean_terminated_length": 803.8251037597656, "completions/min_length": 327.75, "completions/min_terminated_length": 327.75, "epoch": 0.515, "grad_norm": 0.4440304636955261, "kl": 0.04986572265625, "learning_rate": 6.056206645332217e-07, "loss": 0.2928, "num_tokens": 82203038.0, "reward": 0.686919167637825, "reward_std": 0.43128650076687336, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1247095838189125, "rewards/penalized_accuracy_reward/std": 0.17124565690755844, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2567436769604683, "step": 1030 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2021.25, "completions/max_terminated_length": 1581.0, "completions/mean_length": 948.28125, "completions/mean_terminated_length": 894.2156600952148, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.5155, "grad_norm": 0.5111699104309082, "kl": 0.04351806640625, "learning_rate": 6.048412045323164e-07, "loss": 0.1683, "num_tokens": 82273904.0, "reward": 0.5533183217048645, "reward_std": 0.28175486996769905, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037401347421109676, "rewards/penalized_accuracy_reward/std": 0.11803251877427101, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1391897313296795, "step": 1031 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1840.75, "completions/max_terminated_length": 1353.75, "completions/mean_length": 800.59375, "completions/mean_terminated_length": 713.6821594238281, "completions/min_length": 343.75, "completions/min_terminated_length": 343.75, "epoch": 0.516, "grad_norm": 0.49093174934387207, "kl": 0.045562744140625, "learning_rate": 6.040615774755911e-07, "loss": 0.3117, "num_tokens": 82332854.0, "reward": 0.462890625, "reward_std": 0.08917871303856373, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17835742980241776, "step": 1032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1557.75, "completions/max_terminated_length": 1262.0, "completions/mean_length": 677.890625, "completions/mean_terminated_length": 632.8698120117188, "completions/min_length": 271.75, "completions/min_terminated_length": 271.75, "epoch": 0.5165, "grad_norm": 1.0800665616989136, "kl": 0.067474365234375, "learning_rate": 6.032817857379256e-07, "loss": 0.0998, "num_tokens": 82385103.0, "reward": 0.7569123208522797, "reward_std": 0.3258417621254921, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1372452238574624, "rewards/penalized_accuracy_reward/std": 0.1497177630662918, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11707578226923943, "step": 1033 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1795.75, "completions/max_terminated_length": 1526.5, "completions/mean_length": 1011.609375, "completions/mean_terminated_length": 879.2354583740234, "completions/min_length": 468.25, "completions/min_terminated_length": 468.25, "epoch": 0.517, "grad_norm": 0.33932963013648987, "kl": 0.038116455078125, "learning_rate": 6.025018316946999e-07, "loss": 0.1619, "num_tokens": 82460182.0, "reward": 0.7049674093723297, "reward_std": 0.27721283212304115, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12494462728500366, "rewards/penalized_accuracy_reward/std": 0.099955715239048, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18350879102945328, "step": 1034 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1657.75, "completions/max_terminated_length": 1350.0, "completions/mean_length": 835.859375, "completions/mean_terminated_length": 679.9009094238281, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5175, "grad_norm": 0.4812312126159668, "kl": 0.0506591796875, "learning_rate": 6.017217177217899e-07, "loss": 0.3403, "num_tokens": 82522285.0, "reward": 0.8046576976776123, "reward_std": 0.47618842124938965, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17478980123996735, "rewards/penalized_accuracy_reward/std": 0.20305103808641434, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.17557398229837418, "step": 1035 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1727.5, "completions/max_terminated_length": 1280.0, "completions/mean_length": 887.984375, "completions/mean_terminated_length": 783.4998321533203, "completions/min_length": 331.75, "completions/min_terminated_length": 331.75, "epoch": 0.518, "grad_norm": 0.5085933208465576, "kl": 0.04815673828125, "learning_rate": 6.009414461955581e-07, "loss": 0.3071, "num_tokens": 82591676.0, "reward": 0.8586592674255371, "reward_std": 0.08958576271834318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19983746111392975, "rewards/penalized_accuracy_reward/std": 4.986653584637679e-05, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1789720468223095, "step": 1036 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1761.25, "completions/max_terminated_length": 1507.5, "completions/mean_length": 745.40625, "completions/mean_terminated_length": 706.9458465576172, "completions/min_length": 276.75, "completions/min_terminated_length": 276.75, "epoch": 0.5185, "grad_norm": 0.43805640935897827, "kl": 0.03973388671875, "learning_rate": 6.001610194928464e-07, "loss": 0.2449, "num_tokens": 82648182.0, "reward": 0.5705551654100418, "reward_std": 0.2513638585805893, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049926020205020905, "rewards/penalized_accuracy_reward/std": 0.0893104076385498, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17097627744078636, "step": 1037 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1462.0, "completions/max_terminated_length": 1318.25, "completions/mean_length": 719.296875, "completions/mean_terminated_length": 659.8810119628906, "completions/min_length": 282.5, "completions/min_terminated_length": 282.5, "epoch": 0.519, "grad_norm": 0.3461098372936249, "kl": 0.03857421875, "learning_rate": 5.993804399909703e-07, "loss": 0.0501, "num_tokens": 82703353.0, "reward": 0.9319495260715485, "reward_std": 0.2851938307285309, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.22476381435990334, "rewards/penalized_accuracy_reward/std": 0.13050272688269615, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08212131634354591, "step": 1038 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1793.75, "completions/max_terminated_length": 1586.75, "completions/mean_length": 995.640625, "completions/mean_terminated_length": 840.4805450439453, "completions/min_length": 376.25, "completions/min_terminated_length": 376.25, "epoch": 0.5195, "grad_norm": 0.26563042402267456, "kl": 0.047515869140625, "learning_rate": 5.985997100677103e-07, "loss": 0.1908, "num_tokens": 82780738.0, "reward": 0.583345927298069, "reward_std": 0.35532718896865845, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2099173739552498, "step": 1039 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1360.0, "completions/max_terminated_length": 1335.25, "completions/mean_length": 773.890625, "completions/mean_terminated_length": 741.1317138671875, "completions/min_length": 336.25, "completions/min_terminated_length": 336.25, "epoch": 0.52, "grad_norm": 0.35447490215301514, "kl": 0.033935546875, "learning_rate": 5.97818832101305e-07, "loss": 0.1312, "num_tokens": 82837691.0, "reward": 0.8775034546852112, "reward_std": 0.4365996718406677, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19949393719434738, "rewards/penalized_accuracy_reward/std": 0.19951672106981277, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.08734130859375, "step": 1040 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1934.0, "completions/max_terminated_length": 1563.75, "completions/mean_length": 814.59375, "completions/mean_terminated_length": 754.5010757446289, "completions/min_length": 294.25, "completions/min_terminated_length": 294.25, "epoch": 0.5205, "grad_norm": 0.5564824938774109, "kl": 0.051971435546875, "learning_rate": 5.97037808470444e-07, "loss": 0.267, "num_tokens": 82898321.0, "reward": 0.545516699552536, "reward_std": 0.2791803181171417, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03740679379552603, "rewards/penalized_accuracy_reward/std": 0.1180284135043621, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1440858170390129, "step": 1041 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1622.0, "completions/max_terminated_length": 1294.5, "completions/mean_length": 775.890625, "completions/mean_terminated_length": 725.6489715576172, "completions/min_length": 359.75, "completions/min_terminated_length": 359.75, "epoch": 0.521, "grad_norm": 0.6196043491363525, "kl": 0.0582275390625, "learning_rate": 5.96256641554261e-07, "loss": 0.1756, "num_tokens": 82956954.0, "reward": 0.6954045295715332, "reward_std": 0.26444656029343605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123507097363472, "rewards/penalized_accuracy_reward/std": 0.10233353823423386, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11955896764993668, "step": 1042 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1944.0, "completions/max_terminated_length": 1542.25, "completions/mean_length": 869.625, "completions/mean_terminated_length": 815.7930297851562, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.5215, "grad_norm": 0.4921649098396301, "kl": 0.04052734375, "learning_rate": 5.954753337323259e-07, "loss": 0.1676, "num_tokens": 83023042.0, "reward": 0.5284777283668518, "reward_std": 0.1822870373725891, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024981049820780754, "rewards/penalized_accuracy_reward/std": 0.06826122850179672, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.129237312823534, "step": 1043 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1727.75, "completions/max_terminated_length": 1389.75, "completions/mean_length": 938.453125, "completions/mean_terminated_length": 797.6973571777344, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.522, "grad_norm": 0.29937979578971863, "kl": 0.044647216796875, "learning_rate": 5.946938873846375e-07, "loss": 0.1629, "num_tokens": 83094527.0, "reward": 0.5490530282258987, "reward_std": 0.32867421582341194, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04991713725030422, "rewards/penalized_accuracy_reward/std": 0.13639959692955017, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.19058910757303238, "step": 1044 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1313.5, "completions/max_terminated_length": 1214.5, "completions/mean_length": 646.546875, "completions/mean_terminated_length": 606.2745666503906, "completions/min_length": 235.75, "completions/min_terminated_length": 235.75, "epoch": 0.5225, "grad_norm": 0.5957460999488831, "kl": 0.0533447265625, "learning_rate": 5.939123048916173e-07, "loss": 0.1217, "num_tokens": 83143650.0, "reward": 0.6992026716470718, "reward_std": 0.35952380672097206, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1122966492548585, "rewards/penalized_accuracy_reward/std": 0.1530277244746685, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13370942324399948, "step": 1045 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1733.75, "completions/max_terminated_length": 1622.25, "completions/mean_length": 979.21875, "completions/mean_terminated_length": 823.4026947021484, "completions/min_length": 386.25, "completions/min_terminated_length": 386.25, "epoch": 0.523, "grad_norm": 3.273259401321411, "kl": 0.09503173828125, "learning_rate": 5.931305886341008e-07, "loss": 0.2156, "num_tokens": 83214816.0, "reward": 0.5430737137794495, "reward_std": 0.343471247702837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049857172183692455, "rewards/penalized_accuracy_reward/std": 0.13029048964381218, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.21845246106386185, "step": 1046 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1833.25, "completions/max_terminated_length": 1363.25, "completions/mean_length": 895.65625, "completions/mean_terminated_length": 781.4665679931641, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.5235, "grad_norm": 0.48037704825401306, "kl": 0.049591064453125, "learning_rate": 5.923487409933315e-07, "loss": 0.3721, "num_tokens": 83281050.0, "reward": 0.451171875, "reward_std": 0.11184331029653549, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.22368662804365158, "step": 1047 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1583.0, "completions/max_terminated_length": 1198.5, "completions/mean_length": 793.046875, "completions/mean_terminated_length": 667.8617935180664, "completions/min_length": 361.75, "completions/min_terminated_length": 361.75, "epoch": 0.524, "grad_norm": 0.36380621790885925, "kl": 0.032684326171875, "learning_rate": 5.915667643509528e-07, "loss": 0.2903, "num_tokens": 83339821.0, "reward": 0.8064375221729279, "reward_std": 0.46874840557575226, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17470313608646393, "rewards/penalized_accuracy_reward/std": 0.20459141582250595, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17852510511875153, "step": 1048 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1396.25, "completions/mean_length": 998.609375, "completions/mean_terminated_length": 834.0420074462891, "completions/min_length": 354.25, "completions/min_terminated_length": 354.25, "epoch": 0.5245, "grad_norm": 0.4777292013168335, "kl": 0.0660400390625, "learning_rate": 5.907846610890011e-07, "loss": 0.2443, "num_tokens": 83415604.0, "reward": 0.5221187025308609, "reward_std": 0.3000098243355751, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03742653597146273, "rewards/penalized_accuracy_reward/std": 0.11806191131472588, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23014883697032928, "step": 1049 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1887.25, "completions/max_terminated_length": 1538.0, "completions/mean_length": 851.015625, "completions/mean_terminated_length": 791.9406585693359, "completions/min_length": 419.25, "completions/min_terminated_length": 419.25, "epoch": 0.525, "grad_norm": 0.533278226852417, "kl": 0.05487060546875, "learning_rate": 5.900024335898987e-07, "loss": 0.3007, "num_tokens": 83479861.0, "reward": 0.5074072927236557, "reward_std": 0.1702541708946228, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249270886182785, "rewards/penalized_accuracy_reward/std": 0.0499708317220211, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.140625, "step": 1050 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1424.25, "completions/max_terminated_length": 1183.5, "completions/mean_length": 683.3125, "completions/mean_terminated_length": 660.9239654541016, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.5255, "grad_norm": 0.364460825920105, "kl": 0.039154052734375, "learning_rate": 5.892200842364462e-07, "loss": 0.1058, "num_tokens": 83533305.0, "reward": 0.7208813726902008, "reward_std": 0.2203705906867981, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123938113451004, "rewards/penalized_accuracy_reward/std": 0.10237281024456024, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1051 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1492.5, "completions/mean_length": 1064.3125, "completions/mean_terminated_length": 890.5312194824219, "completions/min_length": 379.5, "completions/min_terminated_length": 379.5, "epoch": 0.526, "grad_norm": 0.41949647665023804, "kl": 0.04974365234375, "learning_rate": 5.884376154118154e-07, "loss": 0.2418, "num_tokens": 83608829.0, "reward": 0.48551127314567566, "reward_std": 0.23575543239712715, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498219721019268, "rewards/penalized_accuracy_reward/std": 0.06826436519622803, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.24631675332784653, "step": 1052 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1514.5, "completions/max_terminated_length": 1511.5, "completions/mean_length": 697.953125, "completions/mean_terminated_length": 678.8875122070312, "completions/min_length": 251.5, "completions/min_terminated_length": 251.5, "epoch": 0.5265, "grad_norm": 0.5038074254989624, "kl": 0.059112548828125, "learning_rate": 5.87655029499542e-07, "loss": -0.0026, "num_tokens": 83663962.0, "reward": 0.7687188982963562, "reward_std": 0.4865109473466873, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13728913571685553, "rewards/penalized_accuracy_reward/std": 0.24114472791552544, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1053 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1770.25, "completions/max_terminated_length": 1589.0, "completions/mean_length": 901.171875, "completions/mean_terminated_length": 802.4747772216797, "completions/min_length": 301.5, "completions/min_terminated_length": 301.5, "epoch": 0.527, "grad_norm": 0.4734256863594055, "kl": 0.041290283203125, "learning_rate": 5.868723288835184e-07, "loss": 0.2323, "num_tokens": 83730117.0, "reward": 0.6049368977546692, "reward_std": 0.26919882744550705, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.074929378926754, "rewards/penalized_accuracy_reward/std": 0.0999058410525322, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18999560549855232, "step": 1054 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.5, "completions/mean_length": 1164.359375, "completions/mean_terminated_length": 986.0111083984375, "completions/min_length": 283.25, "completions/min_terminated_length": 283.25, "epoch": 0.5275, "grad_norm": 0.3931572735309601, "kl": 0.0560302734375, "learning_rate": 5.860895159479864e-07, "loss": 0.3262, "num_tokens": 83819308.0, "reward": 0.435546875, "reward_std": 0.12847079150378704, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.256941594183445, "step": 1055 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1527.75, "completions/mean_length": 1029.1875, "completions/mean_terminated_length": 923.6190795898438, "completions/min_length": 387.5, "completions/min_terminated_length": 387.5, "epoch": 0.528, "grad_norm": 0.3830527365207672, "kl": 0.04010009765625, "learning_rate": 5.853065930775303e-07, "loss": 0.2933, "num_tokens": 83896696.0, "reward": 0.4609375, "reward_std": 0.10378459095954895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2075691893696785, "step": 1056 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1517.5, "completions/max_terminated_length": 1387.25, "completions/mean_length": 685.0625, "completions/mean_terminated_length": 668.4843902587891, "completions/min_length": 269.5, "completions/min_terminated_length": 269.5, "epoch": 0.5285, "grad_norm": 0.4389186501502991, "kl": 0.05438232421875, "learning_rate": 5.845235626570683e-07, "loss": 0.1037, "num_tokens": 83947772.0, "reward": 0.6131372302770615, "reward_std": 0.22479701042175293, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06242798641324043, "rewards/penalized_accuracy_reward/std": 0.095632404088974, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07966229319572449, "step": 1057 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1808.5, "completions/max_terminated_length": 1528.25, "completions/mean_length": 878.921875, "completions/mean_terminated_length": 824.3726348876953, "completions/min_length": 317.75, "completions/min_terminated_length": 317.75, "epoch": 0.529, "grad_norm": 0.4148883819580078, "kl": 0.035400390625, "learning_rate": 5.837404270718475e-07, "loss": 0.1571, "num_tokens": 84015495.0, "reward": 1.0550464689731598, "reward_std": 0.39023883640766144, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2872888594865799, "rewards/penalized_accuracy_reward/std": 0.17067573219537735, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12213464826345444, "step": 1058 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1795.25, "completions/max_terminated_length": 1565.25, "completions/mean_length": 942.03125, "completions/mean_terminated_length": 898.25, "completions/min_length": 390.25, "completions/min_terminated_length": 390.25, "epoch": 0.5295, "grad_norm": 0.3065997064113617, "kl": 0.034942626953125, "learning_rate": 5.829571887074343e-07, "loss": 0.0996, "num_tokens": 84084201.0, "reward": 0.6493048220872879, "reward_std": 0.5104449391365051, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08734771888703108, "rewards/penalized_accuracy_reward/std": 0.24851710349321365, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.10198230296373367, "step": 1059 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1861.5, "completions/max_terminated_length": 1580.25, "completions/mean_length": 1126.75, "completions/mean_terminated_length": 891.7890014648438, "completions/min_length": 335.75, "completions/min_terminated_length": 335.75, "epoch": 0.53, "grad_norm": 0.43033701181411743, "kl": 0.0343017578125, "learning_rate": 5.821738499497086e-07, "loss": 0.2141, "num_tokens": 84165145.0, "reward": 0.421875, "reward_std": 0.11409684270620346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.22819368727505207, "step": 1060 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1970.5, "completions/max_terminated_length": 1507.25, "completions/mean_length": 964.78125, "completions/mean_terminated_length": 888.2218933105469, "completions/min_length": 448.25, "completions/min_terminated_length": 448.25, "epoch": 0.5305, "grad_norm": 0.527501106262207, "kl": 0.03375244140625, "learning_rate": 5.813904131848564e-07, "loss": 0.3101, "num_tokens": 84235243.0, "reward": 0.466796875, "reward_std": 0.08350180089473724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1670036017894745, "step": 1061 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1829.5, "completions/max_terminated_length": 1536.0, "completions/mean_length": 914.96875, "completions/mean_terminated_length": 782.0669708251953, "completions/min_length": 362.75, "completions/min_terminated_length": 362.75, "epoch": 0.531, "grad_norm": 0.44778522849082947, "kl": 0.0380859375, "learning_rate": 5.806068807993617e-07, "loss": 0.2674, "num_tokens": 84301513.0, "reward": 0.5030737519264221, "reward_std": 0.2393711842596531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497437782585621, "rewards/penalized_accuracy_reward/std": 0.06824300438165665, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.23222409188747406, "step": 1062 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1952.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 783.75, "completions/mean_terminated_length": 696.7294921875, "completions/min_length": 377.75, "completions/min_terminated_length": 377.75, "epoch": 0.5315, "grad_norm": 0.4983115792274475, "kl": 0.037994384765625, "learning_rate": 5.798232551800002e-07, "loss": 0.3243, "num_tokens": 84361321.0, "reward": 0.466796875, "reward_std": 0.09874238260090351, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19748477265238762, "step": 1063 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 693.4375, "completions/mean_terminated_length": 693.4375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.532, "grad_norm": 2.213775157928467, "kl": 0.06610107421875, "learning_rate": 5.790395387138311e-07, "loss": 0.0586, "num_tokens": 84415845.0, "reward": 0.6226582676172256, "reward_std": 0.1880398988723755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062305696308612823, "rewards/penalized_accuracy_reward/std": 0.09544512629508972, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1064 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1817.75, "completions/max_terminated_length": 1524.25, "completions/mean_length": 908.265625, "completions/mean_terminated_length": 794.6323089599609, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "epoch": 0.5325, "grad_norm": 0.4661436378955841, "kl": 0.036773681640625, "learning_rate": 5.78255733788191e-07, "loss": 0.2212, "num_tokens": 84485238.0, "reward": 0.6756826341152191, "reward_std": 0.2743220627307892, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11225538700819016, "rewards/penalized_accuracy_reward/std": 0.10224680602550507, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.15856516361236572, "step": 1065 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1580.75, "completions/max_terminated_length": 1096.75, "completions/mean_length": 833.109375, "completions/mean_terminated_length": 601.7379608154297, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.533, "grad_norm": 0.3329556882381439, "kl": 0.069427490234375, "learning_rate": 5.774718427906856e-07, "loss": 0.1698, "num_tokens": 84547853.0, "reward": 0.4683457016944885, "reward_std": 0.16398588195443153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249315869063139, "rewards/penalized_accuracy_reward/std": 0.04997263476252556, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.1665133461356163, "step": 1066 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2038.75, "completions/max_terminated_length": 1920.5, "completions/mean_length": 1020.296875, "completions/mean_terminated_length": 965.9854583740234, "completions/min_length": 461.25, "completions/min_terminated_length": 461.25, "epoch": 0.5335, "grad_norm": 0.43187910318374634, "kl": 0.045684814453125, "learning_rate": 5.766878681091828e-07, "loss": 0.1799, "num_tokens": 84621760.0, "reward": 0.541705995798111, "reward_std": 0.2401275746524334, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745456412434578, "rewards/penalized_accuracy_reward/std": 0.08052489906549454, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19872219488024712, "step": 1067 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1880.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 760.984375, "completions/mean_terminated_length": 603.2181701660156, "completions/min_length": 244.5, "completions/min_terminated_length": 244.5, "epoch": 0.534, "grad_norm": 0.5134800672531128, "kl": 0.058807373046875, "learning_rate": 5.759038121318052e-07, "loss": 0.4672, "num_tokens": 84678255.0, "reward": 0.7966319918632507, "reward_std": 0.49270331114530563, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17468317598104477, "rewards/penalized_accuracy_reward/std": 0.20291051268577576, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.22746118158102036, "step": 1068 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1815.5, "completions/mean_length": 1161.34375, "completions/mean_terminated_length": 935.9128570556641, "completions/min_length": 407.5, "completions/min_terminated_length": 407.5, "epoch": 0.5345, "grad_norm": 0.2872561514377594, "kl": 0.03948974609375, "learning_rate": 5.751196772469237e-07, "loss": 0.2129, "num_tokens": 84759221.0, "reward": 0.690989300608635, "reward_std": 0.5529280118644238, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12479152530431747, "rewards/penalized_accuracy_reward/std": 0.24423116445541382, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.26246585696935654, "step": 1069 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1793.75, "completions/max_terminated_length": 1504.5, "completions/mean_length": 762.984375, "completions/mean_terminated_length": 701.7229461669922, "completions/min_length": 304.75, "completions/min_terminated_length": 304.75, "epoch": 0.535, "grad_norm": 0.485596239566803, "kl": 0.0528564453125, "learning_rate": 5.743354658431489e-07, "loss": 0.2519, "num_tokens": 84819908.0, "reward": 0.5955927819013596, "reward_std": 0.3267718181014061, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06244483403861523, "rewards/penalized_accuracy_reward/std": 0.13931722566485405, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1826503686606884, "step": 1070 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1827.75, "completions/max_terminated_length": 1345.25, "completions/mean_length": 956.53125, "completions/mean_terminated_length": 802.4500579833984, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.5355, "grad_norm": 0.42547404766082764, "kl": 0.0455322265625, "learning_rate": 5.735511803093248e-07, "loss": 0.3017, "num_tokens": 84893286.0, "reward": 0.44140625, "reward_std": 0.10661093704402447, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.21322187408804893, "step": 1071 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1628.25, "completions/max_terminated_length": 1549.25, "completions/mean_length": 977.0, "completions/mean_terminated_length": 895.5662536621094, "completions/min_length": 394.75, "completions/min_terminated_length": 394.75, "epoch": 0.536, "grad_norm": 0.4596778154373169, "kl": 0.06378173828125, "learning_rate": 5.727668230345209e-07, "loss": 0.1606, "num_tokens": 84965286.0, "reward": 1.0561996102333069, "reward_std": 0.6401895880699158, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.29958418384194374, "rewards/penalized_accuracy_reward/std": 0.29435475915670395, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.16386458277702332, "step": 1072 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1881.5, "completions/max_terminated_length": 1627.5, "completions/mean_length": 892.046875, "completions/mean_terminated_length": 837.1823272705078, "completions/min_length": 381.75, "completions/min_terminated_length": 381.75, "epoch": 0.5365, "grad_norm": 0.40621864795684814, "kl": 0.034088134765625, "learning_rate": 5.71982396408026e-07, "loss": 0.1804, "num_tokens": 85030633.0, "reward": 0.6822061985731125, "reward_std": 0.37445096485316753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09989217296242714, "rewards/penalized_accuracy_reward/std": 0.16814497113227844, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10219132527709007, "step": 1073 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1488.5, "completions/mean_length": 1239.71875, "completions/mean_terminated_length": 904.9788208007812, "completions/min_length": 392.5, "completions/min_terminated_length": 392.5, "epoch": 0.537, "grad_norm": 0.34504953026771545, "kl": 0.06640625, "learning_rate": 5.711979028193391e-07, "loss": 0.3902, "num_tokens": 85124535.0, "reward": 0.40625, "reward_std": 0.1433923840522766, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8125, "rewards/tag_count_reward/std": 0.2867847681045532, "step": 1074 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1936.75, "completions/max_terminated_length": 1693.25, "completions/mean_length": 1005.140625, "completions/mean_terminated_length": 865.1789398193359, "completions/min_length": 394.5, "completions/min_terminated_length": 394.5, "epoch": 0.5375, "grad_norm": 0.40698811411857605, "kl": 0.057373046875, "learning_rate": 5.704133446581642e-07, "loss": 0.2431, "num_tokens": 85200528.0, "reward": 0.4990599751472473, "reward_std": 0.20871040225028992, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024920612573623657, "rewards/penalized_accuracy_reward/std": 0.0680960863828659, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.1884649395942688, "step": 1075 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1709.75, "completions/max_terminated_length": 1654.0, "completions/mean_length": 861.09375, "completions/mean_terminated_length": 785.9407196044922, "completions/min_length": 351.5, "completions/min_terminated_length": 351.5, "epoch": 0.538, "grad_norm": 0.3586120903491974, "kl": 0.044769287109375, "learning_rate": 5.696287243144012e-07, "loss": 0.1297, "num_tokens": 85264902.0, "reward": 0.470703125, "reward_std": 0.07116112858057022, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.14232225716114044, "step": 1076 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1672.25, "completions/max_terminated_length": 1363.75, "completions/mean_length": 943.875, "completions/mean_terminated_length": 848.0037536621094, "completions/min_length": 460.5, "completions/min_terminated_length": 460.5, "epoch": 0.5385, "grad_norm": 0.37890002131462097, "kl": 0.039154052734375, "learning_rate": 5.688440441781398e-07, "loss": 0.2404, "num_tokens": 85334174.0, "reward": 0.5088082849979401, "reward_std": 0.21530112251639366, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024911954998970032, "rewards/penalized_accuracy_reward/std": 0.06807243078947067, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.15831255912780762, "step": 1077 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1521.25, "completions/max_terminated_length": 1178.0, "completions/mean_length": 655.40625, "completions/mean_terminated_length": 612.2906341552734, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.539, "grad_norm": 0.3261450231075287, "kl": 0.049560546875, "learning_rate": 5.680593066396518e-07, "loss": 0.0579, "num_tokens": 85387976.0, "reward": 0.56315478682518, "reward_std": 0.24469464272260666, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037436772137880325, "rewards/penalized_accuracy_reward/std": 0.11811777204275131, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1078 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1343.0, "completions/max_terminated_length": 1247.5, "completions/mean_length": 912.078125, "completions/mean_terminated_length": 779.0738830566406, "completions/min_length": 368.5, "completions/min_terminated_length": 368.5, "epoch": 0.5395, "grad_norm": 0.1861879825592041, "kl": 0.0419921875, "learning_rate": 5.672745140893839e-07, "loss": 0.1112, "num_tokens": 85452781.0, "reward": 0.447265625, "reward_std": 0.07439808174967766, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.1487961709499359, "step": 1079 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1316.25, "completions/max_terminated_length": 1264.25, "completions/mean_length": 695.875, "completions/mean_terminated_length": 629.4791717529297, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.54, "grad_norm": 0.4462956488132477, "kl": 0.0560302734375, "learning_rate": 5.664896689179504e-07, "loss": 0.0646, "num_tokens": 85505845.0, "reward": 0.8683021515607834, "reward_std": 0.5104341432452202, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.199776079505682, "rewards/penalized_accuracy_reward/std": 0.24539267644286156, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.12984948605298996, "step": 1080 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1872.75, "completions/max_terminated_length": 1811.75, "completions/mean_length": 1145.609375, "completions/mean_terminated_length": 959.489013671875, "completions/min_length": 492.75, "completions/min_terminated_length": 492.75, "epoch": 0.5405, "grad_norm": 0.33869361877441406, "kl": 0.044891357421875, "learning_rate": 5.657047735161255e-07, "loss": 0.1239, "num_tokens": 85588028.0, "reward": 0.6451361700892448, "reward_std": 0.3735292237251997, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.099911835975945, "rewards/penalized_accuracy_reward/std": 0.15234635770320892, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.18873781338334084, "step": 1081 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1725.25, "completions/max_terminated_length": 1694.0, "completions/mean_length": 1183.59375, "completions/mean_terminated_length": 1082.9042663574219, "completions/min_length": 584.25, "completions/min_terminated_length": 584.25, "epoch": 0.541, "grad_norm": 0.40582767128944397, "kl": 0.03411865234375, "learning_rate": 5.649198302748368e-07, "loss": 0.1733, "num_tokens": 85674802.0, "reward": 0.6175332963466644, "reward_std": 0.42836691066622734, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09978226944804192, "rewards/penalized_accuracy_reward/std": 0.17598001658916473, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.21080743707716465, "step": 1082 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1345.0, "completions/max_terminated_length": 1296.5, "completions/mean_length": 702.78125, "completions/mean_terminated_length": 662.5691986083984, "completions/min_length": 263.25, "completions/min_terminated_length": 263.25, "epoch": 0.5415, "grad_norm": 0.372941792011261, "kl": 0.044830322265625, "learning_rate": 5.641348415851577e-07, "loss": 0.1904, "num_tokens": 85728020.0, "reward": 0.466796875, "reward_std": 0.07202449440956116, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14404898881912231, "step": 1083 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1819.0, "completions/max_terminated_length": 1445.0, "completions/mean_length": 911.328125, "completions/mean_terminated_length": 769.6363220214844, "completions/min_length": 346.5, "completions/min_terminated_length": 346.5, "epoch": 0.542, "grad_norm": 0.502940833568573, "kl": 0.08380126953125, "learning_rate": 5.633498098382998e-07, "loss": 0.3164, "num_tokens": 85797065.0, "reward": 0.5720036774873734, "reward_std": 0.34910082817077637, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06236902065575123, "rewards/penalized_accuracy_reward/std": 0.13917358964681625, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23384516686201096, "step": 1084 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1662.75, "completions/max_terminated_length": 1637.75, "completions/mean_length": 921.5, "completions/mean_terminated_length": 882.2223815917969, "completions/min_length": 383.25, "completions/min_terminated_length": 383.25, "epoch": 0.5425, "grad_norm": 0.3401668667793274, "kl": 0.03515625, "learning_rate": 5.625647374256061e-07, "loss": 0.1007, "num_tokens": 85863817.0, "reward": 0.8820998072624207, "reward_std": 0.0435695163832861, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19983896613121033, "rewards/penalized_accuracy_reward/std": 3.327928061480634e-05, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08700593188405037, "step": 1085 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1545.25, "completions/max_terminated_length": 1251.25, "completions/mean_length": 800.96875, "completions/mean_terminated_length": 736.8604354858398, "completions/min_length": 408.75, "completions/min_terminated_length": 408.75, "epoch": 0.543, "grad_norm": 8.88481330871582, "kl": 0.181640625, "learning_rate": 5.617796267385429e-07, "loss": 0.1719, "num_tokens": 85926391.0, "reward": 0.470703125, "reward_std": 0.07305660098791122, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.14611320197582245, "step": 1086 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1625.0, "completions/max_terminated_length": 1320.5, "completions/mean_length": 850.734375, "completions/mean_terminated_length": 798.8738250732422, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.5435, "grad_norm": 0.4539920389652252, "kl": 0.045806884765625, "learning_rate": 5.60994480168694e-07, "loss": 0.096, "num_tokens": 85994262.0, "reward": 0.8317916095256805, "reward_std": 0.5017671585083008, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17468486540019512, "rewards/penalized_accuracy_reward/std": 0.24151184409856796, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 1087 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1575.25, "completions/mean_length": 943.015625, "completions/mean_terminated_length": 830.0653991699219, "completions/min_length": 313.75, "completions/min_terminated_length": 313.75, "epoch": 0.544, "grad_norm": 0.43685808777809143, "kl": 0.0321044921875, "learning_rate": 5.602093001077517e-07, "loss": 0.2803, "num_tokens": 86061799.0, "reward": 0.6856830567121506, "reward_std": 0.4334903731942177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11237278580665588, "rewards/penalized_accuracy_reward/std": 0.18042617291212082, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2174222618341446, "step": 1088 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1577.5, "completions/mean_length": 982.84375, "completions/mean_terminated_length": 816.6548309326172, "completions/min_length": 361.25, "completions/min_terminated_length": 361.25, "epoch": 0.5445, "grad_norm": 40.555946350097656, "kl": 0.469207763671875, "learning_rate": 5.594240889475106e-07, "loss": 0.254, "num_tokens": 86133837.0, "reward": 0.7449734210968018, "reward_std": 0.5123496651649475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14983045123517513, "rewards/penalized_accuracy_reward/std": 0.22039442509412766, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.25286252051591873, "step": 1089 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 732.015625, "completions/mean_terminated_length": 732.015625, "completions/min_length": 319.25, "completions/min_terminated_length": 319.25, "epoch": 0.545, "grad_norm": 0.3575473427772522, "kl": 0.0345458984375, "learning_rate": 5.586388490798604e-07, "loss": 0.0017, "num_tokens": 86187934.0, "reward": 0.5191216915845871, "reward_std": 0.11637212336063385, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012490533292293549, "rewards/penalized_accuracy_reward/std": 0.049962133169174194, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1090 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1961.5, "completions/max_terminated_length": 1705.25, "completions/mean_length": 1139.78125, "completions/mean_terminated_length": 1025.570556640625, "completions/min_length": 393.25, "completions/min_terminated_length": 393.25, "epoch": 0.5455, "grad_norm": 0.30068615078926086, "kl": 0.048614501953125, "learning_rate": 5.578535828967777e-07, "loss": 0.151, "num_tokens": 86268544.0, "reward": 0.6238437443971634, "reward_std": 0.4090370498597622, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0873124934732914, "rewards/penalized_accuracy_reward/std": 0.16364894062280655, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.20187422633171082, "step": 1091 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1988.0, "completions/max_terminated_length": 1505.75, "completions/mean_length": 905.796875, "completions/mean_terminated_length": 782.1533966064453, "completions/min_length": 278.5, "completions/min_terminated_length": 278.5, "epoch": 0.546, "grad_norm": 0.3442077338695526, "kl": 0.037078857421875, "learning_rate": 5.570682927903193e-07, "loss": 0.2871, "num_tokens": 86334915.0, "reward": 0.570491686463356, "reward_std": 0.24297036230564117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049894288182258606, "rewards/penalized_accuracy_reward/std": 0.08925361931324005, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16587430611252785, "step": 1092 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 1047.109375, "completions/mean_terminated_length": 669.8131408691406, "completions/min_length": 334.25, "completions/min_terminated_length": 334.25, "epoch": 0.5465, "grad_norm": 0.5245501399040222, "kl": 0.07427978515625, "learning_rate": 5.562829811526154e-07, "loss": 0.51, "num_tokens": 86409866.0, "reward": 0.404296875, "reward_std": 0.15397946164011955, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.3079589232802391, "step": 1093 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1858.25, "completions/max_terminated_length": 1275.5, "completions/mean_length": 872.90625, "completions/mean_terminated_length": 762.2485046386719, "completions/min_length": 277.25, "completions/min_terminated_length": 277.25, "epoch": 0.547, "grad_norm": 0.4514864981174469, "kl": 0.045745849609375, "learning_rate": 5.554976503758612e-07, "loss": 0.3464, "num_tokens": 86475508.0, "reward": 0.470703125, "reward_std": 0.08466683328151703, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16933366656303406, "step": 1094 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1699.75, "completions/max_terminated_length": 1578.75, "completions/mean_length": 781.765625, "completions/mean_terminated_length": 700.4396057128906, "completions/min_length": 324.25, "completions/min_terminated_length": 324.25, "epoch": 0.5475, "grad_norm": 0.3992753028869629, "kl": 0.048828125, "learning_rate": 5.547123028523106e-07, "loss": 0.2564, "num_tokens": 86536549.0, "reward": 0.8606709092855453, "reward_std": 0.3151169940829277, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19986671954393387, "rewards/penalized_accuracy_reward/std": 0.1365106776356697, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.16549166291952133, "step": 1095 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1483.0, "completions/max_terminated_length": 1318.75, "completions/mean_length": 876.609375, "completions/mean_terminated_length": 777.4020843505859, "completions/min_length": 390.75, "completions/min_terminated_length": 390.75, "epoch": 0.548, "grad_norm": 0.5523794889450073, "kl": 0.05169677734375, "learning_rate": 5.539269409742683e-07, "loss": 0.0692, "num_tokens": 86601500.0, "reward": 0.5779505968093872, "reward_std": 0.35267695784568787, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06241280771791935, "rewards/penalized_accuracy_reward/std": 0.14872678369283676, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.15223969146609306, "step": 1096 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1657.0, "completions/max_terminated_length": 1293.25, "completions/mean_length": 834.609375, "completions/mean_terminated_length": 674.8563995361328, "completions/min_length": 312.5, "completions/min_terminated_length": 312.5, "epoch": 0.5485, "grad_norm": 0.4781716465950012, "kl": 0.058349609375, "learning_rate": 5.531415671340826e-07, "loss": 0.1983, "num_tokens": 86663603.0, "reward": 0.47609366476535797, "reward_std": 0.1589040867984295, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012460894882678986, "rewards/penalized_accuracy_reward/std": 0.04984357953071594, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.17332791164517403, "step": 1097 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1274.5, "completions/mean_length": 974.046875, "completions/mean_terminated_length": 732.0838928222656, "completions/min_length": 295.5, "completions/min_terminated_length": 295.5, "epoch": 0.549, "grad_norm": 0.49400338530540466, "kl": 0.0528564453125, "learning_rate": 5.523561837241387e-07, "loss": 0.4346, "num_tokens": 86736998.0, "reward": 0.5007654130458832, "reward_std": 0.28203877806663513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03749208152294159, "rewards/penalized_accuracy_reward/std": 0.08060554414987564, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.2922305725514889, "step": 1098 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 1080.203125, "completions/mean_terminated_length": 783.3555603027344, "completions/min_length": 364.25, "completions/min_terminated_length": 364.25, "epoch": 0.5495, "grad_norm": 0.6898369789123535, "kl": 0.05596923828125, "learning_rate": 5.515707931368507e-07, "loss": 0.3416, "num_tokens": 86814435.0, "reward": 0.42924994230270386, "reward_std": 0.2101033255457878, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012476532720029354, "rewards/penalized_accuracy_reward/std": 0.049906134605407715, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.3044755198061466, "step": 1099 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1846.5, "completions/max_terminated_length": 1171.75, "completions/mean_length": 841.234375, "completions/mean_terminated_length": 725.6386871337891, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.55, "grad_norm": 0.42844030261039734, "kl": 0.065673828125, "learning_rate": 5.507853977646543e-07, "loss": 0.2609, "num_tokens": 86879794.0, "reward": 0.852687269449234, "reward_std": 0.5918524712324142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1997811235487461, "rewards/penalized_accuracy_reward/std": 0.2722948342561722, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1967521719634533, "step": 1100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1777.75, "completions/max_terminated_length": 1494.75, "completions/mean_length": 859.625, "completions/mean_terminated_length": 767.8597412109375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.5505, "grad_norm": 0.579088568687439, "kl": 0.060791015625, "learning_rate": 5.5e-07, "loss": 0.0202, "num_tokens": 86943114.0, "reward": 0.6146662831306458, "reward_std": 0.37971943244338036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491127029061317, "rewards/penalized_accuracy_reward/std": 0.16105441004037857, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16787280701100826, "step": 1101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1784.5, "completions/max_terminated_length": 1300.0, "completions/mean_length": 816.296875, "completions/mean_terminated_length": 739.9113311767578, "completions/min_length": 397.5, "completions/min_terminated_length": 397.5, "epoch": 0.551, "grad_norm": 0.4062618017196655, "kl": 0.03228759765625, "learning_rate": 5.492146022353459e-07, "loss": 0.1633, "num_tokens": 87004893.0, "reward": 0.6762376427650452, "reward_std": 0.270545557141304, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09983757883310318, "rewards/penalized_accuracy_reward/std": 0.10311185568571091, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.13996089063584805, "step": 1102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1956.75, "completions/max_terminated_length": 1525.0, "completions/mean_length": 883.21875, "completions/mean_terminated_length": 740.1090698242188, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.5515, "grad_norm": 0.48678460717201233, "kl": 0.04998779296875, "learning_rate": 5.484292068631494e-07, "loss": 0.3284, "num_tokens": 87072347.0, "reward": 0.5260766595602036, "reward_std": 0.24189393222332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037452392280101776, "rewards/penalized_accuracy_reward/std": 0.08052023500204086, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.21197444200515747, "step": 1103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1911.5, "completions/max_terminated_length": 1454.5, "completions/mean_length": 1017.953125, "completions/mean_terminated_length": 874.9993438720703, "completions/min_length": 410.75, "completions/min_terminated_length": 410.75, "epoch": 0.552, "grad_norm": 0.43462786078453064, "kl": 0.0506591796875, "learning_rate": 5.476438162758611e-07, "loss": 0.2036, "num_tokens": 87150248.0, "reward": 0.524136021733284, "reward_std": 0.24378586187958717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037458635866642, "rewards/penalized_accuracy_reward/std": 0.08053364604711533, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.21973111480474472, "step": 1104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1927.25, "completions/max_terminated_length": 1366.75, "completions/mean_length": 924.578125, "completions/mean_terminated_length": 836.5236663818359, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.5525, "grad_norm": 0.4179590046405792, "kl": 0.040069580078125, "learning_rate": 5.468584328659172e-07, "loss": 0.3057, "num_tokens": 87219677.0, "reward": 0.5476016849279404, "reward_std": 0.28822851553559303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747271653264761, "rewards/penalized_accuracy_reward/std": 0.11823247745633125, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15024930611252785, "step": 1105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.25, "completions/mean_length": 1268.328125, "completions/mean_terminated_length": 1078.7191772460938, "completions/min_length": 578.75, "completions/min_terminated_length": 578.75, "epoch": 0.553, "grad_norm": 0.37763088941574097, "kl": 0.03253173828125, "learning_rate": 5.460730590257317e-07, "loss": 0.2909, "num_tokens": 87310658.0, "reward": 0.42578125, "reward_std": 0.13574975170195103, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.27149951830506325, "step": 1106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1775.75, "completions/max_terminated_length": 1364.75, "completions/mean_length": 802.15625, "completions/mean_terminated_length": 704.2221984863281, "completions/min_length": 321.75, "completions/min_terminated_length": 321.75, "epoch": 0.5535, "grad_norm": 0.4817950129508972, "kl": 0.04864501953125, "learning_rate": 5.452876971476896e-07, "loss": 0.2337, "num_tokens": 87374780.0, "reward": 0.6068057715892792, "reward_std": 0.2827882692217827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07488726079463959, "rewards/penalized_accuracy_reward/std": 0.09984982758760452, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2142927534878254, "step": 1107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1683.75, "completions/max_terminated_length": 1535.5, "completions/mean_length": 900.015625, "completions/mean_terminated_length": 821.1875, "completions/min_length": 324.75, "completions/min_terminated_length": 324.75, "epoch": 0.554, "grad_norm": 0.3253597021102905, "kl": 0.03704833984375, "learning_rate": 5.445023496241388e-07, "loss": 0.1612, "num_tokens": 87439629.0, "reward": 0.7508525550365448, "reward_std": 0.2406102642416954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1371450126171112, "rewards/penalized_accuracy_reward/std": 0.09549558907747269, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09923820197582245, "step": 1108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1635.5, "completions/max_terminated_length": 1464.75, "completions/mean_length": 798.359375, "completions/mean_terminated_length": 748.6835174560547, "completions/min_length": 340.25, "completions/min_terminated_length": 340.25, "epoch": 0.5545, "grad_norm": 0.527878999710083, "kl": 0.04736328125, "learning_rate": 5.437170188473847e-07, "loss": 0.1226, "num_tokens": 87497972.0, "reward": 0.5284601449966431, "reward_std": 0.2007286287844181, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024972258135676384, "rewards/penalized_accuracy_reward/std": 0.06823720782995224, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14216844737529755, "step": 1109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.25, "completions/mean_length": 1055.75, "completions/mean_terminated_length": 782.0250244140625, "completions/min_length": 350.75, "completions/min_terminated_length": 350.75, "epoch": 0.555, "grad_norm": 0.3711378276348114, "kl": 0.042755126953125, "learning_rate": 5.429317072096807e-07, "loss": 0.324, "num_tokens": 87575732.0, "reward": 0.7388339638710022, "reward_std": 0.48944220319390297, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1623857356607914, "rewards/penalized_accuracy_reward/std": 0.19173723459243774, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.280843622982502, "step": 1110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1744.25, "completions/max_terminated_length": 1545.75, "completions/mean_length": 1052.78125, "completions/mean_terminated_length": 926.4295959472656, "completions/min_length": 509.5, "completions/min_terminated_length": 509.5, "epoch": 0.5555, "grad_norm": 0.4807211458683014, "kl": 0.043365478515625, "learning_rate": 5.421464171032224e-07, "loss": 0.1533, "num_tokens": 87652294.0, "reward": 0.47219105809926987, "reward_std": 0.184242632240057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012462716549634933, "rewards/penalized_accuracy_reward/std": 0.049850866198539734, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.22332160733640194, "step": 1111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1666.75, "completions/max_terminated_length": 1424.75, "completions/mean_length": 759.265625, "completions/mean_terminated_length": 737.2260437011719, "completions/min_length": 312.5, "completions/min_terminated_length": 312.5, "epoch": 0.556, "grad_norm": 0.37930887937545776, "kl": 0.04150390625, "learning_rate": 5.413611509201396e-07, "loss": 0.1089, "num_tokens": 87709223.0, "reward": 0.5628522634506226, "reward_std": 0.30068599060177803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049980822019279, "rewards/penalized_accuracy_reward/std": 0.1305759809911251, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17445354536175728, "step": 1112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1535.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 763.5625, "completions/mean_terminated_length": 709.3787384033203, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.5565, "grad_norm": 0.5107172131538391, "kl": 0.0413818359375, "learning_rate": 5.405759110524894e-07, "loss": 0.1979, "num_tokens": 87766699.0, "reward": 0.466796875, "reward_std": 0.0835272278636694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1670544557273388, "step": 1113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1625.75, "completions/max_terminated_length": 1435.25, "completions/mean_length": 1036.09375, "completions/mean_terminated_length": 910.3560791015625, "completions/min_length": 416.75, "completions/min_terminated_length": 416.75, "epoch": 0.557, "grad_norm": 0.34856101870536804, "kl": 0.036895751953125, "learning_rate": 5.397906998922483e-07, "loss": 0.1468, "num_tokens": 87842145.0, "reward": 0.5624040365219116, "reward_std": 0.3658301383256912, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062452008947730064, "rewards/penalized_accuracy_reward/std": 0.1393446959555149, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.22521265596151352, "step": 1114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1868.25, "completions/max_terminated_length": 1641.25, "completions/mean_length": 1039.953125, "completions/mean_terminated_length": 971.9768524169922, "completions/min_length": 488.25, "completions/min_terminated_length": 488.25, "epoch": 0.5575, "grad_norm": 0.3607749938964844, "kl": 0.04852294921875, "learning_rate": 5.390055198313061e-07, "loss": 0.104, "num_tokens": 87918910.0, "reward": 0.6105764359235764, "reward_std": 0.36681514233350754, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07481946796178818, "rewards/penalized_accuracy_reward/std": 0.16085704416036606, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1791466288268566, "step": 1115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1686.25, "completions/max_terminated_length": 1413.75, "completions/mean_length": 1008.765625, "completions/mean_terminated_length": 909.226318359375, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.558, "grad_norm": 0.4008159041404724, "kl": 0.030059814453125, "learning_rate": 5.382203732614571e-07, "loss": 0.1989, "num_tokens": 87993167.0, "reward": 0.6697000414133072, "reward_std": 0.43701013550162315, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1121937707066536, "rewards/penalized_accuracy_reward/std": 0.18469837307929993, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2067924290895462, "step": 1116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1409.0, "completions/max_terminated_length": 1284.5, "completions/mean_length": 600.71875, "completions/mean_terminated_length": 558.5357208251953, "completions/min_length": 222.75, "completions/min_terminated_length": 222.75, "epoch": 0.5585, "grad_norm": 0.32381048798561096, "kl": 0.043304443359375, "learning_rate": 5.37435262574394e-07, "loss": 0.1682, "num_tokens": 88041677.0, "reward": 0.5382188856601715, "reward_std": 0.16847731545567513, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024968814104795456, "rewards/penalized_accuracy_reward/std": 0.06822779029607773, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.06404344737529755, "step": 1117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1842.25, "completions/max_terminated_length": 1552.75, "completions/mean_length": 964.359375, "completions/mean_terminated_length": 877.8616485595703, "completions/min_length": 389.5, "completions/min_terminated_length": 389.5, "epoch": 0.559, "grad_norm": 0.3963477611541748, "kl": 0.04412841796875, "learning_rate": 5.366501901617001e-07, "loss": 0.2505, "num_tokens": 88111268.0, "reward": 0.6856613159179688, "reward_std": 0.280204052105546, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11236189305782318, "rewards/penalized_accuracy_reward/std": 0.10234373062849045, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17631714418530464, "step": 1118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1780.0, "completions/max_terminated_length": 1525.25, "completions/mean_length": 860.671875, "completions/mean_terminated_length": 748.5308074951172, "completions/min_length": 233.75, "completions/min_terminated_length": 233.75, "epoch": 0.5595, "grad_norm": 0.3619835078716278, "kl": 0.052459716796875, "learning_rate": 5.358651584148423e-07, "loss": 0.2097, "num_tokens": 88177439.0, "reward": 0.5857815891504288, "reward_std": 0.26428040117025375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062422044575214386, "rewards/penalized_accuracy_reward/std": 0.09562329202890396, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18206512182950974, "step": 1119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1531.25, "completions/mean_length": 1222.203125, "completions/mean_terminated_length": 1041.5453033447266, "completions/min_length": 496.25, "completions/min_terminated_length": 496.25, "epoch": 0.56, "grad_norm": 0.291507750749588, "kl": 0.047698974609375, "learning_rate": 5.350801697251633e-07, "loss": 0.1995, "num_tokens": 88264348.0, "reward": 0.5372294783592224, "reward_std": 0.2742696814239025, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0498647466301918, "rewards/penalized_accuracy_reward/std": 0.08920080959796906, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2365734688937664, "step": 1120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1761.75, "completions/mean_length": 992.390625, "completions/mean_terminated_length": 822.0673370361328, "completions/min_length": 341.75, "completions/min_terminated_length": 341.75, "epoch": 0.5605, "grad_norm": 0.485091894865036, "kl": 0.05474853515625, "learning_rate": 5.342952264838747e-07, "loss": 0.3531, "num_tokens": 88335941.0, "reward": 0.4453125, "reward_std": 0.12727093137800694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.25454187765717506, "step": 1121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1767.25, "completions/max_terminated_length": 1346.0, "completions/mean_length": 983.3125, "completions/mean_terminated_length": 864.6760711669922, "completions/min_length": 434.5, "completions/min_terminated_length": 434.5, "epoch": 0.561, "grad_norm": 0.45401087403297424, "kl": 0.04986572265625, "learning_rate": 5.335103310820496e-07, "loss": 0.147, "num_tokens": 88407977.0, "reward": 0.6642204821109772, "reward_std": 0.43474795669317245, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11238368228077888, "rewards/penalized_accuracy_reward/std": 0.1804211437702179, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2188561111688614, "step": 1122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1209.75, "completions/mean_length": 860.015625, "completions/mean_terminated_length": 714.2223815917969, "completions/min_length": 241.5, "completions/min_terminated_length": 241.5, "epoch": 0.5615, "grad_norm": 1.5491451025009155, "kl": 0.08099365234375, "learning_rate": 5.32725485910616e-07, "loss": 0.3347, "num_tokens": 88475194.0, "reward": 0.46832987666130066, "reward_std": 0.20203210413455963, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012485254555940628, "rewards/penalized_accuracy_reward/std": 0.04994102194905281, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.24272903054952621, "step": 1123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1612.75, "completions/max_terminated_length": 1490.25, "completions/mean_length": 919.375, "completions/mean_terminated_length": 878.3519439697266, "completions/min_length": 422.25, "completions/min_terminated_length": 422.25, "epoch": 0.562, "grad_norm": 0.42552468180656433, "kl": 0.03271484375, "learning_rate": 5.319406933603482e-07, "loss": 0.1366, "num_tokens": 88541730.0, "reward": 0.6072395443916321, "reward_std": 0.24229254201054573, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06240883469581604, "rewards/penalized_accuracy_reward/std": 0.09560307115316391, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10217283479869366, "step": 1124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 1318.34375, "completions/mean_terminated_length": 1036.1292877197266, "completions/min_length": 613.5, "completions/min_terminated_length": 613.5, "epoch": 0.5625, "grad_norm": 0.3126542866230011, "kl": 0.034210205078125, "learning_rate": 5.311559558218603e-07, "loss": 0.3412, "num_tokens": 88637048.0, "reward": 0.45422810316085815, "reward_std": 0.26651569455862045, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024965612217783928, "rewards/penalized_accuracy_reward/std": 0.06821904331445694, "rewards/tag_count_reward/mean": 0.80859375, "rewards/tag_count_reward/std": 0.3099826090037823, "step": 1125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1579.5, "completions/max_terminated_length": 1495.5, "completions/mean_length": 703.09375, "completions/mean_terminated_length": 682.8583374023438, "completions/min_length": 282.75, "completions/min_terminated_length": 282.75, "epoch": 0.563, "grad_norm": 0.3774036169052124, "kl": 0.05206298828125, "learning_rate": 5.303712756855988e-07, "loss": 0.105, "num_tokens": 88689838.0, "reward": 0.5093390345573425, "reward_std": 0.12820176780223846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482021003961563, "rewards/penalized_accuracy_reward/std": 0.04992808774113655, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09088464826345444, "step": 1126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1858.25, "completions/max_terminated_length": 1664.5, "completions/mean_length": 1022.828125, "completions/mean_terminated_length": 941.6732788085938, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.5635, "grad_norm": 0.3744426965713501, "kl": 0.042205810546875, "learning_rate": 5.295866553418358e-07, "loss": 0.1551, "num_tokens": 88763603.0, "reward": 0.7316446453332901, "reward_std": 0.5003492534160614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13730669859796762, "rewards/penalized_accuracy_reward/std": 0.23274057731032372, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18196558579802513, "step": 1127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1692.25, "completions/max_terminated_length": 1571.75, "completions/mean_length": 942.828125, "completions/mean_terminated_length": 911.9948120117188, "completions/min_length": 399.75, "completions/min_terminated_length": 399.75, "epoch": 0.564, "grad_norm": 0.4216429889202118, "kl": 0.0439453125, "learning_rate": 5.288020971806608e-07, "loss": 0.0736, "num_tokens": 88833672.0, "reward": 0.6321220993995667, "reward_std": 0.25642867013812065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07485012710094452, "rewards/penalized_accuracy_reward/std": 0.0998002365231514, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11365639418363571, "step": 1128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1537.25, "completions/mean_length": 1054.859375, "completions/mean_terminated_length": 889.3026123046875, "completions/min_length": 453.75, "completions/min_terminated_length": 453.75, "epoch": 0.5645, "grad_norm": 0.43966421484947205, "kl": 0.041748046875, "learning_rate": 5.28017603591974e-07, "loss": 0.3548, "num_tokens": 88919279.0, "reward": 0.48939818143844604, "reward_std": 0.2751966565847397, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497253194451332, "rewards/penalized_accuracy_reward/std": 0.09989013150334358, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2405036948621273, "step": 1129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1799.25, "completions/max_terminated_length": 1711.75, "completions/mean_length": 853.0, "completions/mean_terminated_length": 799.520263671875, "completions/min_length": 285.5, "completions/min_terminated_length": 285.5, "epoch": 0.565, "grad_norm": 0.3645751476287842, "kl": 0.042022705078125, "learning_rate": 5.27233176965479e-07, "loss": 0.14, "num_tokens": 88985215.0, "reward": 0.478515625, "reward_std": 0.06332266703248024, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.12664533779025078, "step": 1130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1971.0, "completions/max_terminated_length": 1513.25, "completions/mean_length": 950.828125, "completions/mean_terminated_length": 833.5386810302734, "completions/min_length": 353.75, "completions/min_terminated_length": 353.75, "epoch": 0.5655, "grad_norm": 0.40806350111961365, "kl": 0.044921875, "learning_rate": 5.264488196906752e-07, "loss": 0.3081, "num_tokens": 89053060.0, "reward": 0.5300202071666718, "reward_std": 0.29905685782432556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037471041083335876, "rewards/penalized_accuracy_reward/std": 0.11822953447699547, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.21620172634720802, "step": 1131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1678.75, "completions/max_terminated_length": 1631.0, "completions/mean_length": 913.171875, "completions/mean_terminated_length": 880.0602722167969, "completions/min_length": 431.75, "completions/min_terminated_length": 431.75, "epoch": 0.566, "grad_norm": 0.3855467736721039, "kl": 0.02874755859375, "learning_rate": 5.256645341568511e-07, "loss": 0.0556, "num_tokens": 89121423.0, "reward": 0.7820223271846771, "reward_std": 0.37874985486268997, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14980023354291916, "rewards/penalized_accuracy_reward/std": 0.16793209314346313, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08577133901417255, "step": 1132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1719.75, "completions/max_terminated_length": 1332.0, "completions/mean_length": 711.59375, "completions/mean_terminated_length": 671.7198181152344, "completions/min_length": 271.5, "completions/min_terminated_length": 271.5, "epoch": 0.5665, "grad_norm": 0.3213393986225128, "kl": 0.0660400390625, "learning_rate": 5.248803227530763e-07, "loss": 0.107, "num_tokens": 89176581.0, "reward": 0.532378762960434, "reward_std": 0.17414817214012146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497844025492668, "rewards/penalized_accuracy_reward/std": 0.06825409829616547, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11707578226923943, "step": 1133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1852.0, "completions/max_terminated_length": 1511.75, "completions/mean_length": 945.421875, "completions/mean_terminated_length": 806.9460144042969, "completions/min_length": 372.25, "completions/min_terminated_length": 372.25, "epoch": 0.567, "grad_norm": 0.4344850182533264, "kl": 0.0548095703125, "learning_rate": 5.240961878681947e-07, "loss": 0.2402, "num_tokens": 89250432.0, "reward": 0.49521034955978394, "reward_std": 0.22947092726826668, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02494892105460167, "rewards/penalized_accuracy_reward/std": 0.06817345321178436, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.23850593343377113, "step": 1134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1462.0, "completions/max_terminated_length": 1125.75, "completions/mean_length": 743.171875, "completions/mean_terminated_length": 646.3822937011719, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5675, "grad_norm": 0.5299329161643982, "kl": 0.0445556640625, "learning_rate": 5.233121318908173e-07, "loss": 0.2673, "num_tokens": 89308683.0, "reward": 0.4609375, "reward_std": 0.06548266112804413, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.13096532225608826, "step": 1135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1755.5, "completions/max_terminated_length": 1619.75, "completions/mean_length": 933.109375, "completions/mean_terminated_length": 773.876953125, "completions/min_length": 266.25, "completions/min_terminated_length": 266.25, "epoch": 0.568, "grad_norm": 0.37691277265548706, "kl": 0.035064697265625, "learning_rate": 5.225281572093143e-07, "loss": 0.1819, "num_tokens": 89376978.0, "reward": 0.5910644233226776, "reward_std": 0.2738419994711876, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0748290866613388, "rewards/penalized_accuracy_reward/std": 0.09977219253778458, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.17190942913293839, "step": 1136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1637.25, "completions/mean_length": 1056.125, "completions/mean_terminated_length": 899.0079345703125, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 0.5685, "grad_norm": 0.360857218503952, "kl": 0.037994384765625, "learning_rate": 5.21744266211809e-07, "loss": 0.2317, "num_tokens": 89453786.0, "reward": 0.5990481227636337, "reward_std": 0.3550828546285629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491468265652657, "rewards/penalized_accuracy_reward/std": 0.14548784121870995, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.21193470805883408, "step": 1137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1885.75, "completions/max_terminated_length": 1250.25, "completions/mean_length": 1069.390625, "completions/mean_terminated_length": 746.8562622070312, "completions/min_length": 363.25, "completions/min_terminated_length": 363.25, "epoch": 0.569, "grad_norm": 0.3878116309642792, "kl": 0.0509033203125, "learning_rate": 5.20960461286169e-07, "loss": 0.303, "num_tokens": 89535011.0, "reward": 0.43359375, "reward_std": 0.10185929574072361, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.20371859893202782, "step": 1138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1639.5, "completions/max_terminated_length": 1582.0, "completions/mean_length": 876.390625, "completions/mean_terminated_length": 860.0823059082031, "completions/min_length": 351.75, "completions/min_terminated_length": 351.75, "epoch": 0.5695, "grad_norm": 0.37038394808769226, "kl": 0.042388916015625, "learning_rate": 5.2017674482e-07, "loss": 0.0415, "num_tokens": 89599660.0, "reward": 0.6361352205276489, "reward_std": 0.32635925710201263, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490354776382446, "rewards/penalized_accuracy_reward/std": 0.14550575613975525, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 1139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1578.25, "completions/max_terminated_length": 1428.0, "completions/mean_length": 801.703125, "completions/mean_terminated_length": 705.5854339599609, "completions/min_length": 301.75, "completions/min_terminated_length": 301.75, "epoch": 0.57, "grad_norm": 0.6114969253540039, "kl": 0.0521240234375, "learning_rate": 5.193931192006385e-07, "loss": 0.2457, "num_tokens": 89661097.0, "reward": 0.5799128711223602, "reward_std": 0.26678648218512535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062417369335889816, "rewards/penalized_accuracy_reward/std": 0.0956161618232727, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.17555104941129684, "step": 1140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1901.75, "completions/max_terminated_length": 1367.0, "completions/mean_length": 1294.40625, "completions/mean_terminated_length": 844.9361190795898, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.5705, "grad_norm": 0.41516101360321045, "kl": 0.07366943359375, "learning_rate": 5.186095868151436e-07, "loss": 0.217, "num_tokens": 89754515.0, "reward": 0.37109375, "reward_std": 0.12826421111822128, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.2565284315496683, "step": 1141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1806.5, "completions/max_terminated_length": 1416.25, "completions/mean_length": 956.21875, "completions/mean_terminated_length": 818.1403961181641, "completions/min_length": 383.25, "completions/min_terminated_length": 383.25, "epoch": 0.571, "grad_norm": 0.5409483313560486, "kl": 0.05950927734375, "learning_rate": 5.178261500502912e-07, "loss": 0.1396, "num_tokens": 89826273.0, "reward": 0.549036055803299, "reward_std": 0.272281177341938, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049908652901649475, "rewards/penalized_accuracy_reward/std": 0.08927936851978302, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.22347351163625717, "step": 1142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1307.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 771.140625, "completions/mean_terminated_length": 686.7753143310547, "completions/min_length": 401.25, "completions/min_terminated_length": 401.25, "epoch": 0.5715, "grad_norm": 0.6421815156936646, "kl": 0.042724609375, "learning_rate": 5.170428112925659e-07, "loss": 0.2784, "num_tokens": 89884154.0, "reward": 0.7643980383872986, "reward_std": 0.25492632761597633, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1497771441936493, "rewards/penalized_accuracy_reward/std": 0.08930984139442444, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1526133306324482, "step": 1143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1507.25, "completions/max_terminated_length": 1507.0, "completions/mean_length": 874.953125, "completions/mean_terminated_length": 858.0937652587891, "completions/min_length": 350.25, "completions/min_terminated_length": 350.25, "epoch": 0.572, "grad_norm": 0.322893351316452, "kl": 0.04608154296875, "learning_rate": 5.162595729281526e-07, "loss": 0.0683, "num_tokens": 89950759.0, "reward": 0.6897852569818497, "reward_std": 0.43369196355342865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09977545216679573, "rewards/penalized_accuracy_reward/std": 0.21364974230527878, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.054575782269239426, "step": 1144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1751.5, "completions/mean_length": 1280.09375, "completions/mean_terminated_length": 1062.4841613769531, "completions/min_length": 459.5, "completions/min_terminated_length": 459.5, "epoch": 0.5725, "grad_norm": 0.47104108333587646, "kl": 0.056732177734375, "learning_rate": 5.154764373429315e-07, "loss": 0.1829, "num_tokens": 90047917.0, "reward": 0.4948471486568451, "reward_std": 0.33269279450178146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037462638691067696, "rewards/penalized_accuracy_reward/std": 0.11819793656468391, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.28330276533961296, "step": 1145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1776.75, "completions/mean_length": 1233.53125, "completions/mean_terminated_length": 1072.2330017089844, "completions/min_length": 512.75, "completions/min_terminated_length": 512.75, "epoch": 0.573, "grad_norm": 0.3702712953090668, "kl": 0.037872314453125, "learning_rate": 5.146934069224698e-07, "loss": 0.2067, "num_tokens": 90140063.0, "reward": 0.5182141959667206, "reward_std": 0.2614179253578186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03742741048336029, "rewards/penalized_accuracy_reward/std": 0.08046653866767883, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.23874584585428238, "step": 1146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1713.75, "completions/max_terminated_length": 1468.75, "completions/mean_length": 1098.40625, "completions/mean_terminated_length": 946.8965454101562, "completions/min_length": 424.25, "completions/min_terminated_length": 424.25, "epoch": 0.5735, "grad_norm": 0.3663688898086548, "kl": 0.053955078125, "learning_rate": 5.139104840520135e-07, "loss": 0.2139, "num_tokens": 90218825.0, "reward": 0.4375, "reward_std": 0.12473787739872932, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.24947576224803925, "step": 1147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2047.0, "completions/max_terminated_length": 1660.5, "completions/mean_length": 984.0625, "completions/mean_terminated_length": 766.5428466796875, "completions/min_length": 314.25, "completions/min_terminated_length": 314.25, "epoch": 0.574, "grad_norm": 0.41313764452934265, "kl": 0.0753173828125, "learning_rate": 5.131276711164815e-07, "loss": 0.2845, "num_tokens": 90290893.0, "reward": 0.5045026391744614, "reward_std": 0.25479676201939583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03740756958723068, "rewards/penalized_accuracy_reward/std": 0.0804239809513092, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.23289087414741516, "step": 1148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1613.25, "completions/mean_length": 851.125, "completions/mean_terminated_length": 750.8155059814453, "completions/min_length": 269.5, "completions/min_terminated_length": 269.5, "epoch": 0.5745, "grad_norm": 0.4403568208217621, "kl": 0.036163330078125, "learning_rate": 5.123449705004581e-07, "loss": 0.3783, "num_tokens": 90354133.0, "reward": 0.6665387451648712, "reward_std": 0.365218386054039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09987091738730669, "rewards/penalized_accuracy_reward/std": 0.1522696278989315, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19370491802692413, "step": 1149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1496.0, "completions/max_terminated_length": 1324.25, "completions/mean_length": 716.765625, "completions/mean_terminated_length": 696.7354278564453, "completions/min_length": 285.25, "completions/min_terminated_length": 285.25, "epoch": 0.575, "grad_norm": 0.3398098349571228, "kl": 0.040313720703125, "learning_rate": 5.115623845881847e-07, "loss": -0.0379, "num_tokens": 90409126.0, "reward": 0.6668583750724792, "reward_std": 0.3394938260316849, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08733544126152992, "rewards/penalized_accuracy_reward/std": 0.16372209787368774, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1901.0, "completions/max_terminated_length": 1616.25, "completions/mean_length": 951.328125, "completions/mean_terminated_length": 793.4754638671875, "completions/min_length": 395.5, "completions/min_terminated_length": 395.5, "epoch": 0.5755, "grad_norm": 0.34071993827819824, "kl": 0.035736083984375, "learning_rate": 5.107799157635538e-07, "loss": 0.3363, "num_tokens": 90477243.0, "reward": 0.458984375, "reward_std": 0.09441948495805264, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18883897736668587, "step": 1151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1481.25, "completions/max_terminated_length": 1071.5, "completions/mean_length": 573.53125, "completions/mean_terminated_length": 525.7687606811523, "completions/min_length": 259.25, "completions/min_terminated_length": 259.25, "epoch": 0.576, "grad_norm": 0.5141123533248901, "kl": 0.0675048828125, "learning_rate": 5.099975664101014e-07, "loss": 0.3061, "num_tokens": 90521517.0, "reward": 0.48828125, "reward_std": 0.046875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1637.5, "completions/max_terminated_length": 1411.0, "completions/mean_length": 913.484375, "completions/mean_terminated_length": 834.0750274658203, "completions/min_length": 437.5, "completions/min_terminated_length": 437.5, "epoch": 0.5765, "grad_norm": 0.29469650983810425, "kl": 0.040863037109375, "learning_rate": 5.09215338910999e-07, "loss": 0.0566, "num_tokens": 90587772.0, "reward": 0.49171361327171326, "reward_std": 0.15269193053245544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012458366341888905, "rewards/penalized_accuracy_reward/std": 0.04983346536755562, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14832578226923943, "step": 1153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1748.0, "completions/max_terminated_length": 1622.5, "completions/mean_length": 995.796875, "completions/mean_terminated_length": 928.6207275390625, "completions/min_length": 387.25, "completions/min_terminated_length": 387.25, "epoch": 0.577, "grad_norm": 0.30873119831085205, "kl": 0.03863525390625, "learning_rate": 5.084332356490472e-07, "loss": 0.0743, "num_tokens": 90662063.0, "reward": 0.6166212260723114, "reward_std": 0.34088203869760036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491217274218798, "rewards/penalized_accuracy_reward/std": 0.14557445421814919, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1673666164278984, "step": 1154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1501.0, "completions/max_terminated_length": 1388.25, "completions/mean_length": 798.4375, "completions/mean_terminated_length": 738.0769348144531, "completions/min_length": 329.5, "completions/min_terminated_length": 329.5, "epoch": 0.5775, "grad_norm": 0.5158293843269348, "kl": 0.044281005859375, "learning_rate": 5.076512590066685e-07, "loss": 0.1475, "num_tokens": 90722715.0, "reward": 0.5415935218334198, "reward_std": 0.28262321650981903, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03739831503480673, "rewards/penalized_accuracy_reward/std": 0.11802903190255165, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14611320197582245, "step": 1155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1880.5, "completions/max_terminated_length": 1515.0, "completions/mean_length": 999.203125, "completions/mean_terminated_length": 846.5269775390625, "completions/min_length": 397.5, "completions/min_terminated_length": 397.5, "epoch": 0.578, "grad_norm": 0.4601103663444519, "kl": 0.043121337890625, "learning_rate": 5.068694113658992e-07, "loss": 0.2822, "num_tokens": 90796616.0, "reward": 0.44921875, "reward_std": 0.10521316714584827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.21042634174227715, "step": 1156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1926.5, "completions/max_terminated_length": 1277.75, "completions/mean_length": 777.296875, "completions/mean_terminated_length": 672.7968978881836, "completions/min_length": 303.75, "completions/min_terminated_length": 303.75, "epoch": 0.5785, "grad_norm": 0.5231736302375793, "kl": 0.06884765625, "learning_rate": 5.060876951083828e-07, "loss": 0.3135, "num_tokens": 90855371.0, "reward": 0.476168692111969, "reward_std": 0.19561817310750484, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012498411349952221, "rewards/penalized_accuracy_reward/std": 0.04999364912509918, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.19126176089048386, "step": 1157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1824.0, "completions/max_terminated_length": 1465.25, "completions/mean_length": 902.3125, "completions/mean_terminated_length": 694.8913879394531, "completions/min_length": 271.25, "completions/min_terminated_length": 271.25, "epoch": 0.579, "grad_norm": 0.49698472023010254, "kl": 0.0545654296875, "learning_rate": 5.053061126153624e-07, "loss": 0.2852, "num_tokens": 90922927.0, "reward": 0.427734375, "reward_std": 0.09744690544903278, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.19489381089806557, "step": 1158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1875.25, "completions/max_terminated_length": 1692.75, "completions/mean_length": 978.75, "completions/mean_terminated_length": 917.933349609375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.5795, "grad_norm": 0.3443211615085602, "kl": 0.029876708984375, "learning_rate": 5.045246662676741e-07, "loss": 0.1689, "num_tokens": 90993807.0, "reward": 0.5857777297496796, "reward_std": 0.2517623733729124, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06242011860013008, "rewards/penalized_accuracy_reward/std": 0.09562035650014877, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1642562747001648, "step": 1159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1788.25, "completions/max_terminated_length": 1260.25, "completions/mean_length": 726.375, "completions/mean_terminated_length": 588.7233276367188, "completions/min_length": 243.75, "completions/min_terminated_length": 243.75, "epoch": 0.58, "grad_norm": 0.46162474155426025, "kl": 0.04205322265625, "learning_rate": 5.037433584457389e-07, "loss": 0.445, "num_tokens": 91053303.0, "reward": 0.5187066346406937, "reward_std": 0.2172122336924076, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024978317320346832, "rewards/penalized_accuracy_reward/std": 0.06825375556945801, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16140944883227348, "step": 1160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1787.75, "completions/max_terminated_length": 1230.0, "completions/mean_length": 961.71875, "completions/mean_terminated_length": 759.7210006713867, "completions/min_length": 422.75, "completions/min_terminated_length": 422.75, "epoch": 0.5805, "grad_norm": 0.4004286825656891, "kl": 0.056060791015625, "learning_rate": 5.02962191529556e-07, "loss": 0.3209, "num_tokens": 91128133.0, "reward": 0.427734375, "reward_std": 0.11263919994235039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.22527840360999107, "step": 1161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1685.75, "completions/max_terminated_length": 1374.25, "completions/mean_length": 973.515625, "completions/mean_terminated_length": 813.3574676513672, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.581, "grad_norm": 0.39369478821754456, "kl": 0.0511474609375, "learning_rate": 5.021811678986951e-07, "loss": 0.3072, "num_tokens": 91200086.0, "reward": 0.5777368098497391, "reward_std": 0.2678428739309311, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06230590492486954, "rewards/penalized_accuracy_reward/std": 0.095445416867733, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.15390408039093018, "step": 1162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1819.75, "completions/max_terminated_length": 1578.75, "completions/mean_length": 881.28125, "completions/mean_terminated_length": 823.9284515380859, "completions/min_length": 380.25, "completions/min_terminated_length": 380.25, "epoch": 0.5815, "grad_norm": 0.4476839601993561, "kl": 0.035400390625, "learning_rate": 5.014002899322896e-07, "loss": 0.2158, "num_tokens": 91263480.0, "reward": 0.5725020170211792, "reward_std": 0.23623967170715332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0499228797852993, "rewards/penalized_accuracy_reward/std": 0.08930476754903793, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1405348777770996, "step": 1163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1587.25, "completions/max_terminated_length": 1347.0, "completions/mean_length": 785.03125, "completions/mean_terminated_length": 686.1297454833984, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.582, "grad_norm": 0.42496830224990845, "kl": 0.0469970703125, "learning_rate": 5.006195600090296e-07, "loss": 0.2912, "num_tokens": 91320842.0, "reward": 0.9623556435108185, "reward_std": 0.6028456129133701, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24973251298069954, "rewards/penalized_accuracy_reward/std": 0.276045523583889, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.16313419491052628, "step": 1164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1865.25, "completions/max_terminated_length": 1582.25, "completions/mean_length": 854.359375, "completions/mean_terminated_length": 785.5585784912109, "completions/min_length": 437.25, "completions/min_terminated_length": 437.25, "epoch": 0.5825, "grad_norm": 0.4057561457157135, "kl": 0.045562744140625, "learning_rate": 4.998389805071536e-07, "loss": 0.211, "num_tokens": 91390817.0, "reward": 0.4995855838060379, "reward_std": 0.15556010603904724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012488102540373802, "rewards/penalized_accuracy_reward/std": 0.04995241016149521, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13808366656303406, "step": 1165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1955.5, "completions/max_terminated_length": 1644.5, "completions/mean_length": 1031.375, "completions/mean_terminated_length": 915.0373687744141, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.583, "grad_norm": 0.46131572127342224, "kl": 0.035003662109375, "learning_rate": 4.990585538044419e-07, "loss": 0.1806, "num_tokens": 91466073.0, "reward": 0.5357464849948883, "reward_std": 0.24094254337251186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03740449249744415, "rewards/penalized_accuracy_reward/std": 0.08041735738515854, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1602156963199377, "step": 1166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1667.25, "completions/max_terminated_length": 1458.5, "completions/mean_length": 909.484375, "completions/mean_terminated_length": 838.6093902587891, "completions/min_length": 340.25, "completions/min_terminated_length": 340.25, "epoch": 0.5835, "grad_norm": 0.28687235713005066, "kl": 0.04302978515625, "learning_rate": 4.982782822782101e-07, "loss": 0.1126, "num_tokens": 91534616.0, "reward": 0.570416122674942, "reward_std": 0.21585353650152683, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04985649883747101, "rewards/penalized_accuracy_reward/std": 0.08918603509664536, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11979937925934792, "step": 1167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1606.0, "completions/max_terminated_length": 1413.0, "completions/mean_length": 1116.296875, "completions/mean_terminated_length": 957.3767547607422, "completions/min_length": 339.75, "completions/min_terminated_length": 339.75, "epoch": 0.584, "grad_norm": 0.28603291511535645, "kl": 0.050018310546875, "learning_rate": 4.974981683053001e-07, "loss": 0.1348, "num_tokens": 91623067.0, "reward": 0.431640625, "reward_std": 0.08628249168395996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.17256498336791992, "step": 1168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1644.25, "completions/max_terminated_length": 1586.25, "completions/mean_length": 757.140625, "completions/mean_terminated_length": 716.4799194335938, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5845, "grad_norm": 0.4144996404647827, "kl": 0.04815673828125, "learning_rate": 4.967182142620745e-07, "loss": 0.0751, "num_tokens": 91683300.0, "reward": 0.5783982872962952, "reward_std": 0.2313947044312954, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994133859872818, "rewards/penalized_accuracy_reward/std": 0.08933782577514648, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10543813742697239, "step": 1169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1966.75, "completions/max_terminated_length": 1482.0, "completions/mean_length": 957.21875, "completions/mean_terminated_length": 818.7878570556641, "completions/min_length": 403.25, "completions/min_terminated_length": 403.25, "epoch": 0.585, "grad_norm": 0.5568026900291443, "kl": 0.042510986328125, "learning_rate": 4.959384225244087e-07, "loss": 0.329, "num_tokens": 91756498.0, "reward": 0.5317873358726501, "reward_std": 0.2559615299105644, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03737804666161537, "rewards/penalized_accuracy_reward/std": 0.08036043494939804, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19048132747411728, "step": 1170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1663.5, "completions/max_terminated_length": 1360.5, "completions/mean_length": 792.015625, "completions/mean_terminated_length": 686.6958465576172, "completions/min_length": 332.75, "completions/min_terminated_length": 332.75, "epoch": 0.5855, "grad_norm": 0.4999534487724304, "kl": 0.041778564453125, "learning_rate": 4.951587954676837e-07, "loss": 0.3246, "num_tokens": 91816963.0, "reward": 1.0874408185482025, "reward_std": 0.27454247720743297, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.31227509677410126, "rewards/penalized_accuracy_reward/std": 0.10237499821232632, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1395849771797657, "step": 1171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1793.5, "completions/max_terminated_length": 1341.5, "completions/mean_length": 909.078125, "completions/mean_terminated_length": 738.2031402587891, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.586, "grad_norm": 0.3159729838371277, "kl": 0.0408935546875, "learning_rate": 4.943793354667783e-07, "loss": 0.2753, "num_tokens": 91883192.0, "reward": 0.8277707397937775, "reward_std": 0.1837584152817726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18732285499572754, "rewards/penalized_accuracy_reward/std": 0.04995293170213699, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.16770510375499725, "step": 1172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1171.25, "completions/max_terminated_length": 1027.5, "completions/mean_length": 625.9375, "completions/mean_terminated_length": 582.0825958251953, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5865, "grad_norm": 0.5073890686035156, "kl": 0.05133056640625, "learning_rate": 4.93600044896063e-07, "loss": 0.1015, "num_tokens": 91931348.0, "reward": 0.6263815760612488, "reward_std": 0.3529805988073349, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490953430533409, "rewards/penalized_accuracy_reward/std": 0.1610506772994995, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14254852384328842, "step": 1173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1389.75, "completions/max_terminated_length": 1270.5, "completions/mean_length": 722.234375, "completions/mean_terminated_length": 687.8950958251953, "completions/min_length": 295.5, "completions/min_terminated_length": 295.5, "epoch": 0.587, "grad_norm": 0.4117385745048523, "kl": 0.0439453125, "learning_rate": 4.928209261293923e-07, "loss": 0.1115, "num_tokens": 91987155.0, "reward": 0.48046875, "reward_std": 0.06327171996235847, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12654344737529755, "step": 1174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1776.5, "completions/max_terminated_length": 1527.5, "completions/mean_length": 980.921875, "completions/mean_terminated_length": 900.5687866210938, "completions/min_length": 389.75, "completions/min_terminated_length": 389.75, "epoch": 0.5875, "grad_norm": 0.300703763961792, "kl": 0.0355224609375, "learning_rate": 4.920419815400968e-07, "loss": 0.1852, "num_tokens": 92059054.0, "reward": 0.5936169326305389, "reward_std": 0.34913358464837074, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06243346817791462, "rewards/penalized_accuracy_reward/std": 0.14878203719854355, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18185977265238762, "step": 1175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1425.25, "completions/mean_length": 830.125, "completions/mean_terminated_length": 730.9179077148438, "completions/min_length": 329.75, "completions/min_terminated_length": 329.75, "epoch": 0.588, "grad_norm": 0.6235924363136292, "kl": 0.05328369140625, "learning_rate": 4.912632135009769e-07, "loss": 0.3941, "num_tokens": 92121462.0, "reward": 0.462890625, "reward_std": 0.10839555040001869, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.21679110452532768, "step": 1176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1669.5, "completions/max_terminated_length": 1494.25, "completions/mean_length": 1001.890625, "completions/mean_terminated_length": 889.7729187011719, "completions/min_length": 358.25, "completions/min_terminated_length": 358.25, "epoch": 0.5885, "grad_norm": 0.346434086561203, "kl": 0.04473876953125, "learning_rate": 4.904846243842949e-07, "loss": 0.2149, "num_tokens": 92192943.0, "reward": 0.7601786106824875, "reward_std": 0.5420852676033974, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14962055999785662, "rewards/penalized_accuracy_reward/std": 0.24516575038433075, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18188394606113434, "step": 1177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1633.25, "completions/mean_length": 988.515625, "completions/mean_terminated_length": 904.3309936523438, "completions/min_length": 385.5, "completions/min_terminated_length": 385.5, "epoch": 0.589, "grad_norm": 0.3867397904396057, "kl": 0.0455322265625, "learning_rate": 4.897062165617686e-07, "loss": 0.222, "num_tokens": 92265952.0, "reward": 0.6494522094726562, "reward_std": 0.3577841594815254, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08742141909897327, "rewards/penalized_accuracy_reward/std": 0.14987171813845634, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1767093911767006, "step": 1178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1620.25, "completions/max_terminated_length": 1456.75, "completions/mean_length": 1075.625, "completions/mean_terminated_length": 901.1897583007812, "completions/min_length": 298.75, "completions/min_terminated_length": 298.75, "epoch": 0.5895, "grad_norm": 0.4139617681503296, "kl": 0.033203125, "learning_rate": 4.88927992404563e-07, "loss": 0.1368, "num_tokens": 92345992.0, "reward": 0.4990735501050949, "reward_std": 0.2563570812344551, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02492740098387003, "rewards/penalized_accuracy_reward/std": 0.09970961138606071, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.1419392004609108, "step": 1179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1647.75, "completions/max_terminated_length": 1349.75, "completions/mean_length": 744.453125, "completions/mean_terminated_length": 705.5271148681641, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.59, "grad_norm": 0.492278128862381, "kl": 0.033203125, "learning_rate": 4.881499542832841e-07, "loss": 0.1726, "num_tokens": 92404165.0, "reward": 0.8757349699735641, "reward_std": 0.35628581792116165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19958623591810465, "rewards/penalized_accuracy_reward/std": 0.16793207451701164, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1228807382285595, "step": 1180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1807.5, "completions/max_terminated_length": 1442.25, "completions/mean_length": 873.1875, "completions/mean_terminated_length": 755.8811645507812, "completions/min_length": 289.25, "completions/min_terminated_length": 289.25, "epoch": 0.5905, "grad_norm": 0.40655747056007385, "kl": 0.04730224609375, "learning_rate": 4.873721045679706e-07, "loss": 0.3036, "num_tokens": 92471313.0, "reward": 0.458984375, "reward_std": 0.087533850222826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1750677078962326, "step": 1181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1979.25, "completions/max_terminated_length": 1547.25, "completions/mean_length": 894.515625, "completions/mean_terminated_length": 776.5080642700195, "completions/min_length": 333.25, "completions/min_terminated_length": 333.25, "epoch": 0.591, "grad_norm": 0.44774070382118225, "kl": 0.039093017578125, "learning_rate": 4.865944456280878e-07, "loss": 0.2732, "num_tokens": 92538002.0, "reward": 0.5128087401390076, "reward_std": 0.21746210753917694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024959055706858635, "rewards/penalized_accuracy_reward/std": 0.06820112466812134, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.19982432574033737, "step": 1182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1973.0, "completions/max_terminated_length": 1524.5, "completions/mean_length": 957.921875, "completions/mean_terminated_length": 888.7079162597656, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.5915, "grad_norm": 0.3247816562652588, "kl": 0.029052734375, "learning_rate": 4.858169798325198e-07, "loss": 0.0839, "num_tokens": 92610461.0, "reward": 0.5552884042263031, "reward_std": 0.2605578899383545, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037409826181828976, "rewards/penalized_accuracy_reward/std": 0.11805575713515282, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11476518586277962, "step": 1183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1431.5, "completions/max_terminated_length": 1400.25, "completions/mean_length": 734.0, "completions/mean_terminated_length": 684.0168304443359, "completions/min_length": 219.75, "completions/min_terminated_length": 219.75, "epoch": 0.592, "grad_norm": 0.32154178619384766, "kl": 0.032470703125, "learning_rate": 4.850397095495621e-07, "loss": 0.0968, "num_tokens": 92665373.0, "reward": 0.5054529309272766, "reward_std": 0.13855137676000595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249209139496088, "rewards/penalized_accuracy_reward/std": 0.04996836557984352, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09120866656303406, "step": 1184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1855.75, "completions/max_terminated_length": 1454.5, "completions/mean_length": 918.625, "completions/mean_terminated_length": 745.5542297363281, "completions/min_length": 212.75, "completions/min_terminated_length": 212.75, "epoch": 0.5925, "grad_norm": 0.4603999853134155, "kl": 0.045166015625, "learning_rate": 4.842626371469149e-07, "loss": 0.3663, "num_tokens": 92735333.0, "reward": 0.46834199130535126, "reward_std": 0.18773328140377998, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491310015320778, "rewards/penalized_accuracy_reward/std": 0.04996524006128311, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.23909321427345276, "step": 1185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1542.5, "completions/max_terminated_length": 1432.25, "completions/mean_length": 658.078125, "completions/mean_terminated_length": 639.3718872070312, "completions/min_length": 274.25, "completions/min_terminated_length": 274.25, "epoch": 0.593, "grad_norm": 0.48170286417007446, "kl": 0.04425048828125, "learning_rate": 4.834857649916752e-07, "loss": 0.0298, "num_tokens": 92786874.0, "reward": 0.6379989981651306, "reward_std": 0.24039429426193237, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0748588889837265, "rewards/penalized_accuracy_reward/std": 0.0998118668794632, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 2004.5, "completions/max_terminated_length": 1845.0, "completions/mean_length": 1341.046875, "completions/mean_terminated_length": 1137.0104217529297, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.5935, "grad_norm": 0.23140285909175873, "kl": 0.04888916015625, "learning_rate": 4.827090954503308e-07, "loss": 0.1666, "num_tokens": 92882285.0, "reward": 0.4987744837999344, "reward_std": 0.30477727577090263, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037473177537322044, "rewards/penalized_accuracy_reward/std": 0.11821584403514862, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.23629537224769592, "step": 1187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1832.25, "completions/max_terminated_length": 1416.0, "completions/mean_length": 999.859375, "completions/mean_terminated_length": 813.2681427001953, "completions/min_length": 346.25, "completions/min_terminated_length": 346.25, "epoch": 0.594, "grad_norm": 0.3719935417175293, "kl": 0.05487060546875, "learning_rate": 4.819326308887513e-07, "loss": 0.3283, "num_tokens": 92956820.0, "reward": 0.44140625, "reward_std": 0.1110742911696434, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.222148597240448, "step": 1188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1834.25, "completions/max_terminated_length": 1638.0, "completions/mean_length": 1165.234375, "completions/mean_terminated_length": 1040.4481658935547, "completions/min_length": 514.5, "completions/min_terminated_length": 514.5, "epoch": 0.5945, "grad_norm": 0.2610819339752197, "kl": 0.02935791015625, "learning_rate": 4.811563736721829e-07, "loss": 0.1747, "num_tokens": 93040083.0, "reward": 0.45703125, "reward_std": 0.08932400494813919, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17864800989627838, "step": 1189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1524.75, "completions/max_terminated_length": 1335.75, "completions/mean_length": 815.53125, "completions/mean_terminated_length": 773.4709930419922, "completions/min_length": 408.25, "completions/min_terminated_length": 408.25, "epoch": 0.595, "grad_norm": 0.4158121347427368, "kl": 0.046173095703125, "learning_rate": 4.803803261652395e-07, "loss": 0.0956, "num_tokens": 93101765.0, "reward": 0.5821986198425293, "reward_std": 0.2948165014386177, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049888371489942074, "rewards/penalized_accuracy_reward/std": 0.1303216628730297, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.06834635883569717, "step": 1190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1326.5, "completions/max_terminated_length": 1264.75, "completions/mean_length": 691.359375, "completions/mean_terminated_length": 635.2463989257812, "completions/min_length": 263.5, "completions/min_terminated_length": 263.5, "epoch": 0.5955, "grad_norm": 0.5090142488479614, "kl": 0.043975830078125, "learning_rate": 4.79604490731896e-07, "loss": 0.1324, "num_tokens": 93156012.0, "reward": 0.4995469003915787, "reward_std": 0.1559875775128603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012468760833144188, "rewards/penalized_accuracy_reward/std": 0.04987504705786705, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1124749705195427, "step": 1191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 733.734375, "completions/mean_terminated_length": 733.734375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.596, "grad_norm": 0.262821227312088, "kl": 0.05010986328125, "learning_rate": 4.788288697354824e-07, "loss": 0.0013, "num_tokens": 93210683.0, "reward": 0.5230080634355545, "reward_std": 0.10765725374221802, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012480593286454678, "rewards/penalized_accuracy_reward/std": 0.04992237687110901, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1592.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 869.1875, "completions/mean_terminated_length": 827.032470703125, "completions/min_length": 381.5, "completions/min_terminated_length": 381.5, "epoch": 0.5965, "grad_norm": 0.4306821823120117, "kl": 0.052337646484375, "learning_rate": 4.780534655386743e-07, "loss": 0.0979, "num_tokens": 93279927.0, "reward": 0.47265625, "reward_std": 0.05997256934642792, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.11994514800608158, "step": 1193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1326.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 722.828125, "completions/mean_terminated_length": 665.3774108886719, "completions/min_length": 308.25, "completions/min_terminated_length": 308.25, "epoch": 0.597, "grad_norm": 0.48509758710861206, "kl": 0.048095703125, "learning_rate": 4.772782805034876e-07, "loss": 0.1517, "num_tokens": 93335116.0, "reward": 0.6225392669439316, "reward_std": 0.37423184514045715, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07494151219725609, "rewards/penalized_accuracy_reward/std": 0.15762703120708466, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.11795559898018837, "step": 1194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1556.5, "completions/max_terminated_length": 1356.75, "completions/mean_length": 841.328125, "completions/mean_terminated_length": 744.3281402587891, "completions/min_length": 298.5, "completions/min_terminated_length": 298.5, "epoch": 0.5975, "grad_norm": 0.30000609159469604, "kl": 0.043487548828125, "learning_rate": 4.7650331699127013e-07, "loss": 0.1177, "num_tokens": 93399153.0, "reward": 0.7204199880361557, "reward_std": 0.48534201085567474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12485843244940042, "rewards/penalized_accuracy_reward/std": 0.23037000373005867, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.12312964349985123, "step": 1195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1642.0, "completions/max_terminated_length": 1192.75, "completions/mean_length": 560.140625, "completions/mean_terminated_length": 489.95105743408203, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.598, "grad_norm": 0.692812442779541, "kl": 0.06280517578125, "learning_rate": 4.75728577362695e-07, "loss": 0.3669, "num_tokens": 93444458.0, "reward": 0.5108246505260468, "reward_std": 0.21410391479730606, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024943571537733078, "rewards/penalized_accuracy_reward/std": 0.068158820271492, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.19513116031885147, "step": 1196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1715.75, "completions/max_terminated_length": 1419.5, "completions/mean_length": 850.609375, "completions/mean_terminated_length": 814.4094085693359, "completions/min_length": 299.75, "completions/min_terminated_length": 299.75, "epoch": 0.5985, "grad_norm": 0.33206579089164734, "kl": 0.037933349609375, "learning_rate": 4.749540639777539e-07, "loss": 0.0586, "num_tokens": 93506337.0, "reward": 0.6131185293197632, "reward_std": 0.3061147928237915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062418632209300995, "rewards/penalized_accuracy_reward/std": 0.1392253041267395, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1517.75, "completions/max_terminated_length": 1315.75, "completions/mean_length": 740.421875, "completions/mean_terminated_length": 700.8385620117188, "completions/min_length": 355.75, "completions/min_terminated_length": 355.75, "epoch": 0.599, "grad_norm": 0.43108069896698, "kl": 0.05206298828125, "learning_rate": 4.741797791957489e-07, "loss": 0.0957, "num_tokens": 93563116.0, "reward": 0.703155130147934, "reward_std": 0.4512936696410179, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11231976095587015, "rewards/penalized_accuracy_reward/std": 0.21801185235381126, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11363695561885834, "step": 1198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1541.25, "completions/max_terminated_length": 1444.25, "completions/mean_length": 748.546875, "completions/mean_terminated_length": 668.7221374511719, "completions/min_length": 279.25, "completions/min_terminated_length": 279.25, "epoch": 0.5995, "grad_norm": 0.37550368905067444, "kl": 0.04522705078125, "learning_rate": 4.7340572537528547e-07, "loss": 0.1847, "num_tokens": 93618207.0, "reward": 1.0450846552848816, "reward_std": 0.26446752436459064, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2871907651424408, "rewards/penalized_accuracy_reward/std": 0.102403484590468, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1533849686384201, "step": 1199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1905.75, "completions/max_terminated_length": 1765.25, "completions/mean_length": 959.15625, "completions/mean_terminated_length": 925.3375244140625, "completions/min_length": 406.5, "completions/min_terminated_length": 406.5, "epoch": 0.6, "grad_norm": 0.36093294620513916, "kl": 0.030670166015625, "learning_rate": 4.7263190487426563e-07, "loss": 0.1443, "num_tokens": 93689001.0, "reward": 0.9110638946294785, "reward_std": 0.12836385081754997, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.212367856875062, "rewards/penalized_accuracy_reward/std": 0.04996699993353104, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 1200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1882.0, "completions/max_terminated_length": 1399.75, "completions/mean_length": 1011.859375, "completions/mean_terminated_length": 751.2943115234375, "completions/min_length": 269.5, "completions/min_terminated_length": 269.5, "epoch": 0.6005, "grad_norm": 1.1319817304611206, "kl": 0.07830810546875, "learning_rate": 4.7185832004988133e-07, "loss": 0.2096, "num_tokens": 93766992.0, "reward": 0.6366680860519409, "reward_std": 0.4551665186882019, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11227935180068016, "rewards/penalized_accuracy_reward/std": 0.18025685846805573, "rewards/tag_count_reward/mean": 0.82421875, "rewards/tag_count_reward/std": 0.25385861098766327, "step": 1201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1149.5, "completions/max_terminated_length": 1149.5, "completions/mean_length": 660.703125, "completions/mean_terminated_length": 660.703125, "completions/min_length": 297.25, "completions/min_terminated_length": 297.25, "epoch": 0.601, "grad_norm": 0.4994664788246155, "kl": 0.04486083984375, "learning_rate": 4.710849732586059e-07, "loss": 0.1044, "num_tokens": 93817645.0, "reward": 0.9971500039100647, "reward_std": 0.3733859360218048, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24955155700445175, "rewards/penalized_accuracy_reward/std": 0.1827867180109024, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1192.25, "completions/max_terminated_length": 986.75, "completions/mean_length": 575.9375, "completions/mean_terminated_length": 462.2727355957031, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.6015, "grad_norm": 0.2496490627527237, "kl": 0.052734375, "learning_rate": 4.703118668561875e-07, "loss": 0.0802, "num_tokens": 93865225.0, "reward": 0.49566251784563065, "reward_std": 0.11636865884065628, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012479697354137897, "rewards/penalized_accuracy_reward/std": 0.04991879314184189, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.08975879102945328, "step": 1203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1518.5, "completions/mean_length": 1164.703125, "completions/mean_terminated_length": 937.433349609375, "completions/min_length": 476.75, "completions/min_terminated_length": 476.75, "epoch": 0.602, "grad_norm": 0.49900978803634644, "kl": 0.052703857421875, "learning_rate": 4.6953900319764274e-07, "loss": 0.2598, "num_tokens": 93951174.0, "reward": 0.553677149116993, "reward_std": 0.4628095757216215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0873854523524642, "rewards/penalized_accuracy_reward/std": 0.19539305567741394, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.3087031953036785, "step": 1204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1474.0, "completions/max_terminated_length": 1318.25, "completions/mean_length": 701.34375, "completions/mean_terminated_length": 680.2843780517578, "completions/min_length": 288.75, "completions/min_terminated_length": 288.75, "epoch": 0.6025, "grad_norm": 0.23838387429714203, "kl": 0.04595947265625, "learning_rate": 4.68766384637248e-07, "loss": 0.0586, "num_tokens": 94005820.0, "reward": 0.668910801410675, "reward_std": 0.21124333143234253, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08738510310649872, "rewards/penalized_accuracy_reward/std": 0.10233500599861145, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1732.75, "completions/max_terminated_length": 1205.25, "completions/mean_length": 750.96875, "completions/mean_terminated_length": 620.1273040771484, "completions/min_length": 257.25, "completions/min_terminated_length": 257.25, "epoch": 0.603, "grad_norm": 0.47832682728767395, "kl": 0.0645751953125, "learning_rate": 4.679940135285336e-07, "loss": 0.2514, "num_tokens": 94066106.0, "reward": 0.7373316287994385, "reward_std": 0.4464542791247368, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1372205000370741, "rewards/penalized_accuracy_reward/std": 0.20215995237231255, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.179734468460083, "step": 1206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1828.25, "completions/max_terminated_length": 1551.25, "completions/mean_length": 1135.5, "completions/mean_terminated_length": 1036.8516540527344, "completions/min_length": 439.75, "completions/min_terminated_length": 439.75, "epoch": 0.6035, "grad_norm": 0.2783260643482208, "kl": 0.05792236328125, "learning_rate": 4.672218922242759e-07, "loss": 0.0744, "num_tokens": 94151898.0, "reward": 0.6563698649406433, "reward_std": 0.28449829295277596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11236461997032166, "rewards/penalized_accuracy_reward/std": 0.10234620422124863, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.18227311596274376, "step": 1207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1680.75, "completions/mean_length": 1058.1875, "completions/mean_terminated_length": 836.9561920166016, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.604, "grad_norm": 0.3984740972518921, "kl": 0.036590576171875, "learning_rate": 4.664500230764903e-07, "loss": 0.4129, "num_tokens": 94227878.0, "reward": 0.4375, "reward_std": 0.1318471673876047, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2636943459510803, "step": 1208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1950.5, "completions/max_terminated_length": 1794.5, "completions/mean_length": 1086.625, "completions/mean_terminated_length": 1025.140365600586, "completions/min_length": 377.75, "completions/min_terminated_length": 377.75, "epoch": 0.6045, "grad_norm": 0.32399630546569824, "kl": 0.0321044921875, "learning_rate": 4.656784084364238e-07, "loss": 0.2204, "num_tokens": 94304430.0, "reward": 0.6570180654525757, "reward_std": 0.2480907365679741, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08729809522628784, "rewards/penalized_accuracy_reward/std": 0.10223309695720673, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08724908530712128, "step": 1209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1242.75, "completions/max_terminated_length": 1054.5, "completions/mean_length": 603.078125, "completions/mean_terminated_length": 565.5089416503906, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.605, "grad_norm": 0.5686039328575134, "kl": 0.0498046875, "learning_rate": 4.6490705065454883e-07, "loss": 0.2257, "num_tokens": 94352419.0, "reward": 0.8860783576965332, "reward_std": 0.04005711439822335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1998751014471054, "rewards/penalized_accuracy_reward/std": 0.00011145394091727212, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07966844737529755, "step": 1210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1289.5, "completions/max_terminated_length": 1027.75, "completions/mean_length": 591.046875, "completions/mean_terminated_length": 568.6843872070312, "completions/min_length": 283.5, "completions/min_terminated_length": 283.5, "epoch": 0.6055, "grad_norm": 0.4321802854537964, "kl": 0.03619384765625, "learning_rate": 4.641359520805548e-07, "loss": 0.165, "num_tokens": 94402934.0, "reward": 0.7881364822387695, "reward_std": 0.20135599374771118, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14992761611938477, "rewards/penalized_accuracy_reward/std": 0.08939957618713379, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07966229319572449, "step": 1211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1627.75, "completions/max_terminated_length": 1480.75, "completions/mean_length": 841.671875, "completions/mean_terminated_length": 742.2549896240234, "completions/min_length": 374.25, "completions/min_terminated_length": 374.25, "epoch": 0.606, "grad_norm": 0.49234193563461304, "kl": 0.042022705078125, "learning_rate": 4.6336511506334177e-07, "loss": 0.2326, "num_tokens": 94464689.0, "reward": 0.4609375, "reward_std": 0.10090170428156853, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20180341601371765, "step": 1212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1634.0, "completions/max_terminated_length": 1378.25, "completions/mean_length": 813.21875, "completions/mean_terminated_length": 755.4090881347656, "completions/min_length": 369.25, "completions/min_terminated_length": 369.25, "epoch": 0.6065, "grad_norm": 0.5003955960273743, "kl": 0.037109375, "learning_rate": 4.6259454195101267e-07, "loss": 0.2458, "num_tokens": 94526863.0, "reward": 1.0316628217697144, "reward_std": 0.39367066137492657, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.274620458483696, "rewards/penalized_accuracy_reward/std": 0.17128750681877136, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10219132527709007, "step": 1213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1869.5, "completions/max_terminated_length": 1596.75, "completions/mean_length": 946.96875, "completions/mean_terminated_length": 859.8750457763672, "completions/min_length": 365.25, "completions/min_terminated_length": 365.25, "epoch": 0.607, "grad_norm": 0.4678648114204407, "kl": 0.050567626953125, "learning_rate": 4.61824235090867e-07, "loss": 0.2135, "num_tokens": 94599549.0, "reward": 0.4765625, "reward_std": 0.07261843979358673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14523687958717346, "step": 1214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1726.75, "completions/max_terminated_length": 1458.75, "completions/mean_length": 883.390625, "completions/mean_terminated_length": 771.9020385742188, "completions/min_length": 361.25, "completions/min_terminated_length": 361.25, "epoch": 0.6075, "grad_norm": 0.3996659815311432, "kl": 0.04248046875, "learning_rate": 4.6105419682939316e-07, "loss": 0.2444, "num_tokens": 94665238.0, "reward": 0.8394597172737122, "reward_std": 0.3103123791515827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1873079827055335, "rewards/penalized_accuracy_reward/std": 0.11813652142882347, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1865021139383316, "step": 1215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1692.0, "completions/max_terminated_length": 1428.25, "completions/mean_length": 908.84375, "completions/mean_terminated_length": 796.6497344970703, "completions/min_length": 444.75, "completions/min_terminated_length": 444.75, "epoch": 0.608, "grad_norm": 0.5147249102592468, "kl": 0.04071044921875, "learning_rate": 4.602844295122613e-07, "loss": 0.2399, "num_tokens": 94733196.0, "reward": 0.4800005257129669, "reward_std": 0.1814851388335228, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012461202219128609, "rewards/penalized_accuracy_reward/std": 0.049844808876514435, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.21929743513464928, "step": 1216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1905.0, "completions/max_terminated_length": 1624.5, "completions/mean_length": 882.734375, "completions/mean_terminated_length": 795.2294311523438, "completions/min_length": 317.5, "completions/min_terminated_length": 317.5, "epoch": 0.6085, "grad_norm": 0.44880351424217224, "kl": 0.03753662109375, "learning_rate": 4.59514935484316e-07, "loss": 0.3249, "num_tokens": 94797979.0, "reward": 0.4609375, "reward_std": 0.08753077685832977, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17506155371665955, "step": 1217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1394.75, "completions/max_terminated_length": 1375.25, "completions/mean_length": 695.71875, "completions/mean_terminated_length": 618.546875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.609, "grad_norm": 0.4053499400615692, "kl": 0.04327392578125, "learning_rate": 4.5874571708956953e-07, "loss": 0.2064, "num_tokens": 94851673.0, "reward": 0.470703125, "reward_std": 0.06536377221345901, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.13072755187749863, "step": 1218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1892.25, "completions/max_terminated_length": 1732.25, "completions/mean_length": 1079.6875, "completions/mean_terminated_length": 915.0205535888672, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6095, "grad_norm": 0.3543846309185028, "kl": 0.05206298828125, "learning_rate": 4.579767766711944e-07, "loss": 0.1115, "num_tokens": 94929445.0, "reward": 0.6121222972869873, "reward_std": 0.40110914409160614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08731114864349365, "rewards/penalized_accuracy_reward/std": 0.16368676722049713, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.16113058850169182, "step": 1219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1552.5, "completions/max_terminated_length": 1550.75, "completions/mean_length": 825.765625, "completions/mean_terminated_length": 796.3995666503906, "completions/min_length": 318.25, "completions/min_terminated_length": 318.25, "epoch": 0.61, "grad_norm": 0.32606634497642517, "kl": 0.031341552734375, "learning_rate": 4.572081165715167e-07, "loss": 0.1294, "num_tokens": 94991222.0, "reward": 0.482421875, "reward_std": 0.04681437276303768, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09362874925136566, "step": 1220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1802.75, "completions/max_terminated_length": 1376.5, "completions/mean_length": 841.3125, "completions/mean_terminated_length": 783.3958740234375, "completions/min_length": 360.75, "completions/min_terminated_length": 360.75, "epoch": 0.6105, "grad_norm": 0.38328689336776733, "kl": 0.044586181640625, "learning_rate": 4.5643973913200837e-07, "loss": 0.2208, "num_tokens": 95053338.0, "reward": 0.7742246687412262, "reward_std": 0.23266303725540638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1498076319694519, "rewards/penalized_accuracy_reward/std": 0.08932822197675705, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1351175718009472, "step": 1221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1355.75, "completions/max_terminated_length": 1294.5, "completions/mean_length": 713.28125, "completions/mean_terminated_length": 697.2385559082031, "completions/min_length": 245.75, "completions/min_terminated_length": 245.75, "epoch": 0.611, "grad_norm": 0.2305457442998886, "kl": 0.043701171875, "learning_rate": 4.556716466932803e-07, "loss": 0.0649, "num_tokens": 95109820.0, "reward": 0.4921875, "reward_std": 0.024206146597862244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.04841229319572449, "step": 1222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1878.25, "completions/max_terminated_length": 1625.0, "completions/mean_length": 1124.296875, "completions/mean_terminated_length": 1025.0609436035156, "completions/min_length": 543.5, "completions/min_terminated_length": 543.5, "epoch": 0.6115, "grad_norm": 0.3559255003929138, "kl": 0.040679931640625, "learning_rate": 4.549038415950751e-07, "loss": 0.0882, "num_tokens": 95192575.0, "reward": 0.5261100828647614, "reward_std": 0.22515235468745232, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037469103932380676, "rewards/penalized_accuracy_reward/std": 0.08055616170167923, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.1836090050637722, "step": 1223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1664.25, "completions/max_terminated_length": 1394.75, "completions/mean_length": 885.40625, "completions/mean_terminated_length": 824.499267578125, "completions/min_length": 453.75, "completions/min_terminated_length": 453.75, "epoch": 0.612, "grad_norm": 0.33585605025291443, "kl": 0.03515625, "learning_rate": 4.5413632617626054e-07, "loss": 0.2168, "num_tokens": 95258265.0, "reward": 0.5436223447322845, "reward_std": 0.22297713719308376, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03743617236614227, "rewards/penalized_accuracy_reward/std": 0.08048535883426666, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16178976371884346, "step": 1224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1766.5, "completions/mean_length": 1137.015625, "completions/mean_terminated_length": 925.8084716796875, "completions/min_length": 443.5, "completions/min_terminated_length": 443.5, "epoch": 0.6125, "grad_norm": 0.4149802029132843, "kl": 0.042083740234375, "learning_rate": 4.5336910277482155e-07, "loss": 0.3521, "num_tokens": 95339194.0, "reward": 0.42578125, "reward_std": 0.13588897138834, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.2717779614031315, "step": 1225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1693.5, "completions/max_terminated_length": 1505.5, "completions/mean_length": 806.671875, "completions/mean_terminated_length": 787.4406280517578, "completions/min_length": 304.25, "completions/min_terminated_length": 304.25, "epoch": 0.613, "grad_norm": 0.5566760897636414, "kl": 0.03912353515625, "learning_rate": 4.526021737278537e-07, "loss": 0.164, "num_tokens": 95398037.0, "reward": 0.484375, "reward_std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 1226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1814.75, "completions/max_terminated_length": 1624.0, "completions/mean_length": 1145.25, "completions/mean_terminated_length": 920.4364776611328, "completions/min_length": 333.25, "completions/min_terminated_length": 333.25, "epoch": 0.6135, "grad_norm": 0.36493560671806335, "kl": 0.044586181640625, "learning_rate": 4.51835541371556e-07, "loss": 0.2011, "num_tokens": 95481237.0, "reward": 0.7501366138458252, "reward_std": 0.48593607544898987, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1621776819229126, "rewards/penalized_accuracy_reward/std": 0.19154760986566544, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.22961556911468506, "step": 1227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1707.25, "completions/max_terminated_length": 1540.5, "completions/mean_length": 723.109375, "completions/mean_terminated_length": 702.6635437011719, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.614, "grad_norm": 0.37772783637046814, "kl": 0.038177490234375, "learning_rate": 4.5106920804122304e-07, "loss": 0.115, "num_tokens": 95534956.0, "reward": 0.6610243618488312, "reward_std": 0.23609137535095215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08734811842441559, "rewards/penalized_accuracy_reward/std": 0.10229166597127914, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.08957063034176826, "step": 1228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1913.25, "completions/max_terminated_length": 1729.0, "completions/mean_length": 943.125, "completions/mean_terminated_length": 854.4958648681641, "completions/min_length": 383.5, "completions/min_terminated_length": 383.5, "epoch": 0.6145, "grad_norm": 0.46794766187667847, "kl": 0.040740966796875, "learning_rate": 4.503031760712397e-07, "loss": 0.3531, "num_tokens": 95602692.0, "reward": 0.458984375, "reward_std": 0.1054474376142025, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.2108948826789856, "step": 1229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1684.75, "completions/max_terminated_length": 1626.0, "completions/mean_length": 904.390625, "completions/mean_terminated_length": 890.6885681152344, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.615, "grad_norm": 0.3238963782787323, "kl": 0.03466796875, "learning_rate": 4.4953744779507197e-07, "loss": 0.0282, "num_tokens": 95670077.0, "reward": 0.5862451493740082, "reward_std": 0.19722317159175873, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995851218700409, "rewards/penalized_accuracy_reward/std": 0.08936850726604462, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.08082501962780952, "step": 1230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1754.5, "completions/max_terminated_length": 1418.25, "completions/mean_length": 945.5, "completions/mean_terminated_length": 868.9775390625, "completions/min_length": 364.5, "completions/min_terminated_length": 364.5, "epoch": 0.6155, "grad_norm": 0.26215338706970215, "kl": 0.034576416015625, "learning_rate": 4.4877202554526084e-07, "loss": 0.101, "num_tokens": 95740237.0, "reward": 0.5147920697927475, "reward_std": 0.20507432334125042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497415989637375, "rewards/penalized_accuracy_reward/std": 0.0682424008846283, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17558613047003746, "step": 1231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1600.0, "completions/max_terminated_length": 1231.75, "completions/mean_length": 718.5625, "completions/mean_terminated_length": 678.7198257446289, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.616, "grad_norm": 0.4839765131473541, "kl": 0.04241943359375, "learning_rate": 4.480069116534151e-07, "loss": 0.2343, "num_tokens": 95794609.0, "reward": 0.48828125, "reward_std": 0.046875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1463.5, "completions/max_terminated_length": 1420.5, "completions/mean_length": 726.109375, "completions/mean_terminated_length": 703.6343841552734, "completions/min_length": 257.5, "completions/min_terminated_length": 257.5, "epoch": 0.6165, "grad_norm": 0.5664671063423157, "kl": 0.047393798828125, "learning_rate": 4.4724210845020494e-07, "loss": 0.0975, "num_tokens": 95850552.0, "reward": 0.5401146709918976, "reward_std": 0.17536143958568573, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02494015172123909, "rewards/penalized_accuracy_reward/std": 0.06814946979284286, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1191.5, "completions/max_terminated_length": 1172.25, "completions/mean_length": 855.46875, "completions/mean_terminated_length": 781.0781555175781, "completions/min_length": 363.5, "completions/min_terminated_length": 363.5, "epoch": 0.617, "grad_norm": 0.5609845519065857, "kl": 0.037200927734375, "learning_rate": 4.4647761826535303e-07, "loss": 0.0592, "num_tokens": 95918022.0, "reward": 0.455078125, "reward_std": 0.05718260630965233, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1143652144819498, "step": 1234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1620.5, "completions/max_terminated_length": 1311.75, "completions/mean_length": 745.8125, "completions/mean_terminated_length": 687.4684753417969, "completions/min_length": 210.25, "completions/min_terminated_length": 210.25, "epoch": 0.6175, "grad_norm": 0.4087170660495758, "kl": 0.03424072265625, "learning_rate": 4.457134434276293e-07, "loss": -0.0126, "num_tokens": 95975610.0, "reward": 0.6568788439035416, "reward_std": 0.401363804936409, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08722847606986761, "rewards/penalized_accuracy_reward/std": 0.19520386680960655, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10219132527709007, "step": 1235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1949.5, "completions/max_terminated_length": 1642.0, "completions/mean_length": 1025.671875, "completions/mean_terminated_length": 913.5232543945312, "completions/min_length": 320.75, "completions/min_terminated_length": 320.75, "epoch": 0.618, "grad_norm": 0.3596786856651306, "kl": 0.05059814453125, "learning_rate": 4.449495862648427e-07, "loss": 0.144, "num_tokens": 96049589.0, "reward": 0.5568153411149979, "reward_std": 0.3239019028842449, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04989204928278923, "rewards/penalized_accuracy_reward/std": 0.13633104413747787, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19515107572078705, "step": 1236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1519.5, "completions/mean_length": 1021.59375, "completions/mean_terminated_length": 865.2091369628906, "completions/min_length": 467.75, "completions/min_terminated_length": 467.75, "epoch": 0.6185, "grad_norm": 0.35122814774513245, "kl": 0.0404052734375, "learning_rate": 4.441860491038345e-07, "loss": 0.1781, "num_tokens": 96126379.0, "reward": 0.8816634863615036, "reward_std": 0.5211027599871159, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2123161144554615, "rewards/penalized_accuracy_reward/std": 0.23817338049411774, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18504608422517776, "step": 1237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1984.75, "completions/max_terminated_length": 1655.5, "completions/mean_length": 1044.359375, "completions/mean_terminated_length": 936.5702667236328, "completions/min_length": 400.75, "completions/min_terminated_length": 400.75, "epoch": 0.619, "grad_norm": 0.4023553431034088, "kl": 0.038848876953125, "learning_rate": 4.4342283427047164e-07, "loss": 0.2509, "num_tokens": 96202226.0, "reward": 0.46484375, "reward_std": 0.08642388880252838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17284777760505676, "step": 1238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1960.75, "completions/max_terminated_length": 1801.5, "completions/mean_length": 1038.703125, "completions/mean_terminated_length": 925.3015441894531, "completions/min_length": 353.75, "completions/min_terminated_length": 353.75, "epoch": 0.6195, "grad_norm": 0.37279754877090454, "kl": 0.036346435546875, "learning_rate": 4.4265994408963867e-07, "loss": 0.1736, "num_tokens": 96284639.0, "reward": 0.6929878294467926, "reward_std": 0.45917442813515663, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1248142197728157, "rewards/penalized_accuracy_reward/std": 0.1891392022371292, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.19505928456783295, "step": 1239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.5, "completions/max_terminated_length": 926.5, "completions/mean_length": 537.921875, "completions/mean_terminated_length": 537.921875, "completions/min_length": 250.5, "completions/min_terminated_length": 250.5, "epoch": 0.62, "grad_norm": 0.45318037271499634, "kl": 0.0447998046875, "learning_rate": 4.418973808852313e-07, "loss": 0.0446, "num_tokens": 96326778.0, "reward": 0.9167413711547852, "reward_std": 0.5402057617902756, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21227693185210228, "rewards/penalized_accuracy_reward/std": 0.26977764070034027, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.05259781517088413, "step": 1240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 932.578125, "completions/mean_terminated_length": 858.2167053222656, "completions/min_length": 388.25, "completions/min_terminated_length": 388.25, "epoch": 0.6205, "grad_norm": 0.5851032137870789, "kl": 0.0496826171875, "learning_rate": 4.4113514698014953e-07, "loss": 0.232, "num_tokens": 96398335.0, "reward": 0.4917227327823639, "reward_std": 0.17866912484169006, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012462932616472244, "rewards/penalized_accuracy_reward/std": 0.04985173046588898, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2024081014096737, "step": 1241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1711.25, "completions/max_terminated_length": 1243.5, "completions/mean_length": 751.09375, "completions/mean_terminated_length": 703.0771026611328, "completions/min_length": 285.25, "completions/min_terminated_length": 285.25, "epoch": 0.621, "grad_norm": 0.46525511145591736, "kl": 0.045257568359375, "learning_rate": 4.403732446962899e-07, "loss": 0.1814, "num_tokens": 96455189.0, "reward": 0.5132658332586288, "reward_std": 0.12759744375944138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012492290697991848, "rewards/penalized_accuracy_reward/std": 0.04996916651725769, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1938.0, "completions/max_terminated_length": 1420.75, "completions/mean_length": 961.453125, "completions/mean_terminated_length": 848.0986938476562, "completions/min_length": 307.75, "completions/min_terminated_length": 307.75, "epoch": 0.6215, "grad_norm": 0.46453359723091125, "kl": 0.04046630859375, "learning_rate": 4.3961167635453876e-07, "loss": 0.2331, "num_tokens": 96527026.0, "reward": 0.4956674575805664, "reward_std": 0.15786530636250973, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482170946896076, "rewards/penalized_accuracy_reward/std": 0.0499286875128746, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15830440074205399, "step": 1243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 906.515625, "completions/mean_terminated_length": 743.5935516357422, "completions/min_length": 311.5, "completions/min_terminated_length": 311.5, "epoch": 0.622, "grad_norm": 0.5289730429649353, "kl": 0.0594482421875, "learning_rate": 4.388504442747657e-07, "loss": 0.3288, "num_tokens": 96594227.0, "reward": 0.5472248643636703, "reward_std": 0.2806225121021271, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04997961223125458, "rewards/penalized_accuracy_reward/std": 0.08940625190734863, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.24846871569752693, "step": 1244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.25, "completions/max_terminated_length": 1173.25, "completions/mean_length": 646.421875, "completions/mean_terminated_length": 646.421875, "completions/min_length": 277.25, "completions/min_terminated_length": 277.25, "epoch": 0.6225, "grad_norm": 0.35407841205596924, "kl": 0.0325927734375, "learning_rate": 4.3808955077581546e-07, "loss": 0.0586, "num_tokens": 96643118.0, "reward": 0.595916211605072, "reward_std": 0.19419288635253906, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04991123452782631, "rewards/penalized_accuracy_reward/std": 0.08928395062685013, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1846.75, "completions/max_terminated_length": 1309.0, "completions/mean_length": 761.96875, "completions/mean_terminated_length": 714.7541885375977, "completions/min_length": 377.25, "completions/min_terminated_length": 377.25, "epoch": 0.623, "grad_norm": 0.5614272952079773, "kl": 0.0394287109375, "learning_rate": 4.373289981755013e-07, "loss": 0.2997, "num_tokens": 96699676.0, "reward": 0.5342850387096405, "reward_std": 0.18710558488965034, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02495501935482025, "rewards/penalized_accuracy_reward/std": 0.0681900903582573, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 1246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1815.5, "completions/max_terminated_length": 1581.25, "completions/mean_length": 919.375, "completions/mean_terminated_length": 885.7073211669922, "completions/min_length": 390.5, "completions/min_terminated_length": 390.5, "epoch": 0.6235, "grad_norm": 0.2794892191886902, "kl": 0.037322998046875, "learning_rate": 4.365687887905988e-07, "loss": 0.1423, "num_tokens": 96769380.0, "reward": 0.486328125, "reward_std": 0.047643646597862244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 1247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1715.25, "completions/max_terminated_length": 1284.5, "completions/mean_length": 1135.3125, "completions/mean_terminated_length": 875.0166168212891, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.624, "grad_norm": 0.4428296983242035, "kl": 0.0614013671875, "learning_rate": 4.358089249368375e-07, "loss": 0.2714, "num_tokens": 96852312.0, "reward": 0.5978466272354126, "reward_std": 0.3008718080818653, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123998835682869, "rewards/penalized_accuracy_reward/std": 0.10237833112478256, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.19223029538989067, "step": 1248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1709.5, "completions/max_terminated_length": 1294.5, "completions/mean_length": 862.875, "completions/mean_terminated_length": 727.8260498046875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.6245, "grad_norm": 0.38615214824676514, "kl": 0.043609619140625, "learning_rate": 4.350494089288943e-07, "loss": 0.182, "num_tokens": 96916528.0, "reward": 0.9141811281442642, "reward_std": 0.7076674252748489, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2246686890721321, "rewards/penalized_accuracy_reward/std": 0.347469761967659, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.14307335764169693, "step": 1249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1824.75, "completions/max_terminated_length": 1679.75, "completions/mean_length": 1011.921875, "completions/mean_terminated_length": 935.0229339599609, "completions/min_length": 407.25, "completions/min_terminated_length": 407.25, "epoch": 0.625, "grad_norm": 0.43813198804855347, "kl": 0.03155517578125, "learning_rate": 4.3429024308038686e-07, "loss": 0.126, "num_tokens": 96990507.0, "reward": 0.48200052976608276, "reward_std": 0.17752202972769737, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012484642677009106, "rewards/penalized_accuracy_reward/std": 0.04993857070803642, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1894841343164444, "step": 1250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1672.5, "completions/max_terminated_length": 1452.0, "completions/mean_length": 921.09375, "completions/mean_terminated_length": 890.8166809082031, "completions/min_length": 480.5, "completions/min_terminated_length": 480.5, "epoch": 0.6255, "grad_norm": 0.42665359377861023, "kl": 0.05487060546875, "learning_rate": 4.3353142970386557e-07, "loss": 0.1238, "num_tokens": 97064353.0, "reward": 0.5745603293180466, "reward_std": 0.3731037359684706, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049975477159023285, "rewards/penalized_accuracy_reward/std": 0.16822673380374908, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1385057382285595, "step": 1251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1555.75, "completions/max_terminated_length": 1457.5, "completions/mean_length": 922.1875, "completions/mean_terminated_length": 889.0558166503906, "completions/min_length": 347.25, "completions/min_terminated_length": 347.25, "epoch": 0.626, "grad_norm": 0.27697378396987915, "kl": 0.0263671875, "learning_rate": 4.327729711108082e-07, "loss": 0.0725, "num_tokens": 97130909.0, "reward": 0.6320609748363495, "reward_std": 0.234563410282135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07481954991817474, "rewards/penalized_accuracy_reward/std": 0.09975944459438324, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 1252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1846.75, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1094.859375, "completions/mean_terminated_length": 995.3547058105469, "completions/min_length": 554.5, "completions/min_terminated_length": 554.5, "epoch": 0.6265, "grad_norm": 0.2160225361585617, "kl": 0.0388031005859375, "learning_rate": 4.3201486961161093e-07, "loss": 0.1031, "num_tokens": 97213476.0, "reward": 0.5089405328035355, "reward_std": 0.17448396235704422, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497807890176773, "rewards/penalized_accuracy_reward/std": 0.06825310736894608, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.14264702796936035, "step": 1253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1471.25, "completions/max_terminated_length": 1392.25, "completions/mean_length": 868.03125, "completions/mean_terminated_length": 797.3137359619141, "completions/min_length": 352.25, "completions/min_terminated_length": 352.25, "epoch": 0.627, "grad_norm": 0.22595493495464325, "kl": 0.040435791015625, "learning_rate": 4.312571275155823e-07, "loss": 0.0836, "num_tokens": 97277878.0, "reward": 0.5763096213340759, "reward_std": 0.20890728943049908, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049873560667037964, "rewards/penalized_accuracy_reward/std": 0.08921656012535095, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.10004068538546562, "step": 1254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1914.5, "completions/max_terminated_length": 1810.0, "completions/mean_length": 897.09375, "completions/mean_terminated_length": 858.4364776611328, "completions/min_length": 378.5, "completions/min_terminated_length": 378.5, "epoch": 0.6275, "grad_norm": 0.4106295108795166, "kl": 0.0380859375, "learning_rate": 4.304997471309361e-07, "loss": 0.0692, "num_tokens": 97342844.0, "reward": 0.6761561334133148, "reward_std": 0.3957499600946903, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09979681298136711, "rewards/penalized_accuracy_reward/std": 0.1760285496711731, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.13996089063584805, "step": 1255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1470.5, "completions/max_terminated_length": 1457.75, "completions/mean_length": 847.875, "completions/mean_terminated_length": 834.3218994140625, "completions/min_length": 392.5, "completions/min_terminated_length": 392.5, "epoch": 0.628, "grad_norm": 0.4619412422180176, "kl": 0.04901123046875, "learning_rate": 4.297427307647844e-07, "loss": 0.0933, "num_tokens": 97406916.0, "reward": 0.8818670213222504, "reward_std": 0.423956960439682, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1997225796803832, "rewards/penalized_accuracy_reward/std": 0.19863570109009743, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09179970622062683, "step": 1256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1604.25, "completions/max_terminated_length": 1600.75, "completions/mean_length": 781.21875, "completions/mean_terminated_length": 761.0718841552734, "completions/min_length": 302.25, "completions/min_terminated_length": 302.25, "epoch": 0.6285, "grad_norm": 0.4688666760921478, "kl": 0.03546142578125, "learning_rate": 4.2898608072313045e-07, "loss": 0.1847, "num_tokens": 97463618.0, "reward": 0.6707046627998352, "reward_std": 0.22010846436023712, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0873054563999176, "rewards/penalized_accuracy_reward/std": 0.10224173218011856, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1585.25, "completions/max_terminated_length": 1515.0, "completions/mean_length": 1200.234375, "completions/mean_terminated_length": 1070.0531311035156, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.629, "grad_norm": 0.20772969722747803, "kl": 0.036712646484375, "learning_rate": 4.2822979931086144e-07, "loss": 0.123, "num_tokens": 97548689.0, "reward": 0.533447340130806, "reward_std": 0.2696927450597286, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049926795065402985, "rewards/penalized_accuracy_reward/std": 0.08931178599596024, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.1821383461356163, "step": 1258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1889.5, "completions/max_terminated_length": 1779.25, "completions/mean_length": 1048.265625, "completions/mean_terminated_length": 993.0425720214844, "completions/min_length": 476.25, "completions/min_terminated_length": 476.25, "epoch": 0.6295, "grad_norm": 0.353431761264801, "kl": 0.037445068359375, "learning_rate": 4.2747388883174154e-07, "loss": 0.0864, "num_tokens": 97628370.0, "reward": 0.5553292036056519, "reward_std": 0.20532863214612007, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03743021935224533, "rewards/penalized_accuracy_reward/std": 0.08047255873680115, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12654344737529755, "step": 1259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1532.25, "completions/max_terminated_length": 1418.5, "completions/mean_length": 864.34375, "completions/mean_terminated_length": 773.6761474609375, "completions/min_length": 377.5, "completions/min_terminated_length": 377.5, "epoch": 0.63, "grad_norm": 0.3196336030960083, "kl": 0.057861328125, "learning_rate": 4.267183515884054e-07, "loss": 0.0852, "num_tokens": 97695608.0, "reward": 0.5857421159744263, "reward_std": 0.33301275596022606, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062402307987213135, "rewards/penalized_accuracy_reward/std": 0.1487097069621086, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.12080313265323639, "step": 1260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2008.5, "completions/max_terminated_length": 1857.0, "completions/mean_length": 1061.765625, "completions/mean_terminated_length": 942.0472106933594, "completions/min_length": 385.25, "completions/min_terminated_length": 385.25, "epoch": 0.6305, "grad_norm": 0.3595232665538788, "kl": 0.032440185546875, "learning_rate": 4.2596318988235037e-07, "loss": 0.296, "num_tokens": 97773721.0, "reward": 0.447265625, "reward_std": 0.1179349273443222, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2358698584139347, "step": 1261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1040.75, "completions/max_terminated_length": 945.25, "completions/mean_length": 582.078125, "completions/mean_terminated_length": 512.3385467529297, "completions/min_length": 232.75, "completions/min_terminated_length": 232.75, "epoch": 0.631, "grad_norm": 0.6660357713699341, "kl": 0.06036376953125, "learning_rate": 4.2520840601392996e-07, "loss": 0.0946, "num_tokens": 97820814.0, "reward": 0.474609375, "reward_std": 0.049738772213459015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.09947755187749863, "step": 1262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1791.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 836.578125, "completions/mean_terminated_length": 699.0867767333984, "completions/min_length": 249.5, "completions/min_terminated_length": 249.5, "epoch": 0.6315, "grad_norm": 0.5246081352233887, "kl": 0.043975830078125, "learning_rate": 4.2445400228234687e-07, "loss": 0.2604, "num_tokens": 97883235.0, "reward": 0.7337432950735092, "reward_std": 0.36776578053832054, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13737944420427084, "rewards/penalized_accuracy_reward/std": 0.14976321533322334, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18942352384328842, "step": 1263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1484.0, "completions/max_terminated_length": 1122.25, "completions/mean_length": 597.484375, "completions/mean_terminated_length": 571.6687545776367, "completions/min_length": 262.5, "completions/min_terminated_length": 262.5, "epoch": 0.632, "grad_norm": 0.26384228467941284, "kl": 0.0386962890625, "learning_rate": 4.2369998098564554e-07, "loss": 0.1459, "num_tokens": 97929986.0, "reward": 0.7611366510391235, "reward_std": 0.21012458205223083, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13740426301956177, "rewards/penalized_accuracy_reward/std": 0.09567607194185257, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07525964826345444, "step": 1264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1550.75, "completions/max_terminated_length": 1450.75, "completions/mean_length": 844.421875, "completions/mean_terminated_length": 826.0343933105469, "completions/min_length": 363.5, "completions/min_terminated_length": 363.5, "epoch": 0.6325, "grad_norm": 0.2750760316848755, "kl": 0.030609130859375, "learning_rate": 4.2294634442070553e-07, "loss": 0.0497, "num_tokens": 97992733.0, "reward": 0.486328125, "reward_std": 0.04824705049395561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09649410098791122, "step": 1265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2034.25, "completions/max_terminated_length": 1540.5, "completions/mean_length": 739.3125, "completions/mean_terminated_length": 656.2777099609375, "completions/min_length": 272.75, "completions/min_terminated_length": 272.75, "epoch": 0.633, "grad_norm": 0.5235368013381958, "kl": 0.048736572265625, "learning_rate": 4.2219309488323487e-07, "loss": 0.5211, "num_tokens": 98048529.0, "reward": 1.0050484538078308, "reward_std": 0.4410785585641861, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2622898519039154, "rewards/penalized_accuracy_reward/std": 0.19175013154745102, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1421622931957245, "step": 1266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.25, "completions/max_terminated_length": 1040.25, "completions/mean_length": 538.15625, "completions/mean_terminated_length": 538.15625, "completions/min_length": 264.5, "completions/min_terminated_length": 264.5, "epoch": 0.6335, "grad_norm": 0.5537868738174438, "kl": 0.05108642578125, "learning_rate": 4.214402346677619e-07, "loss": 0.006, "num_tokens": 98092795.0, "reward": 0.720661997795105, "reward_std": 0.22017082571983337, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11228412389755249, "rewards/penalized_accuracy_reward/std": 0.10227291285991669, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1836.75, "completions/max_terminated_length": 1554.75, "completions/mean_length": 971.40625, "completions/mean_terminated_length": 860.4198303222656, "completions/min_length": 387.5, "completions/min_terminated_length": 387.5, "epoch": 0.634, "grad_norm": 0.4376303255558014, "kl": 0.042755126953125, "learning_rate": 4.206877660676297e-07, "loss": 0.1466, "num_tokens": 98163797.0, "reward": 0.583862379193306, "reward_std": 0.3479172121733427, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062439002096652985, "rewards/penalized_accuracy_reward/std": 0.14878999441862106, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.16634058579802513, "step": 1268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1346.25, "completions/max_terminated_length": 1079.25, "completions/mean_length": 630.953125, "completions/mean_terminated_length": 583.8125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6345, "grad_norm": 0.41387057304382324, "kl": 0.04168701171875, "learning_rate": 4.1993569137498776e-07, "loss": 0.0498, "num_tokens": 98213650.0, "reward": 0.9320699572563171, "reward_std": 0.2814323753118515, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.22482404857873917, "rewards/penalized_accuracy_reward/std": 0.13052178174257278, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09567352384328842, "step": 1269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1481.5, "completions/max_terminated_length": 1263.0, "completions/mean_length": 729.71875, "completions/mean_terminated_length": 691.1104278564453, "completions/min_length": 229.75, "completions/min_terminated_length": 229.75, "epoch": 0.635, "grad_norm": 0.4674742519855499, "kl": 0.036865234375, "learning_rate": 4.1918401288078633e-07, "loss": 0.03, "num_tokens": 98268208.0, "reward": 0.5401618480682373, "reward_std": 0.16226382553577423, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024963732808828354, "rewards/penalized_accuracy_reward/std": 0.06821390986442566, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 1295.5625, "completions/mean_terminated_length": 1187.230209350586, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "epoch": 0.6355, "grad_norm": 0.3259451985359192, "kl": 0.0386962890625, "learning_rate": 4.1843273287476854e-07, "loss": 0.1016, "num_tokens": 98361092.0, "reward": 0.5240155830979347, "reward_std": 0.3485107384622097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03739842027425766, "rewards/penalized_accuracy_reward/std": 0.14959368854761124, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.21603111550211906, "step": 1271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1207.5, "completions/max_terminated_length": 1121.75, "completions/mean_length": 542.296875, "completions/mean_terminated_length": 498.8013458251953, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.636, "grad_norm": 0.4868929386138916, "kl": 0.04498291015625, "learning_rate": 4.1768185364546326e-07, "loss": 0.1696, "num_tokens": 98405223.0, "reward": 0.8510788679122925, "reward_std": 0.447768896818161, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18725818395614624, "rewards/penalized_accuracy_reward/std": 0.2021978795528412, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11808442324399948, "step": 1272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1906.0, "completions/max_terminated_length": 1523.25, "completions/mean_length": 933.6875, "completions/mean_terminated_length": 810.2715911865234, "completions/min_length": 329.25, "completions/min_terminated_length": 329.25, "epoch": 0.6365, "grad_norm": 0.3890499174594879, "kl": 0.040069580078125, "learning_rate": 4.1693137748017915e-07, "loss": 0.2989, "num_tokens": 98473667.0, "reward": 0.4878758043050766, "reward_std": 0.1769145503640175, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0124925896525383, "rewards/penalized_accuracy_reward/std": 0.0499703586101532, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.15394768491387367, "step": 1273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1518.5, "completions/max_terminated_length": 1447.0, "completions/mean_length": 751.59375, "completions/mean_terminated_length": 735.0656433105469, "completions/min_length": 365.5, "completions/min_terminated_length": 365.5, "epoch": 0.637, "grad_norm": 0.5476288199424744, "kl": 0.036865234375, "learning_rate": 4.161813066649963e-07, "loss": 0.1581, "num_tokens": 98534377.0, "reward": 0.48046875, "reward_std": 0.06464069709181786, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1292813941836357, "step": 1274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1822.25, "completions/max_terminated_length": 1445.25, "completions/mean_length": 1014.96875, "completions/mean_terminated_length": 829.9873657226562, "completions/min_length": 411.5, "completions/min_terminated_length": 411.5, "epoch": 0.6375, "grad_norm": 0.3945067226886749, "kl": 0.051025390625, "learning_rate": 4.15431643484761e-07, "loss": 0.3318, "num_tokens": 98608151.0, "reward": 0.43359375, "reward_std": 0.11512119695544243, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.23024240881204605, "step": 1275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1570.25, "completions/mean_length": 1099.890625, "completions/mean_terminated_length": 939.2119598388672, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.638, "grad_norm": 0.3228168785572052, "kl": 0.048095703125, "learning_rate": 4.146823902230772e-07, "loss": 0.1667, "num_tokens": 98688416.0, "reward": 0.5989343523979187, "reward_std": 0.42971283942461014, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0748578030616045, "rewards/penalized_accuracy_reward/std": 0.1891060583293438, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.24015576019883156, "step": 1276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2019.5, "completions/max_terminated_length": 1790.0, "completions/mean_length": 1190.015625, "completions/mean_terminated_length": 1061.994369506836, "completions/min_length": 480.25, "completions/min_terminated_length": 480.25, "epoch": 0.6385, "grad_norm": 0.3298008143901825, "kl": 0.033966064453125, "learning_rate": 4.1393354916230005e-07, "loss": 0.2128, "num_tokens": 98775057.0, "reward": 0.5608015060424805, "reward_std": 0.26631321012973785, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993200674653053, "rewards/penalized_accuracy_reward/std": 0.08932112157344818, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17534197121858597, "step": 1277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1750.75, "completions/max_terminated_length": 1519.75, "completions/mean_length": 894.671875, "completions/mean_terminated_length": 759.7427215576172, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.639, "grad_norm": 0.27179744839668274, "kl": 0.030670166015625, "learning_rate": 4.1318512258352936e-07, "loss": 0.2265, "num_tokens": 98840524.0, "reward": 0.6778703033924103, "reward_std": 0.40465445071458817, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11237265542149544, "rewards/penalized_accuracy_reward/std": 0.18042610585689545, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1009209007024765, "step": 1278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1827.0, "completions/max_terminated_length": 1655.25, "completions/mean_length": 986.984375, "completions/mean_terminated_length": 877.3681182861328, "completions/min_length": 378.75, "completions/min_terminated_length": 378.75, "epoch": 0.6395, "grad_norm": 0.31474238634109497, "kl": 0.028350830078125, "learning_rate": 4.124371127666024e-07, "loss": 0.1519, "num_tokens": 98911563.0, "reward": 0.733289361000061, "reward_std": 0.4558956455439329, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13715249300003052, "rewards/penalized_accuracy_reward/std": 0.19524968415498734, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.14339008927345276, "step": 1279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1856.25, "completions/max_terminated_length": 1408.75, "completions/mean_length": 771.125, "completions/mean_terminated_length": 668.5979309082031, "completions/min_length": 267.5, "completions/min_terminated_length": 267.5, "epoch": 0.64, "grad_norm": 0.47147923707962036, "kl": 0.03656005859375, "learning_rate": 4.1168952199008677e-07, "loss": 0.2998, "num_tokens": 98970707.0, "reward": 0.6645678877830505, "reward_std": 0.2767983376979828, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09986206889152527, "rewards/penalized_accuracy_reward/std": 0.10313712060451508, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1734122931957245, "step": 1280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1588.5, "completions/max_terminated_length": 1314.75, "completions/mean_length": 851.40625, "completions/mean_terminated_length": 814.6062774658203, "completions/min_length": 437.75, "completions/min_terminated_length": 437.75, "epoch": 0.6405, "grad_norm": 0.36119362711906433, "kl": 0.038543701171875, "learning_rate": 4.1094235253127374e-07, "loss": 0.0685, "num_tokens": 99035133.0, "reward": 0.5264946669340134, "reward_std": 0.19426202774047852, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024966079741716385, "rewards/penalized_accuracy_reward/std": 0.06822031736373901, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.15404859744012356, "step": 1281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1676.5, "completions/max_terminated_length": 1612.25, "completions/mean_length": 836.796875, "completions/mean_terminated_length": 787.8269348144531, "completions/min_length": 317.5, "completions/min_terminated_length": 317.5, "epoch": 0.641, "grad_norm": 0.3594115972518921, "kl": 0.03851318359375, "learning_rate": 4.101956066661708e-07, "loss": 0.1045, "num_tokens": 99096336.0, "reward": 0.5725347995758057, "reward_std": 0.2248990684747696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993927478790283, "rewards/penalized_accuracy_reward/std": 0.08933412283658981, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1377599686384201, "step": 1282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1904.5, "completions/max_terminated_length": 1582.5, "completions/mean_length": 924.0, "completions/mean_terminated_length": 834.3385620117188, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.6415, "grad_norm": 0.39047694206237793, "kl": 0.034088134765625, "learning_rate": 4.0944928666949527e-07, "loss": 0.2391, "num_tokens": 99164240.0, "reward": 0.8587172031402588, "reward_std": 0.10916285589337349, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1998664140701294, "rewards/penalized_accuracy_reward/std": 6.328168819891289e-05, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21833455190062523, "step": 1283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1938.75, "completions/max_terminated_length": 1474.0, "completions/mean_length": 870.703125, "completions/mean_terminated_length": 813.1708679199219, "completions/min_length": 341.5, "completions/min_terminated_length": 341.5, "epoch": 0.642, "grad_norm": 0.42679527401924133, "kl": 0.04815673828125, "learning_rate": 4.0870339481466774e-07, "loss": 0.2098, "num_tokens": 99230525.0, "reward": 0.6224841773509979, "reward_std": 0.32929718121886253, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491396553814411, "rewards/penalized_accuracy_reward/std": 0.14558646827936172, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15673990920186043, "step": 1284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1767.25, "completions/max_terminated_length": 1682.0, "completions/mean_length": 1051.75, "completions/mean_terminated_length": 1001.4754943847656, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.6425, "grad_norm": 0.30420297384262085, "kl": 0.033843994140625, "learning_rate": 4.079579333738039e-07, "loss": 0.1037, "num_tokens": 99305629.0, "reward": 0.6302964091300964, "reward_std": 0.3231951966881752, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491382583975792, "rewards/penalized_accuracy_reward/std": 0.14551942050457, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11245574057102203, "step": 1285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1700.25, "completions/mean_length": 1043.515625, "completions/mean_terminated_length": 900.0178985595703, "completions/min_length": 386.5, "completions/min_terminated_length": 386.5, "epoch": 0.643, "grad_norm": 0.35260534286499023, "kl": 0.043487548828125, "learning_rate": 4.0721290461770863e-07, "loss": 0.1524, "num_tokens": 99383358.0, "reward": 0.6279024630784988, "reward_std": 0.44887444004416466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08738873805850744, "rewards/penalized_accuracy_reward/std": 0.195448849350214, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.24708620086312294, "step": 1286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1938.0, "completions/max_terminated_length": 1815.75, "completions/mean_length": 1017.8125, "completions/mean_terminated_length": 891.2131195068359, "completions/min_length": 355.75, "completions/min_terminated_length": 355.75, "epoch": 0.6435, "grad_norm": 0.33343690633773804, "kl": 0.04046630859375, "learning_rate": 4.064683108158685e-07, "loss": 0.24, "num_tokens": 99457874.0, "reward": 0.5070025324821472, "reward_std": 0.20280104130506516, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498563751578331, "rewards/penalized_accuracy_reward/std": 0.06827376782894135, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17941861599683762, "step": 1287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1518.25, "completions/max_terminated_length": 1505.0, "completions/mean_length": 867.5625, "completions/mean_terminated_length": 854.8197937011719, "completions/min_length": 361.25, "completions/min_terminated_length": 361.25, "epoch": 0.644, "grad_norm": 0.16397817432880402, "kl": 0.04681396484375, "learning_rate": 4.057241542364457e-07, "loss": 0.0453, "num_tokens": 99523158.0, "reward": 0.4921875, "reward_std": 0.024206146597862244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.04841229319572449, "step": 1288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1745.5, "completions/max_terminated_length": 1716.75, "completions/mean_length": 847.546875, "completions/mean_terminated_length": 774.1428833007812, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.6445, "grad_norm": 0.4005764126777649, "kl": 0.04644775390625, "learning_rate": 4.0498043714627006e-07, "loss": 0.1529, "num_tokens": 99588617.0, "reward": 0.4765625, "reward_std": 0.05806645750999451, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11613291501998901, "step": 1289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1475.25, "completions/max_terminated_length": 1472.5, "completions/mean_length": 821.21875, "completions/mean_terminated_length": 766.8645935058594, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.645, "grad_norm": 0.2804063856601715, "kl": 0.029510498046875, "learning_rate": 4.042371618108329e-07, "loss": 0.0674, "num_tokens": 99648935.0, "reward": 1.1529284715652466, "reward_std": 0.31597311422228813, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3372064232826233, "rewards/penalized_accuracy_reward/std": 0.13930721953511238, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.07471735030412674, "step": 1290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1787.25, "completions/max_terminated_length": 1304.0, "completions/mean_length": 716.90625, "completions/mean_terminated_length": 633.0327453613281, "completions/min_length": 297.25, "completions/min_terminated_length": 297.25, "epoch": 0.6455, "grad_norm": 0.43929558992385864, "kl": 0.034423828125, "learning_rate": 4.034943304942796e-07, "loss": 0.3732, "num_tokens": 99702113.0, "reward": 0.5495464205741882, "reward_std": 0.22653301060199738, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037468522787094116, "rewards/penalized_accuracy_reward/std": 0.08055492490530014, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13084635883569717, "step": 1291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1623.75, "completions/max_terminated_length": 1605.25, "completions/mean_length": 831.78125, "completions/mean_terminated_length": 777.3810119628906, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.646, "grad_norm": 0.4589586555957794, "kl": 0.04864501953125, "learning_rate": 4.027519454594033e-07, "loss": 0.1238, "num_tokens": 99766947.0, "reward": 0.8665045201778412, "reward_std": 0.30480803176760674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1998538337647915, "rewards/penalized_accuracy_reward/std": 0.13650806993246078, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1456152144819498, "step": 1292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1770.0, "completions/max_terminated_length": 1221.75, "completions/mean_length": 754.65625, "completions/mean_terminated_length": 684.7769470214844, "completions/min_length": 296.5, "completions/min_terminated_length": 296.5, "epoch": 0.6465, "grad_norm": 0.5714460611343384, "kl": 0.039825439453125, "learning_rate": 4.020100089676376e-07, "loss": 0.2168, "num_tokens": 99826317.0, "reward": 0.4916926622390747, "reward_std": 0.18022658675909042, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01244788896292448, "rewards/penalized_accuracy_reward/std": 0.04979155585169792, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.16128693893551826, "step": 1293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1490.25, "completions/max_terminated_length": 1264.25, "completions/mean_length": 872.328125, "completions/mean_terminated_length": 726.9730987548828, "completions/min_length": 279.25, "completions/min_terminated_length": 279.25, "epoch": 0.647, "grad_norm": 0.3457006514072418, "kl": 0.05133056640625, "learning_rate": 4.012685232790497e-07, "loss": 0.2279, "num_tokens": 99891074.0, "reward": 0.6068501174449921, "reward_std": 0.26391077414155006, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490943372249603, "rewards/penalized_accuracy_reward/std": 0.09987924993038177, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.12830451875925064, "step": 1294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1750.0, "completions/max_terminated_length": 1632.75, "completions/mean_length": 1006.8125, "completions/mean_terminated_length": 903.1254119873047, "completions/min_length": 383.75, "completions/min_terminated_length": 383.75, "epoch": 0.6475, "grad_norm": 0.5284269452095032, "kl": 0.07000732421875, "learning_rate": 4.005274906523336e-07, "loss": 0.2249, "num_tokens": 99967142.0, "reward": 0.7085640132427216, "reward_std": 0.28707488626241684, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12478981912136078, "rewards/penalized_accuracy_reward/std": 0.0998319759964943, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1748218685388565, "step": 1295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1628.75, "completions/max_terminated_length": 1609.0, "completions/mean_length": 772.125, "completions/mean_terminated_length": 756.5302124023438, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.648, "grad_norm": 0.30457374453544617, "kl": 0.04351806640625, "learning_rate": 3.9978691334480306e-07, "loss": 0.1126, "num_tokens": 100026302.0, "reward": 0.6401263773441315, "reward_std": 0.22699500620365143, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07494600117206573, "rewards/penalized_accuracy_reward/std": 0.09992799907922745, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1736.25, "completions/mean_length": 1306.640625, "completions/mean_terminated_length": 1143.4778137207031, "completions/min_length": 480.75, "completions/min_terminated_length": 480.75, "epoch": 0.6485, "grad_norm": 0.35143792629241943, "kl": 0.040924072265625, "learning_rate": 3.9904679361238526e-07, "loss": 0.1498, "num_tokens": 100119175.0, "reward": 0.4834805130958557, "reward_std": 0.23956365883350372, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024943383410573006, "rewards/penalized_accuracy_reward/std": 0.06815831363201141, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2448965087532997, "step": 1297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1696.5, "completions/max_terminated_length": 1212.5, "completions/mean_length": 950.109375, "completions/mean_terminated_length": 737.3097381591797, "completions/min_length": 283.5, "completions/min_terminated_length": 283.5, "epoch": 0.649, "grad_norm": 0.42746874690055847, "kl": 0.0506591796875, "learning_rate": 3.9830713370961313e-07, "loss": 0.1117, "num_tokens": 100187198.0, "reward": 0.9828902930021286, "reward_std": 0.6575552150607109, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.27464827708899975, "rewards/penalized_accuracy_reward/std": 0.3065078780055046, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.22825736552476883, "step": 1298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1795.75, "completions/max_terminated_length": 1069.75, "completions/mean_length": 722.671875, "completions/mean_terminated_length": 656.6073150634766, "completions/min_length": 313.5, "completions/min_terminated_length": 313.5, "epoch": 0.6495, "grad_norm": 0.6544716954231262, "kl": 0.05291748046875, "learning_rate": 3.975679358896189e-07, "loss": 0.3042, "num_tokens": 100243881.0, "reward": 0.4765625, "reward_std": 0.08197538927197456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16395078226923943, "step": 1299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1459.0, "completions/max_terminated_length": 1305.5, "completions/mean_length": 824.875, "completions/mean_terminated_length": 793.6205444335938, "completions/min_length": 358.25, "completions/min_terminated_length": 358.25, "epoch": 0.65, "grad_norm": 0.3301491141319275, "kl": 0.045135498046875, "learning_rate": 3.968292024041275e-07, "loss": 0.0547, "num_tokens": 100304641.0, "reward": 0.6841401904821396, "reward_std": 0.32681888341903687, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09988258872181177, "rewards/penalized_accuracy_reward/std": 0.1522655300796032, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08004852384328842, "step": 1300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1791.0, "completions/max_terminated_length": 1030.25, "completions/mean_length": 669.1875, "completions/mean_terminated_length": 579.1205520629883, "completions/min_length": 237.25, "completions/min_terminated_length": 237.25, "epoch": 0.6505, "grad_norm": 0.6442942023277283, "kl": 0.05029296875, "learning_rate": 3.9609093550344907e-07, "loss": 0.4154, "num_tokens": 100358189.0, "reward": 0.5935982465744019, "reward_std": 0.26705826073884964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06242411956191063, "rewards/penalized_accuracy_reward/std": 0.09562648087739944, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18760817870497704, "step": 1301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1766.5, "completions/max_terminated_length": 1307.25, "completions/mean_length": 830.78125, "completions/mean_terminated_length": 735.3134002685547, "completions/min_length": 347.5, "completions/min_terminated_length": 347.5, "epoch": 0.651, "grad_norm": 0.4784785807132721, "kl": 0.0479736328125, "learning_rate": 3.953531374364728e-07, "loss": 0.3386, "num_tokens": 100418143.0, "reward": 0.4898194968700409, "reward_std": 0.1832104418426752, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01248787622898817, "rewards/penalized_accuracy_reward/std": 0.04995150864124298, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1666148491203785, "step": 1302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1740.25, "completions/max_terminated_length": 1314.25, "completions/mean_length": 878.703125, "completions/mean_terminated_length": 753.3425750732422, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6515, "grad_norm": 0.5206872820854187, "kl": 0.06207275390625, "learning_rate": 3.946158104506594e-07, "loss": 0.3002, "num_tokens": 100487484.0, "reward": 0.48201437294483185, "reward_std": 0.19687895476818085, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249156054109335, "rewards/penalized_accuracy_reward/std": 0.0499662421643734, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1938929334282875, "step": 1303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1627.5, "completions/max_terminated_length": 1546.75, "completions/mean_length": 957.796875, "completions/mean_terminated_length": 895.7204895019531, "completions/min_length": 532.75, "completions/min_terminated_length": 532.75, "epoch": 0.652, "grad_norm": 0.3416632413864136, "kl": 0.03521728515625, "learning_rate": 3.938789567920349e-07, "loss": 0.0638, "num_tokens": 100562047.0, "reward": 0.5186741352081299, "reward_std": 0.18901651352643967, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024962063878774643, "rewards/penalized_accuracy_reward/std": 0.06820934265851974, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.10519563034176826, "step": 1304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1816.0, "completions/max_terminated_length": 1429.0, "completions/mean_length": 1033.296875, "completions/mean_terminated_length": 955.1625366210938, "completions/min_length": 403.75, "completions/min_terminated_length": 403.75, "epoch": 0.6525, "grad_norm": 0.21687516570091248, "kl": 0.038787841796875, "learning_rate": 3.931425787051832e-07, "loss": 0.1551, "num_tokens": 100635442.0, "reward": 0.54562808573246, "reward_std": 0.26763221248984337, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746248688548803, "rewards/penalized_accuracy_reward/std": 0.11818495392799377, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1492154598236084, "step": 1305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1629.75, "completions/max_terminated_length": 1152.75, "completions/mean_length": 973.0625, "completions/mean_terminated_length": 685.5539855957031, "completions/min_length": 283.25, "completions/min_terminated_length": 283.25, "epoch": 0.653, "grad_norm": 0.47120630741119385, "kl": 0.05303955078125, "learning_rate": 3.924066784332396e-07, "loss": 0.2524, "num_tokens": 100706678.0, "reward": 0.8043109178543091, "reward_std": 0.1934831254184246, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18731170892715454, "rewards/penalized_accuracy_reward/std": 0.049949828535318375, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.18716693669557571, "step": 1306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1535.0, "completions/max_terminated_length": 1457.25, "completions/mean_length": 846.390625, "completions/mean_terminated_length": 788.6614685058594, "completions/min_length": 333.25, "completions/min_terminated_length": 333.25, "epoch": 0.6535, "grad_norm": 0.3619089424610138, "kl": 0.06134033203125, "learning_rate": 3.9167125821788416e-07, "loss": 0.0956, "num_tokens": 100770847.0, "reward": 0.5205516368150711, "reward_std": 0.18649474903941154, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024924255907535553, "rewards/penalized_accuracy_reward/std": 0.0681060403585434, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11422448605298996, "step": 1307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1730.0, "completions/max_terminated_length": 1305.0, "completions/mean_length": 936.171875, "completions/mean_terminated_length": 749.6111907958984, "completions/min_length": 407.75, "completions/min_terminated_length": 407.75, "epoch": 0.654, "grad_norm": 0.4458838105201721, "kl": 0.054595947265625, "learning_rate": 3.909363202993343e-07, "loss": 0.314, "num_tokens": 100843450.0, "reward": 0.5872556045651436, "reward_std": 0.3629651665687561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07487780507653952, "rewards/penalized_accuracy_reward/std": 0.14554450288414955, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.24318470992147923, "step": 1308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1940.0, "completions/max_terminated_length": 1878.5, "completions/mean_length": 1158.203125, "completions/mean_terminated_length": 1100.194320678711, "completions/min_length": 510.5, "completions/min_terminated_length": 510.5, "epoch": 0.6545, "grad_norm": 0.2520367205142975, "kl": 0.031158447265625, "learning_rate": 3.902018669163384e-07, "loss": 0.0896, "num_tokens": 100925895.0, "reward": 0.5956085622310638, "reward_std": 0.321606881916523, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06245271861553192, "rewards/penalized_accuracy_reward/std": 0.1393447034060955, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.12426739931106567, "step": 1309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1832.75, "completions/max_terminated_length": 1483.75, "completions/mean_length": 983.375, "completions/mean_terminated_length": 871.8908081054688, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.655, "grad_norm": 0.3848455846309662, "kl": 0.059478759765625, "learning_rate": 3.894679003061686e-07, "loss": 0.1151, "num_tokens": 101002719.0, "reward": 0.6875730454921722, "reward_std": 0.35992975533008575, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123412111774087, "rewards/penalized_accuracy_reward/std": 0.15308094024658203, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18617841601371765, "step": 1310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1699.5, "completions/max_terminated_length": 1448.5, "completions/mean_length": 908.984375, "completions/mean_terminated_length": 816.1092681884766, "completions/min_length": 316.25, "completions/min_terminated_length": 316.25, "epoch": 0.6555, "grad_norm": 0.424098938703537, "kl": 0.0467071533203125, "learning_rate": 3.8873442270461485e-07, "loss": 0.2738, "num_tokens": 101070846.0, "reward": 0.7104327082633972, "reward_std": 0.29332852363586426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12474760413169861, "rewards/penalized_accuracy_reward/std": 0.09979810565710068, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1874646283686161, "step": 1311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1723.75, "completions/max_terminated_length": 1693.25, "completions/mean_length": 817.890625, "completions/mean_terminated_length": 801.8416748046875, "completions/min_length": 258.75, "completions/min_terminated_length": 258.75, "epoch": 0.656, "grad_norm": 0.42802366614341736, "kl": 0.043060302734375, "learning_rate": 3.88001436345977e-07, "loss": 0.1134, "num_tokens": 101131767.0, "reward": 0.5803912132978439, "reward_std": 0.23865669965744019, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04996123164892197, "rewards/penalized_accuracy_reward/std": 0.089373379945755, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11981988325715065, "step": 1312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1792.75, "completions/max_terminated_length": 1389.25, "completions/mean_length": 894.140625, "completions/mean_terminated_length": 799.4793701171875, "completions/min_length": 355.5, "completions/min_terminated_length": 355.5, "epoch": 0.6565, "grad_norm": 0.3721417188644409, "kl": 0.05712890625, "learning_rate": 3.872689434630585e-07, "loss": 0.1977, "num_tokens": 101199408.0, "reward": 0.7411488592624664, "reward_std": 0.4765331894159317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13717597257345915, "rewards/penalized_accuracy_reward/std": 0.22102967277169228, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18758258782327175, "step": 1313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1975.25, "completions/max_terminated_length": 1924.5, "completions/mean_length": 1187.9375, "completions/mean_terminated_length": 1081.8638610839844, "completions/min_length": 404.75, "completions/min_terminated_length": 404.75, "epoch": 0.657, "grad_norm": 0.2615261673927307, "kl": 0.039398193359375, "learning_rate": 3.8653694628715984e-07, "loss": 0.162, "num_tokens": 101284028.0, "reward": 0.455078125, "reward_std": 0.09675251692533493, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.19350504502654076, "step": 1314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 991.640625, "completions/mean_terminated_length": 744.0433502197266, "completions/min_length": 342.25, "completions/min_terminated_length": 342.25, "epoch": 0.6575, "grad_norm": 0.39333871006965637, "kl": 0.051239013671875, "learning_rate": 3.8580544704807117e-07, "loss": 0.4359, "num_tokens": 101357925.0, "reward": 0.7544306218624115, "reward_std": 0.4994998574256897, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16237156093120575, "rewards/penalized_accuracy_reward/std": 0.19885912537574768, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.28277548402547836, "step": 1315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1620.0, "completions/max_terminated_length": 1088.25, "completions/mean_length": 759.546875, "completions/mean_terminated_length": 595.0906372070312, "completions/min_length": 266.5, "completions/min_terminated_length": 266.5, "epoch": 0.658, "grad_norm": 0.42303815484046936, "kl": 0.0582275390625, "learning_rate": 3.850744479740663e-07, "loss": 0.1625, "num_tokens": 101414952.0, "reward": 0.5589115768671036, "reward_std": 0.3592389076948166, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04996359907090664, "rewards/penalized_accuracy_reward/std": 0.16818731278181076, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.140625, "step": 1316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1800.75, "completions/max_terminated_length": 1221.5, "completions/mean_length": 760.171875, "completions/mean_terminated_length": 681.2297973632812, "completions/min_length": 348.25, "completions/min_terminated_length": 348.25, "epoch": 0.6585, "grad_norm": 0.6263769865036011, "kl": 0.06353759765625, "learning_rate": 3.843439512918949e-07, "loss": 0.3363, "num_tokens": 101473763.0, "reward": 0.45703125, "reward_std": 0.097872793674469, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.195745587348938, "step": 1317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1394.25, "completions/max_terminated_length": 1157.25, "completions/mean_length": 693.0, "completions/mean_terminated_length": 571.0464172363281, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.659, "grad_norm": 0.6990945339202881, "kl": 0.0609130859375, "learning_rate": 3.8361395922677687e-07, "loss": 0.3541, "num_tokens": 101528387.0, "reward": 0.453125, "reward_std": 0.09585395082831383, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19170790165662766, "step": 1318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1704.25, "completions/max_terminated_length": 1324.25, "completions/mean_length": 906.953125, "completions/mean_terminated_length": 705.8782577514648, "completions/min_length": 275.5, "completions/min_terminated_length": 275.5, "epoch": 0.6595, "grad_norm": 0.33419331908226013, "kl": 0.0655517578125, "learning_rate": 3.8288447400239443e-07, "loss": 0.2226, "num_tokens": 101597168.0, "reward": 0.5471183955669403, "reward_std": 0.31873543933033943, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04992638621479273, "rewards/penalized_accuracy_reward/std": 0.13045156374573708, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.20168789848685265, "step": 1319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1527.75, "completions/max_terminated_length": 1115.5, "completions/mean_length": 694.21875, "completions/mean_terminated_length": 651.9437561035156, "completions/min_length": 262.25, "completions/min_terminated_length": 262.25, "epoch": 0.66, "grad_norm": 0.544408917427063, "kl": 0.0430908203125, "learning_rate": 3.82155497840886e-07, "loss": 0.241, "num_tokens": 101655262.0, "reward": 0.5323831737041473, "reward_std": 0.1948378048837185, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498064935207367, "rewards/penalized_accuracy_reward/std": 0.06826013326644897, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11663510836660862, "step": 1320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1621.0, "completions/max_terminated_length": 1401.25, "completions/mean_length": 825.125, "completions/mean_terminated_length": 773.5495910644531, "completions/min_length": 358.25, "completions/min_terminated_length": 358.25, "epoch": 0.6605, "grad_norm": 0.2588162422180176, "kl": 0.03106689453125, "learning_rate": 3.8142703296283953e-07, "loss": 0.1375, "num_tokens": 101720310.0, "reward": 0.486328125, "reward_std": 0.041479695588350296, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.08295939117670059, "step": 1321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1679.75, "completions/max_terminated_length": 1476.75, "completions/mean_length": 961.484375, "completions/mean_terminated_length": 862.3545837402344, "completions/min_length": 373.5, "completions/min_terminated_length": 373.5, "epoch": 0.661, "grad_norm": 0.39392659068107605, "kl": 0.0462646484375, "learning_rate": 3.806990815872855e-07, "loss": 0.1922, "num_tokens": 101790917.0, "reward": 0.8393965661525726, "reward_std": 0.3083457909524441, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18727640435099602, "rewards/penalized_accuracy_reward/std": 0.11805014684796333, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17125719040632248, "step": 1322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1382.5, "completions/max_terminated_length": 1166.75, "completions/mean_length": 748.6875, "completions/mean_terminated_length": 696.8086547851562, "completions/min_length": 300.25, "completions/min_terminated_length": 300.25, "epoch": 0.6615, "grad_norm": 0.5571526885032654, "kl": 0.078582763671875, "learning_rate": 3.7997164593168983e-07, "loss": 0.1312, "num_tokens": 101849153.0, "reward": 0.6397359073162079, "reward_std": 0.361524973064661, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08744607213884592, "rewards/penalized_accuracy_reward/std": 0.14989124238491058, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1576344631612301, "step": 1323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1014.5, "completions/max_terminated_length": 947.75, "completions/mean_length": 542.375, "completions/mean_terminated_length": 505.5714416503906, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.662, "grad_norm": 0.3612987697124481, "kl": 0.0543212890625, "learning_rate": 3.7924472821194765e-07, "loss": 0.1179, "num_tokens": 101894393.0, "reward": 0.7398191094398499, "reward_std": 0.21413729712367058, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12479237467050552, "rewards/penalized_accuracy_reward/std": 0.09983393549919128, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06524410098791122, "step": 1324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2033.0, "completions/max_terminated_length": 1560.5, "completions/mean_length": 922.578125, "completions/mean_terminated_length": 798.4574584960938, "completions/min_length": 356.25, "completions/min_terminated_length": 356.25, "epoch": 0.6625, "grad_norm": 0.46479204297065735, "kl": 0.05157470703125, "learning_rate": 3.785183306423767e-07, "loss": 0.3362, "num_tokens": 101966798.0, "reward": 0.4878719300031662, "reward_std": 0.19056031852960587, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012490653432905674, "rewards/penalized_accuracy_reward/std": 0.049962613731622696, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2093360722064972, "step": 1325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1609.0, "completions/max_terminated_length": 1598.25, "completions/mean_length": 875.921875, "completions/mean_terminated_length": 862.9593811035156, "completions/min_length": 366.5, "completions/min_terminated_length": 366.5, "epoch": 0.663, "grad_norm": 0.34259212017059326, "kl": 0.054443359375, "learning_rate": 3.777924554357096e-07, "loss": 0.0621, "num_tokens": 102030841.0, "reward": 0.5132716149091721, "reward_std": 0.12576636672019958, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012495181523263454, "rewards/penalized_accuracy_reward/std": 0.049980729818344116, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.06976010836660862, "step": 1326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1605.75, "completions/max_terminated_length": 1457.25, "completions/mean_length": 914.0625, "completions/mean_terminated_length": 804.8726043701172, "completions/min_length": 363.5, "completions/min_terminated_length": 363.5, "epoch": 0.6635, "grad_norm": 0.28508082032203674, "kl": 0.047119140625, "learning_rate": 3.7706710480308835e-07, "loss": 0.2022, "num_tokens": 102097565.0, "reward": 0.660699337720871, "reward_std": 0.40452393889427185, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09988091513514519, "rewards/penalized_accuracy_reward/std": 0.1761297956109047, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.14647135883569717, "step": 1327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1853.25, "completions/max_terminated_length": 1713.75, "completions/mean_length": 963.1875, "completions/mean_terminated_length": 808.1935882568359, "completions/min_length": 293.75, "completions/min_terminated_length": 293.75, "epoch": 0.664, "grad_norm": 0.2712076008319855, "kl": 0.04986572265625, "learning_rate": 3.7634228095405673e-07, "loss": 0.2334, "num_tokens": 102166937.0, "reward": 0.50505730509758, "reward_std": 0.20708443596959114, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024989590048789978, "rewards/penalized_accuracy_reward/std": 0.06828457117080688, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18889591097831726, "step": 1328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1957.75, "completions/max_terminated_length": 1604.5, "completions/mean_length": 1066.796875, "completions/mean_terminated_length": 920.5444488525391, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.6645, "grad_norm": 0.3932078182697296, "kl": 0.035797119140625, "learning_rate": 3.7561798609655373e-07, "loss": 0.3017, "num_tokens": 102243612.0, "reward": 0.545136958360672, "reward_std": 0.28604351356625557, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049912229180336, "rewards/penalized_accuracy_reward/std": 0.0892857238650322, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.22790803015232086, "step": 1329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1694.75, "completions/max_terminated_length": 1269.5, "completions/mean_length": 790.296875, "completions/mean_terminated_length": 727.5537414550781, "completions/min_length": 340.75, "completions/min_terminated_length": 340.75, "epoch": 0.665, "grad_norm": 0.40199482440948486, "kl": 0.037841796875, "learning_rate": 3.748942224369073e-07, "loss": 0.1728, "num_tokens": 102305119.0, "reward": 0.8221975564956665, "reward_std": 0.44020669162273407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17477066069841385, "rewards/penalized_accuracy_reward/std": 0.20300352573394775, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15492809563875198, "step": 1330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 978.09375, "completions/mean_terminated_length": 846.9810028076172, "completions/min_length": 409.75, "completions/min_terminated_length": 409.75, "epoch": 0.6655, "grad_norm": 0.4290872812271118, "kl": 0.05352783203125, "learning_rate": 3.7417099217982686e-07, "loss": 0.3048, "num_tokens": 102377637.0, "reward": 0.48396891355514526, "reward_std": 0.19017981737852097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012492268346250057, "rewards/penalized_accuracy_reward/std": 0.04996907338500023, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.22234784811735153, "step": 1331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1600.75, "completions/max_terminated_length": 1503.25, "completions/mean_length": 957.9375, "completions/mean_terminated_length": 927.34375, "completions/min_length": 403.25, "completions/min_terminated_length": 403.25, "epoch": 0.666, "grad_norm": 0.25173917412757874, "kl": 0.025909423828125, "learning_rate": 3.734482975283975e-07, "loss": 0.0255, "num_tokens": 102447809.0, "reward": 0.5152053833007812, "reward_std": 0.1325060836970806, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012485504150390625, "rewards/penalized_accuracy_reward/std": 0.0499420203268528, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06524410098791122, "step": 1332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.5, "completions/max_terminated_length": 1179.5, "completions/mean_length": 622.453125, "completions/mean_terminated_length": 622.453125, "completions/min_length": 276.5, "completions/min_terminated_length": 276.5, "epoch": 0.6665, "grad_norm": 0.38049620389938354, "kl": 0.032562255859375, "learning_rate": 3.72726140684072e-07, "loss": 0.153, "num_tokens": 102495902.0, "reward": 1.1451212465763092, "reward_std": 0.6530624032020569, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.32451373152434826, "rewards/penalized_accuracy_reward/std": 0.32556694746017456, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.021347815170884132, "step": 1333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1330.25, "completions/max_terminated_length": 941.75, "completions/mean_length": 577.0, "completions/mean_terminated_length": 508.7732238769531, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.667, "grad_norm": 0.5724824070930481, "kl": 0.037353515625, "learning_rate": 3.720045238466658e-07, "loss": 0.3068, "num_tokens": 102543470.0, "reward": 0.6531148999929428, "reward_std": 0.3867821618914604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08729963935911655, "rewards/penalized_accuracy_reward/std": 0.1637362763285637, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11861922964453697, "step": 1334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1492.75, "completions/max_terminated_length": 1230.75, "completions/mean_length": 789.671875, "completions/mean_terminated_length": 757.2448120117188, "completions/min_length": 387.25, "completions/min_terminated_length": 387.25, "epoch": 0.6675, "grad_norm": 0.4295820891857147, "kl": 0.041168212890625, "learning_rate": 3.712834492143487e-07, "loss": 0.1206, "num_tokens": 102604361.0, "reward": 1.1371818780899048, "reward_std": 0.49900077283382416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3244503289461136, "rewards/penalized_accuracy_reward/std": 0.2361278086900711, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08086910098791122, "step": 1335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1748.5, "completions/max_terminated_length": 1451.25, "completions/mean_length": 796.328125, "completions/mean_terminated_length": 735.2857208251953, "completions/min_length": 300.25, "completions/min_terminated_length": 300.25, "epoch": 0.668, "grad_norm": 0.44577327370643616, "kl": 0.051177978515625, "learning_rate": 3.7056291898363925e-07, "loss": 0.1714, "num_tokens": 102662926.0, "reward": 0.5053887814283371, "reward_std": 0.14461685717105865, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012460015714168549, "rewards/penalized_accuracy_reward/std": 0.049840062856674194, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11664126254618168, "step": 1336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1367.5, "completions/max_terminated_length": 1306.25, "completions/mean_length": 750.046875, "completions/mean_terminated_length": 732.3541717529297, "completions/min_length": 315.5, "completions/min_terminated_length": 315.5, "epoch": 0.6685, "grad_norm": 0.3497569262981415, "kl": 0.04217529296875, "learning_rate": 3.6984293534939737e-07, "loss": 0.08, "num_tokens": 102722081.0, "reward": 0.7629188895225525, "reward_std": 0.2188913356512785, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13731881976127625, "rewards/penalized_accuracy_reward/std": 0.09561658650636673, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.05531632527709007, "step": 1337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1837.0, "completions/max_terminated_length": 1768.5, "completions/mean_length": 860.171875, "completions/mean_terminated_length": 803.8866271972656, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.669, "grad_norm": 0.35343310236930847, "kl": 0.0433349609375, "learning_rate": 3.69123500504818e-07, "loss": 0.1591, "num_tokens": 102788684.0, "reward": 0.8527645170688629, "reward_std": 0.45679527521133423, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18712443858385086, "rewards/penalized_accuracy_reward/std": 0.20533476024866104, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.12808074057102203, "step": 1338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1682.5, "completions/mean_length": 1042.78125, "completions/mean_terminated_length": 877.9725646972656, "completions/min_length": 336.25, "completions/min_terminated_length": 336.25, "epoch": 0.6695, "grad_norm": 0.4195035398006439, "kl": 0.06402587890625, "learning_rate": 3.6840461664142444e-07, "loss": 0.3644, "num_tokens": 102870510.0, "reward": 0.4375, "reward_std": 0.12184108421206474, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.24368217587471008, "step": 1339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1731.75, "completions/max_terminated_length": 1684.25, "completions/mean_length": 881.90625, "completions/mean_terminated_length": 867.3416748046875, "completions/min_length": 408.75, "completions/min_terminated_length": 408.75, "epoch": 0.67, "grad_norm": 0.3273753225803375, "kl": 0.029052734375, "learning_rate": 3.6768628594906193e-07, "loss": 0.0975, "num_tokens": 102938856.0, "reward": 0.484375, "reward_std": 0.050725389271974564, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 1340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1333.75, "completions/max_terminated_length": 1324.75, "completions/mean_length": 783.78125, "completions/mean_terminated_length": 752.875, "completions/min_length": 328.25, "completions/min_terminated_length": 328.25, "epoch": 0.6705, "grad_norm": 0.38395485281944275, "kl": 0.045196533203125, "learning_rate": 3.6696851061588994e-07, "loss": 0.032, "num_tokens": 102997466.0, "reward": 0.7878468334674835, "reward_std": 0.4090902004390955, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14978279173374176, "rewards/penalized_accuracy_reward/std": 0.19246572256088257, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.05963464826345444, "step": 1341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1657.5, "completions/mean_length": 995.359375, "completions/mean_terminated_length": 840.9500122070312, "completions/min_length": 285.5, "completions/min_terminated_length": 285.5, "epoch": 0.671, "grad_norm": 0.5476993322372437, "kl": 0.04400634765625, "learning_rate": 3.6625129282837685e-07, "loss": 0.3906, "num_tokens": 103069809.0, "reward": 0.453125, "reward_std": 0.10043558105826378, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20087117329239845, "step": 1342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1393.25, "completions/max_terminated_length": 1091.75, "completions/mean_length": 592.40625, "completions/mean_terminated_length": 568.5291748046875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6715, "grad_norm": 0.4112747013568878, "kl": 0.04827880859375, "learning_rate": 3.655346347712922e-07, "loss": 0.0973, "num_tokens": 103116523.0, "reward": 0.486328125, "reward_std": 0.03300705552101135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.0660141110420227, "step": 1343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1409.25, "completions/max_terminated_length": 1272.75, "completions/mean_length": 833.59375, "completions/mean_terminated_length": 698.4830932617188, "completions/min_length": 314.75, "completions/min_terminated_length": 314.75, "epoch": 0.672, "grad_norm": 0.3882710933685303, "kl": 0.063079833984375, "learning_rate": 3.6481853862770107e-07, "loss": 0.2199, "num_tokens": 103179553.0, "reward": 0.5280652791261673, "reward_std": 0.23741407878696918, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747013956308365, "rewards/penalized_accuracy_reward/std": 0.08055838197469711, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.17906921729445457, "step": 1344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1504.25, "completions/max_terminated_length": 1437.5, "completions/mean_length": 863.921875, "completions/mean_terminated_length": 784.8948974609375, "completions/min_length": 263.75, "completions/min_terminated_length": 263.75, "epoch": 0.6725, "grad_norm": 0.3707081377506256, "kl": 0.04095458984375, "learning_rate": 3.641030065789562e-07, "loss": 0.1305, "num_tokens": 103247148.0, "reward": 0.7300494164228439, "reward_std": 0.40811343863606453, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12479032576084137, "rewards/penalized_accuracy_reward/std": 0.18909692764282227, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.059839196503162384, "step": 1345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1485.5, "completions/mean_length": 958.96875, "completions/mean_terminated_length": 863.1833648681641, "completions/min_length": 376.25, "completions/min_terminated_length": 376.25, "epoch": 0.673, "grad_norm": 0.5961228609085083, "kl": 0.05517578125, "learning_rate": 3.6338804080469253e-07, "loss": 0.2359, "num_tokens": 103318234.0, "reward": 0.5608296096324921, "reward_std": 0.2615963239222765, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049946051090955734, "rewards/penalized_accuracy_reward/std": 0.08934621512889862, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2027587704360485, "step": 1346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1816.75, "completions/max_terminated_length": 1328.75, "completions/mean_length": 1073.03125, "completions/mean_terminated_length": 911.1401062011719, "completions/min_length": 446.5, "completions/min_terminated_length": 446.5, "epoch": 0.6735, "grad_norm": 0.41303467750549316, "kl": 0.036407470703125, "learning_rate": 3.6267364348281946e-07, "loss": 0.2426, "num_tokens": 103398956.0, "reward": 0.6834914684295654, "reward_std": 0.3809049651026726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1122535327449441, "rewards/penalized_accuracy_reward/std": 0.15294642373919487, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.17209654301404953, "step": 1347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1994.25, "completions/max_terminated_length": 1666.25, "completions/mean_length": 981.109375, "completions/mean_terminated_length": 903.8130798339844, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.674, "grad_norm": 0.43231189250946045, "kl": 0.03045654296875, "learning_rate": 3.6195981678951535e-07, "loss": 0.2733, "num_tokens": 103470067.0, "reward": 0.46875, "reward_std": 0.08883712254464626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17767424881458282, "step": 1348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1441.25, "completions/max_terminated_length": 1331.5, "completions/mean_length": 679.875, "completions/mean_terminated_length": 610.4170684814453, "completions/min_length": 275.25, "completions/min_terminated_length": 275.25, "epoch": 0.6745, "grad_norm": 0.53657466173172, "kl": 0.050048828125, "learning_rate": 3.612465628992203e-07, "loss": 0.2481, "num_tokens": 103522715.0, "reward": 0.5705563127994537, "reward_std": 0.2503672055900097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04992659017443657, "rewards/penalized_accuracy_reward/std": 0.08931140601634979, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15645276755094528, "step": 1349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1624.75, "completions/mean_length": 1053.4375, "completions/mean_terminated_length": 863.1373596191406, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.675, "grad_norm": 0.39458420872688293, "kl": 0.05499267578125, "learning_rate": 3.60533883984629e-07, "loss": 0.3787, "num_tokens": 103603447.0, "reward": 0.4854930639266968, "reward_std": 0.2508828602731228, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02497309073805809, "rewards/penalized_accuracy_reward/std": 0.0682394728064537, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.2670181319117546, "step": 1350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1518.25, "completions/max_terminated_length": 1391.0, "completions/mean_length": 881.03125, "completions/mean_terminated_length": 821.6942138671875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.6755, "grad_norm": 0.5485340356826782, "kl": 0.0406494140625, "learning_rate": 3.5982178221668533e-07, "loss": 0.0667, "num_tokens": 103669353.0, "reward": 0.7125789225101471, "reward_std": 0.36101583018898964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17859234660863876, "step": 1351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1346.25, "completions/max_terminated_length": 833.0, "completions/mean_length": 622.828125, "completions/mean_terminated_length": 539.131706237793, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.676, "grad_norm": 0.4971848428249359, "kl": 0.054931640625, "learning_rate": 3.591102597645743e-07, "loss": 0.3057, "num_tokens": 103717982.0, "reward": 0.474609375, "reward_std": 0.07185593992471695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1437118947505951, "step": 1352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1719.0, "completions/max_terminated_length": 1589.5, "completions/mean_length": 863.75, "completions/mean_terminated_length": 801.2794799804688, "completions/min_length": 350.75, "completions/min_terminated_length": 350.75, "epoch": 0.6765, "grad_norm": 0.4674924612045288, "kl": 0.0516357421875, "learning_rate": 3.5839931879571725e-07, "loss": 0.1499, "num_tokens": 103781438.0, "reward": 0.6531688570976257, "reward_std": 0.33474768325686455, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08732661511749029, "rewards/penalized_accuracy_reward/std": 0.1497608870267868, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11861922964453697, "step": 1353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1649.5, "completions/max_terminated_length": 1442.5, "completions/mean_length": 845.625, "completions/mean_terminated_length": 803.0290222167969, "completions/min_length": 384.25, "completions/min_terminated_length": 384.25, "epoch": 0.677, "grad_norm": 0.4174695611000061, "kl": 0.040802001953125, "learning_rate": 3.5768896147576344e-07, "loss": 0.2092, "num_tokens": 103846454.0, "reward": 0.9775767028331757, "reward_std": 0.566397774964571, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24953054264187813, "rewards/penalized_accuracy_reward/std": 0.26538168638944626, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.12928754836320877, "step": 1354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1447.75, "completions/max_terminated_length": 1242.25, "completions/mean_length": 749.625, "completions/mean_terminated_length": 708.7142944335938, "completions/min_length": 374.25, "completions/min_terminated_length": 374.25, "epoch": 0.6775, "grad_norm": 0.4658714234828949, "kl": 0.041778564453125, "learning_rate": 3.5697918996858443e-07, "loss": 0.157, "num_tokens": 103904430.0, "reward": 0.6760237365961075, "reward_std": 0.385949045419693, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09973062202334404, "rewards/penalized_accuracy_reward/std": 0.17590896040201187, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1361130028963089, "step": 1355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1934.5, "completions/max_terminated_length": 1265.25, "completions/mean_length": 823.0625, "completions/mean_terminated_length": 670.5046234130859, "completions/min_length": 315.5, "completions/min_terminated_length": 315.5, "epoch": 0.678, "grad_norm": 0.4889172613620758, "kl": 0.059539794921875, "learning_rate": 3.5627000643626704e-07, "loss": 0.3416, "num_tokens": 103969650.0, "reward": 0.5510195791721344, "reward_std": 0.3422410190105438, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04992385022342205, "rewards/penalized_accuracy_reward/std": 0.13040756434202194, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.22717764228582382, "step": 1356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.75, "completions/max_terminated_length": 1200.75, "completions/mean_length": 584.859375, "completions/mean_terminated_length": 584.859375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6785, "grad_norm": 0.4548604488372803, "kl": 0.0423583984375, "learning_rate": 3.555614130391079e-07, "loss": -0.012, "num_tokens": 104018265.0, "reward": 0.6170626729726791, "reward_std": 0.3153496980667114, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06243759021162987, "rewards/penalized_accuracy_reward/std": 0.14879398792982101, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2028.25, "completions/max_terminated_length": 1457.25, "completions/mean_length": 825.859375, "completions/mean_terminated_length": 742.5754699707031, "completions/min_length": 418.75, "completions/min_terminated_length": 418.75, "epoch": 0.679, "grad_norm": 0.48048198223114014, "kl": 0.044189453125, "learning_rate": 3.5485341193560503e-07, "loss": 0.2449, "num_tokens": 104081008.0, "reward": 0.6475719213485718, "reward_std": 0.27072223275899887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08745783567428589, "rewards/penalized_accuracy_reward/std": 0.1024201363325119, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16549422964453697, "step": 1358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1408.75, "completions/max_terminated_length": 1320.5, "completions/mean_length": 731.140625, "completions/mean_terminated_length": 713.7520904541016, "completions/min_length": 330.75, "completions/min_terminated_length": 330.75, "epoch": 0.6795, "grad_norm": 0.4147406220436096, "kl": 0.04083251953125, "learning_rate": 3.5414600528245266e-07, "loss": 0.047, "num_tokens": 104139449.0, "reward": 0.5152275413274765, "reward_std": 0.13204535841941833, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012496581301093102, "rewards/penalized_accuracy_reward/std": 0.049986325204372406, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1957.25, "completions/max_terminated_length": 1566.5, "completions/mean_length": 1082.40625, "completions/mean_terminated_length": 998.8345642089844, "completions/min_length": 484.5, "completions/min_terminated_length": 484.5, "epoch": 0.68, "grad_norm": 0.29929184913635254, "kl": 0.0359039306640625, "learning_rate": 3.534391952345341e-07, "loss": 0.1265, "num_tokens": 104223683.0, "reward": 0.6702735126018524, "reward_std": 0.39384439401328564, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09978519007563591, "rewards/penalized_accuracy_reward/std": 0.17599239200353622, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16623477265238762, "step": 1360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1691.5, "completions/max_terminated_length": 1066.0, "completions/mean_length": 642.875, "completions/mean_terminated_length": 575.8885650634766, "completions/min_length": 231.25, "completions/min_terminated_length": 231.25, "epoch": 0.6805, "grad_norm": 0.46211326122283936, "kl": 0.046966552734375, "learning_rate": 3.5273298394491515e-07, "loss": 0.3109, "num_tokens": 104274379.0, "reward": 0.8049368858337402, "reward_std": 0.21590344235301018, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16223406791687012, "rewards/penalized_accuracy_reward/std": 0.08049070090055466, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1292813941836357, "step": 1361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1389.25, "completions/max_terminated_length": 1247.0, "completions/mean_length": 804.296875, "completions/mean_terminated_length": 760.8581848144531, "completions/min_length": 390.75, "completions/min_terminated_length": 390.75, "epoch": 0.681, "grad_norm": 0.2020951807498932, "kl": 0.0423583984375, "learning_rate": 3.5202737356483816e-07, "loss": 0.032, "num_tokens": 104335326.0, "reward": 0.4765625, "reward_std": 0.047646719962358475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09529344737529755, "step": 1362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1569.75, "completions/max_terminated_length": 1377.25, "completions/mean_length": 794.75, "completions/mean_terminated_length": 730.4252471923828, "completions/min_length": 321.75, "completions/min_terminated_length": 321.75, "epoch": 0.6815, "grad_norm": 0.522006094455719, "kl": 0.05145263671875, "learning_rate": 3.513223662437147e-07, "loss": 0.1804, "num_tokens": 104396766.0, "reward": 0.46875, "reward_std": 0.07155892811715603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1431178580969572, "step": 1363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1421.25, "completions/max_terminated_length": 1219.0, "completions/mean_length": 643.171875, "completions/mean_terminated_length": 620.3291778564453, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.682, "grad_norm": 0.6468093991279602, "kl": 0.041412353515625, "learning_rate": 3.5061796412911913e-07, "loss": 0.1911, "num_tokens": 104447033.0, "reward": 0.553358718752861, "reward_std": 0.284311406314373, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.12304970622062683, "step": 1364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1534.0, "completions/max_terminated_length": 1159.25, "completions/mean_length": 696.0625, "completions/mean_terminated_length": 600.8415374755859, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6825, "grad_norm": 0.40325433015823364, "kl": 0.0433349609375, "learning_rate": 3.4991416936678276e-07, "loss": 0.2961, "num_tokens": 104499517.0, "reward": 0.7703066468238831, "reward_std": 0.38084324076771736, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14980176649987698, "rewards/penalized_accuracy_reward/std": 0.16802658140659332, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1481337696313858, "step": 1365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1613.25, "completions/max_terminated_length": 1451.5, "completions/mean_length": 880.578125, "completions/mean_terminated_length": 815.8772583007812, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.683, "grad_norm": 0.3722813129425049, "kl": 0.039459228515625, "learning_rate": 3.49210984100586e-07, "loss": 0.1072, "num_tokens": 104566370.0, "reward": 0.5109085589647293, "reward_std": 0.22095463797450066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024985529482364655, "rewards/penalized_accuracy_reward/std": 0.06827346980571747, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1952698826789856, "step": 1366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1743.75, "completions/max_terminated_length": 1461.5, "completions/mean_length": 1064.4375, "completions/mean_terminated_length": 833.391845703125, "completions/min_length": 366.75, "completions/min_terminated_length": 366.75, "epoch": 0.6835, "grad_norm": 0.4238141179084778, "kl": 0.061248779296875, "learning_rate": 3.4850841047255364e-07, "loss": 0.3021, "num_tokens": 104643406.0, "reward": 0.44295018911361694, "reward_std": 0.22277865931391716, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012490717694163322, "rewards/penalized_accuracy_reward/std": 0.04996287077665329, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.27247942239046097, "step": 1367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1714.75, "completions/max_terminated_length": 1513.5, "completions/mean_length": 937.171875, "completions/mean_terminated_length": 883.2093963623047, "completions/min_length": 301.5, "completions/min_terminated_length": 301.5, "epoch": 0.684, "grad_norm": 0.33153173327445984, "kl": 0.0360107421875, "learning_rate": 3.4780645062284665e-07, "loss": 0.1376, "num_tokens": 104710713.0, "reward": 0.474609375, "reward_std": 0.06834406778216362, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1366881374269724, "step": 1368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1959.75, "completions/max_terminated_length": 1711.0, "completions/mean_length": 965.21875, "completions/mean_terminated_length": 916.5927429199219, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.6845, "grad_norm": 0.2980339825153351, "kl": 0.031707763671875, "learning_rate": 3.471051066897562e-07, "loss": 0.1858, "num_tokens": 104783495.0, "reward": 0.48046875, "reward_std": 0.05847674608230591, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11695349216461182, "step": 1369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1926.75, "completions/max_terminated_length": 1476.25, "completions/mean_length": 993.859375, "completions/mean_terminated_length": 914.397705078125, "completions/min_length": 452.75, "completions/min_terminated_length": 452.75, "epoch": 0.685, "grad_norm": 0.44568151235580444, "kl": 0.034576416015625, "learning_rate": 3.4640438080969773e-07, "loss": 0.2586, "num_tokens": 104856414.0, "reward": 0.466796875, "reward_std": 0.090545654296875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18109130859375, "step": 1370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1297.75, "completions/max_terminated_length": 1012.75, "completions/mean_length": 651.421875, "completions/mean_terminated_length": 582.3713989257812, "completions/min_length": 235.5, "completions/min_terminated_length": 235.5, "epoch": 0.6855, "grad_norm": 0.3914555013179779, "kl": 0.03887939453125, "learning_rate": 3.45704275117204e-07, "loss": 0.1993, "num_tokens": 104908953.0, "reward": 0.47265625, "reward_std": 0.04769963026046753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.09539926052093506, "step": 1371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1587.75, "completions/max_terminated_length": 1521.25, "completions/mean_length": 966.453125, "completions/mean_terminated_length": 915.8137054443359, "completions/min_length": 440.75, "completions/min_terminated_length": 440.75, "epoch": 0.686, "grad_norm": 0.24685484170913696, "kl": 0.041259765625, "learning_rate": 3.450047917449181e-07, "loss": 0.0769, "num_tokens": 104981286.0, "reward": 0.7241732180118561, "reward_std": 0.41347144544124603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12478193268179893, "rewards/penalized_accuracy_reward/std": 0.18909364938735962, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1221349686384201, "step": 1372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1905.25, "completions/max_terminated_length": 1564.5, "completions/mean_length": 1022.21875, "completions/mean_terminated_length": 893.7857513427734, "completions/min_length": 359.25, "completions/min_terminated_length": 359.25, "epoch": 0.6865, "grad_norm": 0.3976253271102905, "kl": 0.03662109375, "learning_rate": 3.4430593282358777e-07, "loss": 0.2074, "num_tokens": 105064884.0, "reward": 0.9853729605674744, "reward_std": 0.6303859353065491, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2622177265584469, "rewards/penalized_accuracy_reward/std": 0.2916456386446953, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1924363076686859, "step": 1373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1412.25, "completions/max_terminated_length": 1161.0, "completions/mean_length": 684.953125, "completions/mean_terminated_length": 641.40625, "completions/min_length": 255.25, "completions/min_terminated_length": 255.25, "epoch": 0.687, "grad_norm": 0.4272247552871704, "kl": 0.041107177734375, "learning_rate": 3.4360770048205843e-07, "loss": 0.1584, "num_tokens": 105117569.0, "reward": 0.8068724274635315, "reward_std": 0.20021573826670647, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16222527623176575, "rewards/penalized_accuracy_reward/std": 0.08048640191555023, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09567352384328842, "step": 1374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.25, "completions/mean_length": 996.34375, "completions/mean_terminated_length": 818.1237487792969, "completions/min_length": 277.5, "completions/min_terminated_length": 277.5, "epoch": 0.6875, "grad_norm": 0.42778724431991577, "kl": 0.049346923828125, "learning_rate": 3.429100968472668e-07, "loss": 0.3357, "num_tokens": 105191159.0, "reward": 0.5221861302852631, "reward_std": 0.25838617980480194, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746025636792183, "rewards/penalized_accuracy_reward/std": 0.08053714781999588, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2459489107131958, "step": 1375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1924.75, "completions/max_terminated_length": 1650.5, "completions/mean_length": 1026.140625, "completions/mean_terminated_length": 946.2698364257812, "completions/min_length": 405.75, "completions/min_terminated_length": 405.75, "epoch": 0.688, "grad_norm": 0.40386682748794556, "kl": 0.0457763671875, "learning_rate": 3.4221312404423486e-07, "loss": 0.1459, "num_tokens": 105265872.0, "reward": 0.6779356896877289, "reward_std": 0.2809483092278242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11240534484386444, "rewards/penalized_accuracy_reward/std": 0.10238330811262131, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19823409989476204, "step": 1376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1331.25, "completions/max_terminated_length": 1275.25, "completions/mean_length": 747.515625, "completions/mean_terminated_length": 695.90625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6885, "grad_norm": 0.315573513507843, "kl": 0.03778076171875, "learning_rate": 3.4151678419606233e-07, "loss": 0.0585, "num_tokens": 105323537.0, "reward": 0.46875, "reward_std": 0.054947055876255035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.10989411175251007, "step": 1377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1824.75, "completions/max_terminated_length": 1254.5, "completions/mean_length": 876.859375, "completions/mean_terminated_length": 783.1514739990234, "completions/min_length": 355.25, "completions/min_terminated_length": 355.25, "epoch": 0.689, "grad_norm": 0.41156548261642456, "kl": 0.0361328125, "learning_rate": 3.4082107942392136e-07, "loss": 0.2105, "num_tokens": 105391096.0, "reward": 0.5476319193840027, "reward_std": 0.21851530484855175, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03748783469200134, "rewards/penalized_accuracy_reward/std": 0.080596424639225, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1462009735405445, "step": 1378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1596.0, "completions/max_terminated_length": 1316.5, "completions/mean_length": 829.1875, "completions/mean_terminated_length": 774.5396118164062, "completions/min_length": 315.5, "completions/min_terminated_length": 315.5, "epoch": 0.6895, "grad_norm": 0.27165836095809937, "kl": 0.0435791015625, "learning_rate": 3.4012601184704904e-07, "loss": 0.1531, "num_tokens": 105453380.0, "reward": 0.7323117852210999, "reward_std": 0.24091817811131477, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12494497001171112, "rewards/penalized_accuracy_reward/std": 0.0999559760093689, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 1379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2029.5, "completions/max_terminated_length": 1591.0, "completions/mean_length": 1019.4375, "completions/mean_terminated_length": 921.7954254150391, "completions/min_length": 435.25, "completions/min_terminated_length": 435.25, "epoch": 0.69, "grad_norm": 0.41583409905433655, "kl": 0.037628173828125, "learning_rate": 3.3943158358274203e-07, "loss": 0.3051, "num_tokens": 105528144.0, "reward": 0.5167697370052338, "reward_std": 0.22596118599176407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024986427277326584, "rewards/penalized_accuracy_reward/std": 0.06827591359615326, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19630970992147923, "step": 1380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1557.0, "completions/max_terminated_length": 1376.25, "completions/mean_length": 750.890625, "completions/mean_terminated_length": 717.7968902587891, "completions/min_length": 348.75, "completions/min_terminated_length": 348.75, "epoch": 0.6905, "grad_norm": 0.32781100273132324, "kl": 0.03619384765625, "learning_rate": 3.387377967463493e-07, "loss": 0.1342, "num_tokens": 105583705.0, "reward": 0.484375, "reward_std": 0.050725389271974564, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 1381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1718.0, "completions/max_terminated_length": 1561.75, "completions/mean_length": 939.671875, "completions/mean_terminated_length": 834.9040222167969, "completions/min_length": 369.75, "completions/min_terminated_length": 369.75, "epoch": 0.691, "grad_norm": 0.49908268451690674, "kl": 0.055816650390625, "learning_rate": 3.3804465345126545e-07, "loss": 0.2254, "num_tokens": 105653860.0, "reward": 0.6337387561798096, "reward_std": 0.2954738028347492, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08737719058990479, "rewards/penalized_accuracy_reward/std": 0.10232573747634888, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20466844737529755, "step": 1382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1551.75, "completions/max_terminated_length": 1260.0, "completions/mean_length": 821.21875, "completions/mean_terminated_length": 785.3812713623047, "completions/min_length": 414.25, "completions/min_terminated_length": 414.25, "epoch": 0.6915, "grad_norm": 0.2898655831813812, "kl": 0.041778564453125, "learning_rate": 3.3735215580892575e-07, "loss": 0.1674, "num_tokens": 105713922.0, "reward": 0.490234375, "reward_std": 0.0390625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1562.25, "completions/max_terminated_length": 1408.75, "completions/mean_length": 688.734375, "completions/mean_terminated_length": 644.0982208251953, "completions/min_length": 306.5, "completions/min_terminated_length": 306.5, "epoch": 0.692, "grad_norm": 94.15632629394531, "kl": 0.969512939453125, "learning_rate": 3.366603059287977e-07, "loss": 0.1632, "num_tokens": 105765921.0, "reward": 0.7360462844371796, "reward_std": 0.2396089769899845, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12485907971858978, "rewards/penalized_accuracy_reward/std": 0.09988737851381302, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07966844737529755, "step": 1384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1921.5, "completions/max_terminated_length": 1575.75, "completions/mean_length": 1093.15625, "completions/mean_terminated_length": 1018.4658966064453, "completions/min_length": 432.25, "completions/min_terminated_length": 432.25, "epoch": 0.6925, "grad_norm": 0.3364189565181732, "kl": 0.027130126953125, "learning_rate": 3.359691059183761e-07, "loss": 0.1621, "num_tokens": 105846395.0, "reward": 0.7474376559257507, "reward_std": 0.2464491631835699, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13739070296287537, "rewards/penalized_accuracy_reward/std": 0.09566661715507507, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.11023188941180706, "step": 1385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1827.75, "completions/max_terminated_length": 1575.75, "completions/mean_length": 802.65625, "completions/mean_terminated_length": 738.74853515625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.693, "grad_norm": 0.43323975801467896, "kl": 0.0478515625, "learning_rate": 3.3527855788317614e-07, "loss": 0.0867, "num_tokens": 105906117.0, "reward": 0.6762737035751343, "reward_std": 0.48140208423137665, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09985560551285744, "rewards/penalized_accuracy_reward/std": 0.22574804723262787, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14370574057102203, "step": 1386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1592.0, "completions/max_terminated_length": 1508.5, "completions/mean_length": 958.296875, "completions/mean_terminated_length": 886.4462890625, "completions/min_length": 306.75, "completions/min_terminated_length": 306.75, "epoch": 0.6935, "grad_norm": 0.1976926475763321, "kl": 0.034027099609375, "learning_rate": 3.3458866392672694e-07, "loss": 0.0959, "num_tokens": 105973848.0, "reward": 0.5627600848674774, "reward_std": 0.20726257376372814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993472993373871, "rewards/penalized_accuracy_reward/std": 0.08932598680257797, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.13509058579802513, "step": 1387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1898.75, "completions/max_terminated_length": 1614.5, "completions/mean_length": 910.609375, "completions/mean_terminated_length": 779.5876312255859, "completions/min_length": 249.25, "completions/min_terminated_length": 249.25, "epoch": 0.694, "grad_norm": 0.42483028769493103, "kl": 0.058685302734375, "learning_rate": 3.338994261505649e-07, "loss": 0.1678, "num_tokens": 106040783.0, "reward": 0.6318072378635406, "reward_std": 0.4611310660839081, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08738800045102835, "rewards/penalized_accuracy_reward/std": 0.20751722529530525, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18504608422517776, "step": 1388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1623.0, "completions/max_terminated_length": 1243.75, "completions/mean_length": 788.421875, "completions/mean_terminated_length": 727.8655700683594, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.6945, "grad_norm": 0.4430370628833771, "kl": 0.039398193359375, "learning_rate": 3.3321084665422803e-07, "loss": 0.1827, "num_tokens": 106100330.0, "reward": 0.8107485473155975, "reward_std": 0.20287672616541386, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17490552365779877, "rewards/penalized_accuracy_reward/std": 0.06827617436647415, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.16513093188405037, "step": 1389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1404.25, "completions/max_terminated_length": 1346.25, "completions/mean_length": 709.453125, "completions/mean_terminated_length": 693.2531433105469, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.695, "grad_norm": 0.4334413409233093, "kl": 0.03729248046875, "learning_rate": 3.325229275352489e-07, "loss": 0.1586, "num_tokens": 106153239.0, "reward": 0.741765707731247, "reward_std": 0.33037565648555756, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12478912062942982, "rewards/penalized_accuracy_reward/std": 0.1522580273449421, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1912.5, "completions/max_terminated_length": 1550.75, "completions/mean_length": 804.453125, "completions/mean_terminated_length": 701.1131134033203, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.6955, "grad_norm": 0.5413482785224915, "kl": 0.05224609375, "learning_rate": 3.3183567088914833e-07, "loss": 0.3532, "num_tokens": 106214836.0, "reward": 0.5896951258182526, "reward_std": 0.28113968297839165, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062425680458545685, "rewards/penalized_accuracy_reward/std": 0.09562887996435165, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17976384609937668, "step": 1391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2035.25, "completions/max_terminated_length": 1397.0, "completions/mean_length": 856.515625, "completions/mean_terminated_length": 730.0605010986328, "completions/min_length": 302.25, "completions/min_terminated_length": 302.25, "epoch": 0.696, "grad_norm": 0.45384493470191956, "kl": 0.05517578125, "learning_rate": 3.3114907880942933e-07, "loss": 0.3871, "num_tokens": 106279029.0, "reward": 0.545549064874649, "reward_std": 0.23130611330270767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037422969937324524, "rewards/penalized_accuracy_reward/std": 0.08045713603496552, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.14078368619084358, "step": 1392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1955.75, "completions/max_terminated_length": 1742.5, "completions/mean_length": 1129.109375, "completions/mean_terminated_length": 1007.6621704101562, "completions/min_length": 435.5, "completions/min_terminated_length": 435.5, "epoch": 0.6965, "grad_norm": 0.31663137674331665, "kl": 0.030364990234375, "learning_rate": 3.3046315338757026e-07, "loss": 0.1778, "num_tokens": 106360428.0, "reward": 0.5799638777971268, "reward_std": 0.3411155715584755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062442874535918236, "rewards/penalized_accuracy_reward/std": 0.13931311666965485, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1726691871881485, "step": 1393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1550.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 911.453125, "completions/mean_terminated_length": 770.7012176513672, "completions/min_length": 284.5, "completions/min_terminated_length": 284.5, "epoch": 0.697, "grad_norm": 0.35855624079704285, "kl": 0.05865478515625, "learning_rate": 3.297778967130191e-07, "loss": 0.0624, "num_tokens": 106428809.0, "reward": 0.6278017461299896, "reward_std": 0.2762135863304138, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08733838051557541, "rewards/penalized_accuracy_reward/std": 0.10228025913238525, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1788995936512947, "step": 1394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1421.25, "completions/max_terminated_length": 1421.25, "completions/mean_length": 626.8125, "completions/mean_terminated_length": 626.8125, "completions/min_length": 258.5, "completions/min_terminated_length": 258.5, "epoch": 0.6975, "grad_norm": 0.45311886072158813, "kl": 0.03497314453125, "learning_rate": 3.290933108731866e-07, "loss": 0.0485, "num_tokens": 106477453.0, "reward": 1.0495559573173523, "reward_std": 0.584965318441391, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.27477798238396645, "rewards/penalized_accuracy_reward/std": 0.2924826741218567, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1804.5, "completions/max_terminated_length": 1548.75, "completions/mean_length": 948.0, "completions/mean_terminated_length": 899.6659545898438, "completions/min_length": 427.5, "completions/min_terminated_length": 427.5, "epoch": 0.698, "grad_norm": 0.27389195561408997, "kl": 0.038116455078125, "learning_rate": 3.2840939795343987e-07, "loss": 0.0667, "num_tokens": 106547389.0, "reward": 0.5343067795038223, "reward_std": 0.16574729979038239, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024965887889266014, "rewards/penalized_accuracy_reward/std": 0.06821979582309723, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09682458639144897, "step": 1396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1317.75, "completions/max_terminated_length": 1173.75, "completions/mean_length": 727.453125, "completions/mean_terminated_length": 710.7239685058594, "completions/min_length": 271.5, "completions/min_terminated_length": 271.5, "epoch": 0.6985, "grad_norm": 0.4439019560813904, "kl": 0.045074462890625, "learning_rate": 3.2772616003709616e-07, "loss": 0.1124, "num_tokens": 106602330.0, "reward": 0.517173707485199, "reward_std": 0.1311948224902153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012493102811276913, "rewards/penalized_accuracy_reward/std": 0.04997241124510765, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1397.75, "completions/max_terminated_length": 1226.5, "completions/mean_length": 717.25, "completions/mean_terminated_length": 634.67919921875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.699, "grad_norm": 0.2560904920101166, "kl": 0.037200927734375, "learning_rate": 3.270435992054166e-07, "loss": 0.1688, "num_tokens": 106656490.0, "reward": 0.478515625, "reward_std": 0.05138834938406944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10277670249342918, "step": 1398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1642.5, "completions/max_terminated_length": 1222.5, "completions/mean_length": 754.6875, "completions/mean_terminated_length": 614.5952835083008, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6995, "grad_norm": 0.42993342876434326, "kl": 0.053680419921875, "learning_rate": 3.263617175376001e-07, "loss": 0.1585, "num_tokens": 106716870.0, "reward": 0.6817231327295303, "reward_std": 0.47100166231393814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11234593763947487, "rewards/penalized_accuracy_reward/std": 0.2180437184870243, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.16958122327923775, "step": 1399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1378.5, "completions/max_terminated_length": 1071.25, "completions/mean_length": 581.296875, "completions/mean_terminated_length": 556.2406311035156, "completions/min_length": 281.5, "completions/min_terminated_length": 281.5, "epoch": 0.7, "grad_norm": 0.5796995759010315, "kl": 0.047119140625, "learning_rate": 3.2568051711077636e-07, "loss": 0.2212, "num_tokens": 106762105.0, "reward": 0.6879887729883194, "reward_std": 0.3976822905242443, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09985376521945, "rewards/penalized_accuracy_reward/std": 0.17862387001514435, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08086910098791122, "step": 1400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1799.0, "completions/max_terminated_length": 1284.75, "completions/mean_length": 807.015625, "completions/mean_terminated_length": 728.6515197753906, "completions/min_length": 353.75, "completions/min_terminated_length": 353.75, "epoch": 0.7005, "grad_norm": 0.578904926776886, "kl": 0.05596923828125, "learning_rate": 3.250000000000001e-07, "loss": 0.2389, "num_tokens": 106822714.0, "reward": 0.5706189274787903, "reward_std": 0.31823622435331345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995789937674999, "rewards/penalized_accuracy_reward/std": 0.13050976768136024, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16375261545181274, "step": 1401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1689.75, "completions/max_terminated_length": 1682.25, "completions/mean_length": 892.640625, "completions/mean_terminated_length": 833.09375, "completions/min_length": 476.5, "completions/min_terminated_length": 476.5, "epoch": 0.701, "grad_norm": 0.6069672107696533, "kl": 0.06097412109375, "learning_rate": 3.2432016827824414e-07, "loss": 0.072, "num_tokens": 106892243.0, "reward": 0.5705579817295074, "reward_std": 0.2934810444712639, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049927433021366596, "rewards/penalized_accuracy_reward/std": 0.13044629991054535, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.140625, "step": 1402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1636.25, "completions/max_terminated_length": 1329.5, "completions/mean_length": 784.953125, "completions/mean_terminated_length": 729.9836578369141, "completions/min_length": 288.25, "completions/min_terminated_length": 288.25, "epoch": 0.7015, "grad_norm": 0.5121973156929016, "kl": 0.0364990234375, "learning_rate": 3.2364102401639423e-07, "loss": 0.0676, "num_tokens": 106952064.0, "reward": 0.6592374294996262, "reward_std": 0.33148832246661186, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08743123058229685, "rewards/penalized_accuracy_reward/std": 0.14983787760138512, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 1403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1724.5, "completions/max_terminated_length": 1537.0, "completions/mean_length": 819.203125, "completions/mean_terminated_length": 744.497802734375, "completions/min_length": 321.75, "completions/min_terminated_length": 321.75, "epoch": 0.702, "grad_norm": 0.47434261441230774, "kl": 0.046630859375, "learning_rate": 3.229625692832414e-07, "loss": 0.1639, "num_tokens": 107017613.0, "reward": 0.5417207777500153, "reward_std": 0.22915148362517357, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746195137500763, "rewards/penalized_accuracy_reward/std": 0.08054078370332718, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.13613983243703842, "step": 1404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1431.25, "completions/max_terminated_length": 1361.5, "completions/mean_length": 748.5, "completions/mean_terminated_length": 692.9675598144531, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7025, "grad_norm": 0.3729986250400543, "kl": 0.041107177734375, "learning_rate": 3.222848061454764e-07, "loss": 0.1141, "num_tokens": 107075181.0, "reward": 0.8069023489952087, "reward_std": 0.19877934455871582, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16224023699760437, "rewards/penalized_accuracy_reward/std": 0.08049376308917999, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.07558366656303406, "step": 1405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1602.25, "completions/max_terminated_length": 1381.75, "completions/mean_length": 778.0, "completions/mean_terminated_length": 724.2902069091797, "completions/min_length": 324.75, "completions/min_terminated_length": 324.75, "epoch": 0.703, "grad_norm": 0.36682456731796265, "kl": 0.040771484375, "learning_rate": 3.216077366676833e-07, "loss": 0.1967, "num_tokens": 107133949.0, "reward": 0.5572175681591034, "reward_std": 0.2162652499973774, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037397850304841995, "rewards/penalized_accuracy_reward/std": 0.08040302246809006, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 1406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1869.75, "completions/max_terminated_length": 1590.0, "completions/mean_length": 920.234375, "completions/mean_terminated_length": 834.198600769043, "completions/min_length": 305.5, "completions/min_terminated_length": 305.5, "epoch": 0.7035, "grad_norm": 0.4127624034881592, "kl": 0.049468994140625, "learning_rate": 3.209313629123329e-07, "loss": 0.1126, "num_tokens": 107202284.0, "reward": 0.9972420781850815, "reward_std": 0.5013109873980284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2622929122298956, "rewards/penalized_accuracy_reward/std": 0.2321554198861122, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15334771946072578, "step": 1407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1263.25, "completions/max_terminated_length": 1037.75, "completions/mean_length": 561.109375, "completions/mean_terminated_length": 538.1875, "completions/min_length": 229.5, "completions/min_terminated_length": 229.5, "epoch": 0.704, "grad_norm": 0.4740467071533203, "kl": 0.043609619140625, "learning_rate": 3.2025568693977745e-07, "loss": 0.2005, "num_tokens": 107246531.0, "reward": 0.8624898195266724, "reward_std": 0.43826927058398724, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18710428476333618, "rewards/penalized_accuracy_reward/std": 0.2053055688738823, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.05531632527709007, "step": 1408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1639.5, "completions/max_terminated_length": 1379.25, "completions/mean_length": 866.65625, "completions/mean_terminated_length": 765.2929382324219, "completions/min_length": 399.75, "completions/min_terminated_length": 399.75, "epoch": 0.7045, "grad_norm": 0.30740147829055786, "kl": 0.05340576171875, "learning_rate": 3.195807108082429e-07, "loss": 0.1709, "num_tokens": 107311885.0, "reward": 0.545620933175087, "reward_std": 0.2018749751150608, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745890036225319, "rewards/penalized_accuracy_reward/std": 0.08053424209356308, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.12983575090765953, "step": 1409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1721.5, "completions/mean_length": 968.1875, "completions/mean_terminated_length": 872.2917022705078, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.705, "grad_norm": 0.42817550897598267, "kl": 0.0460205078125, "learning_rate": 3.1890643657382356e-07, "loss": 0.2309, "num_tokens": 107382457.0, "reward": 0.49763165414333344, "reward_std": 0.1798112951219082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012487703002989292, "rewards/penalized_accuracy_reward/std": 0.04995081201195717, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17379852384328842, "step": 1410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1715.25, "completions/max_terminated_length": 1421.25, "completions/mean_length": 802.28125, "completions/mean_terminated_length": 724.6178131103516, "completions/min_length": 324.5, "completions/min_terminated_length": 324.5, "epoch": 0.7055, "grad_norm": 1.6942050457000732, "kl": 0.08734130859375, "learning_rate": 3.182328662904756e-07, "loss": 0.0351, "num_tokens": 107447259.0, "reward": 0.828266978263855, "reward_std": 0.33839504420757294, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17487567849457264, "rewards/penalized_accuracy_reward/std": 0.15760762244462967, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11404262483119965, "step": 1411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2006.5, "completions/max_terminated_length": 1633.25, "completions/mean_length": 983.875, "completions/mean_terminated_length": 923.6053771972656, "completions/min_length": 311.5, "completions/min_terminated_length": 311.5, "epoch": 0.706, "grad_norm": 0.37346580624580383, "kl": 0.0404052734375, "learning_rate": 3.175600020100112e-07, "loss": 0.1911, "num_tokens": 107518979.0, "reward": 0.6343335211277008, "reward_std": 0.23845261335372925, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0749792605638504, "rewards/penalized_accuracy_reward/std": 0.09997235238552094, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10519563034176826, "step": 1412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 754.421875, "completions/mean_terminated_length": 754.421875, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.7065, "grad_norm": 0.4531639814376831, "kl": 0.034332275390625, "learning_rate": 3.168878457820915e-07, "loss": 0.0487, "num_tokens": 107575838.0, "reward": 0.6209135055541992, "reward_std": 0.38832957297563553, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06240988150238991, "rewards/penalized_accuracy_reward/std": 0.18635228648781776, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1490.25, "completions/mean_length": 870.640625, "completions/mean_terminated_length": 772.3869323730469, "completions/min_length": 307.5, "completions/min_terminated_length": 307.5, "epoch": 0.707, "grad_norm": 0.3917335867881775, "kl": 0.05181884765625, "learning_rate": 3.162163996542209e-07, "loss": 0.2838, "num_tokens": 107639479.0, "reward": 0.547593966126442, "reward_std": 0.29560423269867897, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037468855269253254, "rewards/penalized_accuracy_reward/std": 0.11821139231324196, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.19520078226923943, "step": 1414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1890.75, "completions/max_terminated_length": 1789.75, "completions/mean_length": 1225.625, "completions/mean_terminated_length": 1147.8922424316406, "completions/min_length": 511.5, "completions/min_terminated_length": 511.5, "epoch": 0.7075, "grad_norm": 0.3055698871612549, "kl": 0.039642333984375, "learning_rate": 3.155456656717408e-07, "loss": 0.1633, "num_tokens": 107726367.0, "reward": 0.7873168289661407, "reward_std": 0.3858417347073555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16221309825778008, "rewards/penalized_accuracy_reward/std": 0.16379336267709732, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18666185066103935, "step": 1415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1796.5, "completions/max_terminated_length": 1502.0, "completions/mean_length": 969.484375, "completions/mean_terminated_length": 818.6774749755859, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.708, "grad_norm": 0.5083766579627991, "kl": 0.04180908203125, "learning_rate": 3.14875645877823e-07, "loss": 0.1913, "num_tokens": 107797806.0, "reward": 0.47216957807540894, "reward_std": 0.17951378971338272, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012451971881091595, "rewards/penalized_accuracy_reward/std": 0.04980788752436638, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.20163869485259056, "step": 1416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1980.25, "completions/max_terminated_length": 1578.25, "completions/mean_length": 951.609375, "completions/mean_terminated_length": 849.9114074707031, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.7085, "grad_norm": 0.6450545191764832, "kl": 0.050140380859375, "learning_rate": 3.142063423134644e-07, "loss": 0.2882, "num_tokens": 107872405.0, "reward": 0.462890625, "reward_std": 0.08633984625339508, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17267969250679016, "step": 1417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1485.75, "completions/max_terminated_length": 1331.5, "completions/mean_length": 769.96875, "completions/mean_terminated_length": 681.6614685058594, "completions/min_length": 285.75, "completions/min_terminated_length": 285.75, "epoch": 0.709, "grad_norm": 0.5027422308921814, "kl": 0.06048583984375, "learning_rate": 3.135377570174796e-07, "loss": 0.1745, "num_tokens": 107932179.0, "reward": 0.46484375, "reward_std": 0.07728258147835732, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.15456517040729523, "step": 1418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1723.25, "completions/max_terminated_length": 1420.0, "completions/mean_length": 706.8125, "completions/mean_terminated_length": 658.9084930419922, "completions/min_length": 285.75, "completions/min_terminated_length": 285.75, "epoch": 0.7095, "grad_norm": 0.4311954975128174, "kl": 0.043121337890625, "learning_rate": 3.1286989202649503e-07, "loss": 0.1545, "num_tokens": 107988007.0, "reward": 0.6572727859020233, "reward_std": 0.2468390315771103, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08742545545101166, "rewards/penalized_accuracy_reward/std": 0.10238221287727356, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10717359744012356, "step": 1419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1977.75, "completions/max_terminated_length": 1715.5, "completions/mean_length": 1023.734375, "completions/mean_terminated_length": 971.5823364257812, "completions/min_length": 448.5, "completions/min_terminated_length": 448.5, "epoch": 0.71, "grad_norm": 0.41216281056404114, "kl": 0.0379638671875, "learning_rate": 3.122027493749438e-07, "loss": 0.1975, "num_tokens": 108063382.0, "reward": 0.5573879182338715, "reward_std": 0.21853291988372803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03748302534222603, "rewards/penalized_accuracy_reward/std": 0.08058608323335648, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.140625, "step": 1420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.75, "completions/mean_length": 974.03125, "completions/mean_terminated_length": 862.1125335693359, "completions/min_length": 335.5, "completions/min_terminated_length": 335.5, "epoch": 0.7105, "grad_norm": 0.3707869350910187, "kl": 0.039459228515625, "learning_rate": 3.115363310950578e-07, "loss": 0.1881, "num_tokens": 108134056.0, "reward": 0.581935465335846, "reward_std": 0.4030364528298378, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06245210114866495, "rewards/penalized_accuracy_reward/std": 0.18648812919855118, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22645078226923943, "step": 1421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1355.5, "completions/mean_length": 875.703125, "completions/mean_terminated_length": 676.9898986816406, "completions/min_length": 252.5, "completions/min_terminated_length": 252.5, "epoch": 0.711, "grad_norm": 0.7489117980003357, "kl": 0.0579833984375, "learning_rate": 3.1087063921686263e-07, "loss": 0.5841, "num_tokens": 108199013.0, "reward": 0.4375, "reward_std": 0.13094056397676468, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.26188113540410995, "step": 1422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1849.75, "completions/max_terminated_length": 1665.25, "completions/mean_length": 1318.3125, "completions/mean_terminated_length": 1143.3255615234375, "completions/min_length": 523.75, "completions/min_terminated_length": 523.75, "epoch": 0.7115, "grad_norm": 0.3292495608329773, "kl": 0.035919189453125, "learning_rate": 3.102056757681715e-07, "loss": 0.1472, "num_tokens": 108291849.0, "reward": 0.6598013639450073, "reward_std": 0.4673864431679249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12482254952192307, "rewards/penalized_accuracy_reward/std": 0.18284907937049866, "rewards/tag_count_reward/mean": 0.8203125, "rewards/tag_count_reward/std": 0.25050561130046844, "step": 1423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1561.75, "completions/max_terminated_length": 1280.0, "completions/mean_length": 650.34375, "completions/mean_terminated_length": 608.1468963623047, "completions/min_length": 343.75, "completions/min_terminated_length": 343.75, "epoch": 0.712, "grad_norm": 0.7288137674331665, "kl": 0.0572509765625, "learning_rate": 3.0954144277457817e-07, "loss": 0.1841, "num_tokens": 108343567.0, "reward": 0.484375, "reward_std": 0.04555395990610123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09110792353749275, "step": 1424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1879.5, "completions/max_terminated_length": 1565.25, "completions/mean_length": 976.625, "completions/mean_terminated_length": 798.4152069091797, "completions/min_length": 374.5, "completions/min_terminated_length": 374.5, "epoch": 0.7125, "grad_norm": 0.42028796672821045, "kl": 0.048797607421875, "learning_rate": 3.0887794225945143e-07, "loss": 0.3172, "num_tokens": 108413895.0, "reward": 0.4952508509159088, "reward_std": 0.24557046964764595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024969179183244705, "rewards/penalized_accuracy_reward/std": 0.06822879612445831, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21822576224803925, "step": 1425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1843.25, "completions/max_terminated_length": 1303.5, "completions/mean_length": 911.515625, "completions/mean_terminated_length": 799.8470458984375, "completions/min_length": 393.25, "completions/min_terminated_length": 393.25, "epoch": 0.713, "grad_norm": 20.86483383178711, "kl": 0.3079833984375, "learning_rate": 3.0821517624392925e-07, "loss": 0.3349, "num_tokens": 108480968.0, "reward": 0.453125, "reward_std": 0.10456501692533493, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20913004502654076, "step": 1426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1661.5, "completions/max_terminated_length": 1556.75, "completions/mean_length": 835.796875, "completions/mean_terminated_length": 820.8333435058594, "completions/min_length": 282.5, "completions/min_terminated_length": 282.5, "epoch": 0.7135, "grad_norm": 0.3206171691417694, "kl": 0.0487060546875, "learning_rate": 3.075531467469116e-07, "loss": 0.1088, "num_tokens": 108543243.0, "reward": 0.6880761384963989, "reward_std": 0.2467818520963192, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09989744424819946, "rewards/penalized_accuracy_reward/std": 0.10317365825176239, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08086910098791122, "step": 1427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1516.0, "completions/max_terminated_length": 1494.5, "completions/mean_length": 742.203125, "completions/mean_terminated_length": 696.8930358886719, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.714, "grad_norm": 0.37642550468444824, "kl": 0.03240966796875, "learning_rate": 3.0689185578505525e-07, "loss": 0.2214, "num_tokens": 108597880.0, "reward": 0.708842545747757, "reward_std": 0.24278623796999454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11223377287387848, "rewards/penalized_accuracy_reward/std": 0.1022270917892456, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.0766641404479742, "step": 1428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1427.5, "completions/mean_length": 942.8125, "completions/mean_terminated_length": 790.9923477172852, "completions/min_length": 359.25, "completions/min_terminated_length": 359.25, "epoch": 0.7145, "grad_norm": 0.5264390707015991, "kl": 0.05279541015625, "learning_rate": 3.062313053727671e-07, "loss": 0.3031, "num_tokens": 108666476.0, "reward": 0.6778410077095032, "reward_std": 0.43609586358070374, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11235800012946129, "rewards/penalized_accuracy_reward/std": 0.1803656369447708, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.21620866656303406, "step": 1429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1779.75, "completions/max_terminated_length": 1457.25, "completions/mean_length": 829.890625, "completions/mean_terminated_length": 791.8031463623047, "completions/min_length": 315.75, "completions/min_terminated_length": 315.75, "epoch": 0.715, "grad_norm": 0.45266175270080566, "kl": 0.034332275390625, "learning_rate": 3.055714975221981e-07, "loss": 0.1384, "num_tokens": 108728341.0, "reward": 0.536257266998291, "reward_std": 0.18467942997813225, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496456727385521, "rewards/penalized_accuracy_reward/std": 0.06821619719266891, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09649410098791122, "step": 1430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2032.5, "completions/max_terminated_length": 1430.5, "completions/mean_length": 814.203125, "completions/mean_terminated_length": 749.9427337646484, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.7155, "grad_norm": 0.5653957724571228, "kl": 0.05572509765625, "learning_rate": 3.0491243424323783e-07, "loss": 0.268, "num_tokens": 108792450.0, "reward": 0.478515625, "reward_std": 0.07889364659786224, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1577872931957245, "step": 1431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1486.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 687.46875, "completions/mean_terminated_length": 663.6135559082031, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.716, "grad_norm": 0.6336126923561096, "kl": 0.04010009765625, "learning_rate": 3.0425411754350694e-07, "loss": 0.2198, "num_tokens": 108845440.0, "reward": 0.6592549681663513, "reward_std": 0.3441665694117546, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08743997570127249, "rewards/penalized_accuracy_reward/std": 0.14988258108496666, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11211910098791122, "step": 1432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2019.0, "completions/max_terminated_length": 1519.75, "completions/mean_length": 1273.46875, "completions/mean_terminated_length": 867.4680328369141, "completions/min_length": 418.75, "completions/min_terminated_length": 418.75, "epoch": 0.7165, "grad_norm": 0.28628823161125183, "kl": 0.058837890625, "learning_rate": 3.0359654942835247e-07, "loss": 0.2201, "num_tokens": 108937262.0, "reward": 0.4346599578857422, "reward_std": 0.212314760312438, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024947164580225945, "rewards/penalized_accuracy_reward/std": 0.06816863268613815, "rewards/tag_count_reward/mean": 0.76953125, "rewards/tag_count_reward/std": 0.19374465942382812, "step": 1433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1929.5, "completions/max_terminated_length": 1713.75, "completions/mean_length": 847.359375, "completions/mean_terminated_length": 787.6802368164062, "completions/min_length": 332.25, "completions/min_terminated_length": 332.25, "epoch": 0.717, "grad_norm": 0.4655703902244568, "kl": 0.035308837890625, "learning_rate": 3.029397319008407e-07, "loss": 0.2407, "num_tokens": 109003813.0, "reward": 0.5723491013050079, "reward_std": 0.2429777067154646, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049846433103084564, "rewards/penalized_accuracy_reward/std": 0.08916802704334259, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.16622861847281456, "step": 1434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2045.5, "completions/max_terminated_length": 1799.75, "completions/mean_length": 1208.265625, "completions/mean_terminated_length": 1074.0604400634766, "completions/min_length": 502.75, "completions/min_terminated_length": 502.75, "epoch": 0.7175, "grad_norm": 0.2515089213848114, "kl": 0.03485107421875, "learning_rate": 3.02283666961752e-07, "loss": 0.127, "num_tokens": 109089110.0, "reward": 0.5721243470907211, "reward_std": 0.35796893760561943, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062429361045360565, "rewards/penalized_accuracy_reward/std": 0.1487622857093811, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.20797448605298996, "step": 1435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1994.5, "completions/max_terminated_length": 1856.25, "completions/mean_length": 1223.9375, "completions/mean_terminated_length": 1121.4298553466797, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.718, "grad_norm": 0.30916744470596313, "kl": 0.033721923828125, "learning_rate": 3.016283566095739e-07, "loss": 0.1587, "num_tokens": 109176642.0, "reward": 0.5911406576633453, "reward_std": 0.40175894275307655, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07486720383167267, "rewards/penalized_accuracy_reward/std": 0.1609596610069275, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.23428086936473846, "step": 1436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1366.75, "completions/max_terminated_length": 1304.0, "completions/mean_length": 754.609375, "completions/mean_terminated_length": 688.4322967529297, "completions/min_length": 310.25, "completions/min_terminated_length": 310.25, "epoch": 0.7185, "grad_norm": 0.4495655298233032, "kl": 0.0445556640625, "learning_rate": 3.0097380284049523e-07, "loss": 0.0599, "num_tokens": 109235225.0, "reward": 0.6244693398475647, "reward_std": 0.3636764995753765, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07492998242378235, "rewards/penalized_accuracy_reward/std": 0.16109462827444077, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.08297448605298996, "step": 1437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1862.25, "completions/max_terminated_length": 1474.25, "completions/mean_length": 875.78125, "completions/mean_terminated_length": 792.2113494873047, "completions/min_length": 333.75, "completions/min_terminated_length": 333.75, "epoch": 0.719, "grad_norm": 0.5074045658111572, "kl": 0.039764404296875, "learning_rate": 3.003200076484004e-07, "loss": 0.2392, "num_tokens": 109299307.0, "reward": 0.5396661460399628, "reward_std": 0.23737742751836777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03741119056940079, "rewards/penalized_accuracy_reward/std": 0.08043165504932404, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1994442492723465, "step": 1438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2004.5, "completions/max_terminated_length": 1890.0, "completions/mean_length": 1001.125, "completions/mean_terminated_length": 932.5156402587891, "completions/min_length": 351.75, "completions/min_terminated_length": 351.75, "epoch": 0.7195, "grad_norm": 0.42740342020988464, "kl": 0.0433349609375, "learning_rate": 2.996669730248628e-07, "loss": 0.2142, "num_tokens": 109372387.0, "reward": 0.466796875, "reward_std": 0.09230193309485912, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18460387364029884, "step": 1439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1747.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 977.8125, "completions/mean_terminated_length": 928.5995788574219, "completions/min_length": 431.75, "completions/min_terminated_length": 431.75, "epoch": 0.72, "grad_norm": 0.32717740535736084, "kl": 0.03863525390625, "learning_rate": 2.9901470095913943e-07, "loss": 0.0498, "num_tokens": 109447015.0, "reward": 0.5781287997961044, "reward_std": 0.21661613881587982, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049806587398052216, "rewards/penalized_accuracy_reward/std": 0.08909673988819122, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1342380754649639, "step": 1440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1884.25, "completions/max_terminated_length": 1376.5, "completions/mean_length": 997.640625, "completions/mean_terminated_length": 833.260986328125, "completions/min_length": 381.25, "completions/min_terminated_length": 381.25, "epoch": 0.7205, "grad_norm": 0.42204147577285767, "kl": 0.05633544921875, "learning_rate": 2.9836319343816397e-07, "loss": 0.235, "num_tokens": 109522464.0, "reward": 0.5930731892585754, "reward_std": 0.3780023790895939, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07485690247267485, "rewards/penalized_accuracy_reward/std": 0.14547301456332207, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.22226756066083908, "step": 1441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1842.75, "completions/max_terminated_length": 1456.25, "completions/mean_length": 859.234375, "completions/mean_terminated_length": 800.8427429199219, "completions/min_length": 309.25, "completions/min_terminated_length": 309.25, "epoch": 0.721, "grad_norm": 0.9905495047569275, "kl": 0.049102783203125, "learning_rate": 2.977124524465413e-07, "loss": 0.2452, "num_tokens": 109585151.0, "reward": 0.7299788594245911, "reward_std": 0.25040099769830704, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12475505471229553, "rewards/penalized_accuracy_reward/std": 0.0998040959239006, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13048820197582245, "step": 1442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1516.75, "completions/max_terminated_length": 1214.5, "completions/mean_length": 829.546875, "completions/mean_terminated_length": 739.1299285888672, "completions/min_length": 330.25, "completions/min_terminated_length": 330.25, "epoch": 0.7215, "grad_norm": 0.5615711212158203, "kl": 0.0682373046875, "learning_rate": 2.9706247996654134e-07, "loss": 0.2041, "num_tokens": 109649634.0, "reward": 0.466796875, "reward_std": 0.07273724302649498, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14547448605298996, "step": 1443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1743.5, "completions/max_terminated_length": 1580.0, "completions/mean_length": 947.484375, "completions/mean_terminated_length": 912.8125152587891, "completions/min_length": 404.75, "completions/min_terminated_length": 404.75, "epoch": 0.722, "grad_norm": 0.4059709906578064, "kl": 0.049346923828125, "learning_rate": 2.964132779780929e-07, "loss": 0.1699, "num_tokens": 109718129.0, "reward": 0.6591650247573853, "reward_std": 0.33799891360104084, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08739501144737005, "rewards/penalized_accuracy_reward/std": 0.14983342215418816, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.0766641404479742, "step": 1444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1638.75, "completions/max_terminated_length": 1292.0, "completions/mean_length": 810.15625, "completions/mean_terminated_length": 693.6011505126953, "completions/min_length": 219.25, "completions/min_terminated_length": 219.25, "epoch": 0.7225, "grad_norm": 0.598197877407074, "kl": 0.0645751953125, "learning_rate": 2.9576484845877793e-07, "loss": 0.3228, "num_tokens": 109776811.0, "reward": 0.8315258622169495, "reward_std": 0.48475800454616547, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18724728375673294, "rewards/penalized_accuracy_reward/std": 0.20219885557889938, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.20643002539873123, "step": 1445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1797.25, "completions/max_terminated_length": 1689.75, "completions/mean_length": 836.109375, "completions/mean_terminated_length": 803.0812835693359, "completions/min_length": 309.75, "completions/min_terminated_length": 309.75, "epoch": 0.723, "grad_norm": 0.5022996664047241, "kl": 0.03912353515625, "learning_rate": 2.9511719338382535e-07, "loss": 0.0317, "num_tokens": 109840450.0, "reward": 0.5014487504959106, "reward_std": 0.16231461241841316, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012443129904568195, "rewards/penalized_accuracy_reward/std": 0.04977252334356308, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16395078226923943, "step": 1446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1761.0, "completions/max_terminated_length": 1469.25, "completions/mean_length": 770.625, "completions/mean_terminated_length": 729.4541931152344, "completions/min_length": 328.75, "completions/min_terminated_length": 328.75, "epoch": 0.7235, "grad_norm": 0.5236214399337769, "kl": 0.048828125, "learning_rate": 2.944703147261046e-07, "loss": 0.2123, "num_tokens": 109897338.0, "reward": 0.484375, "reward_std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 1447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1750.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 1023.640625, "completions/mean_terminated_length": 997.8794860839844, "completions/min_length": 430.25, "completions/min_terminated_length": 430.25, "epoch": 0.724, "grad_norm": 0.3686748445034027, "kl": 0.034088134765625, "learning_rate": 2.938242144561201e-07, "loss": -0.1088, "num_tokens": 109970355.0, "reward": 0.6380406618118286, "reward_std": 0.3999178782105446, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07487971428781748, "rewards/penalized_accuracy_reward/std": 0.1985740177333355, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07966229319572449, "step": 1448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1682.5, "completions/max_terminated_length": 1433.75, "completions/mean_length": 865.890625, "completions/mean_terminated_length": 774.6578674316406, "completions/min_length": 368.5, "completions/min_terminated_length": 368.5, "epoch": 0.7245, "grad_norm": 0.4417184591293335, "kl": 0.04742431640625, "learning_rate": 2.931788945420058e-07, "loss": 0.1583, "num_tokens": 110038988.0, "reward": 0.466796875, "reward_std": 0.05750272050499916, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.11500544100999832, "step": 1449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1567.25, "completions/mean_length": 986.109375, "completions/mean_terminated_length": 871.0756988525391, "completions/min_length": 363.25, "completions/min_terminated_length": 363.25, "epoch": 0.725, "grad_norm": 0.3777114152908325, "kl": 0.036376953125, "learning_rate": 2.925343569495178e-07, "loss": 0.2496, "num_tokens": 110110451.0, "reward": 0.5975950360298157, "reward_std": 0.2592913657426834, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06246939301490784, "rewards/penalized_accuracy_reward/std": 0.09569582343101501, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14839753694832325, "step": 1450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1515.75, "completions/mean_length": 932.515625, "completions/mean_terminated_length": 793.5555572509766, "completions/min_length": 329.25, "completions/min_terminated_length": 329.25, "epoch": 0.7255, "grad_norm": 0.41177237033843994, "kl": 0.047119140625, "learning_rate": 2.918906036420294e-07, "loss": 0.2497, "num_tokens": 110178884.0, "reward": 0.66069296002388, "reward_std": 0.48051881790161133, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0998777262866497, "rewards/penalized_accuracy_reward/std": 0.21376243978738785, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21890868619084358, "step": 1451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1711.0, "completions/max_terminated_length": 1682.75, "completions/mean_length": 1051.859375, "completions/mean_terminated_length": 1024.7901916503906, "completions/min_length": 489.25, "completions/min_terminated_length": 489.25, "epoch": 0.726, "grad_norm": 0.2792389690876007, "kl": 0.0243072509765625, "learning_rate": 2.9124763658052474e-07, "loss": 0.0627, "num_tokens": 110254331.0, "reward": 0.7395821213722229, "reward_std": 0.40268463641405106, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12467387318611145, "rewards/penalized_accuracy_reward/std": 0.1889485865831375, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.049575019627809525, "step": 1452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1235.0, "completions/max_terminated_length": 1146.5, "completions/mean_length": 674.484375, "completions/mean_terminated_length": 657.8760528564453, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.7265, "grad_norm": 0.5563107132911682, "kl": 0.04248046875, "learning_rate": 2.9060545772359305e-07, "loss": 0.1583, "num_tokens": 110306362.0, "reward": 1.1933103054761887, "reward_std": 0.7755820900201797, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.34958484396338463, "rewards/penalized_accuracy_reward/std": 0.385911226272583, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.033994100987911224, "step": 1453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1681.75, "completions/max_terminated_length": 1494.25, "completions/mean_length": 964.28125, "completions/mean_terminated_length": 847.2569274902344, "completions/min_length": 461.75, "completions/min_terminated_length": 461.75, "epoch": 0.727, "grad_norm": 0.28788813948631287, "kl": 0.04791259765625, "learning_rate": 2.8996406902742267e-07, "loss": 0.1191, "num_tokens": 110376108.0, "reward": 0.5857958495616913, "reward_std": 0.23480867967009544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06242917478084564, "rewards/penalized_accuracy_reward/std": 0.09563422203063965, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1237528920173645, "step": 1454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1767.75, "completions/max_terminated_length": 1666.75, "completions/mean_length": 982.890625, "completions/mean_terminated_length": 965.9521026611328, "completions/min_length": 460.75, "completions/min_terminated_length": 460.75, "epoch": 0.7275, "grad_norm": 0.6759796142578125, "kl": 0.050628662109375, "learning_rate": 2.893234724457946e-07, "loss": -0.0322, "num_tokens": 110448149.0, "reward": 0.5362275540828705, "reward_std": 0.17137226089835167, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02494971640408039, "rewards/penalized_accuracy_reward/std": 0.06817561388015747, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09649410098791122, "step": 1455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1253.25, "completions/mean_length": 985.78125, "completions/mean_terminated_length": 776.8641204833984, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.728, "grad_norm": 0.47660619020462036, "kl": 0.07757568359375, "learning_rate": 2.886836699300771e-07, "loss": 0.321, "num_tokens": 110521271.0, "reward": 0.5716374218463898, "reward_std": 0.40819242782890797, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07488120719790459, "rewards/penalized_accuracy_reward/std": 0.1575085073709488, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.27782439440488815, "step": 1456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1585.75, "completions/max_terminated_length": 1443.0, "completions/mean_length": 862.640625, "completions/mean_terminated_length": 731.7552032470703, "completions/min_length": 336.75, "completions/min_terminated_length": 336.75, "epoch": 0.7285, "grad_norm": 0.4678838849067688, "kl": 0.04925537109375, "learning_rate": 2.8804466342921987e-07, "loss": 0.2023, "num_tokens": 110584800.0, "reward": 0.7239449620246887, "reward_std": 0.4335031621158123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13736310973763466, "rewards/penalized_accuracy_reward/std": 0.18373863399028778, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.14336910098791122, "step": 1457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1933.5, "completions/max_terminated_length": 1924.0, "completions/mean_length": 1050.375, "completions/mean_terminated_length": 1028.6272583007812, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.729, "grad_norm": 0.20350545644760132, "kl": 0.032806396484375, "learning_rate": 2.874064548897472e-07, "loss": -0.0296, "num_tokens": 110660664.0, "reward": 0.8109149634838104, "reward_std": 0.32025760412216187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16229341737926006, "rewards/penalized_accuracy_reward/std": 0.16379327327013016, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.06887998431921005, "step": 1458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1873.0, "completions/max_terminated_length": 1391.25, "completions/mean_length": 754.921875, "completions/mean_terminated_length": 673.5611877441406, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.7295, "grad_norm": 0.5646243691444397, "kl": 0.057464599609375, "learning_rate": 2.86769046255753e-07, "loss": 0.3311, "num_tokens": 110717203.0, "reward": 0.8527767360210419, "reward_std": 0.2759924903512001, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18713055737316608, "rewards/penalized_accuracy_reward/std": 0.11801060661673546, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.12808074057102203, "step": 1459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1454.25, "completions/max_terminated_length": 1156.5, "completions/mean_length": 623.578125, "completions/mean_terminated_length": 580.8937683105469, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.73, "grad_norm": 0.4084329307079315, "kl": 0.043914794921875, "learning_rate": 2.8613243946889477e-07, "loss": 0.0997, "num_tokens": 110765672.0, "reward": 0.6263240873813629, "reward_std": 0.2520482875406742, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07488079369068146, "rewards/penalized_accuracy_reward/std": 0.09984106570482254, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14040156453847885, "step": 1460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1841.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 907.921875, "completions/mean_terminated_length": 782.373046875, "completions/min_length": 307.25, "completions/min_terminated_length": 307.25, "epoch": 0.7305, "grad_norm": 0.38695257902145386, "kl": 0.03997802734375, "learning_rate": 2.854966364683872e-07, "loss": 0.1991, "num_tokens": 110833507.0, "reward": 0.5147695094347, "reward_std": 0.2037464790046215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024962877854704857, "rewards/penalized_accuracy_reward/std": 0.06821159273386002, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16109948605298996, "step": 1461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1641.5, "completions/max_terminated_length": 1218.5, "completions/mean_length": 760.15625, "completions/mean_terminated_length": 714.7437591552734, "completions/min_length": 206.25, "completions/min_terminated_length": 206.25, "epoch": 0.731, "grad_norm": 0.6431498527526855, "kl": 0.05084228515625, "learning_rate": 2.848616391909959e-07, "loss": 0.2961, "num_tokens": 110891373.0, "reward": 0.482421875, "reward_std": 0.058317553251981735, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11663510836660862, "step": 1462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1637.75, "completions/max_terminated_length": 1364.75, "completions/mean_length": 787.109375, "completions/mean_terminated_length": 723.1732330322266, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7315, "grad_norm": 0.35708510875701904, "kl": 0.0374755859375, "learning_rate": 2.842274495710335e-07, "loss": 0.137, "num_tokens": 110953172.0, "reward": 1.1024740040302277, "reward_std": 0.7249516770243645, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3119791792705655, "rewards/penalized_accuracy_reward/std": 0.3544031009078026, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11534032225608826, "step": 1463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1391.25, "completions/max_terminated_length": 1364.75, "completions/mean_length": 737.3125, "completions/mean_terminated_length": 724.8760681152344, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.732, "grad_norm": 0.30297812819480896, "kl": 0.039306640625, "learning_rate": 2.835940695403512e-07, "loss": 0.0576, "num_tokens": 111014488.0, "reward": 0.484375, "reward_std": 0.04081955552101135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.0816391110420227, "step": 1464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1520.75, "completions/max_terminated_length": 1326.25, "completions/mean_length": 935.265625, "completions/mean_terminated_length": 852.4411315917969, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.7325, "grad_norm": 0.4586212933063507, "kl": 0.042877197265625, "learning_rate": 2.829615010283344e-07, "loss": 0.1331, "num_tokens": 111085017.0, "reward": 0.6416227221488953, "reward_std": 0.36108206026256084, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08741291984915733, "rewards/penalized_accuracy_reward/std": 0.14986519142985344, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.12270338460803032, "step": 1465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1453.5, "completions/max_terminated_length": 1237.5, "completions/mean_length": 691.0625, "completions/mean_terminated_length": 668.7166748046875, "completions/min_length": 310.25, "completions/min_terminated_length": 310.25, "epoch": 0.733, "grad_norm": 0.24801822006702423, "kl": 0.04534912109375, "learning_rate": 2.8232974596189653e-07, "loss": 0.0526, "num_tokens": 111137517.0, "reward": 0.5690807700157166, "reward_std": 0.16566407680511475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037470072507858276, "rewards/penalized_accuracy_reward/std": 0.08055824041366577, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1875.5, "completions/max_terminated_length": 1654.0, "completions/mean_length": 928.296875, "completions/mean_terminated_length": 843.9516143798828, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.7335, "grad_norm": 0.40530070662498474, "kl": 0.038909912109375, "learning_rate": 2.8169880626547283e-07, "loss": 0.1896, "num_tokens": 111207888.0, "reward": 0.7683427035808563, "reward_std": 0.2513731084764004, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14979636669158936, "rewards/penalized_accuracy_reward/std": 0.08932136744260788, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.14546075090765953, "step": 1467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1347.75, "completions/max_terminated_length": 1202.5, "completions/mean_length": 625.796875, "completions/mean_terminated_length": 603.7802124023438, "completions/min_length": 284.25, "completions/min_terminated_length": 284.25, "epoch": 0.734, "grad_norm": 0.5572853088378906, "kl": 0.06512451171875, "learning_rate": 2.8106868386101545e-07, "loss": 0.083, "num_tokens": 111258019.0, "reward": 0.6072855293750763, "reward_std": 0.24237230233848095, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06243182718753815, "rewards/penalized_accuracy_reward/std": 0.09563830494880676, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10219132527709007, "step": 1468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1658.25, "completions/max_terminated_length": 1491.25, "completions/mean_length": 1015.15625, "completions/mean_terminated_length": 940.5485076904297, "completions/min_length": 520.25, "completions/min_terminated_length": 520.25, "epoch": 0.7345, "grad_norm": 0.2985619008541107, "kl": 0.04583740234375, "learning_rate": 2.8043938066798645e-07, "loss": 0.2235, "num_tokens": 111330861.0, "reward": 0.6415592432022095, "reward_std": 0.26536176539957523, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.12140204757452011, "step": 1469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 672.171875, "completions/mean_terminated_length": 672.171875, "completions/min_length": 343.5, "completions/min_terminated_length": 343.5, "epoch": 0.735, "grad_norm": 0.3850691020488739, "kl": 0.03741455078125, "learning_rate": 2.7981089860335225e-07, "loss": 0.0341, "num_tokens": 111384696.0, "reward": 0.6189433634281158, "reward_std": 0.20832183957099915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06240135803818703, "rewards/penalized_accuracy_reward/std": 0.09559161216020584, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1244.75, "completions/max_terminated_length": 1244.75, "completions/mean_length": 597.09375, "completions/mean_terminated_length": 597.09375, "completions/min_length": 224.5, "completions/min_terminated_length": 224.5, "epoch": 0.7355, "grad_norm": 0.3940978944301605, "kl": 0.040557861328125, "learning_rate": 2.791832395815782e-07, "loss": 0.0163, "num_tokens": 111431038.0, "reward": 0.49609375, "reward_std": 0.015625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.5, "completions/max_terminated_length": 1739.5, "completions/mean_length": 903.640625, "completions/mean_terminated_length": 903.640625, "completions/min_length": 455.25, "completions/min_terminated_length": 455.25, "epoch": 0.736, "grad_norm": 0.12680335342884064, "kl": 0.03070068359375, "learning_rate": 2.7855640551462287e-07, "loss": 0.0216, "num_tokens": 111499623.0, "reward": 0.498046875, "reward_std": 0.0078125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1767.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 818.3125, "completions/mean_terminated_length": 718.4244384765625, "completions/min_length": 359.25, "completions/min_terminated_length": 359.25, "epoch": 0.7365, "grad_norm": 0.5195584893226624, "kl": 0.04205322265625, "learning_rate": 2.7793039831193133e-07, "loss": 0.3513, "num_tokens": 111561099.0, "reward": 0.845478892326355, "reward_std": 0.1805976778268814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1873878836631775, "rewards/penalized_accuracy_reward/std": 0.04997013136744499, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16131485998630524, "step": 1473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1621.75, "completions/max_terminated_length": 1419.75, "completions/mean_length": 960.828125, "completions/mean_terminated_length": 894.4047393798828, "completions/min_length": 382.5, "completions/min_terminated_length": 382.5, "epoch": 0.737, "grad_norm": 0.3492174446582794, "kl": 0.033355712890625, "learning_rate": 2.773052198804301e-07, "loss": 0.1834, "num_tokens": 111632496.0, "reward": 0.7991430759429932, "reward_std": 0.22253437712788582, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16226685047149658, "rewards/penalized_accuracy_reward/std": 0.08050691336393356, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.12304110452532768, "step": 1474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1835.25, "completions/max_terminated_length": 1518.25, "completions/mean_length": 1085.953125, "completions/mean_terminated_length": 955.4562683105469, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7375, "grad_norm": 0.31642189621925354, "kl": 0.037200927734375, "learning_rate": 2.766808721245211e-07, "loss": 0.0755, "num_tokens": 111709981.0, "reward": 0.5029232203960419, "reward_std": 0.2511560320854187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02489911112934351, "rewards/penalized_accuracy_reward/std": 0.09959645196795464, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1981821097433567, "step": 1475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1569.25, "completions/max_terminated_length": 1375.25, "completions/mean_length": 889.421875, "completions/mean_terminated_length": 822.2678985595703, "completions/min_length": 449.25, "completions/min_terminated_length": 449.25, "epoch": 0.738, "grad_norm": 0.39910420775413513, "kl": 0.030181884765625, "learning_rate": 2.760573569460757e-07, "loss": 0.2803, "num_tokens": 111777240.0, "reward": 0.8050296157598495, "reward_std": 0.45091212913393974, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16228043287992477, "rewards/penalized_accuracy_reward/std": 0.19877130538225174, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10673907771706581, "step": 1476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1551.5, "completions/max_terminated_length": 1460.0, "completions/mean_length": 800.390625, "completions/mean_terminated_length": 744.609375, "completions/min_length": 396.75, "completions/min_terminated_length": 396.75, "epoch": 0.7385, "grad_norm": 0.27085673809051514, "kl": 0.03143310546875, "learning_rate": 2.7543467624442956e-07, "loss": 0.0572, "num_tokens": 111839409.0, "reward": 0.4765625, "reward_std": 0.05325498431921005, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1065099686384201, "step": 1477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1870.25, "completions/max_terminated_length": 1308.0, "completions/mean_length": 803.65625, "completions/mean_terminated_length": 724.7250366210938, "completions/min_length": 387.75, "completions/min_terminated_length": 387.75, "epoch": 0.739, "grad_norm": 0.5016718506813049, "kl": 0.03912353515625, "learning_rate": 2.7481283191637605e-07, "loss": 0.2536, "num_tokens": 111900987.0, "reward": 0.48592036962509155, "reward_std": 0.1786623653024435, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491435743868351, "rewards/penalized_accuracy_reward/std": 0.0499657467007637, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.20562516152858734, "step": 1478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1538.0, "completions/max_terminated_length": 1511.5, "completions/mean_length": 856.90625, "completions/mean_terminated_length": 811.1814880371094, "completions/min_length": 356.5, "completions/min_terminated_length": 356.5, "epoch": 0.7395, "grad_norm": 0.3368953466415405, "kl": 0.02874755859375, "learning_rate": 2.741918258561607e-07, "loss": 0.2098, "num_tokens": 111964469.0, "reward": 0.8011297583580017, "reward_std": 0.21979805827140808, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16228362917900085, "rewards/penalized_accuracy_reward/std": 0.08051533252000809, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.13096532225608826, "step": 1479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1667.0, "completions/max_terminated_length": 1266.25, "completions/mean_length": 883.515625, "completions/mean_terminated_length": 775.7478942871094, "completions/min_length": 371.75, "completions/min_terminated_length": 371.75, "epoch": 0.74, "grad_norm": 0.4916079342365265, "kl": 0.043060302734375, "learning_rate": 2.7357165995547547e-07, "loss": 0.2129, "num_tokens": 112029094.0, "reward": 0.7624778002500534, "reward_std": 0.49067065864801407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14979358483105898, "rewards/penalized_accuracy_reward/std": 0.2203790806233883, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.18270447477698326, "step": 1480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2011.75, "completions/max_terminated_length": 1458.75, "completions/mean_length": 874.03125, "completions/mean_terminated_length": 724.3528594970703, "completions/min_length": 344.75, "completions/min_terminated_length": 344.75, "epoch": 0.7405, "grad_norm": 0.4739307463169098, "kl": 0.07110595703125, "learning_rate": 2.729523361034538e-07, "loss": 0.3212, "num_tokens": 112094344.0, "reward": 0.5740940719842911, "reward_std": 0.35999055206775665, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06243766378611326, "rewards/penalized_accuracy_reward/std": 0.1393023133277893, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2072637416422367, "step": 1481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1903.0, "completions/max_terminated_length": 1365.25, "completions/mean_length": 855.4375, "completions/mean_terminated_length": 768.4701385498047, "completions/min_length": 477.5, "completions/min_terminated_length": 477.5, "epoch": 0.741, "grad_norm": 0.6203644275665283, "kl": 0.05438232421875, "learning_rate": 2.7233385618666315e-07, "loss": 0.2679, "num_tokens": 112162756.0, "reward": 0.5628134310245514, "reward_std": 0.31495633721351624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0499613992869854, "rewards/penalized_accuracy_reward/std": 0.13652053475379944, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.14997504279017448, "step": 1482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1747.75, "completions/max_terminated_length": 1722.75, "completions/mean_length": 818.78125, "completions/mean_terminated_length": 805.2885437011719, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.7415, "grad_norm": 0.39767947793006897, "kl": 0.036346435546875, "learning_rate": 2.717162220891007e-07, "loss": 0.0133, "num_tokens": 112224982.0, "reward": 0.5690976828336716, "reward_std": 0.2599314823746681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747852426022291, "rewards/penalized_accuracy_reward/std": 0.11824698746204376, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1324.75, "completions/max_terminated_length": 1301.25, "completions/mean_length": 716.03125, "completions/mean_terminated_length": 696.8823089599609, "completions/min_length": 280.25, "completions/min_terminated_length": 280.25, "epoch": 0.742, "grad_norm": 0.3314177691936493, "kl": 0.03900146484375, "learning_rate": 2.7109943569218707e-07, "loss": 0.034, "num_tokens": 112278776.0, "reward": 0.7667887657880783, "reward_std": 0.44659894704818726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13730063196271658, "rewards/penalized_accuracy_reward/std": 0.22120999917387962, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.03608439117670059, "step": 1484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1733.5, "completions/max_terminated_length": 1370.5, "completions/mean_length": 906.671875, "completions/mean_terminated_length": 774.4092407226562, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.7425, "grad_norm": 0.44501450657844543, "kl": 0.0496826171875, "learning_rate": 2.7048349887476037e-07, "loss": 0.3032, "num_tokens": 112348387.0, "reward": 0.45703125, "reward_std": 0.09150741621851921, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18301483243703842, "step": 1485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1963.75, "completions/max_terminated_length": 1729.75, "completions/mean_length": 1048.984375, "completions/mean_terminated_length": 928.7300720214844, "completions/min_length": 463.25, "completions/min_terminated_length": 463.25, "epoch": 0.743, "grad_norm": 0.39518406987190247, "kl": 0.0369873046875, "learning_rate": 2.698684135130713e-07, "loss": 0.2036, "num_tokens": 112427794.0, "reward": 0.4819978326559067, "reward_std": 0.18587381020188332, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012483292259275913, "rewards/penalized_accuracy_reward/std": 0.04993316903710365, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22017251700162888, "step": 1486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1704.5, "completions/max_terminated_length": 1206.25, "completions/mean_length": 734.828125, "completions/mean_terminated_length": 661.856559753418, "completions/min_length": 257.25, "completions/min_terminated_length": 257.25, "epoch": 0.7435, "grad_norm": 0.3435825705528259, "kl": 0.065643310546875, "learning_rate": 2.692541814807763e-07, "loss": 0.1546, "num_tokens": 112485623.0, "reward": 0.6571137309074402, "reward_std": 0.353252649307251, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0873459242284298, "rewards/penalized_accuracy_reward/std": 0.16976428031921387, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 1487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 988.234375, "completions/mean_terminated_length": 917.5833587646484, "completions/min_length": 413.25, "completions/min_terminated_length": 413.25, "epoch": 0.744, "grad_norm": 0.47845208644866943, "kl": 0.03363037109375, "learning_rate": 2.686408046489328e-07, "loss": 0.1919, "num_tokens": 112558326.0, "reward": 0.5514298230409622, "reward_std": 0.22938118502497673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.17461910098791122, "step": 1488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1394.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 781.53125, "completions/mean_terminated_length": 727.3581848144531, "completions/min_length": 333.75, "completions/min_terminated_length": 333.75, "epoch": 0.7445, "grad_norm": 0.33030349016189575, "kl": 0.044647216796875, "learning_rate": 2.6802828488599294e-07, "loss": 0.1803, "num_tokens": 112615656.0, "reward": 0.8821984529495239, "reward_std": 0.03788809345132904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19988827407360077, "rewards/penalized_accuracy_reward/std": 4.812537008547224e-05, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.07558366656303406, "step": 1489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.25, "completions/max_terminated_length": 1150.25, "completions/mean_length": 555.734375, "completions/mean_terminated_length": 555.734375, "completions/min_length": 243.75, "completions/min_terminated_length": 243.75, "epoch": 0.745, "grad_norm": 0.24373461306095123, "kl": 0.04461669921875, "learning_rate": 2.6741662405779796e-07, "loss": 0.0051, "num_tokens": 112660743.0, "reward": 0.7748057544231415, "reward_std": 0.19135020673274994, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13740287721157074, "rewards/penalized_accuracy_reward/std": 0.09567510336637497, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1797.25, "completions/max_terminated_length": 1582.5, "completions/mean_length": 893.28125, "completions/mean_terminated_length": 777.5367279052734, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.7455, "grad_norm": 0.36956459283828735, "kl": 0.0439453125, "learning_rate": 2.6680582402757324e-07, "loss": 0.0929, "num_tokens": 112726809.0, "reward": 0.8602784126996994, "reward_std": 0.6305230408906937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1996704414486885, "rewards/penalized_accuracy_reward/std": 0.2909936159849167, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2177521139383316, "step": 1491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1810.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 838.96875, "completions/mean_terminated_length": 732.1927337646484, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.746, "grad_norm": 0.4715869426727295, "kl": 0.075439453125, "learning_rate": 2.661958866559213e-07, "loss": 0.3314, "num_tokens": 112790391.0, "reward": 0.4765625, "reward_std": 0.07127426192164421, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14254852384328842, "step": 1492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1842.5, "completions/max_terminated_length": 1625.25, "completions/mean_length": 901.546875, "completions/mean_terminated_length": 815.921875, "completions/min_length": 337.75, "completions/min_terminated_length": 337.75, "epoch": 0.7465, "grad_norm": 0.3802449405193329, "kl": 0.034942626953125, "learning_rate": 2.655868138008171e-07, "loss": 0.2576, "num_tokens": 112856858.0, "reward": 0.5358599275350571, "reward_std": 0.3005293030291796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746121469885111, "rewards/penalized_accuracy_reward/std": 0.11819974705576897, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.1550339087843895, "step": 1493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1690.25, "completions/max_terminated_length": 1466.0, "completions/mean_length": 975.25, "completions/mean_terminated_length": 927.8236999511719, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.747, "grad_norm": 0.3629263937473297, "kl": 0.04034423828125, "learning_rate": 2.649786073176025e-07, "loss": 0.0602, "num_tokens": 112928010.0, "reward": 0.7821177989244461, "reward_std": 0.41654495149850845, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14984798058867455, "rewards/penalized_accuracy_reward/std": 0.19981054589152336, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 1494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1468.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 697.765625, "completions/mean_terminated_length": 682.7593994140625, "completions/min_length": 331.25, "completions/min_terminated_length": 331.25, "epoch": 0.7475, "grad_norm": 0.27327749133110046, "kl": 0.0413818359375, "learning_rate": 2.6437126905897967e-07, "loss": -0.0105, "num_tokens": 112982219.0, "reward": 0.534234344959259, "reward_std": 0.1469617635011673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024929676204919815, "rewards/penalized_accuracy_reward/std": 0.06812085211277008, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08656632527709007, "step": 1495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1744.25, "completions/max_terminated_length": 1388.5, "completions/mean_length": 826.421875, "completions/mean_terminated_length": 789.0791778564453, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.748, "grad_norm": 0.2421203851699829, "kl": 0.03668212890625, "learning_rate": 2.637648008750062e-07, "loss": 0.0461, "num_tokens": 113044870.0, "reward": 0.5861091166734695, "reward_std": 0.2701255604624748, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0498904949054122, "rewards/penalized_accuracy_reward/std": 0.13033835217356682, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 1496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1504.0, "completions/max_terminated_length": 1370.5, "completions/mean_length": 777.5, "completions/mean_terminated_length": 744.0000305175781, "completions/min_length": 315.25, "completions/min_terminated_length": 315.25, "epoch": 0.7485, "grad_norm": 0.33096253871917725, "kl": 0.033477783203125, "learning_rate": 2.631592046130896e-07, "loss": 0.0847, "num_tokens": 113102006.0, "reward": 0.7552451491355896, "reward_std": 0.23827575147151947, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.137388214468956, "rewards/penalized_accuracy_reward/std": 0.09566488116979599, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12048184871673584, "step": 1497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1777.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 1040.265625, "completions/mean_terminated_length": 875.4719085693359, "completions/min_length": 444.25, "completions/min_terminated_length": 444.25, "epoch": 0.749, "grad_norm": 0.5081402659416199, "kl": 0.0577392578125, "learning_rate": 2.6255448211798103e-07, "loss": 0.2542, "num_tokens": 113176919.0, "reward": 0.45852112770080566, "reward_std": 0.19780654832720757, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012463688850402832, "rewards/penalized_accuracy_reward/std": 0.04985475912690163, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2519031912088394, "step": 1498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1818.75, "completions/max_terminated_length": 1571.0, "completions/mean_length": 916.625, "completions/mean_terminated_length": 845.7388610839844, "completions/min_length": 244.25, "completions/min_terminated_length": 244.25, "epoch": 0.7495, "grad_norm": 0.3234092593193054, "kl": 0.039703369140625, "learning_rate": 2.6195063523177e-07, "loss": 0.1641, "num_tokens": 113243743.0, "reward": 0.5745238810777664, "reward_std": 0.3739490769803524, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995725955814123, "rewards/penalized_accuracy_reward/std": 0.16816630959510803, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14986922964453697, "step": 1499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1920.0, "completions/max_terminated_length": 1668.75, "completions/mean_length": 1113.109375, "completions/mean_terminated_length": 1014.6420440673828, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "epoch": 0.75, "grad_norm": 0.39027631282806396, "kl": 0.041656494140625, "learning_rate": 2.613476657938789e-07, "loss": 0.1633, "num_tokens": 113325174.0, "reward": 0.5220755636692047, "reward_std": 0.24808958545327187, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03740496560931206, "rewards/penalized_accuracy_reward/std": 0.08041838556528091, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.20040611550211906, "step": 1500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1928.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 974.21875, "completions/mean_terminated_length": 836.9899291992188, "completions/min_length": 400.25, "completions/min_terminated_length": 400.25, "epoch": 0.7505, "grad_norm": 0.44099321961402893, "kl": 0.043182373046875, "learning_rate": 2.6074557564105724e-07, "loss": 0.3101, "num_tokens": 113396564.0, "reward": 0.45703125, "reward_std": 0.09638972207903862, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1927794460207224, "step": 1501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1745.25, "completions/max_terminated_length": 1605.25, "completions/mean_length": 893.703125, "completions/mean_terminated_length": 842.9312896728516, "completions/min_length": 301.25, "completions/min_terminated_length": 301.25, "epoch": 0.751, "grad_norm": 0.3201117217540741, "kl": 0.04547119140625, "learning_rate": 2.6014436660737605e-07, "loss": -0.0026, "num_tokens": 113464321.0, "reward": 0.8278366923332214, "reward_std": 0.5225461721420288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1746605234220624, "rewards/penalized_accuracy_reward/std": 0.25192341580986977, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14832578226923943, "step": 1502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1812.25, "completions/max_terminated_length": 1365.25, "completions/mean_length": 988.765625, "completions/mean_terminated_length": 901.1900634765625, "completions/min_length": 431.5, "completions/min_terminated_length": 431.5, "epoch": 0.7515, "grad_norm": 0.32183945178985596, "kl": 0.03424072265625, "learning_rate": 2.595440405242222e-07, "loss": 0.1976, "num_tokens": 113538434.0, "reward": 0.5626450181007385, "reward_std": 0.23949617333710194, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049877192825078964, "rewards/penalized_accuracy_reward/std": 0.08922304213047028, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17960558086633682, "step": 1503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1560.5, "completions/max_terminated_length": 1208.25, "completions/mean_length": 741.28125, "completions/mean_terminated_length": 608.9675521850586, "completions/min_length": 278.75, "completions/min_terminated_length": 278.75, "epoch": 0.752, "grad_norm": 0.5607298612594604, "kl": 0.0548095703125, "learning_rate": 2.589445992202931e-07, "loss": 0.328, "num_tokens": 113594692.0, "reward": 0.5897131115198135, "reward_std": 0.34613717906177044, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062434676103293896, "rewards/penalized_accuracy_reward/std": 0.13930541276931763, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.16182629391551018, "step": 1504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1321.5, "completions/max_terminated_length": 1210.0, "completions/mean_length": 847.84375, "completions/mean_terminated_length": 789.4275512695312, "completions/min_length": 435.25, "completions/min_terminated_length": 435.25, "epoch": 0.7525, "grad_norm": 0.5274669528007507, "kl": 0.053314208984375, "learning_rate": 2.583460445215911e-07, "loss": 0.0422, "num_tokens": 113658794.0, "reward": 0.9704430773854256, "reward_std": 0.5029632076621056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2498699752613902, "rewards/penalized_accuracy_reward/std": 0.24175799265503883, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11102426052093506, "step": 1505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1473.25, "completions/max_terminated_length": 1123.75, "completions/mean_length": 590.859375, "completions/mean_terminated_length": 544.9531402587891, "completions/min_length": 270.75, "completions/min_terminated_length": 270.75, "epoch": 0.753, "grad_norm": 0.49506130814552307, "kl": 0.044189453125, "learning_rate": 2.5774837825141736e-07, "loss": 0.1567, "num_tokens": 113703841.0, "reward": 0.5361989438533783, "reward_std": 0.2349633276462555, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024935407564044, "rewards/penalized_accuracy_reward/std": 0.0997416339814663, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.109375, "step": 1506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1576.75, "completions/max_terminated_length": 1158.25, "completions/mean_length": 628.140625, "completions/mean_terminated_length": 557.2775421142578, "completions/min_length": 296.25, "completions/min_terminated_length": 296.25, "epoch": 0.7535, "grad_norm": 0.5767908692359924, "kl": 0.04803466796875, "learning_rate": 2.571516022303671e-07, "loss": 0.4036, "num_tokens": 113753914.0, "reward": 1.0085208415985107, "reward_std": 0.4364822991192341, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.26207292079925537, "rewards/penalized_accuracy_reward/std": 0.19541556388139725, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 1507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1651.5, "completions/max_terminated_length": 1541.0, "completions/mean_length": 729.328125, "completions/mean_terminated_length": 711.8562622070312, "completions/min_length": 369.5, "completions/min_terminated_length": 369.5, "epoch": 0.754, "grad_norm": 0.527442216873169, "kl": 0.043243408203125, "learning_rate": 2.565557182763235e-07, "loss": 0.2408, "num_tokens": 113808975.0, "reward": 1.090892180800438, "reward_std": 0.5999251902103424, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.29935233294963837, "rewards/penalized_accuracy_reward/std": 0.28703273087739944, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1644.25, "completions/mean_length": 994.96875, "completions/mean_terminated_length": 907.673828125, "completions/min_length": 381.25, "completions/min_terminated_length": 381.25, "epoch": 0.7545, "grad_norm": 0.4380964934825897, "kl": 0.038330078125, "learning_rate": 2.5596072820445254e-07, "loss": 0.246, "num_tokens": 113881101.0, "reward": 0.7626858055591583, "reward_std": 0.3690687380731106, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14989758655428886, "rewards/penalized_accuracy_reward/std": 0.14562434703111649, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.2200700119137764, "step": 1509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1851.25, "completions/max_terminated_length": 1492.25, "completions/mean_length": 814.875, "completions/mean_terminated_length": 756.8208465576172, "completions/min_length": 317.25, "completions/min_terminated_length": 317.25, "epoch": 0.755, "grad_norm": 0.49230071902275085, "kl": 0.05462646484375, "learning_rate": 2.5536663382719713e-07, "loss": 0.2017, "num_tokens": 113942869.0, "reward": 1.2761230766773224, "reward_std": 0.594248503446579, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3997802883386612, "rewards/penalized_accuracy_reward/std": 0.27238261699676514, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16395078226923943, "step": 1510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1996.75, "completions/max_terminated_length": 1743.0, "completions/mean_length": 1075.140625, "completions/mean_terminated_length": 991.6927185058594, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "epoch": 0.7555, "grad_norm": 0.3516116738319397, "kl": 0.03460693359375, "learning_rate": 2.547734369542718e-07, "loss": 0.2179, "num_tokens": 114020398.0, "reward": 0.6955142319202423, "reward_std": 0.36160567216575146, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11240555066615343, "rewards/penalized_accuracy_reward/std": 0.15314854681491852, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1697557382285595, "step": 1511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1948.25, "completions/max_terminated_length": 1433.25, "completions/mean_length": 857.96875, "completions/mean_terminated_length": 756.1845397949219, "completions/min_length": 353.25, "completions/min_terminated_length": 353.25, "epoch": 0.756, "grad_norm": 0.5034777522087097, "kl": 0.04705810546875, "learning_rate": 2.5418113939265686e-07, "loss": 0.2669, "num_tokens": 114083356.0, "reward": 0.52060666680336, "reward_std": 0.20511173084378242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02495177462697029, "rewards/penalized_accuracy_reward/std": 0.06818123161792755, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.17055309563875198, "step": 1512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1905.75, "completions/max_terminated_length": 1499.25, "completions/mean_length": 1060.578125, "completions/mean_terminated_length": 921.3032684326172, "completions/min_length": 454.5, "completions/min_terminated_length": 454.5, "epoch": 0.7565, "grad_norm": 0.32802513241767883, "kl": 0.039093017578125, "learning_rate": 2.5358974294659373e-07, "loss": 0.1908, "num_tokens": 114159009.0, "reward": 0.5010482370853424, "reward_std": 0.22244012355804443, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024938184767961502, "rewards/penalized_accuracy_reward/std": 0.06814409792423248, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.21050865203142166, "step": 1513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1688.25, "completions/max_terminated_length": 1621.0, "completions/mean_length": 954.15625, "completions/mean_terminated_length": 903.8230895996094, "completions/min_length": 402.75, "completions/min_terminated_length": 402.75, "epoch": 0.757, "grad_norm": 0.4097634255886078, "kl": 0.040374755859375, "learning_rate": 2.5299924941757843e-07, "loss": 0.1245, "num_tokens": 114233179.0, "reward": 0.5802472978830338, "reward_std": 0.2999313697218895, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049889277666807175, "rewards/penalized_accuracy_reward/std": 0.1303708851337433, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12654344737529755, "step": 1514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1849.25, "completions/max_terminated_length": 1559.0, "completions/mean_length": 1172.078125, "completions/mean_terminated_length": 989.2810363769531, "completions/min_length": 438.75, "completions/min_terminated_length": 438.75, "epoch": 0.7575, "grad_norm": 0.3490559458732605, "kl": 0.0484619140625, "learning_rate": 2.5240966060435674e-07, "loss": 0.2211, "num_tokens": 114321312.0, "reward": 0.4296875, "reward_std": 0.13360027596354485, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2672005519270897, "step": 1515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1904.5, "completions/max_terminated_length": 1327.0, "completions/mean_length": 787.96875, "completions/mean_terminated_length": 722.7208404541016, "completions/min_length": 360.75, "completions/min_terminated_length": 360.75, "epoch": 0.758, "grad_norm": 0.4976147413253784, "kl": 0.03729248046875, "learning_rate": 2.5182097830291824e-07, "loss": 0.2031, "num_tokens": 114381406.0, "reward": 0.6033119559288025, "reward_std": 0.31981780380010605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06239817105233669, "rewards/penalized_accuracy_reward/std": 0.1392539218068123, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.15207063034176826, "step": 1516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1566.75, "completions/max_terminated_length": 1566.75, "completions/mean_length": 825.78125, "completions/mean_terminated_length": 825.78125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.7585, "grad_norm": 0.2437794804573059, "kl": 0.032440185546875, "learning_rate": 2.512332043064913e-07, "loss": -0.0012, "num_tokens": 114447200.0, "reward": 0.52103191614151, "reward_std": 0.1083884984254837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012469080276787281, "rewards/penalized_accuracy_reward/std": 0.049876321107149124, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1630.25, "completions/max_terminated_length": 1512.75, "completions/mean_length": 939.046875, "completions/mean_terminated_length": 833.5807800292969, "completions/min_length": 386.25, "completions/min_terminated_length": 386.25, "epoch": 0.759, "grad_norm": 0.3863738179206848, "kl": 0.0419921875, "learning_rate": 2.5064634040553767e-07, "loss": 0.1522, "num_tokens": 114517123.0, "reward": 0.7339111566543579, "reward_std": 0.26397200115025043, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13746339082717896, "rewards/penalized_accuracy_reward/std": 0.09571722149848938, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.14507511630654335, "step": 1518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 1103.265625, "completions/mean_terminated_length": 848.8004302978516, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.7595, "grad_norm": 0.4440969228744507, "kl": 0.06011962890625, "learning_rate": 2.5006038838774647e-07, "loss": 0.361, "num_tokens": 114598292.0, "reward": 0.4140625, "reward_std": 0.14021281152963638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.28042563050985336, "step": 1519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.75, "completions/mean_length": 1115.40625, "completions/mean_terminated_length": 939.5686492919922, "completions/min_length": 317.25, "completions/min_terminated_length": 317.25, "epoch": 0.76, "grad_norm": 0.4155069887638092, "kl": 0.04638671875, "learning_rate": 2.494753500380291e-07, "loss": 0.3009, "num_tokens": 114682558.0, "reward": 0.5718967169523239, "reward_std": 0.37982504442334175, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062315549701452255, "rewards/penalized_accuracy_reward/std": 0.1484825238585472, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2276088111102581, "step": 1520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1303.5, "completions/mean_length": 841.96875, "completions/mean_terminated_length": 741.8405075073242, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.7605, "grad_norm": 0.5312321186065674, "kl": 0.05499267578125, "learning_rate": 2.488912271385139e-07, "loss": 0.3392, "num_tokens": 114747916.0, "reward": 0.4917479157447815, "reward_std": 0.1804827433079481, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012475521303713322, "rewards/penalized_accuracy_reward/std": 0.04990208521485329, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2036421075463295, "step": 1521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1593.25, "completions/max_terminated_length": 1302.5, "completions/mean_length": 995.453125, "completions/mean_terminated_length": 823.7916717529297, "completions/min_length": 463.75, "completions/min_terminated_length": 463.75, "epoch": 0.761, "grad_norm": 0.5015698075294495, "kl": 0.05377197265625, "learning_rate": 2.483080214685404e-07, "loss": 0.1994, "num_tokens": 114824889.0, "reward": 0.4375, "reward_std": 0.11024316772818565, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.2204863429069519, "step": 1522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1700.25, "completions/max_terminated_length": 1569.75, "completions/mean_length": 971.109375, "completions/mean_terminated_length": 920.4361877441406, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.7615, "grad_norm": 0.38381507992744446, "kl": 0.045806884765625, "learning_rate": 2.4772573480465445e-07, "loss": 0.1129, "num_tokens": 114894048.0, "reward": 0.5284202545881271, "reward_std": 0.18741203099489212, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024952314794063568, "rewards/penalized_accuracy_reward/std": 0.06818271428346634, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11575283855199814, "step": 1523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1169.5, "completions/max_terminated_length": 972.25, "completions/mean_length": 559.15625, "completions/mean_terminated_length": 497.0649108886719, "completions/min_length": 230.75, "completions/min_terminated_length": 230.75, "epoch": 0.762, "grad_norm": 0.48720625042915344, "kl": 0.071533203125, "learning_rate": 2.471443689206021e-07, "loss": 0.226, "num_tokens": 114938058.0, "reward": 0.8546810150146484, "reward_std": 0.5170491337776184, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1871061297133565, "rewards/penalized_accuracy_reward/std": 0.23896338045597076, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09120866656303406, "step": 1524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1567.25, "completions/max_terminated_length": 1365.0, "completions/mean_length": 809.609375, "completions/mean_terminated_length": 721.823974609375, "completions/min_length": 335.25, "completions/min_terminated_length": 335.25, "epoch": 0.7625, "grad_norm": 0.4748983681201935, "kl": 0.0467529296875, "learning_rate": 2.465639255873246e-07, "loss": 0.1452, "num_tokens": 115000113.0, "reward": 0.6664282828569412, "reward_std": 0.4843336045742035, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09981570020318031, "rewards/penalized_accuracy_reward/std": 0.21961357444524765, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.15754209086298943, "step": 1525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1948.5, "completions/max_terminated_length": 1463.75, "completions/mean_length": 1036.84375, "completions/mean_terminated_length": 985.2010650634766, "completions/min_length": 463.5, "completions/min_terminated_length": 463.5, "epoch": 0.763, "grad_norm": 0.4285919666290283, "kl": 0.04217529296875, "learning_rate": 2.4598440657295286e-07, "loss": 0.2047, "num_tokens": 115077111.0, "reward": 0.5398337244987488, "reward_std": 0.239563612267375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03749498724937439, "rewards/penalized_accuracy_reward/std": 0.08061179518699646, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19446197524666786, "step": 1526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1788.0, "completions/max_terminated_length": 1362.5, "completions/mean_length": 1147.3125, "completions/mean_terminated_length": 839.5260620117188, "completions/min_length": 359.75, "completions/min_terminated_length": 359.75, "epoch": 0.7635, "grad_norm": 0.3948267102241516, "kl": 0.08148193359375, "learning_rate": 2.454058136428027e-07, "loss": 0.2426, "num_tokens": 115158603.0, "reward": 0.39453125, "reward_std": 0.1155531033873558, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7890625, "rewards/tag_count_reward/std": 0.2311062142252922, "step": 1527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1681.5, "completions/max_terminated_length": 1379.0, "completions/mean_length": 941.6875, "completions/mean_terminated_length": 763.3186645507812, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.764, "grad_norm": 0.3819548487663269, "kl": 0.05706787109375, "learning_rate": 2.4482814855936834e-07, "loss": 0.2658, "num_tokens": 115226423.0, "reward": 0.44921875, "reward_std": 0.09595329686999321, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.19190659746527672, "step": 1528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1849.25, "completions/max_terminated_length": 1548.0, "completions/mean_length": 961.546875, "completions/mean_terminated_length": 912.987548828125, "completions/min_length": 387.25, "completions/min_terminated_length": 387.25, "epoch": 0.7645, "grad_norm": 0.3704184889793396, "kl": 0.0423583984375, "learning_rate": 2.4425141308231765e-07, "loss": 0.2003, "num_tokens": 115296394.0, "reward": 0.9760416597127914, "reward_std": 0.5310473740100861, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24973957613110542, "rewards/penalized_accuracy_reward/std": 0.24430961906909943, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.16769563034176826, "step": 1529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1565.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 806.296875, "completions/mean_terminated_length": 769.9219055175781, "completions/min_length": 372.75, "completions/min_terminated_length": 372.75, "epoch": 0.765, "grad_norm": 0.4126427471637726, "kl": 0.038909912109375, "learning_rate": 2.43675608968487e-07, "loss": 0.1147, "num_tokens": 115357933.0, "reward": 0.4765625, "reward_std": 0.07453316263854504, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14906632527709007, "step": 1530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1250.75, "completions/mean_length": 786.171875, "completions/mean_terminated_length": 702.050048828125, "completions/min_length": 282.5, "completions/min_terminated_length": 282.5, "epoch": 0.7655, "grad_norm": 0.5156564116477966, "kl": 0.045684814453125, "learning_rate": 2.4310073797187573e-07, "loss": 0.3195, "num_tokens": 115415256.0, "reward": 0.5265150517225266, "reward_std": 0.20932862162590027, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024976275861263275, "rewards/penalized_accuracy_reward/std": 0.0682481899857521, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.15932458639144897, "step": 1531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1753.0, "completions/max_terminated_length": 1529.75, "completions/mean_length": 894.609375, "completions/mean_terminated_length": 859.4656372070312, "completions/min_length": 396.75, "completions/min_terminated_length": 396.75, "epoch": 0.766, "grad_norm": 0.30698296427726746, "kl": 0.027801513671875, "learning_rate": 2.4252680184364045e-07, "loss": 0.1639, "num_tokens": 115480927.0, "reward": 0.5303857028484344, "reward_std": 0.18541952222585678, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024958472698926926, "rewards/penalized_accuracy_reward/std": 0.0681995302438736, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13644563034176826, "step": 1532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1266.25, "completions/max_terminated_length": 1109.0, "completions/mean_length": 704.0625, "completions/mean_terminated_length": 668.3906402587891, "completions/min_length": 295.75, "completions/min_terminated_length": 295.75, "epoch": 0.7665, "grad_norm": 0.38731053471565247, "kl": 0.059906005859375, "learning_rate": 2.4195380233209006e-07, "loss": 0.1066, "num_tokens": 115538947.0, "reward": 0.5284379422664642, "reward_std": 0.18725404143333435, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024961166083812714, "rewards/penalized_accuracy_reward/std": 0.06820689141750336, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11534032225608826, "step": 1533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2009.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 984.765625, "completions/mean_terminated_length": 870.5731430053711, "completions/min_length": 285.75, "completions/min_terminated_length": 285.75, "epoch": 0.767, "grad_norm": 0.45038458704948425, "kl": 0.0433349609375, "learning_rate": 2.413817411826807e-07, "loss": 0.2771, "num_tokens": 115614020.0, "reward": 0.6108301430940628, "reward_std": 0.3853130042552948, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0749463252723217, "rewards/penalized_accuracy_reward/std": 0.15764248371124268, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.18301483243703842, "step": 1534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1983.0, "completions/max_terminated_length": 1591.25, "completions/mean_length": 1058.296875, "completions/mean_terminated_length": 938.0708465576172, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.7675, "grad_norm": 0.3605099320411682, "kl": 0.04150390625, "learning_rate": 2.408106201380097e-07, "loss": 0.1832, "num_tokens": 115689975.0, "reward": 0.603002279996872, "reward_std": 0.36616807617247105, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0749386353418231, "rewards/penalized_accuracy_reward/std": 0.14559711143374443, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1859467662870884, "step": 1535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1689.75, "completions/max_terminated_length": 1402.0, "completions/mean_length": 898.84375, "completions/mean_terminated_length": 759.8846282958984, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.768, "grad_norm": 2.12255597114563, "kl": 0.04815673828125, "learning_rate": 2.4024044093781063e-07, "loss": 0.1774, "num_tokens": 115758461.0, "reward": 0.453125, "reward_std": 0.06787987425923347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.13575975596904755, "step": 1536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1910.5, "completions/max_terminated_length": 1179.25, "completions/mean_length": 843.9375, "completions/mean_terminated_length": 700.9375305175781, "completions/min_length": 369.75, "completions/min_terminated_length": 369.75, "epoch": 0.7685, "grad_norm": 0.7053321599960327, "kl": 0.052490234375, "learning_rate": 2.3967120531894857e-07, "loss": 0.3756, "num_tokens": 115823801.0, "reward": 0.5510249137878418, "reward_std": 0.32578254491090775, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049926516599953175, "rewards/penalized_accuracy_reward/std": 0.13036005571484566, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.18278051167726517, "step": 1537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1519.5, "completions/mean_length": 1062.515625, "completions/mean_terminated_length": 879.1663284301758, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.769, "grad_norm": 0.3855244219303131, "kl": 0.0435791015625, "learning_rate": 2.391029150154137e-07, "loss": 0.1206, "num_tokens": 115902378.0, "reward": 0.748810663819313, "reward_std": 0.5585134625434875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14979594945907593, "rewards/penalized_accuracy_reward/std": 0.2573878616094589, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2117716744542122, "step": 1538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1633.5, "completions/max_terminated_length": 1531.75, "completions/mean_length": 787.34375, "completions/mean_terminated_length": 729.1430358886719, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.7695, "grad_norm": 0.39755675196647644, "kl": 0.035858154296875, "learning_rate": 2.38535571758317e-07, "loss": 0.1469, "num_tokens": 115963568.0, "reward": 0.7302673012018204, "reward_std": 0.3729541450738907, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12489926815032959, "rewards/penalized_accuracy_reward/std": 0.17147202044725418, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.08981313742697239, "step": 1539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1789.5, "completions/max_terminated_length": 1227.25, "completions/mean_length": 707.265625, "completions/mean_terminated_length": 644.4052429199219, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.77, "grad_norm": 0.3992327153682709, "kl": 0.03826904296875, "learning_rate": 2.3796917727588412e-07, "loss": 0.2872, "num_tokens": 116018465.0, "reward": 0.7533612549304962, "reward_std": 0.23721687495708466, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1374228149652481, "rewards/penalized_accuracy_reward/std": 0.09568897634744644, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13344132527709007, "step": 1540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2040.5, "completions/max_terminated_length": 1719.75, "completions/mean_length": 1031.296875, "completions/mean_terminated_length": 873.0828857421875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.7705, "grad_norm": 0.3862172067165375, "kl": 0.03204345703125, "learning_rate": 2.374037332934512e-07, "loss": 0.1723, "num_tokens": 116092196.0, "reward": 0.5374178290367126, "reward_std": 0.27652762085199356, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995891824364662, "rewards/penalized_accuracy_reward/std": 0.08936924487352371, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.23467692360281944, "step": 1541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1944.75, "completions/max_terminated_length": 1923.25, "completions/mean_length": 1152.140625, "completions/mean_terminated_length": 1048.6811218261719, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.771, "grad_norm": 0.26547765731811523, "kl": 0.026519775390625, "learning_rate": 2.3683924153345854e-07, "loss": 0.1483, "num_tokens": 116174141.0, "reward": 0.5838311612606049, "reward_std": 0.2662739213556051, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06242339685559273, "rewards/penalized_accuracy_reward/std": 0.0956253781914711, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.1860438771545887, "step": 1542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1345.75, "completions/mean_length": 887.53125, "completions/mean_terminated_length": 713.758674621582, "completions/min_length": 351.75, "completions/min_terminated_length": 351.75, "epoch": 0.7715, "grad_norm": 0.39146721363067627, "kl": 0.047271728515625, "learning_rate": 2.36275703715446e-07, "loss": 0.3284, "num_tokens": 116240415.0, "reward": 0.7506806254386902, "reward_std": 0.5379019752144814, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1497543891891837, "rewards/penalized_accuracy_reward/std": 0.2415616363286972, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.2219314817339182, "step": 1543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1648.5, "completions/max_terminated_length": 1363.5, "completions/mean_length": 755.359375, "completions/mean_terminated_length": 697.9318695068359, "completions/min_length": 275.75, "completions/min_terminated_length": 275.75, "epoch": 0.772, "grad_norm": 0.3845709562301636, "kl": 0.0433349609375, "learning_rate": 2.357131215560474e-07, "loss": 0.1559, "num_tokens": 116301446.0, "reward": 0.7783625721931458, "reward_std": 0.23312324099242687, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14992347359657288, "rewards/penalized_accuracy_reward/std": 0.08939709514379501, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10865810140967369, "step": 1544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1987.75, "completions/max_terminated_length": 1784.25, "completions/mean_length": 1143.375, "completions/mean_terminated_length": 1036.7854614257812, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.7725, "grad_norm": 0.37295541167259216, "kl": 0.04315185546875, "learning_rate": 2.3515149676898552e-07, "loss": 0.2286, "num_tokens": 116383934.0, "reward": 0.451171875, "reward_std": 0.0972979236394167, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.1945958510041237, "step": 1545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1689.25, "completions/max_terminated_length": 1505.5, "completions/mean_length": 933.5625, "completions/mean_terminated_length": 834.3489685058594, "completions/min_length": 349.5, "completions/min_terminated_length": 349.5, "epoch": 0.773, "grad_norm": 0.8459130525588989, "kl": 0.05218505859375, "learning_rate": 2.3459083106506712e-07, "loss": 0.249, "num_tokens": 116451762.0, "reward": 1.0794943273067474, "reward_std": 0.6202104911208153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3122081235051155, "rewards/penalized_accuracy_reward/std": 0.2943965382874012, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18491514027118683, "step": 1546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1782.25, "completions/max_terminated_length": 1482.5, "completions/mean_length": 845.0625, "completions/mean_terminated_length": 776.2413787841797, "completions/min_length": 307.25, "completions/min_terminated_length": 307.25, "epoch": 0.7735, "grad_norm": 0.3618587553501129, "kl": 0.041778564453125, "learning_rate": 2.3403112615217693e-07, "loss": 0.0985, "num_tokens": 116512694.0, "reward": 0.5534612536430359, "reward_std": 0.19990842044353485, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747281804680824, "rewards/penalized_accuracy_reward/std": 0.08056414872407913, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11534032225608826, "step": 1547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1888.5, "completions/max_terminated_length": 1586.5, "completions/mean_length": 1051.765625, "completions/mean_terminated_length": 888.9218902587891, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.774, "grad_norm": 0.3055844306945801, "kl": 0.041290283203125, "learning_rate": 2.334723837352733e-07, "loss": 0.2667, "num_tokens": 116593335.0, "reward": 0.49524495005607605, "reward_std": 0.22346559911966324, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024966228753328323, "rewards/penalized_accuracy_reward/std": 0.06822072714567184, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.22642730176448822, "step": 1548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1616.25, "completions/max_terminated_length": 1385.0, "completions/mean_length": 885.65625, "completions/mean_terminated_length": 823.2509918212891, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.7745, "grad_norm": 0.34926602244377136, "kl": 0.031951904296875, "learning_rate": 2.3291460551638237e-07, "loss": 0.1775, "num_tokens": 116659825.0, "reward": 0.5015559643507004, "reward_std": 0.14079342037439346, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012496731244027615, "rewards/penalized_accuracy_reward/std": 0.04998692497611046, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.13096532225608826, "step": 1549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1834.0, "completions/max_terminated_length": 1548.25, "completions/mean_length": 1027.4375, "completions/mean_terminated_length": 842.7680358886719, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.775, "grad_norm": 0.37597957253456116, "kl": 0.060028076171875, "learning_rate": 2.3235779319459355e-07, "loss": 0.2541, "num_tokens": 116738109.0, "reward": 0.7021321803331375, "reward_std": 0.42374609410762787, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13719891756772995, "rewards/penalized_accuracy_reward/std": 0.1704377681016922, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.1657411828637123, "step": 1550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1717.5, "completions/max_terminated_length": 1386.0, "completions/mean_length": 724.640625, "completions/mean_terminated_length": 660.5608673095703, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7755, "grad_norm": 0.4485400319099426, "kl": 0.05206298828125, "learning_rate": 2.3180194846605364e-07, "loss": 0.1565, "num_tokens": 116794742.0, "reward": 0.5225878655910492, "reward_std": 0.1815650351345539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496580220758915, "rewards/penalized_accuracy_reward/std": 0.06821955740451813, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13866610452532768, "step": 1551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1886.25, "completions/max_terminated_length": 1324.5, "completions/mean_length": 989.890625, "completions/mean_terminated_length": 781.0802154541016, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.776, "grad_norm": 0.41176000237464905, "kl": 0.04638671875, "learning_rate": 2.312470730239621e-07, "loss": 0.2429, "num_tokens": 116868191.0, "reward": 0.6891894936561584, "reward_std": 0.2902321182191372, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12486818432807922, "rewards/penalized_accuracy_reward/std": 0.09989456087350845, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.20066209882497787, "step": 1552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1364.25, "completions/max_terminated_length": 1338.25, "completions/mean_length": 722.09375, "completions/mean_terminated_length": 705.9750061035156, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.7765, "grad_norm": 0.4754960238933563, "kl": 0.044677734375, "learning_rate": 2.306931685585657e-07, "loss": -0.0186, "num_tokens": 116924757.0, "reward": 0.6170106679201126, "reward_std": 0.29612988978624344, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06241158954799175, "rewards/penalized_accuracy_reward/std": 0.13917533680796623, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1847.25, "completions/mean_length": 1019.875, "completions/mean_terminated_length": 872.9768218994141, "completions/min_length": 350.25, "completions/min_terminated_length": 350.25, "epoch": 0.777, "grad_norm": 0.4144488573074341, "kl": 0.03216552734375, "learning_rate": 2.3014023675715339e-07, "loss": 0.3213, "num_tokens": 116999261.0, "reward": 0.49527862668037415, "reward_std": 0.23705712519586086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024983063340187073, "rewards/penalized_accuracy_reward/std": 0.06826673448085785, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.2379702813923359, "step": 1554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1714.5, "completions/max_terminated_length": 1252.5, "completions/mean_length": 760.03125, "completions/mean_terminated_length": 651.0460052490234, "completions/min_length": 305.75, "completions/min_terminated_length": 305.75, "epoch": 0.7775, "grad_norm": 0.44868117570877075, "kl": 0.04840087890625, "learning_rate": 2.2958827930405162e-07, "loss": 0.2894, "num_tokens": 117057647.0, "reward": 0.8724334239959717, "reward_std": 0.2503896504640579, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.199888588860631, "rewards/penalized_accuracy_reward/std": 0.09987391531467438, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14943470992147923, "step": 1555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1669.25, "completions/max_terminated_length": 1248.5, "completions/mean_length": 725.734375, "completions/mean_terminated_length": 643.1132659912109, "completions/min_length": 260.25, "completions/min_terminated_length": 260.25, "epoch": 0.778, "grad_norm": 0.461662232875824, "kl": 0.0428466796875, "learning_rate": 2.2903729788061834e-07, "loss": 0.2967, "num_tokens": 117115262.0, "reward": 0.47265625, "reward_std": 0.07592359185218811, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15184719488024712, "step": 1556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1696.5, "completions/max_terminated_length": 1563.0, "completions/mean_length": 879.390625, "completions/mean_terminated_length": 813.0288696289062, "completions/min_length": 340.5, "completions/min_terminated_length": 340.5, "epoch": 0.7785, "grad_norm": 0.3935399353504181, "kl": 0.0416259765625, "learning_rate": 2.2848729416523859e-07, "loss": 0.1408, "num_tokens": 117181863.0, "reward": 0.8219306915998459, "reward_std": 0.36375243589282036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1746372077614069, "rewards/penalized_accuracy_reward/std": 0.15741050988435745, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.12431412376463413, "step": 1557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1819.75, "completions/max_terminated_length": 1359.75, "completions/mean_length": 840.953125, "completions/mean_terminated_length": 784.0312805175781, "completions/min_length": 361.5, "completions/min_terminated_length": 361.5, "epoch": 0.779, "grad_norm": 0.5369862914085388, "kl": 0.03460693359375, "learning_rate": 2.2793826983331886e-07, "loss": 0.2202, "num_tokens": 117244548.0, "reward": 0.5073950588703156, "reward_std": 0.15681886672973633, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012486595660448074, "rewards/penalized_accuracy_reward/std": 0.049946386367082596, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.140625, "step": 1558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1419.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 657.234375, "completions/mean_terminated_length": 595.2546234130859, "completions/min_length": 297.5, "completions/min_terminated_length": 297.5, "epoch": 0.7795, "grad_norm": 0.4614546298980713, "kl": 0.055908203125, "learning_rate": 2.2739022655728277e-07, "loss": 0.193, "num_tokens": 117293651.0, "reward": 0.6495253145694733, "reward_std": 0.4331194795668125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08745796978473663, "rewards/penalized_accuracy_reward/std": 0.19563913345336914, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16053754836320877, "step": 1559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1732.75, "completions/max_terminated_length": 1445.5, "completions/mean_length": 890.59375, "completions/mean_terminated_length": 852.1198120117188, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.78, "grad_norm": 0.42571812868118286, "kl": 0.033660888671875, "learning_rate": 2.268431660065651e-07, "loss": 0.1539, "num_tokens": 117359849.0, "reward": 0.5093177258968353, "reward_std": 0.14884128049016, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012471364811062813, "rewards/penalized_accuracy_reward/std": 0.04988545924425125, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11211910098791122, "step": 1560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1772.0, "completions/max_terminated_length": 1509.5, "completions/mean_length": 965.40625, "completions/mean_terminated_length": 827.98291015625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.7805, "grad_norm": 0.4894823729991913, "kl": 0.04913330078125, "learning_rate": 2.2629708984760706e-07, "loss": 0.1744, "num_tokens": 117431539.0, "reward": 0.6047578901052475, "reward_std": 0.38349395617842674, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.074839873239398, "rewards/penalized_accuracy_reward/std": 0.1574038416147232, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18809578567743301, "step": 1561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1914.5, "completions/max_terminated_length": 1663.5, "completions/mean_length": 939.21875, "completions/mean_terminated_length": 882.6835021972656, "completions/min_length": 407.75, "completions/min_terminated_length": 407.75, "epoch": 0.781, "grad_norm": 0.42137160897254944, "kl": 0.0406494140625, "learning_rate": 2.2575199974385144e-07, "loss": 0.1981, "num_tokens": 117501121.0, "reward": 0.5572758764028549, "reward_std": 0.21814535185694695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03742700070142746, "rewards/penalized_accuracy_reward/std": 0.0804656520485878, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.12774410098791122, "step": 1562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1859.75, "completions/max_terminated_length": 1487.0, "completions/mean_length": 821.3125, "completions/mean_terminated_length": 744.1366424560547, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.7815, "grad_norm": 0.30306562781333923, "kl": 0.044189453125, "learning_rate": 2.2520789735573704e-07, "loss": 0.1761, "num_tokens": 117565413.0, "reward": 0.6051426976919174, "reward_std": 0.32810554653406143, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06233697757124901, "rewards/penalized_accuracy_reward/std": 0.14852851629257202, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11361231282353401, "step": 1563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1719.25, "completions/max_terminated_length": 1362.75, "completions/mean_length": 877.34375, "completions/mean_terminated_length": 773.0032043457031, "completions/min_length": 351.75, "completions/min_terminated_length": 351.75, "epoch": 0.782, "grad_norm": 0.4418907165527344, "kl": 0.045867919921875, "learning_rate": 2.2466478434069435e-07, "loss": 0.1083, "num_tokens": 117634395.0, "reward": 0.5706673413515091, "reward_std": 0.24752701073884964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049982111901044846, "rewards/penalized_accuracy_reward/std": 0.08941071480512619, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15037550404667854, "step": 1564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1635.75, "completions/max_terminated_length": 1262.75, "completions/mean_length": 855.203125, "completions/mean_terminated_length": 682.2861328125, "completions/min_length": 293.5, "completions/min_terminated_length": 293.5, "epoch": 0.7825, "grad_norm": 0.5063014030456543, "kl": 0.032989501953125, "learning_rate": 2.2412266235313973e-07, "loss": 0.294, "num_tokens": 117698376.0, "reward": 0.6337220221757889, "reward_std": 0.39284541085362434, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08736883476376534, "rewards/penalized_accuracy_reward/std": 0.16380061954259872, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.14308623224496841, "step": 1565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1329.25, "completions/max_terminated_length": 1156.5, "completions/mean_length": 556.765625, "completions/mean_terminated_length": 515.9241180419922, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.783, "grad_norm": 0.5674201846122742, "kl": 0.05584716796875, "learning_rate": 2.2358153304447066e-07, "loss": 0.2053, "num_tokens": 117742825.0, "reward": 0.6283865571022034, "reward_std": 0.24467500671744347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07493546605110168, "rewards/penalized_accuracy_reward/std": 0.09991396963596344, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.0896941740065813, "step": 1566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1754.75, "completions/max_terminated_length": 1551.75, "completions/mean_length": 1046.421875, "completions/mean_terminated_length": 923.3187561035156, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.7835, "grad_norm": 0.36370307207107544, "kl": 0.050537109375, "learning_rate": 2.230413980630609e-07, "loss": 0.1787, "num_tokens": 117819924.0, "reward": 0.48589712381362915, "reward_std": 0.16674023866653442, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012479809112846851, "rewards/penalized_accuracy_reward/std": 0.049919236451387405, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.13380355387926102, "step": 1567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1975.0, "completions/max_terminated_length": 1564.75, "completions/mean_length": 1166.984375, "completions/mean_terminated_length": 959.3149719238281, "completions/min_length": 448.5, "completions/min_terminated_length": 448.5, "epoch": 0.784, "grad_norm": 0.36239543557167053, "kl": 0.04510498046875, "learning_rate": 2.2250225905425532e-07, "loss": 0.2849, "num_tokens": 117904211.0, "reward": 0.5005933791399002, "reward_std": 0.2759612798690796, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037406064569950104, "rewards/penalized_accuracy_reward/std": 0.08042064309120178, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.27675746753811836, "step": 1568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1495.5, "completions/max_terminated_length": 1330.5, "completions/mean_length": 889.96875, "completions/mean_terminated_length": 796.6800842285156, "completions/min_length": 345.5, "completions/min_terminated_length": 345.5, "epoch": 0.7845, "grad_norm": 0.5827677249908447, "kl": 0.05181884765625, "learning_rate": 2.2196411766036487e-07, "loss": 0.1673, "num_tokens": 117971489.0, "reward": 0.458984375, "reward_std": 0.09448149986565113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18896299973130226, "step": 1569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1741.5, "completions/max_terminated_length": 1380.5, "completions/mean_length": 913.640625, "completions/mean_terminated_length": 829.0101318359375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.785, "grad_norm": 0.4182393550872803, "kl": 0.03436279296875, "learning_rate": 2.2142697552066142e-07, "loss": 0.1383, "num_tokens": 118038698.0, "reward": 0.666464701294899, "reward_std": 0.47714413329958916, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09983391221612692, "rewards/penalized_accuracy_reward/std": 0.21971141174435616, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.18770276755094528, "step": 1570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1939.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 808.671875, "completions/mean_terminated_length": 653.9061126708984, "completions/min_length": 294.25, "completions/min_terminated_length": 294.25, "epoch": 0.7855, "grad_norm": 0.4211881160736084, "kl": 0.062103271484375, "learning_rate": 2.2089083427137329e-07, "loss": 0.285, "num_tokens": 118099845.0, "reward": 0.5510033518075943, "reward_std": 0.2638176791369915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04991573095321655, "rewards/penalized_accuracy_reward/std": 0.08929198980331421, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.21817802637815475, "step": 1571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2034.5, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1210.390625, "completions/mean_terminated_length": 1119.9884338378906, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.786, "grad_norm": 0.3165477514266968, "kl": 0.0369873046875, "learning_rate": 2.203556955456796e-07, "loss": 0.107, "num_tokens": 118188190.0, "reward": 0.5471592843532562, "reward_std": 0.36797719448804855, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049946827813982964, "rewards/penalized_accuracy_reward/std": 0.16811975091695786, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.2356422208249569, "step": 1572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1219.25, "completions/mean_length": 773.15625, "completions/mean_terminated_length": 688.1666946411133, "completions/min_length": 351.5, "completions/min_terminated_length": 351.5, "epoch": 0.7865, "grad_norm": 0.6183377504348755, "kl": 0.052459716796875, "learning_rate": 2.1982156097370557e-07, "loss": 0.3611, "num_tokens": 118247320.0, "reward": 0.4956870675086975, "reward_std": 0.17899367585778236, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249197032302618, "rewards/penalized_accuracy_reward/std": 0.04996788129210472, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1967380754649639, "step": 1573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1501.0, "completions/max_terminated_length": 1375.5, "completions/mean_length": 671.03125, "completions/mean_terminated_length": 647.2020874023438, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.787, "grad_norm": 0.41104093194007874, "kl": 0.046356201171875, "learning_rate": 2.1928843218251803e-07, "loss": 0.1976, "num_tokens": 118300682.0, "reward": 0.6188899129629135, "reward_std": 0.21453887224197388, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06237463653087616, "rewards/penalized_accuracy_reward/std": 0.09555068612098694, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2012.25, "completions/max_terminated_length": 1587.75, "completions/mean_length": 1198.46875, "completions/mean_terminated_length": 999.6846313476562, "completions/min_length": 475.25, "completions/min_terminated_length": 475.25, "epoch": 0.7875, "grad_norm": 0.3713396489620209, "kl": 0.054473876953125, "learning_rate": 2.1875631079611956e-07, "loss": 0.2655, "num_tokens": 118386536.0, "reward": 0.4639922231435776, "reward_std": 0.26632998138666153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496485970914364, "rewards/penalized_accuracy_reward/std": 0.06821701675653458, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.2862449511885643, "step": 1575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1281.25, "completions/max_terminated_length": 1281.25, "completions/mean_length": 666.703125, "completions/mean_terminated_length": 666.703125, "completions/min_length": 298.25, "completions/min_terminated_length": 298.25, "epoch": 0.788, "grad_norm": 0.4484412670135498, "kl": 0.0545654296875, "learning_rate": 2.1822519843544422e-07, "loss": 0.0988, "num_tokens": 118438885.0, "reward": 0.7437262535095215, "reward_std": 0.47312864661216736, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12479281239211559, "rewards/penalized_accuracy_reward/std": 0.2347789853811264, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1938.0, "completions/max_terminated_length": 1468.5, "completions/mean_length": 962.625, "completions/mean_terminated_length": 827.3081665039062, "completions/min_length": 391.5, "completions/min_terminated_length": 391.5, "epoch": 0.7885, "grad_norm": 0.46979743242263794, "kl": 0.04876708984375, "learning_rate": 2.1769509671835223e-07, "loss": 0.2122, "num_tokens": 118510941.0, "reward": 0.5529985725879669, "reward_std": 0.34459492936730385, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049936795607209206, "rewards/penalized_accuracy_reward/std": 0.136453315615654, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.19009364023804665, "step": 1577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1905.0, "completions/max_terminated_length": 1364.5, "completions/mean_length": 1031.515625, "completions/mean_terminated_length": 752.7092514038086, "completions/min_length": 381.5, "completions/min_terminated_length": 381.5, "epoch": 0.789, "grad_norm": 0.3725079596042633, "kl": 0.06787109375, "learning_rate": 2.1716600725962558e-07, "loss": 0.389, "num_tokens": 118589182.0, "reward": 0.431640625, "reward_std": 0.11674204096198082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.23348408192396164, "step": 1578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1810.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 1066.1875, "completions/mean_terminated_length": 874.8928680419922, "completions/min_length": 433.5, "completions/min_terminated_length": 433.5, "epoch": 0.7895, "grad_norm": 0.42732763290405273, "kl": 0.0478515625, "learning_rate": 2.166379316709625e-07, "loss": 0.3047, "num_tokens": 118664954.0, "reward": 0.431640625, "reward_std": 0.11703772842884064, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.23407547175884247, "step": 1579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1420.75, "completions/max_terminated_length": 1301.75, "completions/mean_length": 732.71875, "completions/mean_terminated_length": 718.902099609375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.79, "grad_norm": 0.5473921298980713, "kl": 0.05072021484375, "learning_rate": 2.1611087156097267e-07, "loss": 0.137, "num_tokens": 118720232.0, "reward": 0.8896829038858414, "reward_std": 0.5087809711694717, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19972426630556583, "rewards/penalized_accuracy_reward/std": 0.24430356919765472, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.5, "completions/mean_length": 1212.515625, "completions/mean_terminated_length": 1074.5061798095703, "completions/min_length": 432.25, "completions/min_terminated_length": 432.25, "epoch": 0.7905, "grad_norm": 0.34774908423423767, "kl": 0.04248046875, "learning_rate": 2.1558482853517253e-07, "loss": 0.222, "num_tokens": 118806041.0, "reward": 0.5010921061038971, "reward_std": 0.2300056405365467, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496011182665825, "rewards/penalized_accuracy_reward/std": 0.06820404529571533, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.23879868537187576, "step": 1581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1489.75, "completions/mean_length": 975.5625, "completions/mean_terminated_length": 801.2981109619141, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.791, "grad_norm": 0.4288358986377716, "kl": 0.045074462890625, "learning_rate": 2.1505980419598063e-07, "loss": 0.1842, "num_tokens": 118878589.0, "reward": 0.6239686906337738, "reward_std": 0.48065728321671486, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0873749703168869, "rewards/penalized_accuracy_reward/std": 0.21692883223295212, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.25693394243717194, "step": 1582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1604.5, "completions/max_terminated_length": 1403.0, "completions/mean_length": 802.125, "completions/mean_terminated_length": 779.3948059082031, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.7915, "grad_norm": 0.2544136941432953, "kl": 0.03875732421875, "learning_rate": 2.1453580014271203e-07, "loss": 0.1199, "num_tokens": 118938437.0, "reward": 0.5381663143634796, "reward_std": 0.1639700960367918, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024942530319094658, "rewards/penalized_accuracy_reward/std": 0.06815598160028458, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.05531632527709007, "step": 1583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1552.25, "completions/max_terminated_length": 1349.75, "completions/mean_length": 776.984375, "completions/mean_terminated_length": 740.9646148681641, "completions/min_length": 260.25, "completions/min_terminated_length": 260.25, "epoch": 0.792, "grad_norm": 0.4313102960586548, "kl": 0.043212890625, "learning_rate": 2.1401281797157395e-07, "loss": 0.1633, "num_tokens": 118995540.0, "reward": 0.914907306432724, "reward_std": 0.33669982850551605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.212336465716362, "rewards/penalized_accuracy_reward/std": 0.14881866425275803, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1852.25, "completions/max_terminated_length": 1779.75, "completions/mean_length": 1103.25, "completions/mean_terminated_length": 964.0113677978516, "completions/min_length": 449.75, "completions/min_terminated_length": 449.75, "epoch": 0.7925, "grad_norm": 0.2952355444431305, "kl": 0.033538818359375, "learning_rate": 2.134908592756607e-07, "loss": 0.186, "num_tokens": 119077972.0, "reward": 0.5354903042316437, "reward_std": 0.2573624886572361, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049971722066402435, "rewards/penalized_accuracy_reward/std": 0.08939214050769806, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.20022457838058472, "step": 1585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2007.75, "completions/max_terminated_length": 1658.0, "completions/mean_length": 1244.296875, "completions/mean_terminated_length": 1027.2298889160156, "completions/min_length": 493.5, "completions/min_terminated_length": 493.5, "epoch": 0.793, "grad_norm": 0.24755984544754028, "kl": 0.0411376953125, "learning_rate": 2.1296992564494904e-07, "loss": 0.1512, "num_tokens": 119168071.0, "reward": 0.5295222699642181, "reward_std": 0.26307400315999985, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04991738125681877, "rewards/penalized_accuracy_reward/std": 0.08929493278265, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.18505837209522724, "step": 1586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1948.25, "completions/max_terminated_length": 1615.5, "completions/mean_length": 1037.375, "completions/mean_terminated_length": 970.8257904052734, "completions/min_length": 451.75, "completions/min_terminated_length": 451.75, "epoch": 0.7935, "grad_norm": 0.3693445920944214, "kl": 0.033843994140625, "learning_rate": 2.124500186662932e-07, "loss": 0.1417, "num_tokens": 119244271.0, "reward": 0.48046875, "reward_std": 0.06385000795125961, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.12770001962780952, "step": 1587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1769.5, "completions/max_terminated_length": 1556.75, "completions/mean_length": 1141.765625, "completions/mean_terminated_length": 980.3410034179688, "completions/min_length": 500.75, "completions/min_terminated_length": 500.75, "epoch": 0.794, "grad_norm": 0.3194670081138611, "kl": 0.036651611328125, "learning_rate": 2.1193113992342001e-07, "loss": 0.2483, "num_tokens": 119326624.0, "reward": 0.4987120032310486, "reward_std": 0.27521596662700176, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744193911552429, "rewards/penalized_accuracy_reward/std": 0.08049775660037994, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.27487656474113464, "step": 1588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1504.0, "completions/max_terminated_length": 1157.25, "completions/mean_length": 730.921875, "completions/mean_terminated_length": 631.4940032958984, "completions/min_length": 242.25, "completions/min_terminated_length": 242.25, "epoch": 0.7945, "grad_norm": 0.4293665587902069, "kl": 0.05487060546875, "learning_rate": 2.1141329099692406e-07, "loss": 0.1609, "num_tokens": 119384747.0, "reward": 0.7415588051080704, "reward_std": 0.42625200748443604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.137380950152874, "rewards/penalized_accuracy_reward/std": 0.1917479708790779, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1483948826789856, "step": 1589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1848.5, "completions/max_terminated_length": 1465.5, "completions/mean_length": 1118.265625, "completions/mean_terminated_length": 973.1490325927734, "completions/min_length": 625.5, "completions/min_terminated_length": 625.5, "epoch": 0.795, "grad_norm": 0.2701038122177124, "kl": 0.041229248046875, "learning_rate": 2.1089647346426303e-07, "loss": 0.186, "num_tokens": 119466140.0, "reward": 0.5720907151699066, "reward_std": 0.3544735908508301, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06241254881024361, "rewards/penalized_accuracy_reward/std": 0.148722842335701, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.1888725757598877, "step": 1590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1939.25, "completions/max_terminated_length": 1550.25, "completions/mean_length": 962.171875, "completions/mean_terminated_length": 893.804931640625, "completions/min_length": 346.25, "completions/min_terminated_length": 346.25, "epoch": 0.7955, "grad_norm": 0.33257752656936646, "kl": 0.04168701171875, "learning_rate": 2.1038068889975259e-07, "loss": 0.2213, "num_tokens": 119535207.0, "reward": 0.749162346124649, "reward_std": 0.2558934912085533, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13727648556232452, "rewards/penalized_accuracy_reward/std": 0.09558712691068649, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15602656453847885, "step": 1591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1428.25, "completions/max_terminated_length": 1260.25, "completions/mean_length": 762.140625, "completions/mean_terminated_length": 740.6062622070312, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.796, "grad_norm": 0.42674216628074646, "kl": 0.039459228515625, "learning_rate": 2.0986593887456223e-07, "loss": 0.1172, "num_tokens": 119593424.0, "reward": 1.0127876847982407, "reward_std": 0.5430427715182304, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.26225321739912033, "rewards/penalized_accuracy_reward/std": 0.26370515674352646, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.05531632527709007, "step": 1592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1821.25, "completions/max_terminated_length": 1736.75, "completions/mean_length": 1035.21875, "completions/mean_terminated_length": 972.1743927001953, "completions/min_length": 424.75, "completions/min_terminated_length": 424.75, "epoch": 0.7965, "grad_norm": 0.32577383518218994, "kl": 0.03369140625, "learning_rate": 2.0935222495670968e-07, "loss": 0.0768, "num_tokens": 119666750.0, "reward": 0.7664839625358582, "reward_std": 0.4478753674775362, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14984354004263878, "rewards/penalized_accuracy_reward/std": 0.19798943400382996, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14207998290657997, "step": 1593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1723.5, "completions/max_terminated_length": 1289.0, "completions/mean_length": 855.9375, "completions/mean_terminated_length": 685.9857177734375, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.797, "grad_norm": 0.4110722243785858, "kl": 0.0592041015625, "learning_rate": 2.088395487110566e-07, "loss": 0.3139, "num_tokens": 119731802.0, "reward": 0.8046764135360718, "reward_std": 0.20813175290822983, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1747991442680359, "rewards/penalized_accuracy_reward/std": 0.0682346299290657, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.14332501962780952, "step": 1594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1993.5, "completions/max_terminated_length": 1583.25, "completions/mean_length": 961.96875, "completions/mean_terminated_length": 804.8915100097656, "completions/min_length": 376.5, "completions/min_terminated_length": 376.5, "epoch": 0.7975, "grad_norm": 0.37609490752220154, "kl": 0.046142578125, "learning_rate": 2.0832791169930363e-07, "loss": 0.2596, "num_tokens": 119801976.0, "reward": 0.4991469234228134, "reward_std": 0.2590152621269226, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024964085780084133, "rewards/penalized_accuracy_reward/std": 0.09985634312033653, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2174222618341446, "step": 1595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1619.5, "completions/max_terminated_length": 1263.75, "completions/mean_length": 937.359375, "completions/mean_terminated_length": 845.8645477294922, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.798, "grad_norm": 0.378080815076828, "kl": 0.055450439453125, "learning_rate": 2.078173154799861e-07, "loss": 0.1929, "num_tokens": 119872447.0, "reward": 0.5128078013658524, "reward_std": 0.20260559394955635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024958590045571327, "rewards/penalized_accuracy_reward/std": 0.06819986552000046, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.13241172581911087, "step": 1596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1918.5, "completions/max_terminated_length": 1565.25, "completions/mean_length": 1200.078125, "completions/mean_terminated_length": 940.2377014160156, "completions/min_length": 321.5, "completions/min_terminated_length": 321.5, "epoch": 0.7985, "grad_norm": 0.24200691282749176, "kl": 0.04248046875, "learning_rate": 2.0730776160846853e-07, "loss": 0.1902, "num_tokens": 119956052.0, "reward": 0.565689891576767, "reward_std": 0.3021623156964779, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07483713328838348, "rewards/penalized_accuracy_reward/std": 0.09978287667036057, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.24693026393651962, "step": 1597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1809.0, "completions/max_terminated_length": 1557.5, "completions/mean_length": 877.1875, "completions/mean_terminated_length": 796.5605926513672, "completions/min_length": 350.25, "completions/min_terminated_length": 350.25, "epoch": 0.799, "grad_norm": 0.48458993434906006, "kl": 0.038909912109375, "learning_rate": 2.0679925163694033e-07, "loss": 0.2368, "num_tokens": 120022496.0, "reward": 0.60520139336586, "reward_std": 0.3528434894979, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062366314232349396, "rewards/penalized_accuracy_reward/std": 0.1485971137881279, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11129852384328842, "step": 1598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1835.5, "completions/max_terminated_length": 1636.25, "completions/mean_length": 746.375, "completions/mean_terminated_length": 663.1259002685547, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.7995, "grad_norm": 0.46747151017189026, "kl": 0.0594482421875, "learning_rate": 2.0629178711441115e-07, "loss": 0.2706, "num_tokens": 120081080.0, "reward": 0.470703125, "reward_std": 0.07289222255349159, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.14578444883227348, "step": 1599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1527.0, "completions/max_terminated_length": 1460.25, "completions/mean_length": 899.1875, "completions/mean_terminated_length": 833.8541717529297, "completions/min_length": 421.5, "completions/min_terminated_length": 421.5, "epoch": 0.8, "grad_norm": 0.23246806859970093, "kl": 0.0430908203125, "learning_rate": 2.0578536958670574e-07, "loss": 0.114, "num_tokens": 120147380.0, "reward": 0.466796875, "reward_std": 0.06283505633473396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.12567011639475822, "step": 1600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1900.25, "completions/max_terminated_length": 1523.25, "completions/mean_length": 868.578125, "completions/mean_terminated_length": 741.4711761474609, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.8005, "grad_norm": 0.5129444599151611, "kl": 0.0638427734375, "learning_rate": 2.0528000059645995e-07, "loss": 0.3477, "num_tokens": 120212761.0, "reward": 0.6817559003829956, "reward_std": 0.2971675843000412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123623251914978, "rewards/penalized_accuracy_reward/std": 0.10234412550926208, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18495866656303406, "step": 1601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 1111.546875, "completions/mean_terminated_length": 968.0507202148438, "completions/min_length": 430.25, "completions/min_terminated_length": 430.25, "epoch": 0.801, "grad_norm": 0.37825873494148254, "kl": 0.04010009765625, "learning_rate": 2.0477568168311525e-07, "loss": 0.2772, "num_tokens": 120293980.0, "reward": 0.5662158727645874, "reward_std": 0.28833236917853355, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062404815107584, "rewards/penalized_accuracy_reward/std": 0.09559690952301025, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.24016263708472252, "step": 1602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1643.5, "completions/max_terminated_length": 1551.75, "completions/mean_length": 1093.328125, "completions/mean_terminated_length": 992.9281616210938, "completions/min_length": 558.75, "completions/min_terminated_length": 558.75, "epoch": 0.8015, "grad_norm": 0.22275766730308533, "kl": 0.037445068359375, "learning_rate": 2.042724143829146e-07, "loss": 0.0659, "num_tokens": 120373857.0, "reward": 0.49137668311595917, "reward_std": 0.18484556674957275, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024985216557979584, "rewards/penalized_accuracy_reward/std": 0.06827260553836823, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.13140418380498886, "step": 1603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1341.75, "completions/max_terminated_length": 1285.25, "completions/mean_length": 681.015625, "completions/mean_terminated_length": 662.6125183105469, "completions/min_length": 264.75, "completions/min_terminated_length": 264.75, "epoch": 0.802, "grad_norm": 0.4285629093647003, "kl": 0.03826904296875, "learning_rate": 2.037702002288973e-07, "loss": 0.0594, "num_tokens": 120425922.0, "reward": 0.486328125, "reward_std": 0.0546875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.109375, "step": 1604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1658.0, "completions/max_terminated_length": 1502.5, "completions/mean_length": 714.90625, "completions/mean_terminated_length": 675.0968933105469, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.8025, "grad_norm": 0.4686207175254822, "kl": 0.031341552734375, "learning_rate": 2.032690407508949e-07, "loss": 0.0132, "num_tokens": 120480828.0, "reward": 0.588130921125412, "reward_std": 0.28883786499500275, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04992483742535114, "rewards/penalized_accuracy_reward/std": 0.1304258219897747, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1709.0, "completions/max_terminated_length": 1423.25, "completions/mean_length": 774.734375, "completions/mean_terminated_length": 728.8058166503906, "completions/min_length": 302.5, "completions/min_terminated_length": 302.5, "epoch": 0.803, "grad_norm": 0.3631879687309265, "kl": 0.0465087890625, "learning_rate": 2.027689374755261e-07, "loss": 0.176, "num_tokens": 120539947.0, "reward": 0.48046875, "reward_std": 0.04920881241559982, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09841762483119965, "step": 1606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1415.75, "completions/max_terminated_length": 1325.25, "completions/mean_length": 798.578125, "completions/mean_terminated_length": 754.6573181152344, "completions/min_length": 437.25, "completions/min_terminated_length": 437.25, "epoch": 0.8035, "grad_norm": 0.27396097779273987, "kl": 0.055816650390625, "learning_rate": 2.0226989192619204e-07, "loss": 0.0689, "num_tokens": 120600448.0, "reward": 0.5881462842226028, "reward_std": 0.20633341372013092, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04993251711130142, "rewards/penalized_accuracy_reward/std": 0.0893220379948616, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08086910098791122, "step": 1607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1653.5, "completions/max_terminated_length": 1436.0, "completions/mean_length": 785.265625, "completions/mean_terminated_length": 744.7823028564453, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.804, "grad_norm": 0.3616463243961334, "kl": 0.029510498046875, "learning_rate": 2.0177190562307224e-07, "loss": 0.2462, "num_tokens": 120658721.0, "reward": 1.3290909826755524, "reward_std": 0.6464227735996246, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.4243110967800021, "rewards/penalized_accuracy_reward/std": 0.3167586885392666, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14336910098791122, "step": 1608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.75, "completions/max_terminated_length": 1010.75, "completions/mean_length": 571.1875, "completions/mean_terminated_length": 571.1875, "completions/min_length": 290.25, "completions/min_terminated_length": 290.25, "epoch": 0.8045, "grad_norm": 0.23292414844036102, "kl": 0.03466796875, "learning_rate": 2.0127498008311922e-07, "loss": 0.0245, "num_tokens": 120702237.0, "reward": 0.498046875, "reward_std": 0.0078125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1724.0, "completions/max_terminated_length": 1343.25, "completions/mean_length": 757.34375, "completions/mean_terminated_length": 717.9500274658203, "completions/min_length": 322.5, "completions/min_terminated_length": 322.5, "epoch": 0.805, "grad_norm": 0.38230082392692566, "kl": 0.0374755859375, "learning_rate": 2.0077911682005428e-07, "loss": 0.1294, "num_tokens": 120761475.0, "reward": 0.7400734424591064, "reward_std": 0.21923255920410156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12491953372955322, "rewards/penalized_accuracy_reward/std": 0.0999356284737587, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06403729319572449, "step": 1610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1646.75, "completions/mean_length": 1051.46875, "completions/mean_terminated_length": 873.2849426269531, "completions/min_length": 324.5, "completions/min_terminated_length": 324.5, "epoch": 0.8055, "grad_norm": 0.44662803411483765, "kl": 0.04364013671875, "learning_rate": 2.0028431734436308e-07, "loss": 0.3718, "num_tokens": 120838337.0, "reward": 0.458984375, "reward_std": 0.10528737679123878, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21057476103305817, "step": 1611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1893.5, "completions/max_terminated_length": 1622.0, "completions/mean_length": 967.78125, "completions/mean_terminated_length": 857.4126892089844, "completions/min_length": 410.75, "completions/min_terminated_length": 410.75, "epoch": 0.806, "grad_norm": 0.4308742880821228, "kl": 0.04217529296875, "learning_rate": 1.9979058316329055e-07, "loss": 0.247, "num_tokens": 120907139.0, "reward": 0.462890625, "reward_std": 0.08748093992471695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1749618947505951, "step": 1612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1305.5, "completions/max_terminated_length": 1268.25, "completions/mean_length": 692.5625, "completions/mean_terminated_length": 655.2834930419922, "completions/min_length": 312.5, "completions/min_terminated_length": 312.5, "epoch": 0.8065, "grad_norm": 0.5370123386383057, "kl": 0.04022216796875, "learning_rate": 1.9929791578083655e-07, "loss": 0.1274, "num_tokens": 120959799.0, "reward": 0.8340779393911362, "reward_std": 0.43033580482006073, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17485148832201958, "rewards/penalized_accuracy_reward/std": 0.1980515792965889, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.06846532225608826, "step": 1613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1672.75, "completions/max_terminated_length": 1469.5, "completions/mean_length": 767.734375, "completions/mean_terminated_length": 746.902099609375, "completions/min_length": 243.75, "completions/min_terminated_length": 243.75, "epoch": 0.807, "grad_norm": 0.33478647470474243, "kl": 0.040283203125, "learning_rate": 1.9880631669775162e-07, "loss": 0.1351, "num_tokens": 121018470.0, "reward": 0.6072531938552856, "reward_std": 0.21105670928955078, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062415655702352524, "rewards/penalized_accuracy_reward/std": 0.0956135094165802, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.09179970622062683, "step": 1614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1737.5, "completions/max_terminated_length": 1504.25, "completions/mean_length": 889.3125, "completions/mean_terminated_length": 819.6290283203125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.8075, "grad_norm": 0.39450863003730774, "kl": 0.0380859375, "learning_rate": 1.9831578741153155e-07, "loss": 0.1887, "num_tokens": 121084522.0, "reward": 0.7534008026123047, "reward_std": 0.24482230842113495, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13744258880615234, "rewards/penalized_accuracy_reward/std": 0.09570274502038956, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10683366656303406, "step": 1615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1948.75, "completions/max_terminated_length": 1664.75, "completions/mean_length": 1053.484375, "completions/mean_terminated_length": 962.975830078125, "completions/min_length": 376.25, "completions/min_terminated_length": 376.25, "epoch": 0.808, "grad_norm": 0.3270496726036072, "kl": 0.039947509765625, "learning_rate": 1.9782632941641375e-07, "loss": 0.2107, "num_tokens": 121158697.0, "reward": 0.6416932940483093, "reward_std": 0.27419157698750496, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08744820952415466, "rewards/penalized_accuracy_reward/std": 0.10240887105464935, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.17703444883227348, "step": 1616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1802.5, "completions/max_terminated_length": 1483.25, "completions/mean_length": 1131.265625, "completions/mean_terminated_length": 951.1623992919922, "completions/min_length": 595.25, "completions/min_terminated_length": 595.25, "epoch": 0.8085, "grad_norm": 0.339140385389328, "kl": 0.043731689453125, "learning_rate": 1.9733794420337213e-07, "loss": 0.2264, "num_tokens": 121240906.0, "reward": 0.5946398079395294, "reward_std": 0.3907005079090595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08735896274447441, "rewards/penalized_accuracy_reward/std": 0.1497948281466961, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.24642379954457283, "step": 1617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1535.75, "completions/max_terminated_length": 1408.5, "completions/mean_length": 901.140625, "completions/mean_terminated_length": 869.0535888671875, "completions/min_length": 392.5, "completions/min_terminated_length": 392.5, "epoch": 0.809, "grad_norm": 0.3981139361858368, "kl": 0.030975341796875, "learning_rate": 1.9685063326011263e-07, "loss": 0.126, "num_tokens": 121308323.0, "reward": 0.9468206763267517, "reward_std": 0.45608462765812874, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.23708222061395645, "rewards/penalized_accuracy_reward/std": 0.19869723170995712, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.15448210388422012, "step": 1618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1521.25, "completions/mean_length": 982.515625, "completions/mean_terminated_length": 854.5815887451172, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.8095, "grad_norm": 0.5639104843139648, "kl": 0.04754638671875, "learning_rate": 1.9636439807106912e-07, "loss": 0.3813, "num_tokens": 121381732.0, "reward": 0.458984375, "reward_std": 0.10694237425923347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.21388475596904755, "step": 1619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1884.0, "completions/max_terminated_length": 1504.25, "completions/mean_length": 917.390625, "completions/mean_terminated_length": 784.1869659423828, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.81, "grad_norm": 0.3243507146835327, "kl": 0.05810546875, "learning_rate": 1.9587924011739826e-07, "loss": 0.2717, "num_tokens": 121449389.0, "reward": 0.608805924654007, "reward_std": 0.27405208721756935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491076737642288, "rewards/penalized_accuracy_reward/std": 0.09988103061914444, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18350879102945328, "step": 1620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1963.0, "completions/max_terminated_length": 1599.75, "completions/mean_length": 1093.359375, "completions/mean_terminated_length": 958.1419982910156, "completions/min_length": 477.75, "completions/min_terminated_length": 477.75, "epoch": 0.8105, "grad_norm": 0.38288024067878723, "kl": 0.03741455078125, "learning_rate": 1.9539516087697517e-07, "loss": 0.2898, "num_tokens": 121528468.0, "reward": 0.5588054805994034, "reward_std": 0.2588450722396374, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04991055279970169, "rewards/penalized_accuracy_reward/std": 0.08928271383047104, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19564661011099815, "step": 1621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1844.25, "completions/max_terminated_length": 1644.5, "completions/mean_length": 1109.203125, "completions/mean_terminated_length": 974.7871704101562, "completions/min_length": 475.25, "completions/min_terminated_length": 475.25, "epoch": 0.811, "grad_norm": 0.3287237286567688, "kl": 0.052764892578125, "learning_rate": 1.9491216182438926e-07, "loss": 0.2107, "num_tokens": 121609409.0, "reward": 0.451171875, "reward_std": 0.09418966248631477, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.18837933614850044, "step": 1622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1880.5, "completions/max_terminated_length": 1654.5, "completions/mean_length": 958.015625, "completions/mean_terminated_length": 858.4666900634766, "completions/min_length": 307.75, "completions/min_terminated_length": 307.75, "epoch": 0.8115, "grad_norm": 0.34344589710235596, "kl": 0.04608154296875, "learning_rate": 1.944302444309393e-07, "loss": 0.2057, "num_tokens": 121680850.0, "reward": 0.451171875, "reward_std": 0.0901102963835001, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.1802205964922905, "step": 1623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1387.75, "completions/max_terminated_length": 1234.25, "completions/mean_length": 622.921875, "completions/mean_terminated_length": 585.9441986083984, "completions/min_length": 241.75, "completions/min_terminated_length": 241.75, "epoch": 0.812, "grad_norm": 0.42770227789878845, "kl": 0.0572509765625, "learning_rate": 1.9394941016462947e-07, "loss": 0.0526, "num_tokens": 121729453.0, "reward": 0.6341843008995056, "reward_std": 0.3134754002094269, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490464672446251, "rewards/penalized_accuracy_reward/std": 0.1455613598227501, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08656632527709007, "step": 1624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1758.75, "completions/mean_length": 997.890625, "completions/mean_terminated_length": 839.8250274658203, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.8125, "grad_norm": 0.47731223702430725, "kl": 0.06451416015625, "learning_rate": 1.934696604901642e-07, "loss": 0.3538, "num_tokens": 121801862.0, "reward": 0.5276405215263367, "reward_std": 0.27583594992756844, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04995308071374893, "rewards/penalized_accuracy_reward/std": 0.08935879170894623, "rewards/tag_count_reward/mean": 0.85546875, "rewards/tag_count_reward/std": 0.2333349771797657, "step": 1625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1765.75, "completions/max_terminated_length": 1515.0, "completions/mean_length": 922.15625, "completions/mean_terminated_length": 813.1883697509766, "completions/min_length": 304.75, "completions/min_terminated_length": 304.75, "epoch": 0.813, "grad_norm": 0.4631711542606354, "kl": 0.061492919921875, "learning_rate": 1.929909968689442e-07, "loss": 0.3419, "num_tokens": 121871600.0, "reward": 0.6817079186439514, "reward_std": 0.2987259402871132, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11233832687139511, "rewards/penalized_accuracy_reward/std": 0.10232225805521011, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.2072916179895401, "step": 1626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1559.0, "completions/max_terminated_length": 1520.25, "completions/mean_length": 751.46875, "completions/mean_terminated_length": 733.3291778564453, "completions/min_length": 316.25, "completions/min_terminated_length": 316.25, "epoch": 0.8135, "grad_norm": 0.264401376247406, "kl": 0.0372314453125, "learning_rate": 1.9251342075906179e-07, "loss": 0.0885, "num_tokens": 121927502.0, "reward": 0.8685068339109421, "reward_std": 0.3928215205669403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18718310818076134, "rewards/penalized_accuracy_reward/std": 0.1954539716243744, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 974.765625, "completions/mean_terminated_length": 793.0875396728516, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.814, "grad_norm": 0.45089641213417053, "kl": 0.0523681640625, "learning_rate": 1.9203693361529687e-07, "loss": 0.3681, "num_tokens": 121996879.0, "reward": 0.46830521523952484, "reward_std": 0.20544306188821793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012472921051084995, "rewards/penalized_accuracy_reward/std": 0.04989168420433998, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.24983498826622963, "step": 1628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1851.5, "completions/max_terminated_length": 1589.0, "completions/mean_length": 931.390625, "completions/mean_terminated_length": 891.9500122070312, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.8145, "grad_norm": 0.3222494125366211, "kl": 0.032440185546875, "learning_rate": 1.915615368891117e-07, "loss": 0.167, "num_tokens": 122067912.0, "reward": 0.6281232237815857, "reward_std": 0.25086526200175285, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07480380684137344, "rewards/penalized_accuracy_reward/std": 0.0997384637594223, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10277670249342918, "step": 1629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1429.25, "completions/max_terminated_length": 1133.0, "completions/mean_length": 766.390625, "completions/mean_terminated_length": 670.5936126708984, "completions/min_length": 346.25, "completions/min_terminated_length": 346.25, "epoch": 0.815, "grad_norm": 0.36423465609550476, "kl": 0.051422119140625, "learning_rate": 1.9108723202864723e-07, "loss": 0.2415, "num_tokens": 122125409.0, "reward": 0.6203851699829102, "reward_std": 0.2616867069154978, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07484101504087448, "rewards/penalized_accuracy_reward/std": 0.09978803247213364, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.12422125414013863, "step": 1630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.5, "completions/mean_length": 1013.578125, "completions/mean_terminated_length": 874.2167205810547, "completions/min_length": 313.75, "completions/min_terminated_length": 313.75, "epoch": 0.8155, "grad_norm": 0.4071907103061676, "kl": 0.03594970703125, "learning_rate": 1.9061402047871833e-07, "loss": 0.2805, "num_tokens": 122200790.0, "reward": 0.5166478306055069, "reward_std": 0.21284692734479904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02492547780275345, "rewards/penalized_accuracy_reward/std": 0.06810938566923141, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19165603816509247, "step": 1631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1486.25, "completions/max_terminated_length": 1338.5, "completions/mean_length": 764.25, "completions/mean_terminated_length": 685.4598236083984, "completions/min_length": 266.5, "completions/min_terminated_length": 266.5, "epoch": 0.816, "grad_norm": 0.4317670464515686, "kl": 0.042205810546875, "learning_rate": 1.9014190368080924e-07, "loss": 0.1987, "num_tokens": 122261270.0, "reward": 0.7511638402938843, "reward_std": 0.24842410162091255, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13730067014694214, "rewards/penalized_accuracy_reward/std": 0.09560392051935196, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11443255841732025, "step": 1632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1591.5, "completions/max_terminated_length": 1350.0, "completions/mean_length": 764.671875, "completions/mean_terminated_length": 741.8448028564453, "completions/min_length": 368.75, "completions/min_terminated_length": 368.75, "epoch": 0.8165, "grad_norm": 0.44478920102119446, "kl": 0.042449951171875, "learning_rate": 1.8967088307307e-07, "loss": 0.1247, "num_tokens": 122317521.0, "reward": 0.5842029750347137, "reward_std": 0.298153068870306, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04991398751735687, "rewards/penalized_accuracy_reward/std": 0.13040689006447792, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 1633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1956.25, "completions/max_terminated_length": 1640.5, "completions/mean_length": 923.625, "completions/mean_terminated_length": 760.46875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.817, "grad_norm": 0.42773908376693726, "kl": 0.0615234375, "learning_rate": 1.8920096009031072e-07, "loss": 0.2953, "num_tokens": 122387321.0, "reward": 0.4663439393043518, "reward_std": 0.17436645925045013, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012468849308788776, "rewards/penalized_accuracy_reward/std": 0.049875400960445404, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.2049461379647255, "step": 1634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1468.75, "completions/max_terminated_length": 1134.25, "completions/mean_length": 663.84375, "completions/mean_terminated_length": 623.3041839599609, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.8175, "grad_norm": 0.4538942575454712, "kl": 0.03631591796875, "learning_rate": 1.887321361639985e-07, "loss": 0.1887, "num_tokens": 122438431.0, "reward": 0.46875, "reward_std": 0.06494186259806156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.12988372892141342, "step": 1635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1404.75, "completions/max_terminated_length": 1256.75, "completions/mean_length": 840.75, "completions/mean_terminated_length": 709.7812652587891, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.818, "grad_norm": 0.6162127256393433, "kl": 0.0615234375, "learning_rate": 1.8826441272225225e-07, "loss": 0.1872, "num_tokens": 122499407.0, "reward": 0.5050078332424164, "reward_std": 0.20102158188819885, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496485784649849, "rewards/penalized_accuracy_reward/std": 0.06821697950363159, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1834489107131958, "step": 1636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1582.75, "completions/max_terminated_length": 1274.25, "completions/mean_length": 771.21875, "completions/mean_terminated_length": 705.4310150146484, "completions/min_length": 423.5, "completions/min_terminated_length": 423.5, "epoch": 0.8185, "grad_norm": 0.5629516839981079, "kl": 0.04241943359375, "learning_rate": 1.8779779118983867e-07, "loss": 0.1904, "num_tokens": 122557101.0, "reward": 0.9741870760917664, "reward_std": 0.4425227865576744, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24978885054588318, "rewards/penalized_accuracy_reward/std": 0.19254561513662338, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11486320197582245, "step": 1637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1572.75, "completions/max_terminated_length": 1245.0, "completions/mean_length": 683.625, "completions/mean_terminated_length": 641.6656341552734, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.819, "grad_norm": 0.5450237393379211, "kl": 0.0546875, "learning_rate": 1.8733227298816794e-07, "loss": 0.1706, "num_tokens": 122613429.0, "reward": 0.7052513062953949, "reward_std": 0.464660219848156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11239127442240715, "rewards/penalized_accuracy_reward/std": 0.21812903881072998, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1421622931957245, "step": 1638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1540.75, "completions/max_terminated_length": 1345.75, "completions/mean_length": 847.84375, "completions/mean_terminated_length": 789.8819122314453, "completions/min_length": 391.5, "completions/min_terminated_length": 391.5, "epoch": 0.8195, "grad_norm": 0.39989835023880005, "kl": 0.032745361328125, "learning_rate": 1.8686785953528922e-07, "loss": 0.1437, "num_tokens": 122676091.0, "reward": 0.8241048157215118, "reward_std": 0.30712801218032837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1747477175667882, "rewards/penalized_accuracy_reward/std": 0.130440391600132, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.09249448962509632, "step": 1639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1684.25, "completions/max_terminated_length": 1461.0, "completions/mean_length": 769.515625, "completions/mean_terminated_length": 728.2209930419922, "completions/min_length": 328.75, "completions/min_terminated_length": 328.75, "epoch": 0.82, "grad_norm": 0.3426930606365204, "kl": 0.04241943359375, "learning_rate": 1.8640455224588636e-07, "loss": 0.2097, "num_tokens": 122732332.0, "reward": 0.7032299786806107, "reward_std": 0.40966543927788734, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11235718056559563, "rewards/penalized_accuracy_reward/std": 0.18494025617837906, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.12928754836320877, "step": 1640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1939.25, "completions/max_terminated_length": 1553.5, "completions/mean_length": 908.390625, "completions/mean_terminated_length": 826.2182922363281, "completions/min_length": 380.75, "completions/min_terminated_length": 380.75, "epoch": 0.8205, "grad_norm": 0.4732940196990967, "kl": 0.04937744140625, "learning_rate": 1.8594235253127372e-07, "loss": 0.2308, "num_tokens": 122800949.0, "reward": 0.470703125, "reward_std": 0.08293715119361877, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16587430611252785, "step": 1641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1676.25, "completions/mean_length": 1144.96875, "completions/mean_terminated_length": 1037.6595764160156, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.821, "grad_norm": 0.3714974522590637, "kl": 0.03955078125, "learning_rate": 1.8548126179939188e-07, "loss": 0.2542, "num_tokens": 122881667.0, "reward": 0.45703125, "reward_std": 0.09634622558951378, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19269245862960815, "step": 1642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1568.75, "completions/max_terminated_length": 1508.5, "completions/mean_length": 864.78125, "completions/mean_terminated_length": 834.7167053222656, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.8215, "grad_norm": 0.3106871545314789, "kl": 0.033843994140625, "learning_rate": 1.850212814548031e-07, "loss": 0.1039, "num_tokens": 122947189.0, "reward": 0.7051256746053696, "reward_std": 0.4045782834291458, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11232845857739449, "rewards/penalized_accuracy_reward/std": 0.1849103793501854, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11245574057102203, "step": 1643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1809.25, "completions/max_terminated_length": 895.75, "completions/mean_length": 621.734375, "completions/mean_terminated_length": 525.3111801147461, "completions/min_length": 303.25, "completions/min_terminated_length": 303.25, "epoch": 0.822, "grad_norm": 0.7197040319442749, "kl": 0.0631103515625, "learning_rate": 1.8456241289868718e-07, "loss": 0.3874, "num_tokens": 122996996.0, "reward": 0.747212141752243, "reward_std": 0.42741358280181885, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13727794215083122, "rewards/penalized_accuracy_reward/std": 0.18360115587711334, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18904344737529755, "step": 1644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1310.5, "completions/mean_length": 879.921875, "completions/mean_terminated_length": 802.0500335693359, "completions/min_length": 334.75, "completions/min_terminated_length": 334.75, "epoch": 0.8225, "grad_norm": 0.45504671335220337, "kl": 0.04473876953125, "learning_rate": 1.8410465752883758e-07, "loss": 0.3192, "num_tokens": 123061039.0, "reward": 0.49763795733451843, "reward_std": 0.1611078567802906, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01249085832387209, "rewards/penalized_accuracy_reward/std": 0.049963437020778656, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17731912061572075, "step": 1645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1656.0, "completions/max_terminated_length": 1080.5, "completions/mean_length": 835.015625, "completions/mean_terminated_length": 679.4207000732422, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "epoch": 0.823, "grad_norm": 0.39002057909965515, "kl": 0.063018798828125, "learning_rate": 1.8364801673965642e-07, "loss": 0.2535, "num_tokens": 123123200.0, "reward": 0.6567376852035522, "reward_std": 0.35932663455605507, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09985320549458265, "rewards/penalized_accuracy_reward/std": 0.1522696241736412, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.18516581133008003, "step": 1646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1890.25, "completions/max_terminated_length": 1638.75, "completions/mean_length": 1049.578125, "completions/mean_terminated_length": 935.653076171875, "completions/min_length": 365.5, "completions/min_terminated_length": 365.5, "epoch": 0.8235, "grad_norm": 0.39580097794532776, "kl": 0.04107666015625, "learning_rate": 1.8319249192215055e-07, "loss": 0.2081, "num_tokens": 123199013.0, "reward": 0.44921875, "reward_std": 0.10206014290452003, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.20412028953433037, "step": 1647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 1066.984375, "completions/mean_terminated_length": 866.2034301757812, "completions/min_length": 347.5, "completions/min_terminated_length": 347.5, "epoch": 0.824, "grad_norm": 0.32808083295822144, "kl": 0.05499267578125, "learning_rate": 1.8273808446392785e-07, "loss": 0.1888, "num_tokens": 123280980.0, "reward": 0.6294776052236557, "reward_std": 0.48822786659002304, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09989506844431162, "rewards/penalized_accuracy_reward/std": 0.21381840854883194, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2730148062109947, "step": 1648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1574.0, "completions/max_terminated_length": 1233.5, "completions/mean_length": 771.0, "completions/mean_terminated_length": 582.9361724853516, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.8245, "grad_norm": 0.5446438789367676, "kl": 0.0577392578125, "learning_rate": 1.822847957491922e-07, "loss": 0.2596, "num_tokens": 123339844.0, "reward": 0.7930573672056198, "reward_std": 0.3906508535146713, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17484899796545506, "rewards/penalized_accuracy_reward/std": 0.15762928873300552, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.1986480914056301, "step": 1649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.75, "completions/mean_length": 1040.6875, "completions/mean_terminated_length": 851.6833801269531, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.825, "grad_norm": 0.4591538608074188, "kl": 0.038726806640625, "learning_rate": 1.8183262715873938e-07, "loss": 0.2049, "num_tokens": 123415728.0, "reward": 0.5084511041641235, "reward_std": 0.30429515428841114, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03742868360131979, "rewards/penalized_accuracy_reward/std": 0.11809965968132019, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2399933561682701, "step": 1650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 1109.171875, "completions/mean_terminated_length": 892.1106567382812, "completions/min_length": 396.5, "completions/min_terminated_length": 396.5, "epoch": 0.8255, "grad_norm": 0.3288770616054535, "kl": 0.045806884765625, "learning_rate": 1.8138158006995363e-07, "loss": 0.3462, "num_tokens": 123496443.0, "reward": 0.423828125, "reward_std": 0.13311711698770523, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.84765625, "rewards/tag_count_reward/std": 0.26623423770070076, "step": 1651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1972.75, "completions/max_terminated_length": 1526.25, "completions/mean_length": 1055.921875, "completions/mean_terminated_length": 878.8489685058594, "completions/min_length": 384.25, "completions/min_terminated_length": 384.25, "epoch": 0.826, "grad_norm": 0.2944071888923645, "kl": 0.04742431640625, "learning_rate": 1.8093165585680253e-07, "loss": 0.2614, "num_tokens": 123572454.0, "reward": 0.7542482018470764, "reward_std": 0.2623208202421665, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16228033602237701, "rewards/penalized_accuracy_reward/std": 0.08051362633705139, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.2220250517129898, "step": 1652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1688.0, "completions/max_terminated_length": 1546.25, "completions/mean_length": 944.75, "completions/mean_terminated_length": 894.7625122070312, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 0.8265, "grad_norm": 0.31737247109413147, "kl": 0.04180908203125, "learning_rate": 1.804828558898332e-07, "loss": 0.1496, "num_tokens": 123642502.0, "reward": 0.47265625, "reward_std": 0.06933305040001869, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13866610452532768, "step": 1653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1776.0, "completions/max_terminated_length": 1604.5, "completions/mean_length": 929.015625, "completions/mean_terminated_length": 876.4168548583984, "completions/min_length": 336.75, "completions/min_terminated_length": 336.75, "epoch": 0.827, "grad_norm": 0.40092766284942627, "kl": 0.030670166015625, "learning_rate": 1.800351815361682e-07, "loss": 0.1121, "num_tokens": 123708951.0, "reward": 0.7530007213354111, "reward_std": 0.4014486111700535, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13724254816770554, "rewards/penalized_accuracy_reward/std": 0.17044097185134888, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14216844737529755, "step": 1654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1917.75, "completions/max_terminated_length": 1787.25, "completions/mean_length": 918.578125, "completions/mean_terminated_length": 868.2751770019531, "completions/min_length": 332.75, "completions/min_terminated_length": 332.75, "epoch": 0.8275, "grad_norm": 0.3680141270160675, "kl": 0.0394287109375, "learning_rate": 1.7958863415950112e-07, "loss": 0.185, "num_tokens": 123777532.0, "reward": 0.474609375, "reward_std": 0.06685471162199974, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13370942324399948, "step": 1655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1938.75, "completions/max_terminated_length": 1675.0, "completions/mean_length": 997.140625, "completions/mean_terminated_length": 912.0987091064453, "completions/min_length": 386.25, "completions/min_terminated_length": 386.25, "epoch": 0.828, "grad_norm": 0.40684443712234497, "kl": 0.038055419921875, "learning_rate": 1.7914321512009296e-07, "loss": 0.2067, "num_tokens": 123850901.0, "reward": 0.474609375, "reward_std": 0.07530180923640728, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15060361847281456, "step": 1656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1301.5, "completions/max_terminated_length": 1077.5, "completions/mean_length": 707.234375, "completions/mean_terminated_length": 668.6741180419922, "completions/min_length": 252.25, "completions/min_terminated_length": 252.25, "epoch": 0.8285, "grad_norm": 0.5048791170120239, "kl": 0.047698974609375, "learning_rate": 1.7869892577476722e-07, "loss": 0.1981, "num_tokens": 123904852.0, "reward": 0.9761841595172882, "reward_std": 0.22992957010865211, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2498108297586441, "rewards/penalized_accuracy_reward/std": 0.08941372909612255, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1023404598236084, "step": 1657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1863.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 1041.15625, "completions/mean_terminated_length": 894.1174621582031, "completions/min_length": 512.75, "completions/min_terminated_length": 512.75, "epoch": 0.829, "grad_norm": 0.43050938844680786, "kl": 0.045928955078125, "learning_rate": 1.782557674769063e-07, "loss": 0.2398, "num_tokens": 123983422.0, "reward": 0.4453125, "reward_std": 0.1089695356786251, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21793907321989536, "step": 1658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1888.0, "completions/max_terminated_length": 1390.75, "completions/mean_length": 741.4375, "completions/mean_terminated_length": 672.7985305786133, "completions/min_length": 297.75, "completions/min_terminated_length": 297.75, "epoch": 0.8295, "grad_norm": 0.5017850995063782, "kl": 0.041168212890625, "learning_rate": 1.7781374157644713e-07, "loss": 0.2719, "num_tokens": 124038314.0, "reward": 0.482421875, "reward_std": 0.055459219962358475, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11091844737529755, "step": 1659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1576.25, "completions/max_terminated_length": 1450.5, "completions/mean_length": 962.953125, "completions/mean_terminated_length": 857.9776916503906, "completions/min_length": 354.25, "completions/min_terminated_length": 354.25, "epoch": 0.83, "grad_norm": 3.254981756210327, "kl": 0.071441650390625, "learning_rate": 1.773728494198775e-07, "loss": 0.2343, "num_tokens": 124108151.0, "reward": 0.7586442828178406, "reward_std": 0.4426162876188755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14982995018363, "rewards/penalized_accuracy_reward/std": 0.18290239572525024, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.17952607572078705, "step": 1660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1466.5, "completions/max_terminated_length": 1331.5, "completions/mean_length": 709.640625, "completions/mean_terminated_length": 672.183349609375, "completions/min_length": 328.5, "completions/min_terminated_length": 328.5, "epoch": 0.8305, "grad_norm": 0.3852446675300598, "kl": 0.051513671875, "learning_rate": 1.7693309235023127e-07, "loss": 0.1494, "num_tokens": 124163568.0, "reward": 0.486328125, "reward_std": 0.047643646597862244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 1661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1833.25, "completions/max_terminated_length": 1303.75, "completions/mean_length": 820.640625, "completions/mean_terminated_length": 760.7521057128906, "completions/min_length": 342.25, "completions/min_terminated_length": 342.25, "epoch": 0.831, "grad_norm": 0.5445142984390259, "kl": 0.05511474609375, "learning_rate": 1.7649447170708466e-07, "loss": 0.2682, "num_tokens": 124224937.0, "reward": 0.470703125, "reward_std": 0.09192858636379242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.18385717645287514, "step": 1662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1840.5, "completions/max_terminated_length": 1302.25, "completions/mean_length": 735.421875, "completions/mean_terminated_length": 605.659912109375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8315, "grad_norm": 0.5138114094734192, "kl": 0.0572509765625, "learning_rate": 1.7605698882655233e-07, "loss": 0.4242, "num_tokens": 124280532.0, "reward": 0.6816791296005249, "reward_std": 0.29992271959781647, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11232394725084305, "rewards/penalized_accuracy_reward/std": 0.10230934619903564, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.20950419828295708, "step": 1663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1650.75, "completions/max_terminated_length": 1255.75, "completions/mean_length": 942.140625, "completions/mean_terminated_length": 764.2253875732422, "completions/min_length": 276.5, "completions/min_terminated_length": 276.5, "epoch": 0.832, "grad_norm": 0.2768719792366028, "kl": 0.064422607421875, "learning_rate": 1.7562064504128281e-07, "loss": 0.1879, "num_tokens": 124350285.0, "reward": 0.5221194177865982, "reward_std": 0.2951732650399208, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03742690198123455, "rewards/penalized_accuracy_reward/std": 0.1180531196296215, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.1844867467880249, "step": 1664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1249.0, "completions/max_terminated_length": 1119.75, "completions/mean_length": 769.78125, "completions/mean_terminated_length": 736.9017944335938, "completions/min_length": 383.75, "completions/min_terminated_length": 383.75, "epoch": 0.8325, "grad_norm": 0.44222697615623474, "kl": 0.042449951171875, "learning_rate": 1.7518544168045524e-07, "loss": 0.0984, "num_tokens": 124407503.0, "reward": 1.0752758383750916, "reward_std": 0.4081740342080593, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2993566691875458, "rewards/penalized_accuracy_reward/std": 0.1785019040107727, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1023404598236084, "step": 1665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1650.25, "completions/max_terminated_length": 1354.5, "completions/mean_length": 903.53125, "completions/mean_terminated_length": 836.17333984375, "completions/min_length": 491.5, "completions/min_terminated_length": 491.5, "epoch": 0.833, "grad_norm": 0.4460863471031189, "kl": 0.041351318359375, "learning_rate": 1.7475138006977437e-07, "loss": 0.1843, "num_tokens": 124473073.0, "reward": 0.6142880320549011, "reward_std": 0.2679305225610733, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07472214102745056, "rewards/penalized_accuracy_reward/std": 0.09962952882051468, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.15789926052093506, "step": 1666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2030.25, "completions/max_terminated_length": 1762.5, "completions/mean_length": 898.421875, "completions/mean_terminated_length": 823.3446655273438, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.8335, "grad_norm": 0.4741263687610626, "kl": 0.0577392578125, "learning_rate": 1.743184615314671e-07, "loss": 0.3788, "num_tokens": 124539020.0, "reward": 0.6471695303916931, "reward_std": 0.28537512943148613, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08725664019584656, "rewards/penalized_accuracy_reward/std": 0.10218454897403717, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17379852384328842, "step": 1667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1737.25, "completions/max_terminated_length": 1011.75, "completions/mean_length": 685.921875, "completions/mean_terminated_length": 600.0233764648438, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.834, "grad_norm": 0.6717742681503296, "kl": 0.06414794921875, "learning_rate": 1.7388668738427847e-07, "loss": 0.3202, "num_tokens": 124592087.0, "reward": 0.7762331515550613, "reward_std": 0.42147646844387054, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14983532577753067, "rewards/penalized_accuracy_reward/std": 0.1829221472144127, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.15779344737529755, "step": 1668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1448.25, "completions/max_terminated_length": 1445.75, "completions/mean_length": 695.5625, "completions/mean_terminated_length": 674.277099609375, "completions/min_length": 315.25, "completions/min_terminated_length": 315.25, "epoch": 0.8345, "grad_norm": 0.3446996212005615, "kl": 0.04937744140625, "learning_rate": 1.7345605894346726e-07, "loss": 0.0693, "num_tokens": 124646075.0, "reward": 0.740029975771904, "reward_std": 0.3682616055011749, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.124897800385952, "rewards/penalized_accuracy_reward/std": 0.18290869891643524, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.06403729319572449, "step": 1669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1628.0, "completions/max_terminated_length": 1426.25, "completions/mean_length": 663.828125, "completions/mean_terminated_length": 613.7031326293945, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.835, "grad_norm": 0.3171371817588806, "kl": 0.041473388671875, "learning_rate": 1.7302657752080258e-07, "loss": 0.0998, "num_tokens": 124696640.0, "reward": 0.6322199702262878, "reward_std": 0.2281547337770462, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07489904761314392, "rewards/penalized_accuracy_reward/std": 0.0998653993010521, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.0914958082139492, "step": 1670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1806.0, "completions/max_terminated_length": 1650.5, "completions/mean_length": 936.265625, "completions/mean_terminated_length": 918.5718841552734, "completions/min_length": 324.75, "completions/min_terminated_length": 324.75, "epoch": 0.8355, "grad_norm": 0.4448389410972595, "kl": 0.0443115234375, "learning_rate": 1.7259824442455923e-07, "loss": 0.0118, "num_tokens": 124766849.0, "reward": 0.8530537188053131, "reward_std": 0.536459356546402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18726905435323715, "rewards/penalized_accuracy_reward/std": 0.2669881135225296, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14832578226923943, "step": 1671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1489.75, "completions/max_terminated_length": 1457.75, "completions/mean_length": 807.9375, "completions/mean_terminated_length": 773.1584930419922, "completions/min_length": 345.25, "completions/min_terminated_length": 345.25, "epoch": 0.836, "grad_norm": 0.1757274866104126, "kl": 0.04296875, "learning_rate": 1.7217106095951412e-07, "loss": 0.0606, "num_tokens": 124827549.0, "reward": 0.482421875, "reward_std": 0.030122904106974602, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.060245808213949203, "step": 1672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1917.25, "completions/max_terminated_length": 1507.0, "completions/mean_length": 903.375, "completions/mean_terminated_length": 823.6796722412109, "completions/min_length": 293.25, "completions/min_terminated_length": 293.25, "epoch": 0.8365, "grad_norm": 0.5187534689903259, "kl": 0.058013916015625, "learning_rate": 1.7174502842694212e-07, "loss": 0.2591, "num_tokens": 124893621.0, "reward": 0.5206732898950577, "reward_std": 0.20515315607190132, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498508058488369, "rewards/penalized_accuracy_reward/std": 0.06827224791049957, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.13721735030412674, "step": 1673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1920.75, "completions/max_terminated_length": 1354.25, "completions/mean_length": 888.703125, "completions/mean_terminated_length": 809.3092498779297, "completions/min_length": 344.5, "completions/min_terminated_length": 344.5, "epoch": 0.837, "grad_norm": 0.3740379512310028, "kl": 0.042633056640625, "learning_rate": 1.7132014812461227e-07, "loss": 0.1926, "num_tokens": 124959810.0, "reward": 0.5015309751033783, "reward_std": 0.15955640748143196, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012484240345656872, "rewards/penalized_accuracy_reward/std": 0.04993696138262749, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.15779344737529755, "step": 1674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1876.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 733.21875, "completions/mean_terminated_length": 650.8945159912109, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.8375, "grad_norm": 0.4680139720439911, "kl": 0.05242919921875, "learning_rate": 1.7089642134678364e-07, "loss": 0.3209, "num_tokens": 125016528.0, "reward": 0.7491114437580109, "reward_std": 0.25585826486349106, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13725104928016663, "rewards/penalized_accuracy_reward/std": 0.09556940943002701, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15602656453847885, "step": 1675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 435.828125, "completions/mean_terminated_length": 435.828125, "completions/min_length": 210.75, "completions/min_terminated_length": 210.75, "epoch": 0.838, "grad_norm": 0.6414474844932556, "kl": 0.06549072265625, "learning_rate": 1.704738493842015e-07, "loss": 0.1149, "num_tokens": 125056117.0, "reward": 0.9725464284420013, "reward_std": 0.5036818459630013, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.23724977858364582, "rewards/penalized_accuracy_reward/std": 0.24793468415737152, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2029.0, "completions/max_terminated_length": 1664.25, "completions/mean_length": 944.203125, "completions/mean_terminated_length": 862.8757781982422, "completions/min_length": 378.75, "completions/min_terminated_length": 378.75, "epoch": 0.8385, "grad_norm": 0.555585503578186, "kl": 0.0562744140625, "learning_rate": 1.7005243352409333e-07, "loss": 0.3327, "num_tokens": 125125138.0, "reward": 0.451171875, "reward_std": 0.10287893377244473, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20575786754488945, "step": 1677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1483.75, "completions/max_terminated_length": 1229.0, "completions/mean_length": 776.8125, "completions/mean_terminated_length": 676.8656311035156, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.839, "grad_norm": 0.2614339590072632, "kl": 0.0634765625, "learning_rate": 1.6963217505016475e-07, "loss": 0.1213, "num_tokens": 125183814.0, "reward": 0.5357427895069122, "reward_std": 0.20878159254789352, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03740264102816582, "rewards/penalized_accuracy_reward/std": 0.08041330426931381, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.13368403166532516, "step": 1678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1772.5, "completions/max_terminated_length": 1403.25, "completions/mean_length": 956.734375, "completions/mean_terminated_length": 815.5286560058594, "completions/min_length": 423.25, "completions/min_terminated_length": 423.25, "epoch": 0.8395, "grad_norm": 0.3856779932975769, "kl": 0.0418701171875, "learning_rate": 1.6921307524259625e-07, "loss": 0.2538, "num_tokens": 125252789.0, "reward": 0.455078125, "reward_std": 0.08816255815327168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.17632511630654335, "step": 1679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1779.0, "completions/max_terminated_length": 1315.5, "completions/mean_length": 934.5, "completions/mean_terminated_length": 676.6274261474609, "completions/min_length": 258.5, "completions/min_terminated_length": 258.5, "epoch": 0.84, "grad_norm": 0.3671688437461853, "kl": 0.06207275390625, "learning_rate": 1.6879513537803839e-07, "loss": 0.2771, "num_tokens": 125322613.0, "reward": 0.42578125, "reward_std": 0.1021735928952694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.2043471857905388, "step": 1680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1738.5, "completions/max_terminated_length": 1487.25, "completions/mean_length": 949.234375, "completions/mean_terminated_length": 805.0685119628906, "completions/min_length": 245.5, "completions/min_terminated_length": 245.5, "epoch": 0.8405, "grad_norm": 0.5789071917533875, "kl": 0.04632568359375, "learning_rate": 1.6837835672960831e-07, "loss": 0.1035, "num_tokens": 125398916.0, "reward": 0.7833090722560883, "reward_std": 0.34032438322901726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.19304947182536125, "step": 1681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1472.25, "completions/max_terminated_length": 1162.0, "completions/mean_length": 735.953125, "completions/mean_terminated_length": 679.2434692382812, "completions/min_length": 270.25, "completions/min_terminated_length": 270.25, "epoch": 0.841, "grad_norm": 0.4798482060432434, "kl": 0.04498291015625, "learning_rate": 1.6796274056688637e-07, "loss": 0.0289, "num_tokens": 125454657.0, "reward": 0.7129774540662766, "reward_std": 0.571827307343483, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11234810762107372, "rewards/penalized_accuracy_reward/std": 0.28516414016485214, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07966229319572449, "step": 1682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1796.25, "completions/max_terminated_length": 1430.25, "completions/mean_length": 1059.953125, "completions/mean_terminated_length": 877.6598968505859, "completions/min_length": 338.25, "completions/min_terminated_length": 338.25, "epoch": 0.8415, "grad_norm": 0.3851536810398102, "kl": 0.05029296875, "learning_rate": 1.6754828815591131e-07, "loss": 0.179, "num_tokens": 125531022.0, "reward": 0.46246451139450073, "reward_std": 0.19341251254081726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01248226035386324, "rewards/penalized_accuracy_reward/std": 0.049929045140743256, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.23912455141544342, "step": 1683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1566.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 675.53125, "completions/mean_terminated_length": 611.7933197021484, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.842, "grad_norm": 0.43869245052337646, "kl": 0.051177978515625, "learning_rate": 1.6713500075917694e-07, "loss": 0.2926, "num_tokens": 125582128.0, "reward": 0.5764593183994293, "reward_std": 0.23841168358922005, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994840919971466, "rewards/penalized_accuracy_reward/std": 0.08935043215751648, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14491254836320877, "step": 1684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1769.75, "completions/max_terminated_length": 1609.5, "completions/mean_length": 845.8125, "completions/mean_terminated_length": 804.7410736083984, "completions/min_length": 299.25, "completions/min_terminated_length": 299.25, "epoch": 0.8425, "grad_norm": 0.45642900466918945, "kl": 0.0487060546875, "learning_rate": 1.6672287963562852e-07, "loss": 0.1643, "num_tokens": 125645012.0, "reward": 0.5801907926797867, "reward_std": 0.22824589908123016, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04986102133989334, "rewards/penalized_accuracy_reward/std": 0.08919411897659302, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09971532225608826, "step": 1685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1660.75, "completions/max_terminated_length": 1299.25, "completions/mean_length": 764.40625, "completions/mean_terminated_length": 722.7968902587891, "completions/min_length": 290.5, "completions/min_terminated_length": 290.5, "epoch": 0.843, "grad_norm": 0.39869430661201477, "kl": 0.04150390625, "learning_rate": 1.6631192604065852e-07, "loss": 0.1884, "num_tokens": 125702366.0, "reward": 0.7107043117284775, "reward_std": 0.39122800529003143, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11218808591365814, "rewards/penalized_accuracy_reward/std": 0.18010788410902023, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 1686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1566.0, "completions/max_terminated_length": 1443.5, "completions/mean_length": 784.625, "completions/mean_terminated_length": 744.7901916503906, "completions/min_length": 338.75, "completions/min_terminated_length": 338.75, "epoch": 0.8435, "grad_norm": 0.4340851306915283, "kl": 0.052276611328125, "learning_rate": 1.659021412261026e-07, "loss": 0.1404, "num_tokens": 125761302.0, "reward": 0.5783020555973053, "reward_std": 0.2235366404056549, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04989320784807205, "rewards/penalized_accuracy_reward/std": 0.08925169706344604, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11534032225608826, "step": 1687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1626.75, "completions/max_terminated_length": 1442.25, "completions/mean_length": 916.15625, "completions/mean_terminated_length": 853.3847961425781, "completions/min_length": 368.75, "completions/min_terminated_length": 368.75, "epoch": 0.844, "grad_norm": 0.39641186594963074, "kl": 0.03466796875, "learning_rate": 1.6549352644023668e-07, "loss": 0.1688, "num_tokens": 125829872.0, "reward": 0.7802441269159317, "reward_std": 0.3248737845569849, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.149887690320611, "rewards/penalized_accuracy_reward/std": 0.145635187625885, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.10263093188405037, "step": 1688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1565.0, "completions/max_terminated_length": 1401.75, "completions/mean_length": 785.9375, "completions/mean_terminated_length": 752.0875244140625, "completions/min_length": 310.75, "completions/min_terminated_length": 310.75, "epoch": 0.8445, "grad_norm": 0.44982245564460754, "kl": 0.03948974609375, "learning_rate": 1.6508608292777203e-07, "loss": 0.155, "num_tokens": 125895324.0, "reward": 0.8132099211215973, "reward_std": 0.20808477699756622, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16246432065963745, "rewards/penalized_accuracy_reward/std": 0.08060488104820251, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1940.5, "completions/max_terminated_length": 1667.5, "completions/mean_length": 1036.5625, "completions/mean_terminated_length": 846.2443695068359, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.845, "grad_norm": 0.39846768975257874, "kl": 0.034698486328125, "learning_rate": 1.646798119298523e-07, "loss": 0.2438, "num_tokens": 125971216.0, "reward": 0.447265625, "reward_std": 0.08483916148543358, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.16967833414673805, "step": 1690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 1060.71875, "completions/mean_terminated_length": 797.4889068603516, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.8455, "grad_norm": 0.32049524784088135, "kl": 0.05877685546875, "learning_rate": 1.6427471468404952e-07, "loss": 0.2301, "num_tokens": 126048158.0, "reward": 0.5926349759101868, "reward_std": 0.43950963020324707, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08733310922980309, "rewards/penalized_accuracy_reward/std": 0.16975773125886917, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.27323222905397415, "step": 1691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1775.25, "completions/max_terminated_length": 1238.25, "completions/mean_length": 739.078125, "completions/mean_terminated_length": 658.7732391357422, "completions/min_length": 296.5, "completions/min_terminated_length": 296.5, "epoch": 0.846, "grad_norm": 0.5277993679046631, "kl": 0.04473876953125, "learning_rate": 1.6387079242435995e-07, "loss": 0.3125, "num_tokens": 126103155.0, "reward": 0.8820023387670517, "reward_std": 0.37219712138175964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.11894455552101135, "step": 1692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1547.0, "completions/max_terminated_length": 1507.75, "completions/mean_length": 881.09375, "completions/mean_terminated_length": 864.3448028564453, "completions/min_length": 417.25, "completions/min_terminated_length": 417.25, "epoch": 0.8465, "grad_norm": 0.42669275403022766, "kl": 0.0372314453125, "learning_rate": 1.6346804638120098e-07, "loss": 0.0561, "num_tokens": 126166505.0, "reward": 0.6630760133266449, "reward_std": 0.32977263256907463, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08739738538861275, "rewards/penalized_accuracy_reward/std": 0.1498088240623474, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08086910098791122, "step": 1693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1731.75, "completions/max_terminated_length": 1608.5, "completions/mean_length": 876.8125, "completions/mean_terminated_length": 820.5479431152344, "completions/min_length": 327.25, "completions/min_terminated_length": 327.25, "epoch": 0.847, "grad_norm": 0.36026543378829956, "kl": 0.038330078125, "learning_rate": 1.6306647778140697e-07, "loss": 0.1453, "num_tokens": 126232605.0, "reward": 0.478515625, "reward_std": 0.06346176192164421, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.12692352384328842, "step": 1694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1476.25, "completions/max_terminated_length": 1228.5, "completions/mean_length": 666.140625, "completions/mean_terminated_length": 645.8697967529297, "completions/min_length": 302.5, "completions/min_terminated_length": 302.5, "epoch": 0.8475, "grad_norm": 0.6824751496315002, "kl": 0.0634765625, "learning_rate": 1.6266608784822542e-07, "loss": 0.1683, "num_tokens": 126284822.0, "reward": 0.5093502402305603, "reward_std": 0.14901446551084518, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012487619183957577, "rewards/penalized_accuracy_reward/std": 0.04995047673583031, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 1695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1803.5, "completions/max_terminated_length": 1563.5, "completions/mean_length": 1014.34375, "completions/mean_terminated_length": 933.0041961669922, "completions/min_length": 348.5, "completions/min_terminated_length": 348.5, "epoch": 0.848, "grad_norm": 0.49288031458854675, "kl": 0.04449462890625, "learning_rate": 1.6226687780131337e-07, "loss": 0.1524, "num_tokens": 126357532.0, "reward": 0.5647389739751816, "reward_std": 0.3277532607316971, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994761198759079, "rewards/penalized_accuracy_reward/std": 0.1364828646183014, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1875, "step": 1696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1889.75, "completions/max_terminated_length": 1619.0, "completions/mean_length": 837.5, "completions/mean_terminated_length": 781.4135589599609, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8485, "grad_norm": 0.43097954988479614, "kl": 0.04217529296875, "learning_rate": 1.6186884885673413e-07, "loss": 0.218, "num_tokens": 126421996.0, "reward": 0.807132750749588, "reward_std": 0.38489823043346405, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1623554304242134, "rewards/penalized_accuracy_reward/std": 0.16394022852182388, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.140625, "step": 1697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1865.5, "completions/max_terminated_length": 1484.0, "completions/mean_length": 1175.53125, "completions/mean_terminated_length": 911.2999725341797, "completions/min_length": 447.5, "completions/min_terminated_length": 447.5, "epoch": 0.849, "grad_norm": 0.33761635422706604, "kl": 0.06005859375, "learning_rate": 1.6147200222695275e-07, "loss": 0.3201, "num_tokens": 126507998.0, "reward": 0.6157532632350922, "reward_std": 0.31923810578882694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09986881911754608, "rewards/penalized_accuracy_reward/std": 0.1031440868973732, "rewards/tag_count_reward/mean": 0.83203125, "rewards/tag_count_reward/std": 0.2466974686831236, "step": 1698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1655.5, "completions/max_terminated_length": 1641.0, "completions/mean_length": 833.21875, "completions/mean_terminated_length": 819.5343933105469, "completions/min_length": 321.5, "completions/min_terminated_length": 321.5, "epoch": 0.8495, "grad_norm": 0.3561137616634369, "kl": 0.041748046875, "learning_rate": 1.610763391208329e-07, "loss": 0.0176, "num_tokens": 126570764.0, "reward": 0.5651548206806183, "reward_std": 0.18586163967847824, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037460215389728546, "rewards/penalized_accuracy_reward/std": 0.08053706586360931, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.049575019627809525, "step": 1699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1608.75, "completions/max_terminated_length": 1314.5, "completions/mean_length": 856.75, "completions/mean_terminated_length": 741.0444946289062, "completions/min_length": 281.25, "completions/min_terminated_length": 281.25, "epoch": 0.85, "grad_norm": 0.3654150366783142, "kl": 0.04278564453125, "learning_rate": 1.6068186074363307e-07, "loss": 0.2212, "num_tokens": 126636444.0, "reward": 0.6377193033695221, "reward_std": 0.2783883959054947, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08741434663534164, "rewards/penalized_accuracy_reward/std": 0.10236921161413193, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.14729997515678406, "step": 1700 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1401.5, "completions/max_terminated_length": 1366.75, "completions/mean_length": 809.21875, "completions/mean_terminated_length": 737.0044708251953, "completions/min_length": 337.75, "completions/min_terminated_length": 337.75, "epoch": 0.8505, "grad_norm": 0.46931272745132446, "kl": 0.040496826171875, "learning_rate": 1.6028856829700258e-07, "loss": 0.1684, "num_tokens": 126696986.0, "reward": 0.9971396625041962, "reward_std": 0.3483956716954708, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2622416913509369, "rewards/penalized_accuracy_reward/std": 0.1498163565993309, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1212318204343319, "step": 1701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1596.25, "completions/max_terminated_length": 1366.75, "completions/mean_length": 917.609375, "completions/mean_terminated_length": 826.7529449462891, "completions/min_length": 370.5, "completions/min_terminated_length": 370.5, "epoch": 0.851, "grad_norm": 0.3254840075969696, "kl": 0.04180908203125, "learning_rate": 1.5989646297897876e-07, "loss": 0.1875, "num_tokens": 126764737.0, "reward": 0.47265625, "reward_std": 0.06507972255349159, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13015944883227348, "step": 1702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1677.0, "completions/max_terminated_length": 1551.5, "completions/mean_length": 923.734375, "completions/mean_terminated_length": 798.3025665283203, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.8515, "grad_norm": 0.43866443634033203, "kl": 0.04046630859375, "learning_rate": 1.5950554598398228e-07, "loss": 0.2381, "num_tokens": 126833344.0, "reward": 0.8664971590042114, "reward_std": 0.27084274031221867, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.199850139208138, "rewards/penalized_accuracy_reward/std": 0.09990137815475464, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14207998290657997, "step": 1703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1315.5, "completions/max_terminated_length": 1315.5, "completions/mean_length": 721.59375, "completions/mean_terminated_length": 721.59375, "completions/min_length": 267.5, "completions/min_terminated_length": 267.5, "epoch": 0.852, "grad_norm": 0.4159834682941437, "kl": 0.0465087890625, "learning_rate": 1.59115818502814e-07, "loss": 0.0303, "num_tokens": 126888982.0, "reward": 0.490234375, "reward_std": 0.0390625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2042.75, "completions/max_terminated_length": 1937.25, "completions/mean_length": 1228.140625, "completions/mean_terminated_length": 1117.4695434570312, "completions/min_length": 463.75, "completions/min_terminated_length": 463.75, "epoch": 0.8525, "grad_norm": 0.22214633226394653, "kl": 0.029998779296875, "learning_rate": 1.5872728172265146e-07, "loss": 0.1448, "num_tokens": 126975391.0, "reward": 0.7067909687757492, "reward_std": 0.43657156080007553, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1248798593878746, "rewards/penalized_accuracy_reward/std": 0.1892523616552353, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1760583184659481, "step": 1705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1817.75, "completions/max_terminated_length": 1542.25, "completions/mean_length": 1139.515625, "completions/mean_terminated_length": 966.2406311035156, "completions/min_length": 445.5, "completions/min_terminated_length": 445.5, "epoch": 0.853, "grad_norm": 0.35865503549575806, "kl": 0.055389404296875, "learning_rate": 1.5833993682704515e-07, "loss": 0.2028, "num_tokens": 127059952.0, "reward": 0.447265625, "reward_std": 0.08449020981788635, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.1689804196357727, "step": 1706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1935.5, "completions/max_terminated_length": 1611.0, "completions/mean_length": 1040.328125, "completions/mean_terminated_length": 903.2481536865234, "completions/min_length": 404.25, "completions/min_terminated_length": 404.25, "epoch": 0.8535, "grad_norm": 0.3951480984687805, "kl": 0.047607421875, "learning_rate": 1.579537849959148e-07, "loss": 0.1174, "num_tokens": 127136133.0, "reward": 0.5589132159948349, "reward_std": 0.31781119108200073, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0499644186347723, "rewards/penalized_accuracy_reward/std": 0.13053050264716148, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.19252349436283112, "step": 1707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1758.75, "completions/max_terminated_length": 1431.5, "completions/mean_length": 828.359375, "completions/mean_terminated_length": 753.1528625488281, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.854, "grad_norm": 0.4093847870826721, "kl": 0.05072021484375, "learning_rate": 1.5756882740554578e-07, "loss": 0.1076, "num_tokens": 127198172.0, "reward": 0.6744543015956879, "reward_std": 0.38979583233594894, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09992246329784393, "rewards/penalized_accuracy_reward/std": 0.17621616274118423, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1361413449048996, "step": 1708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1728.5, "completions/max_terminated_length": 1595.25, "completions/mean_length": 758.078125, "completions/mean_terminated_length": 737.5437622070312, "completions/min_length": 347.5, "completions/min_terminated_length": 347.5, "epoch": 0.8545, "grad_norm": 0.34123337268829346, "kl": 0.03790283203125, "learning_rate": 1.5718506522858572e-07, "loss": 0.1042, "num_tokens": 127253697.0, "reward": 0.48828125, "reward_std": 0.039831146597862244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07966229319572449, "step": 1709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1444.25, "completions/mean_length": 848.59375, "completions/mean_terminated_length": 768.6333618164062, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.855, "grad_norm": 0.47688549757003784, "kl": 0.035064697265625, "learning_rate": 1.5680249963404065e-07, "loss": 0.1828, "num_tokens": 127318471.0, "reward": 0.5225120931863785, "reward_std": 0.2554154470562935, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024927922524511814, "rewards/penalized_accuracy_reward/std": 0.09971169382333755, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18231988325715065, "step": 1710 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1391.25, "completions/max_terminated_length": 1207.0, "completions/mean_length": 715.515625, "completions/mean_terminated_length": 672.390625, "completions/min_length": 241.25, "completions/min_terminated_length": 241.25, "epoch": 0.8555, "grad_norm": 0.21076053380966187, "kl": 0.05169677734375, "learning_rate": 1.5642113178727193e-07, "loss": 0.0745, "num_tokens": 127370376.0, "reward": 0.5860991179943085, "reward_std": 0.18921209871768951, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049885496497154236, "rewards/penalized_accuracy_reward/std": 0.08923793584108353, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.06442352384328842, "step": 1711 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1934.5, "completions/max_terminated_length": 1543.25, "completions/mean_length": 855.78125, "completions/mean_terminated_length": 699.2010955810547, "completions/min_length": 284.25, "completions/min_terminated_length": 284.25, "epoch": 0.856, "grad_norm": 0.3755073845386505, "kl": 0.050537109375, "learning_rate": 1.56040962849992e-07, "loss": 0.2029, "num_tokens": 127433690.0, "reward": 0.529877208173275, "reward_std": 0.28180408105254173, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037399536930024624, "rewards/penalized_accuracy_reward/std": 0.1180017776787281, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.17873795703053474, "step": 1712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.25, "completions/mean_length": 1208.71875, "completions/mean_terminated_length": 1080.2567901611328, "completions/min_length": 499.75, "completions/min_terminated_length": 499.75, "epoch": 0.8565, "grad_norm": 0.3166237771511078, "kl": 0.046478271484375, "learning_rate": 1.5566199398026147e-07, "loss": 0.2104, "num_tokens": 127521560.0, "reward": 0.514374852180481, "reward_std": 0.2652791030704975, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03746086731553078, "rewards/penalized_accuracy_reward/std": 0.08053845912218094, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2586735598742962, "step": 1713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1746.0, "completions/max_terminated_length": 1317.5, "completions/mean_length": 965.015625, "completions/mean_terminated_length": 833.68408203125, "completions/min_length": 339.25, "completions/min_terminated_length": 339.25, "epoch": 0.857, "grad_norm": 0.5647670030593872, "kl": 0.064788818359375, "learning_rate": 1.5528422633248516e-07, "loss": 0.3124, "num_tokens": 127595065.0, "reward": 0.5181989818811417, "reward_std": 0.2423611283302307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03741980344057083, "rewards/penalized_accuracy_reward/std": 0.0804501622915268, "rewards/tag_count_reward/mean": 0.88671875, "rewards/tag_count_reward/std": 0.2203122302889824, "step": 1714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 763.3125, "completions/mean_terminated_length": 677.6666870117188, "completions/min_length": 268.25, "completions/min_terminated_length": 268.25, "epoch": 0.8575, "grad_norm": 0.6431763768196106, "kl": 0.0526123046875, "learning_rate": 1.5490766105740876e-07, "loss": 0.466, "num_tokens": 127653069.0, "reward": 0.4765625, "reward_std": 0.09375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1875, "step": 1715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2029.25, "completions/max_terminated_length": 1829.0, "completions/mean_length": 1131.703125, "completions/mean_terminated_length": 1038.594223022461, "completions/min_length": 424.75, "completions/min_terminated_length": 424.75, "epoch": 0.858, "grad_norm": 0.48722654581069946, "kl": 0.04998779296875, "learning_rate": 1.5453229930211563e-07, "loss": 0.1987, "num_tokens": 127733354.0, "reward": 0.45703125, "reward_std": 0.09527009166777134, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.19054018706083298, "step": 1716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1667.5, "completions/max_terminated_length": 1405.0, "completions/mean_length": 980.8125, "completions/mean_terminated_length": 895.5706634521484, "completions/min_length": 436.5, "completions/min_terminated_length": 436.5, "epoch": 0.8585, "grad_norm": 0.3933636546134949, "kl": 0.051116943359375, "learning_rate": 1.5415814221002265e-07, "loss": 0.2194, "num_tokens": 127807806.0, "reward": 0.455078125, "reward_std": 0.08748093992471695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.1749618947505951, "step": 1717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2037.25, "completions/max_terminated_length": 1463.75, "completions/mean_length": 1031.921875, "completions/mean_terminated_length": 741.2337951660156, "completions/min_length": 295.5, "completions/min_terminated_length": 295.5, "epoch": 0.859, "grad_norm": 0.38837525248527527, "kl": 0.08258056640625, "learning_rate": 1.5378519092087712e-07, "loss": 0.3151, "num_tokens": 127882409.0, "reward": 0.471720814704895, "reward_std": 0.2591165639460087, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024922902695834637, "rewards/penalized_accuracy_reward/std": 0.09969161450862885, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.20067723840475082, "step": 1718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1474.5, "completions/max_terminated_length": 1342.25, "completions/mean_length": 785.734375, "completions/mean_terminated_length": 714.9955596923828, "completions/min_length": 361.25, "completions/min_terminated_length": 361.25, "epoch": 0.8595, "grad_norm": 0.2874058485031128, "kl": 0.04754638671875, "learning_rate": 1.5341344657075354e-07, "loss": 0.1541, "num_tokens": 127943304.0, "reward": 0.7744099497795105, "reward_std": 0.23632361367344856, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14990028738975525, "rewards/penalized_accuracy_reward/std": 0.08938327431678772, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.12846697121858597, "step": 1719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1751.5, "completions/max_terminated_length": 1392.0, "completions/mean_length": 786.96875, "completions/mean_terminated_length": 662.7701110839844, "completions/min_length": 335.25, "completions/min_terminated_length": 335.25, "epoch": 0.86, "grad_norm": 0.41100335121154785, "kl": 0.039459228515625, "learning_rate": 1.5304291029204954e-07, "loss": 0.249, "num_tokens": 128002326.0, "reward": 0.7547144144773483, "reward_std": 0.5765300095081329, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14981814846396446, "rewards/penalized_accuracy_reward/std": 0.2654919773340225, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.18802428990602493, "step": 1720 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1786.25, "completions/max_terminated_length": 1638.75, "completions/mean_length": 1041.9375, "completions/mean_terminated_length": 946.8765258789062, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.8605, "grad_norm": 0.34651312232017517, "kl": 0.04901123046875, "learning_rate": 1.5267358321348285e-07, "loss": 0.1249, "num_tokens": 128077954.0, "reward": 0.6435872316360474, "reward_std": 0.2726282961666584, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08741861581802368, "rewards/penalized_accuracy_reward/std": 0.10237421840429306, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.13575975596904755, "step": 1721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1791.0, "completions/max_terminated_length": 1307.25, "completions/mean_length": 765.75, "completions/mean_terminated_length": 723.3094024658203, "completions/min_length": 335.75, "completions/min_terminated_length": 335.75, "epoch": 0.861, "grad_norm": 0.5901057124137878, "kl": 0.0379638671875, "learning_rate": 1.5230546646008792e-07, "loss": 0.3101, "num_tokens": 128136834.0, "reward": 0.48046875, "reward_std": 0.07168455049395561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14336910098791122, "step": 1722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1771.0, "completions/max_terminated_length": 1494.75, "completions/mean_length": 893.984375, "completions/mean_terminated_length": 856.5333557128906, "completions/min_length": 407.25, "completions/min_terminated_length": 407.25, "epoch": 0.8615, "grad_norm": 0.33108922839164734, "kl": 0.043426513671875, "learning_rate": 1.5193856115321224e-07, "loss": 0.1344, "num_tokens": 128204305.0, "reward": 0.8338512778282166, "reward_std": 0.3449161648750305, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17473813705146313, "rewards/penalized_accuracy_reward/std": 0.15754414349794388, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 1723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1165.890625, "completions/mean_terminated_length": 981.7938690185547, "completions/min_length": 377.25, "completions/min_terminated_length": 377.25, "epoch": 0.862, "grad_norm": 0.46433863043785095, "kl": 0.0533447265625, "learning_rate": 1.5157286841051285e-07, "loss": 0.3643, "num_tokens": 128290618.0, "reward": 0.454657718539238, "reward_std": 0.20945000275969505, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012485108338296413, "rewards/penalized_accuracy_reward/std": 0.049940433353185654, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.25900739803910255, "step": 1724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1349.25, "completions/max_terminated_length": 1176.0, "completions/mean_length": 729.015625, "completions/mean_terminated_length": 709.9468841552734, "completions/min_length": 341.25, "completions/min_terminated_length": 341.25, "epoch": 0.8625, "grad_norm": 0.3498450815677643, "kl": 0.03155517578125, "learning_rate": 1.5120838934595337e-07, "loss": 0.105, "num_tokens": 128346059.0, "reward": 0.8420459628105164, "reward_std": 0.16545987129211426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17492923140525818, "rewards/penalized_accuracy_reward/std": 0.06828539073467255, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1823.25, "completions/max_terminated_length": 1634.0, "completions/mean_length": 821.09375, "completions/mean_terminated_length": 782.9656524658203, "completions/min_length": 289.5, "completions/min_terminated_length": 289.5, "epoch": 0.863, "grad_norm": 0.4455412030220032, "kl": 0.0313720703125, "learning_rate": 1.5084512506980023e-07, "loss": 0.176, "num_tokens": 128409601.0, "reward": 0.6342005431652069, "reward_std": 0.33433034271001816, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0749127734452486, "rewards/penalized_accuracy_reward/std": 0.14552021771669388, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 1726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 538.171875, "completions/mean_terminated_length": 538.171875, "completions/min_length": 178.25, "completions/min_terminated_length": 178.25, "epoch": 0.8635, "grad_norm": 0.48198074102401733, "kl": 0.040008544921875, "learning_rate": 1.5048307668861947e-07, "loss": 0.0302, "num_tokens": 128452716.0, "reward": 0.7435535043478012, "reward_std": 0.37730398774147034, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1247064471244812, "rewards/penalized_accuracy_reward/std": 0.1826888546347618, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.046875, "step": 1727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1872.5, "completions/max_terminated_length": 1584.25, "completions/mean_length": 986.90625, "completions/mean_terminated_length": 852.388427734375, "completions/min_length": 339.5, "completions/min_terminated_length": 339.5, "epoch": 0.864, "grad_norm": 0.3632856011390686, "kl": 0.038787841796875, "learning_rate": 1.5012224530527297e-07, "loss": 0.159, "num_tokens": 128527270.0, "reward": 0.72196364402771, "reward_std": 0.54066326841712, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13734901417046785, "rewards/penalized_accuracy_reward/std": 0.2391614131629467, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23145903646945953, "step": 1728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1666.5, "completions/max_terminated_length": 1386.25, "completions/mean_length": 749.265625, "completions/mean_terminated_length": 653.9925842285156, "completions/min_length": 273.75, "completions/min_terminated_length": 273.75, "epoch": 0.8645, "grad_norm": 0.6181725859642029, "kl": 0.05645751953125, "learning_rate": 1.4976263201891613e-07, "loss": 0.3516, "num_tokens": 128582791.0, "reward": 0.7550170123577118, "reward_std": 0.25608809664845467, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1372741311788559, "rewards/penalized_accuracy_reward/std": 0.09558545053005219, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1298343911767006, "step": 1729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1549.5, "completions/max_terminated_length": 1280.0, "completions/mean_length": 893.296875, "completions/mean_terminated_length": 814.5031127929688, "completions/min_length": 409.75, "completions/min_terminated_length": 409.75, "epoch": 0.865, "grad_norm": 0.459372878074646, "kl": 0.04852294921875, "learning_rate": 1.4940423792499306e-07, "loss": 0.191, "num_tokens": 128647466.0, "reward": 0.4897863119840622, "reward_std": 0.18195927888154984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012471279129385948, "rewards/penalized_accuracy_reward/std": 0.04988512024283409, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.18491152673959732, "step": 1730 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 1258.90625, "completions/mean_terminated_length": 1113.8938903808594, "completions/min_length": 420.75, "completions/min_terminated_length": 420.75, "epoch": 0.8655, "grad_norm": 0.31238678097724915, "kl": 0.040863037109375, "learning_rate": 1.4904706411523448e-07, "loss": 0.1929, "num_tokens": 128737956.0, "reward": 0.47223612666130066, "reward_std": 0.1873704195022583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012485251761972904, "rewards/penalized_accuracy_reward/std": 0.04994100332260132, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.21359620988368988, "step": 1731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1869.75, "completions/max_terminated_length": 1314.75, "completions/mean_length": 870.796875, "completions/mean_terminated_length": 774.501220703125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.866, "grad_norm": 0.4689647853374481, "kl": 0.033172607421875, "learning_rate": 1.4869111167765372e-07, "loss": 0.2715, "num_tokens": 128802695.0, "reward": 0.505451887845993, "reward_std": 0.1518510114401579, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012491564266383648, "rewards/penalized_accuracy_reward/std": 0.04996625706553459, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11781632527709007, "step": 1732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.75, "completions/mean_length": 985.234375, "completions/mean_terminated_length": 813.2035980224609, "completions/min_length": 339.75, "completions/min_terminated_length": 339.75, "epoch": 0.8665, "grad_norm": 0.4230544865131378, "kl": 0.0555419921875, "learning_rate": 1.483363816965435e-07, "loss": 0.3319, "num_tokens": 128878534.0, "reward": 0.8390200734138489, "reward_std": 0.41786642745137215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19978347048163414, "rewards/penalized_accuracy_reward/std": 0.16091382503509521, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2577107436954975, "step": 1733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1864.0, "completions/max_terminated_length": 1580.5, "completions/mean_length": 1178.046875, "completions/mean_terminated_length": 1002.1742095947266, "completions/min_length": 517.25, "completions/min_terminated_length": 517.25, "epoch": 0.867, "grad_norm": 0.3214801251888275, "kl": 0.0653076171875, "learning_rate": 1.479828752524731e-07, "loss": 0.2074, "num_tokens": 128964537.0, "reward": 0.419921875, "reward_std": 0.10167980566620827, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.83984375, "rewards/tag_count_reward/std": 0.20335961878299713, "step": 1734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1821.75, "completions/max_terminated_length": 1782.75, "completions/mean_length": 937.09375, "completions/mean_terminated_length": 905.5489959716797, "completions/min_length": 368.75, "completions/min_terminated_length": 368.75, "epoch": 0.8675, "grad_norm": 0.4075459837913513, "kl": 0.0379638671875, "learning_rate": 1.4763059342228434e-07, "loss": 0.1399, "num_tokens": 129036255.0, "reward": 0.486328125, "reward_std": 0.0546875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.109375, "step": 1735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2031.75, "completions/max_terminated_length": 1656.5, "completions/mean_length": 1076.265625, "completions/mean_terminated_length": 957.7595520019531, "completions/min_length": 454.5, "completions/min_terminated_length": 454.5, "epoch": 0.868, "grad_norm": 0.3985852003097534, "kl": 0.0450439453125, "learning_rate": 1.4727953727908877e-07, "loss": 0.1249, "num_tokens": 129114464.0, "reward": 0.531940832734108, "reward_std": 0.3033214472234249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745478671044111, "rewards/penalized_accuracy_reward/std": 0.11816532537341118, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.206581711769104, "step": 1736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1748.25, "completions/max_terminated_length": 1546.25, "completions/mean_length": 833.796875, "completions/mean_terminated_length": 736.85107421875, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.8685, "grad_norm": 0.38919925689697266, "kl": 0.038482666015625, "learning_rate": 1.469297078922642e-07, "loss": 0.2533, "num_tokens": 129178051.0, "reward": 0.49567410349845886, "reward_std": 0.1681503802537918, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012485492043197155, "rewards/penalized_accuracy_reward/std": 0.04994196817278862, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1749618947505951, "step": 1737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1864.25, "completions/max_terminated_length": 1750.0, "completions/mean_length": 1048.734375, "completions/mean_terminated_length": 928.8311157226562, "completions/min_length": 438.25, "completions/min_terminated_length": 438.25, "epoch": 0.869, "grad_norm": 0.3018134832382202, "kl": 0.03741455078125, "learning_rate": 1.4658110632745174e-07, "loss": 0.2099, "num_tokens": 129253122.0, "reward": 0.4800626188516617, "reward_std": 0.1761697456240654, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012492245994508266, "rewards/penalized_accuracy_reward/std": 0.049968987703323364, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.19475741311907768, "step": 1738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1835.75, "completions/max_terminated_length": 1633.75, "completions/mean_length": 842.171875, "completions/mean_terminated_length": 824.4322967529297, "completions/min_length": 428.75, "completions/min_terminated_length": 428.75, "epoch": 0.8695, "grad_norm": 0.4454421103000641, "kl": 0.04327392578125, "learning_rate": 1.4623373364655223e-07, "loss": 0.0721, "num_tokens": 129316093.0, "reward": 0.6572834253311157, "reward_std": 0.3804809395223856, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08743077144026756, "rewards/penalized_accuracy_reward/std": 0.16993193328380585, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10650964826345444, "step": 1739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1496.5, "completions/max_terminated_length": 1463.75, "completions/mean_length": 810.09375, "completions/mean_terminated_length": 794.2041931152344, "completions/min_length": 305.25, "completions/min_terminated_length": 305.25, "epoch": 0.87, "grad_norm": 0.3455018699169159, "kl": 0.040802001953125, "learning_rate": 1.45887590907723e-07, "loss": 0.1199, "num_tokens": 129375619.0, "reward": 0.7858345657587051, "reward_std": 0.4948125183582306, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14975322503596544, "rewards/penalized_accuracy_reward/std": 0.24147918447852135, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.08582578226923943, "step": 1740 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2012.25, "completions/max_terminated_length": 1697.25, "completions/mean_length": 1011.40625, "completions/mean_terminated_length": 935.6171417236328, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.8705, "grad_norm": 0.43993285298347473, "kl": 0.035003662109375, "learning_rate": 1.4554267916537495e-07, "loss": 0.2191, "num_tokens": 129448717.0, "reward": 0.470703125, "reward_std": 0.07714555040001869, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.15429110452532768, "step": 1741 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1394.75, "completions/max_terminated_length": 1297.75, "completions/mean_length": 709.609375, "completions/mean_terminated_length": 674.6406402587891, "completions/min_length": 339.75, "completions/min_terminated_length": 339.75, "epoch": 0.871, "grad_norm": 0.399268239736557, "kl": 0.04034423828125, "learning_rate": 1.4519899947016888e-07, "loss": 0.2608, "num_tokens": 129504100.0, "reward": 1.0548031628131866, "reward_std": 0.3988531865179539, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2871672213077545, "rewards/penalized_accuracy_reward/std": 0.18042071908712387, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09579972177743912, "step": 1742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1623.75, "completions/max_terminated_length": 1542.5, "completions/mean_length": 897.5625, "completions/mean_terminated_length": 844.8281402587891, "completions/min_length": 338.75, "completions/min_terminated_length": 338.75, "epoch": 0.8715, "grad_norm": 0.4278159737586975, "kl": 0.038787841796875, "learning_rate": 1.448565528690129e-07, "loss": 0.0447, "num_tokens": 129570664.0, "reward": 0.5572739839553833, "reward_std": 0.33720025420188904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374260526150465, "rewards/penalized_accuracy_reward/std": 0.1497042141854763, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.07558366656303406, "step": 1743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1591.25, "completions/max_terminated_length": 1541.75, "completions/mean_length": 834.578125, "completions/mean_terminated_length": 816.0177154541016, "completions/min_length": 388.5, "completions/min_terminated_length": 388.5, "epoch": 0.872, "grad_norm": 0.4215010106563568, "kl": 0.03314208984375, "learning_rate": 1.4451534040505881e-07, "loss": 0.1338, "num_tokens": 129633677.0, "reward": 0.490234375, "reward_std": 0.0390625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.078125, "step": 1744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1356.75, "completions/max_terminated_length": 1192.75, "completions/mean_length": 601.328125, "completions/mean_terminated_length": 579.4322967529297, "completions/min_length": 234.5, "completions/min_terminated_length": 234.5, "epoch": 0.8725, "grad_norm": 0.7785678505897522, "kl": 0.05694580078125, "learning_rate": 1.4417536311769885e-07, "loss": 0.1188, "num_tokens": 129680530.0, "reward": 0.645385205745697, "reward_std": 0.39257217943668365, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08734103292226791, "rewards/penalized_accuracy_reward/std": 0.16974373906850815, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1749618910253048, "step": 1745 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1175.5, "completions/max_terminated_length": 977.25, "completions/mean_length": 574.015625, "completions/mean_terminated_length": 512.1190032958984, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.873, "grad_norm": 0.5078064799308777, "kl": 0.063507080078125, "learning_rate": 1.438366220425628e-07, "loss": 0.1281, "num_tokens": 129726451.0, "reward": 0.48046875, "reward_std": 0.04560433328151703, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09120866656303406, "step": 1746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1580.5, "completions/max_terminated_length": 1396.0, "completions/mean_length": 880.875, "completions/mean_terminated_length": 781.9888458251953, "completions/min_length": 361.5, "completions/min_terminated_length": 361.5, "epoch": 0.8735, "grad_norm": 0.44908225536346436, "kl": 0.0408935546875, "learning_rate": 1.4349911821151462e-07, "loss": 0.1058, "num_tokens": 129791947.0, "reward": 0.6453630030155182, "reward_std": 0.3246275335550308, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08732994366437197, "rewards/penalized_accuracy_reward/std": 0.14969022199511528, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.12808074057102203, "step": 1747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1702.25, "completions/max_terminated_length": 1487.25, "completions/mean_length": 867.21875, "completions/mean_terminated_length": 832.3448181152344, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.874, "grad_norm": 0.42614537477493286, "kl": 0.035919189453125, "learning_rate": 1.4316285265264978e-07, "loss": 0.1133, "num_tokens": 129858121.0, "reward": 0.484375, "reward_std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.125, "step": 1748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1901.25, "completions/max_terminated_length": 1761.75, "completions/mean_length": 983.484375, "completions/mean_terminated_length": 917.5417022705078, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "epoch": 0.8745, "grad_norm": 0.41686686873435974, "kl": 0.04913330078125, "learning_rate": 1.4282782639029128e-07, "loss": 0.1098, "num_tokens": 129931096.0, "reward": 0.8664588034152985, "reward_std": 0.4714139439165592, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19983097165822983, "rewards/penalized_accuracy_reward/std": 0.20476645231246948, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.21236922964453697, "step": 1749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1885.0, "completions/max_terminated_length": 1698.75, "completions/mean_length": 927.96875, "completions/mean_terminated_length": 868.9537353515625, "completions/min_length": 417.5, "completions/min_terminated_length": 417.5, "epoch": 0.875, "grad_norm": 0.46479490399360657, "kl": 0.0439453125, "learning_rate": 1.4249404044498727e-07, "loss": 0.1683, "num_tokens": 129999958.0, "reward": 0.5705967396497726, "reward_std": 0.2249294761568308, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994680732488632, "rewards/penalized_accuracy_reward/std": 0.08934757858514786, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.1366250328719616, "step": 1750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1374.5, "completions/max_terminated_length": 1287.5, "completions/mean_length": 776.90625, "completions/mean_terminated_length": 742.8303680419922, "completions/min_length": 346.25, "completions/min_terminated_length": 346.25, "epoch": 0.8755, "grad_norm": 0.5316336750984192, "kl": 0.03717041015625, "learning_rate": 1.4216149583350755e-07, "loss": 0.1026, "num_tokens": 130057952.0, "reward": 0.7341199219226837, "reward_std": 0.3961492609232664, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12487246841192245, "rewards/penalized_accuracy_reward/std": 0.18290875107049942, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.08656632527709007, "step": 1751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.25, "completions/max_terminated_length": 1336.25, "completions/mean_length": 673.4375, "completions/mean_terminated_length": 673.4375, "completions/min_length": 311.75, "completions/min_terminated_length": 311.75, "epoch": 0.876, "grad_norm": 0.36058440804481506, "kl": 0.031585693359375, "learning_rate": 1.418301935688408e-07, "loss": 0.0121, "num_tokens": 130112732.0, "reward": 0.7459887862205505, "reward_std": 0.2155410349369049, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1459.5, "completions/mean_length": 1118.875, "completions/mean_terminated_length": 1020.2512359619141, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.8765, "grad_norm": 0.429909348487854, "kl": 0.0511474609375, "learning_rate": 1.4150013466019114e-07, "loss": 0.2612, "num_tokens": 130194308.0, "reward": 0.5186948925256729, "reward_std": 0.22037488594651222, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024972444400191307, "rewards/penalized_accuracy_reward/std": 0.06823771446943283, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.20620574057102203, "step": 1753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1757.5, "completions/max_terminated_length": 1458.5, "completions/mean_length": 884.8125, "completions/mean_terminated_length": 796.9622802734375, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.877, "grad_norm": 0.48586803674697876, "kl": 0.0439453125, "learning_rate": 1.4117132011297528e-07, "loss": 0.2376, "num_tokens": 130263784.0, "reward": 0.474609375, "reward_std": 0.06904183328151703, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13808366656303406, "step": 1754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1935.75, "completions/max_terminated_length": 1520.75, "completions/mean_length": 1100.828125, "completions/mean_terminated_length": 986.8168792724609, "completions/min_length": 528.75, "completions/min_terminated_length": 528.75, "epoch": 0.8775, "grad_norm": 0.3251938819885254, "kl": 0.0404052734375, "learning_rate": 1.4084375092881917e-07, "loss": 0.1725, "num_tokens": 130342397.0, "reward": 0.550902709364891, "reward_std": 0.26972730457782745, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049865420907735825, "rewards/penalized_accuracy_reward/std": 0.08920205384492874, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.21867242455482483, "step": 1755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1965.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 885.015625, "completions/mean_terminated_length": 806.1199722290039, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.878, "grad_norm": 0.6754069328308105, "kl": 0.0484619140625, "learning_rate": 1.405174281055556e-07, "loss": 0.2863, "num_tokens": 130411054.0, "reward": 0.5112781673669815, "reward_std": 0.14804722741246223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012475023046135902, "rewards/penalized_accuracy_reward/std": 0.04990009218454361, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09649410098791122, "step": 1756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1824.75, "completions/max_terminated_length": 1423.5, "completions/mean_length": 986.078125, "completions/mean_terminated_length": 779.6867370605469, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.8785, "grad_norm": 0.44803667068481445, "kl": 0.047515869140625, "learning_rate": 1.4019235263722034e-07, "loss": 0.2975, "num_tokens": 130484595.0, "reward": 0.5064851939678192, "reward_std": 0.27284732460975647, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03742228075861931, "rewards/penalized_accuracy_reward/std": 0.0804554894566536, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.24977383017539978, "step": 1757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1631.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 858.46875, "completions/mean_terminated_length": 826.3727722167969, "completions/min_length": 331.5, "completions/min_terminated_length": 331.5, "epoch": 0.879, "grad_norm": 0.451206237077713, "kl": 0.032867431640625, "learning_rate": 1.3986852551404962e-07, "loss": 0.1083, "num_tokens": 130547969.0, "reward": 0.484375, "reward_std": 0.035690465942025185, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.07138093188405037, "step": 1758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1481.25, "completions/max_terminated_length": 1392.5, "completions/mean_length": 693.53125, "completions/mean_terminated_length": 671.0020904541016, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.8795, "grad_norm": 0.3816147446632385, "kl": 0.0390625, "learning_rate": 1.395459477224772e-07, "loss": 0.0244, "num_tokens": 130599555.0, "reward": 1.30295792222023, "reward_std": 0.570308580994606, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.41222115978598595, "rewards/penalized_accuracy_reward/std": 0.2849810943007469, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1285141110420227, "step": 1759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1734.0, "completions/max_terminated_length": 1572.25, "completions/mean_length": 813.5, "completions/mean_terminated_length": 780.2458648681641, "completions/min_length": 364.25, "completions/min_terminated_length": 364.25, "epoch": 0.88, "grad_norm": 0.34859058260917664, "kl": 0.0394287109375, "learning_rate": 1.3922462024513075e-07, "loss": 0.0912, "num_tokens": 130658707.0, "reward": 0.5362727344036102, "reward_std": 0.16551832482218742, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024972304701805115, "rewards/penalized_accuracy_reward/std": 0.06823733448982239, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09649410098791122, "step": 1760 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1629.25, "completions/mean_length": 1113.890625, "completions/mean_terminated_length": 957.4199829101562, "completions/min_length": 327.5, "completions/min_terminated_length": 327.5, "epoch": 0.8805, "grad_norm": 0.3969188332557678, "kl": 0.0487060546875, "learning_rate": 1.3890454406082956e-07, "loss": 0.2888, "num_tokens": 130738236.0, "reward": 0.573944091796875, "reward_std": 0.29836826771497726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0623626671731472, "rewards/penalized_accuracy_reward/std": 0.0955323874950409, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2506019398570061, "step": 1761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 976.28125, "completions/mean_terminated_length": 789.0262145996094, "completions/min_length": 359.25, "completions/min_terminated_length": 359.25, "epoch": 0.881, "grad_norm": 0.4201774001121521, "kl": 0.0765380859375, "learning_rate": 1.385857201445813e-07, "loss": 0.3359, "num_tokens": 130813102.0, "reward": 0.456602543592453, "reward_std": 0.21644016355276108, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012480954639613628, "rewards/penalized_accuracy_reward/std": 0.049923818558454514, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.27547289803624153, "step": 1762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.5, "completions/max_terminated_length": 1237.5, "completions/mean_length": 737.78125, "completions/mean_terminated_length": 737.78125, "completions/min_length": 307.75, "completions/min_terminated_length": 307.75, "epoch": 0.8815, "grad_norm": 0.16001388430595398, "kl": 0.06182861328125, "learning_rate": 1.3826814946757888e-07, "loss": 0.0476, "num_tokens": 130870752.0, "reward": 0.5499261617660522, "reward_std": 0.1364242434501648, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024963080883026123, "rewards/penalized_accuracy_reward/std": 0.068212129175663, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1969.0, "completions/max_terminated_length": 1675.75, "completions/mean_length": 888.734375, "completions/mean_terminated_length": 815.5799407958984, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.882, "grad_norm": 0.4240720868110657, "kl": 0.0633544921875, "learning_rate": 1.3795183299719753e-07, "loss": 0.1877, "num_tokens": 130936671.0, "reward": 0.622576504945755, "reward_std": 0.41730960831046104, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07496012840420008, "rewards/penalized_accuracy_reward/std": 0.18932750076055527, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.17165156453847885, "step": 1764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1874.75, "completions/max_terminated_length": 1540.0, "completions/mean_length": 987.765625, "completions/mean_terminated_length": 824.3189392089844, "completions/min_length": 387.25, "completions/min_terminated_length": 387.25, "epoch": 0.8825, "grad_norm": 0.35194921493530273, "kl": 0.029937744140625, "learning_rate": 1.3763677169699217e-07, "loss": 0.2504, "num_tokens": 131010736.0, "reward": 0.7451083362102509, "reward_std": 0.47184447199106216, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14989791810512543, "rewards/penalized_accuracy_reward/std": 0.19256839901208878, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.21751796454191208, "step": 1765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1833.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 886.3125, "completions/mean_terminated_length": 688.2966156005859, "completions/min_length": 336.75, "completions/min_terminated_length": 336.75, "epoch": 0.883, "grad_norm": 0.3938741385936737, "kl": 0.066162109375, "learning_rate": 1.3732296652669417e-07, "loss": 0.3013, "num_tokens": 131074708.0, "reward": 0.5602752566337585, "reward_std": 0.2836132273077965, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06236419081687927, "rewards/penalized_accuracy_reward/std": 0.09553467482328415, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.21784568578004837, "step": 1766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1571.5, "completions/mean_length": 1025.1875, "completions/mean_terminated_length": 902.1976776123047, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "epoch": 0.8835, "grad_norm": 0.44696924090385437, "kl": 0.04290771484375, "learning_rate": 1.370104184422085e-07, "loss": 0.2821, "num_tokens": 131150608.0, "reward": 0.5319229066371918, "reward_std": 0.2536583412438631, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.037445832043886185, "rewards/penalized_accuracy_reward/std": 0.08050614595413208, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.23181551322340965, "step": 1767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1831.5, "completions/max_terminated_length": 1529.25, "completions/mean_length": 1013.53125, "completions/mean_terminated_length": 909.7359161376953, "completions/min_length": 380.5, "completions/min_terminated_length": 380.5, "epoch": 0.884, "grad_norm": 0.40804824233055115, "kl": 0.030487060546875, "learning_rate": 1.3669912839561083e-07, "loss": 0.2261, "num_tokens": 131224802.0, "reward": 0.4761486202478409, "reward_std": 0.17973065562546253, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01248837262392044, "rewards/penalized_accuracy_reward/std": 0.04995349049568176, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.20893577113747597, "step": 1768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1366.25, "completions/max_terminated_length": 1248.25, "completions/mean_length": 799.625, "completions/mean_terminated_length": 784.5656433105469, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "epoch": 0.8845, "grad_norm": 0.3929818868637085, "kl": 0.026580810546875, "learning_rate": 1.3638909733514452e-07, "loss": 0.0792, "num_tokens": 131284474.0, "reward": 0.8419571220874786, "reward_std": 0.1677861362695694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17488481104373932, "rewards/penalized_accuracy_reward/std": 0.0682680681347847, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1853.5, "completions/max_terminated_length": 1467.0, "completions/mean_length": 853.140625, "completions/mean_terminated_length": 727.8532409667969, "completions/min_length": 294.75, "completions/min_terminated_length": 294.75, "epoch": 0.885, "grad_norm": 0.4517180919647217, "kl": 0.05877685546875, "learning_rate": 1.3608032620521803e-07, "loss": 0.2851, "num_tokens": 131347539.0, "reward": 0.5339466035366058, "reward_std": 0.2414295356720686, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03748110681772232, "rewards/penalized_accuracy_reward/std": 0.08058197051286697, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.20110077410936356, "step": 1770 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1854.25, "completions/max_terminated_length": 1546.75, "completions/mean_length": 969.78125, "completions/mean_terminated_length": 844.3359222412109, "completions/min_length": 401.25, "completions/min_terminated_length": 401.25, "epoch": 0.8855, "grad_norm": 0.4061007797718048, "kl": 0.04254150390625, "learning_rate": 1.3577281594640182e-07, "loss": 0.3077, "num_tokens": 131418693.0, "reward": 0.451171875, "reward_std": 0.09105735644698143, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.18211472034454346, "step": 1771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1775.75, "completions/mean_length": 1029.625, "completions/mean_terminated_length": 920.8000640869141, "completions/min_length": 418.75, "completions/min_terminated_length": 418.75, "epoch": 0.886, "grad_norm": 0.4052397310733795, "kl": 0.046630859375, "learning_rate": 1.354665674954255e-07, "loss": 0.2784, "num_tokens": 131494333.0, "reward": 0.5378632843494415, "reward_std": 0.28719787672162056, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03748633246868849, "rewards/penalized_accuracy_reward/std": 0.11826841160655022, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17957578226923943, "step": 1772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1526.0, "completions/max_terminated_length": 1268.5, "completions/mean_length": 771.046875, "completions/mean_terminated_length": 737.0718994140625, "completions/min_length": 250.25, "completions/min_terminated_length": 250.25, "epoch": 0.8865, "grad_norm": 0.4952690601348877, "kl": 0.050933837890625, "learning_rate": 1.351615817851748e-07, "loss": 0.0757, "num_tokens": 131555088.0, "reward": 0.5533467233181, "reward_std": 0.20771993696689606, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03741554915904999, "rewards/penalized_accuracy_reward/std": 0.08044102787971497, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1342380754649639, "step": 1773 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1375.25, "completions/max_terminated_length": 1353.0, "completions/mean_length": 883.71875, "completions/mean_terminated_length": 759.5, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.887, "grad_norm": 0.2939152121543884, "kl": 0.03936767578125, "learning_rate": 1.3485785974468913e-07, "loss": 0.0829, "num_tokens": 131620622.0, "reward": 0.6626444011926651, "reward_std": 0.3953387476503849, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09987688437104225, "rewards/penalized_accuracy_reward/std": 0.1761554405093193, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.08605579286813736, "step": 1774 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2028.5, "completions/max_terminated_length": 1632.25, "completions/mean_length": 1031.640625, "completions/mean_terminated_length": 967.6366424560547, "completions/min_length": 470.5, "completions/min_terminated_length": 470.5, "epoch": 0.8875, "grad_norm": 0.46328920125961304, "kl": 0.0369873046875, "learning_rate": 1.345554022991586e-07, "loss": 0.2661, "num_tokens": 131697319.0, "reward": 0.8067963123321533, "reward_std": 0.22025743685662746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17488254606723785, "rewards/penalized_accuracy_reward/std": 0.06826716661453247, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.20292926207184792, "step": 1775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1735.25, "completions/max_terminated_length": 1535.5, "completions/mean_length": 966.625, "completions/mean_terminated_length": 831.5528869628906, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.888, "grad_norm": 0.3281888961791992, "kl": 0.04486083984375, "learning_rate": 1.3425421036992097e-07, "loss": 0.1908, "num_tokens": 131769967.0, "reward": 0.5089123547077179, "reward_std": 0.2513622045516968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024963993579149246, "rewards/penalized_accuracy_reward/std": 0.09985597431659698, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18209363520145416, "step": 1776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1555.5, "completions/max_terminated_length": 1450.75, "completions/mean_length": 817.609375, "completions/mean_terminated_length": 789.1227722167969, "completions/min_length": 420.5, "completions/min_terminated_length": 420.5, "epoch": 0.8885, "grad_norm": 0.4437038004398346, "kl": 0.04510498046875, "learning_rate": 1.3395428487445914e-07, "loss": 0.0648, "num_tokens": 131833046.0, "reward": 0.4765625, "reward_std": 0.05920084938406944, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11840170249342918, "step": 1777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1981.25, "completions/max_terminated_length": 1573.25, "completions/mean_length": 921.6875, "completions/mean_terminated_length": 845.9661102294922, "completions/min_length": 349.25, "completions/min_terminated_length": 349.25, "epoch": 0.889, "grad_norm": 0.3728310763835907, "kl": 0.046966552734375, "learning_rate": 1.3365562672639807e-07, "loss": 0.2554, "num_tokens": 131902290.0, "reward": 0.5244703143835068, "reward_std": 0.19034098088741302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024930469691753387, "rewards/penalized_accuracy_reward/std": 0.06812302023172379, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14659032225608826, "step": 1778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1832.25, "completions/max_terminated_length": 1587.75, "completions/mean_length": 1085.28125, "completions/mean_terminated_length": 986.2942657470703, "completions/min_length": 450.25, "completions/min_terminated_length": 450.25, "epoch": 0.8895, "grad_norm": 0.4050956666469574, "kl": 0.0404052734375, "learning_rate": 1.3335823683550237e-07, "loss": 0.0817, "num_tokens": 131982644.0, "reward": 0.48197653889656067, "reward_std": 0.17596234381198883, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012472649104893208, "rewards/penalized_accuracy_reward/std": 0.04989060014486313, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1865071840584278, "step": 1779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1905.75, "completions/max_terminated_length": 1354.5, "completions/mean_length": 1014.890625, "completions/mean_terminated_length": 870.7447357177734, "completions/min_length": 458.5, "completions/min_terminated_length": 458.5, "epoch": 0.89, "grad_norm": 0.3864828646183014, "kl": 0.069793701171875, "learning_rate": 1.3306211610767327e-07, "loss": 0.1782, "num_tokens": 132062781.0, "reward": 0.5970903038978577, "reward_std": 0.3723519127815962, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07491233944892883, "rewards/penalized_accuracy_reward/std": 0.15755056589841843, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.21475254371762276, "step": 1780 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1405.25, "completions/max_terminated_length": 1269.0, "completions/mean_length": 668.796875, "completions/mean_terminated_length": 604.9814453125, "completions/min_length": 336.75, "completions/min_terminated_length": 336.75, "epoch": 0.8905, "grad_norm": 0.2894628047943115, "kl": 0.05523681640625, "learning_rate": 1.3276726544494571e-07, "loss": 0.0659, "num_tokens": 132113776.0, "reward": 0.578400194644928, "reward_std": 0.2653609365224838, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1072557382285595, "step": 1781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1947.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 1216.640625, "completions/mean_terminated_length": 1057.1702575683594, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.891, "grad_norm": 0.3302861154079437, "kl": 0.04541015625, "learning_rate": 1.3247368574548605e-07, "loss": 0.2026, "num_tokens": 132202249.0, "reward": 0.4717601090669632, "reward_std": 0.2921537086367607, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024942555464804173, "rewards/penalized_accuracy_reward/std": 0.09977022558450699, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.23707153648138046, "step": 1782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1636.5, "completions/max_terminated_length": 1210.5, "completions/mean_length": 768.25, "completions/mean_terminated_length": 710.5901947021484, "completions/min_length": 282.75, "completions/min_terminated_length": 282.75, "epoch": 0.8915, "grad_norm": 0.4889187216758728, "kl": 0.0504150390625, "learning_rate": 1.3218137790358892e-07, "loss": 0.2541, "num_tokens": 132259833.0, "reward": 0.5054457038640976, "reward_std": 0.15222473815083504, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012488475069403648, "rewards/penalized_accuracy_reward/std": 0.04995390400290489, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11861307546496391, "step": 1783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1233.75, "completions/max_terminated_length": 1189.5, "completions/mean_length": 598.796875, "completions/mean_terminated_length": 579.0937652587891, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.892, "grad_norm": 0.36594751477241516, "kl": 0.044189453125, "learning_rate": 1.3189034280967474e-07, "loss": 0.1186, "num_tokens": 132306556.0, "reward": 0.6479334533214569, "reward_std": 0.20766127109527588, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07494328916072845, "rewards/penalized_accuracy_reward/std": 0.09992438554763794, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1662.5, "completions/max_terminated_length": 1475.5, "completions/mean_length": 914.75, "completions/mean_terminated_length": 881.1354370117188, "completions/min_length": 484.5, "completions/min_terminated_length": 484.5, "epoch": 0.8925, "grad_norm": 0.27852168679237366, "kl": 0.032135009765625, "learning_rate": 1.316005813502869e-07, "loss": 0.1372, "num_tokens": 132374892.0, "reward": 0.685776948928833, "reward_std": 0.33270052820444107, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09972441289573908, "rewards/penalized_accuracy_reward/std": 0.15213682129979134, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 1785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1851.0, "completions/max_terminated_length": 1512.75, "completions/mean_length": 997.96875, "completions/mean_terminated_length": 747.1592254638672, "completions/min_length": 304.5, "completions/min_terminated_length": 304.5, "epoch": 0.893, "grad_norm": 0.35800206661224365, "kl": 0.04937744140625, "learning_rate": 1.31312094408089e-07, "loss": 0.2492, "num_tokens": 132450250.0, "reward": 0.4967542067170143, "reward_std": 0.30778704956173897, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03743960242718458, "rewards/penalized_accuracy_reward/std": 0.11813011392951012, "rewards/tag_count_reward/mean": 0.84375, "rewards/tag_count_reward/std": 0.2503516525030136, "step": 1786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1544.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 865.953125, "completions/mean_terminated_length": 763.2031402587891, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.8935, "grad_norm": 0.4470963180065155, "kl": 0.043701171875, "learning_rate": 1.3102488286186234e-07, "loss": 0.199, "num_tokens": 132513031.0, "reward": 0.8394617289304733, "reward_std": 0.41720813140273094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18730898946523666, "rewards/penalized_accuracy_reward/std": 0.16984587162733078, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.17125719040632248, "step": 1787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1800.25, "completions/max_terminated_length": 1470.5, "completions/mean_length": 1056.21875, "completions/mean_terminated_length": 886.4520263671875, "completions/min_length": 421.5, "completions/min_terminated_length": 421.5, "epoch": 0.894, "grad_norm": 0.3742600083351135, "kl": 0.040985107421875, "learning_rate": 1.30738947586503e-07, "loss": 0.2921, "num_tokens": 132589749.0, "reward": 0.7141496539115906, "reward_std": 0.2992166355252266, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1373482644557953, "rewards/penalized_accuracy_reward/std": 0.09563708305358887, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2158849686384201, "step": 1788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1353.75, "completions/max_terminated_length": 1317.25, "completions/mean_length": 816.953125, "completions/mean_terminated_length": 766.8489685058594, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.8945, "grad_norm": 0.28234317898750305, "kl": 0.043701171875, "learning_rate": 1.3045428945301953e-07, "loss": -0.0143, "num_tokens": 132654562.0, "reward": 0.6763743013143539, "reward_std": 0.3537498265504837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09990590065717697, "rewards/penalized_accuracy_reward/std": 0.16809627413749695, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.07739239931106567, "step": 1789 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1468.0, "completions/max_terminated_length": 1340.75, "completions/mean_length": 673.96875, "completions/mean_terminated_length": 636.1875152587891, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.895, "grad_norm": 0.44559916853904724, "kl": 0.03948974609375, "learning_rate": 1.3017090932852998e-07, "loss": 0.0288, "num_tokens": 132706528.0, "reward": 0.5132064670324326, "reward_std": 0.1348012574017048, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012462608516216278, "rewards/penalized_accuracy_reward/std": 0.049850430339574814, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07020078226923943, "step": 1790 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1586.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 960.484375, "completions/mean_terminated_length": 845.9591369628906, "completions/min_length": 360.25, "completions/min_terminated_length": 360.25, "epoch": 0.8955, "grad_norm": 0.3740074038505554, "kl": 0.03631591796875, "learning_rate": 1.2988880807625927e-07, "loss": 0.1919, "num_tokens": 132776399.0, "reward": 0.7316182255744934, "reward_std": 0.4672193303704262, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1372934877872467, "rewards/penalized_accuracy_reward/std": 0.19544658064842224, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.15265236794948578, "step": 1791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1409.5, "completions/max_terminated_length": 1112.75, "completions/mean_length": 630.4375, "completions/mean_terminated_length": 591.1396026611328, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.896, "grad_norm": 0.4015541672706604, "kl": 0.055023193359375, "learning_rate": 1.2960798655553673e-07, "loss": 0.057, "num_tokens": 132825627.0, "reward": 0.7359817028045654, "reward_std": 0.48157593607902527, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12482680100947618, "rewards/penalized_accuracy_reward/std": 0.2348819486796856, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09528729319572449, "step": 1792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1331.5, "completions/max_terminated_length": 1250.25, "completions/mean_length": 673.203125, "completions/mean_terminated_length": 636.9040222167969, "completions/min_length": 287.25, "completions/min_terminated_length": 287.25, "epoch": 0.8965, "grad_norm": 0.508729100227356, "kl": 0.03948974609375, "learning_rate": 1.2932844562179352e-07, "loss": 0.055, "num_tokens": 132879640.0, "reward": 0.6842745989561081, "reward_std": 0.41593846678733826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09994979668408632, "rewards/penalized_accuracy_reward/std": 0.19987228140234947, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10145078226923943, "step": 1793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1857.5, "completions/max_terminated_length": 1625.5, "completions/mean_length": 1036.265625, "completions/mean_terminated_length": 940.5845489501953, "completions/min_length": 443.75, "completions/min_terminated_length": 443.75, "epoch": 0.897, "grad_norm": 0.3831500709056854, "kl": 0.0401611328125, "learning_rate": 1.2905018612655974e-07, "loss": 0.2099, "num_tokens": 132954553.0, "reward": 0.462890625, "reward_std": 0.08403083682060242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.16806168109178543, "step": 1794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1469.5, "completions/mean_length": 1085.875, "completions/mean_terminated_length": 875.8591613769531, "completions/min_length": 367.25, "completions/min_terminated_length": 367.25, "epoch": 0.8975, "grad_norm": 0.3434215784072876, "kl": 0.054840087890625, "learning_rate": 1.2877320891746201e-07, "loss": 0.2373, "num_tokens": 133031569.0, "reward": 0.8907873183488846, "reward_std": 0.5841770600527525, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.22469054907560349, "rewards/penalized_accuracy_reward/std": 0.2654283121228218, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.24314142018556595, "step": 1795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1876.25, "completions/max_terminated_length": 1554.0, "completions/mean_length": 983.125, "completions/mean_terminated_length": 875.7786865234375, "completions/min_length": 344.75, "completions/min_terminated_length": 344.75, "epoch": 0.898, "grad_norm": 0.4114736020565033, "kl": 0.04248046875, "learning_rate": 1.284975148382211e-07, "loss": 0.2674, "num_tokens": 133106089.0, "reward": 0.9163414537906647, "reward_std": 0.37141886726021767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.22477228939533234, "rewards/penalized_accuracy_reward/std": 0.15757505595684052, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1667410470545292, "step": 1796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1052.5, "completions/max_terminated_length": 943.75, "completions/mean_length": 546.65625, "completions/mean_terminated_length": 507.76116943359375, "completions/min_length": 209.5, "completions/min_terminated_length": 209.5, "epoch": 0.8985, "grad_norm": 0.68525630235672, "kl": 0.08599853515625, "learning_rate": 1.2822310472864885e-07, "loss": 0.161, "num_tokens": 133158947.0, "reward": 1.0105388164520264, "reward_std": 0.5359440520405769, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2621053336188197, "rewards/penalized_accuracy_reward/std": 0.26373251527547836, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.06442352384328842, "step": 1797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1363.5, "completions/max_terminated_length": 1086.5, "completions/mean_length": 622.609375, "completions/mean_terminated_length": 561.78662109375, "completions/min_length": 253.75, "completions/min_terminated_length": 253.75, "epoch": 0.899, "grad_norm": 0.3244628310203552, "kl": 0.06707763671875, "learning_rate": 1.2794997942464603e-07, "loss": 0.2044, "num_tokens": 133208506.0, "reward": 0.48046875, "reward_std": 0.05622786656022072, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11245574057102203, "step": 1798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1439.5, "completions/mean_length": 968.28125, "completions/mean_terminated_length": 862.9357452392578, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.8995, "grad_norm": 0.4807436466217041, "kl": 0.05780029296875, "learning_rate": 1.2767813975819983e-07, "loss": 0.2623, "num_tokens": 133284172.0, "reward": 0.5319762229919434, "reward_std": 0.243245093151927, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747248649597168, "rewards/penalized_accuracy_reward/std": 0.08056342601776123, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22523242980241776, "step": 1799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1632.5, "completions/mean_length": 831.484375, "completions/mean_terminated_length": 750.3833923339844, "completions/min_length": 331.25, "completions/min_terminated_length": 331.25, "epoch": 0.9, "grad_norm": 0.4270942211151123, "kl": 0.043609619140625, "learning_rate": 1.274075865573809e-07, "loss": 0.2005, "num_tokens": 133348747.0, "reward": 0.701343908905983, "reward_std": 0.48249319195747375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11239070072770119, "rewards/penalized_accuracy_reward/std": 0.22612550109624863, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1577934455126524, "step": 1800 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 1040.25, "completions/mean_terminated_length": 951.5702819824219, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.9005, "grad_norm": 0.37978866696357727, "kl": 0.037994384765625, "learning_rate": 1.2713832064634125e-07, "loss": 0.2241, "num_tokens": 133423675.0, "reward": 0.6744925379753113, "reward_std": 0.26325249299407005, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09994157403707504, "rewards/penalized_accuracy_reward/std": 0.10321921855211258, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1383691541850567, "step": 1801 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1427.5, "completions/max_terminated_length": 1300.25, "completions/mean_length": 775.421875, "completions/mean_terminated_length": 709.3385467529297, "completions/min_length": 303.75, "completions/min_terminated_length": 303.75, "epoch": 0.901, "grad_norm": 0.3945537507534027, "kl": 0.04150390625, "learning_rate": 1.2687034284531145e-07, "loss": 0.086, "num_tokens": 133484870.0, "reward": 0.470703125, "reward_std": 0.0703125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.140625, "step": 1802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1730.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 1072.921875, "completions/mean_terminated_length": 872.1666259765625, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.9015, "grad_norm": 0.47532209753990173, "kl": 0.050140380859375, "learning_rate": 1.2660365397059856e-07, "loss": 0.2354, "num_tokens": 133563985.0, "reward": 0.610160231590271, "reward_std": 0.31283559277653694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0873066708445549, "rewards/penalized_accuracy_reward/std": 0.10224319994449615, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.21669844537973404, "step": 1803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1614.75, "completions/max_terminated_length": 1491.75, "completions/mean_length": 864.25, "completions/mean_terminated_length": 819.853759765625, "completions/min_length": 384.5, "completions/min_terminated_length": 384.5, "epoch": 0.902, "grad_norm": 0.5611650347709656, "kl": 0.050140380859375, "learning_rate": 1.263382548345829e-07, "loss": 0.2011, "num_tokens": 133625537.0, "reward": 0.6744885146617889, "reward_std": 0.2745417393743992, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09993956983089447, "rewards/penalized_accuracy_reward/std": 0.10321715474128723, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13621489331126213, "step": 1804 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1457.25, "completions/max_terminated_length": 1039.5, "completions/mean_length": 771.015625, "completions/mean_terminated_length": 650.7633972167969, "completions/min_length": 328.5, "completions/min_terminated_length": 328.5, "epoch": 0.9025, "grad_norm": 0.3821370005607605, "kl": 0.0528564453125, "learning_rate": 1.260741462457165e-07, "loss": 0.2092, "num_tokens": 133686882.0, "reward": 0.5088581293821335, "reward_std": 0.19515696913003922, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024936877191066742, "rewards/penalized_accuracy_reward/std": 0.06814054399728775, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.15985829010605812, "step": 1805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1849.75, "completions/max_terminated_length": 1460.75, "completions/mean_length": 976.375, "completions/mean_terminated_length": 808.4732208251953, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.903, "grad_norm": 0.41842272877693176, "kl": 0.06951904296875, "learning_rate": 1.258113290085197e-07, "loss": 0.2632, "num_tokens": 133762698.0, "reward": 0.5410977900028229, "reward_std": 0.2789996564388275, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04984577000141144, "rewards/penalized_accuracy_reward/std": 0.08916690945625305, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.24030320346355438, "step": 1806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1496.5, "completions/max_terminated_length": 1235.0, "completions/mean_length": 772.421875, "completions/mean_terminated_length": 697.1674194335938, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.9035, "grad_norm": 0.4354444742202759, "kl": 0.0323486328125, "learning_rate": 1.2554980392357956e-07, "loss": 0.176, "num_tokens": 133820245.0, "reward": 0.8032313883304596, "reward_std": 0.44592560827732086, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1623578928411007, "rewards/penalized_accuracy_reward/std": 0.21811755374073982, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11157561466097832, "step": 1807 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1789.5, "completions/max_terminated_length": 1461.0, "completions/mean_length": 1016.734375, "completions/mean_terminated_length": 805.3042144775391, "completions/min_length": 229.25, "completions/min_terminated_length": 229.25, "epoch": 0.904, "grad_norm": 0.35827839374542236, "kl": 0.062774658203125, "learning_rate": 1.2528957178754676e-07, "loss": 0.341, "num_tokens": 133893524.0, "reward": 0.435546875, "reward_std": 0.11175746656954288, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.22351493313908577, "step": 1808 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1559.25, "completions/mean_length": 1051.96875, "completions/mean_terminated_length": 865.5980987548828, "completions/min_length": 360.25, "completions/min_terminated_length": 360.25, "epoch": 0.9045, "grad_norm": 0.35464179515838623, "kl": 0.064788818359375, "learning_rate": 1.2503063339313356e-07, "loss": 0.3711, "num_tokens": 133970082.0, "reward": 0.5814170241355896, "reward_std": 0.3169991672039032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0748881846666336, "rewards/penalized_accuracy_reward/std": 0.09985092282295227, "rewards/tag_count_reward/mean": 0.86328125, "rewards/tag_count_reward/std": 0.26952263712882996, "step": 1809 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1732.5, "completions/max_terminated_length": 1179.0, "completions/mean_length": 748.59375, "completions/mean_terminated_length": 667.2617797851562, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.905, "grad_norm": 0.5462590456008911, "kl": 0.06231689453125, "learning_rate": 1.2477298952911116e-07, "loss": 0.2117, "num_tokens": 134029976.0, "reward": 0.6685159653425217, "reward_std": 0.4051365293562412, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09988298639655113, "rewards/penalized_accuracy_reward/std": 0.17867614328861237, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.15779344737529755, "step": 1810 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1509.0, "completions/max_terminated_length": 1049.5, "completions/mean_length": 617.90625, "completions/mean_terminated_length": 574.2135467529297, "completions/min_length": 279.5, "completions/min_terminated_length": 279.5, "epoch": 0.9055, "grad_norm": 0.36875492334365845, "kl": 0.05767822265625, "learning_rate": 1.2451664098030743e-07, "loss": 0.1908, "num_tokens": 134078370.0, "reward": 0.9129830896854401, "reward_std": 0.2639647424221039, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21235091984272003, "rewards/penalized_accuracy_reward/std": 0.11814567446708679, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1811 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1598.25, "completions/mean_length": 1127.890625, "completions/mean_terminated_length": 976.6702880859375, "completions/min_length": 432.25, "completions/min_terminated_length": 432.25, "epoch": 0.906, "grad_norm": 1.0142102241516113, "kl": 0.047393798828125, "learning_rate": 1.242615885276046e-07, "loss": 0.2523, "num_tokens": 134162443.0, "reward": 0.7277182340621948, "reward_std": 0.2684874702244997, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1372966170310974, "rewards/penalized_accuracy_reward/std": 0.09560119360685349, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20581265538930893, "step": 1812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1858.75, "completions/mean_length": 1237.6875, "completions/mean_terminated_length": 1038.3617858886719, "completions/min_length": 475.75, "completions/min_terminated_length": 475.75, "epoch": 0.9065, "grad_norm": 0.33445027470588684, "kl": 0.040771484375, "learning_rate": 1.2400783294793668e-07, "loss": 0.3161, "num_tokens": 134253511.0, "reward": 0.42578125, "reward_std": 0.1429324559867382, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8515625, "rewards/tag_count_reward/std": 0.2858649119734764, "step": 1813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 1094.1875, "completions/mean_terminated_length": 839.1881332397461, "completions/min_length": 345.5, "completions/min_terminated_length": 345.5, "epoch": 0.907, "grad_norm": 0.37979182600975037, "kl": 0.0654296875, "learning_rate": 1.2375537501428706e-07, "loss": 0.4181, "num_tokens": 134331811.0, "reward": 0.4523247182369232, "reward_std": 0.25603992491960526, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024990487843751907, "rewards/penalized_accuracy_reward/std": 0.06828701496124268, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.2773413583636284, "step": 1814 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1961.25, "completions/max_terminated_length": 1536.5, "completions/mean_length": 1024.796875, "completions/mean_terminated_length": 932.7344055175781, "completions/min_length": 345.5, "completions/min_terminated_length": 345.5, "epoch": 0.9075, "grad_norm": 0.3982563316822052, "kl": 0.0321044921875, "learning_rate": 1.235042154956865e-07, "loss": 0.2473, "num_tokens": 134406006.0, "reward": 0.4995442032814026, "reward_std": 0.17010588198900223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012467412278056145, "rewards/penalized_accuracy_reward/std": 0.04986965283751488, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.14073317870497704, "step": 1815 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1891.75, "completions/max_terminated_length": 1729.25, "completions/mean_length": 1087.078125, "completions/mean_terminated_length": 955.0948028564453, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.908, "grad_norm": 0.29283174872398376, "kl": 0.0360107421875, "learning_rate": 1.232543551572103e-07, "loss": 0.1146, "num_tokens": 134487819.0, "reward": 0.950231060385704, "reward_std": 0.6169909462332726, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24952960014343262, "rewards/penalized_accuracy_reward/std": 0.29085899144411087, "rewards/tag_count_reward/mean": 0.90234375, "rewards/tag_count_reward/std": 0.16472551971673965, "step": 1816 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1735.75, "completions/max_terminated_length": 1690.25, "completions/mean_length": 1044.78125, "completions/mean_terminated_length": 989.9291839599609, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "epoch": 0.9085, "grad_norm": 0.40060877799987793, "kl": 0.0360107421875, "learning_rate": 1.2300579475997657e-07, "loss": 0.1448, "num_tokens": 134565709.0, "reward": 0.5704883486032486, "reward_std": 0.3002987168729305, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04989260993897915, "rewards/penalized_accuracy_reward/std": 0.1303800642490387, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.12774410098791122, "step": 1817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1403.5, "completions/max_terminated_length": 1236.75, "completions/mean_length": 740.25, "completions/mean_terminated_length": 675.4542083740234, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.909, "grad_norm": 0.6446665525436401, "kl": 0.05706787109375, "learning_rate": 1.227585350611433e-07, "loss": 0.2751, "num_tokens": 134621677.0, "reward": 0.6493648290634155, "reward_std": 0.3551836460828781, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08737772796303034, "rewards/penalized_accuracy_reward/std": 0.14976344630122185, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13808366656303406, "step": 1818 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1447.75, "completions/mean_length": 1054.140625, "completions/mean_terminated_length": 882.3184432983398, "completions/min_length": 359.25, "completions/min_terminated_length": 359.25, "epoch": 0.9095, "grad_norm": 0.42936089634895325, "kl": 0.048583984375, "learning_rate": 1.2251257681390645e-07, "loss": 0.2554, "num_tokens": 134702118.0, "reward": 0.7219992130994797, "reward_std": 0.377786235883832, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1373667884618044, "rewards/penalized_accuracy_reward/std": 0.14988181740045547, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.21914836391806602, "step": 1819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1626.0, "completions/max_terminated_length": 1413.25, "completions/mean_length": 865.234375, "completions/mean_terminated_length": 825.3683166503906, "completions/min_length": 370.25, "completions/min_terminated_length": 370.25, "epoch": 0.91, "grad_norm": 0.4382741451263428, "kl": 0.05731201171875, "learning_rate": 1.2226792076749734e-07, "loss": 0.0821, "num_tokens": 134766917.0, "reward": 0.7185255587100983, "reward_std": 0.3455353993922472, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12488777749240398, "rewards/penalized_accuracy_reward/std": 0.15232344716787338, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1732971966266632, "step": 1820 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1406.25, "completions/max_terminated_length": 1368.5, "completions/mean_length": 669.421875, "completions/mean_terminated_length": 649.3510589599609, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.9105, "grad_norm": 0.3899904787540436, "kl": 0.0582275390625, "learning_rate": 1.220245676671809e-07, "loss": 0.2176, "num_tokens": 134818000.0, "reward": 0.6648014783859253, "reward_std": 0.3521810695528984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08728355169296265, "rewards/penalized_accuracy_reward/std": 0.1636967733502388, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.049575019627809525, "step": 1821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1616.0, "completions/max_terminated_length": 1372.5, "completions/mean_length": 732.71875, "completions/mean_terminated_length": 695.2052307128906, "completions/min_length": 317.75, "completions/min_terminated_length": 317.75, "epoch": 0.911, "grad_norm": 0.47030141949653625, "kl": 0.04925537109375, "learning_rate": 1.2178251825425282e-07, "loss": 0.2692, "num_tokens": 134875774.0, "reward": 0.8297276198863983, "reward_std": 0.5714181661605835, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17462944239377975, "rewards/penalized_accuracy_reward/std": 0.2719387114048004, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13270078226923943, "step": 1822 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1903.5, "completions/max_terminated_length": 1636.75, "completions/mean_length": 1014.421875, "completions/mean_terminated_length": 920.1888580322266, "completions/min_length": 293.75, "completions/min_terminated_length": 293.75, "epoch": 0.9115, "grad_norm": 0.36763858795166016, "kl": 0.03985595703125, "learning_rate": 1.2154177326603763e-07, "loss": 0.093, "num_tokens": 134947769.0, "reward": 0.6415869444608688, "reward_std": 0.46894972026348114, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08739503845572472, "rewards/penalized_accuracy_reward/std": 0.2169739305973053, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19255755841732025, "step": 1823 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1780.5, "completions/max_terminated_length": 1681.5, "completions/mean_length": 995.6875, "completions/mean_terminated_length": 917.9721984863281, "completions/min_length": 347.75, "completions/min_terminated_length": 347.75, "epoch": 0.912, "grad_norm": 0.31404247879981995, "kl": 0.05462646484375, "learning_rate": 1.2130233343588623e-07, "loss": 0.1918, "num_tokens": 135023221.0, "reward": 0.453125, "reward_std": 0.08741116523742676, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.17482233233749866, "step": 1824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1591.5, "completions/max_terminated_length": 1224.5, "completions/mean_length": 831.28125, "completions/mean_terminated_length": 722.1689147949219, "completions/min_length": 366.25, "completions/min_terminated_length": 366.25, "epoch": 0.9125, "grad_norm": 0.2585456669330597, "kl": 0.04150390625, "learning_rate": 1.2106419949317388e-07, "loss": 0.1392, "num_tokens": 135084759.0, "reward": 0.5764546990394592, "reward_std": 0.21602513268589973, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.049946095794439316, "rewards/penalized_accuracy_reward/std": 0.08934629708528519, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1103317141532898, "step": 1825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1753.5, "completions/max_terminated_length": 1343.0, "completions/mean_length": 874.1875, "completions/mean_terminated_length": 768.454231262207, "completions/min_length": 369.75, "completions/min_terminated_length": 369.75, "epoch": 0.913, "grad_norm": 0.40502265095710754, "kl": 0.05126953125, "learning_rate": 1.2082737216329792e-07, "loss": 0.2205, "num_tokens": 135151299.0, "reward": 0.5108525156974792, "reward_std": 0.2014442514628172, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024957504123449326, "rewards/penalized_accuracy_reward/std": 0.06819688528776169, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17700446024537086, "step": 1826 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1888.75, "completions/max_terminated_length": 1668.25, "completions/mean_length": 1161.609375, "completions/mean_terminated_length": 1061.8058166503906, "completions/min_length": 494.75, "completions/min_terminated_length": 494.75, "epoch": 0.9135, "grad_norm": 0.20262955129146576, "kl": 0.040771484375, "learning_rate": 1.2059185216767543e-07, "loss": 0.1271, "num_tokens": 135234698.0, "reward": 0.7546769827604294, "reward_std": 0.5665292590856552, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1497994251549244, "rewards/penalized_accuracy_reward/std": 0.2654423117637634, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.2062118947505951, "step": 1827 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1637.75, "completions/max_terminated_length": 1266.0, "completions/mean_length": 605.640625, "completions/mean_terminated_length": 560.6760559082031, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.914, "grad_norm": 0.5561500787734985, "kl": 0.04443359375, "learning_rate": 1.203576402237412e-07, "loss": 0.2435, "num_tokens": 135284307.0, "reward": 0.48828125, "reward_std": 0.046875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1828 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.5, "completions/max_terminated_length": 1145.5, "completions/mean_length": 520.734375, "completions/mean_terminated_length": 520.734375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9145, "grad_norm": 0.372089147567749, "kl": 0.057342529296875, "learning_rate": 1.2012473704494537e-07, "loss": 0.146, "num_tokens": 135326610.0, "reward": 0.8745193779468536, "reward_std": 0.5737199932336807, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.18725967407226562, "rewards/penalized_accuracy_reward/std": 0.28686001151800156, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1891.5, "completions/max_terminated_length": 1826.0, "completions/mean_length": 1039.515625, "completions/mean_terminated_length": 1024.4146118164062, "completions/min_length": 466.5, "completions/min_terminated_length": 466.5, "epoch": 0.915, "grad_norm": 0.3106885850429535, "kl": 0.037628173828125, "learning_rate": 1.1989314334075144e-07, "loss": 0.0914, "num_tokens": 135404355.0, "reward": 0.7168805003166199, "reward_std": 0.23095833882689476, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11234650760889053, "rewards/penalized_accuracy_reward/std": 0.10232971608638763, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.05259781517088413, "step": 1830 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 997.953125, "completions/mean_terminated_length": 869.6476593017578, "completions/min_length": 275.75, "completions/min_terminated_length": 275.75, "epoch": 0.9155, "grad_norm": 0.428654283285141, "kl": 0.04803466796875, "learning_rate": 1.1966285981663407e-07, "loss": 0.345, "num_tokens": 135481040.0, "reward": 0.453125, "reward_std": 0.11481597647070885, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.229631956666708, "step": 1831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1933.5, "completions/max_terminated_length": 1600.75, "completions/mean_length": 800.796875, "completions/mean_terminated_length": 763.7292022705078, "completions/min_length": 297.75, "completions/min_terminated_length": 297.75, "epoch": 0.916, "grad_norm": 0.44329625368118286, "kl": 0.03973388671875, "learning_rate": 1.1943388717407668e-07, "loss": 0.0729, "num_tokens": 135540899.0, "reward": 0.5073224604129791, "reward_std": 0.15109636262059212, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012450290843844414, "rewards/penalized_accuracy_reward/std": 0.049801163375377655, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10298807546496391, "step": 1832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1828.75, "completions/max_terminated_length": 1546.25, "completions/mean_length": 889.15625, "completions/mean_terminated_length": 808.4287719726562, "completions/min_length": 389.75, "completions/min_terminated_length": 389.75, "epoch": 0.9165, "grad_norm": 0.2813776433467865, "kl": 0.040740966796875, "learning_rate": 1.1920622611056974e-07, "loss": 0.198, "num_tokens": 135605853.0, "reward": 0.474609375, "reward_std": 0.06152055040001869, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.12304110452532768, "step": 1833 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1628.5, "completions/max_terminated_length": 1193.0, "completions/mean_length": 690.390625, "completions/mean_terminated_length": 607.9273376464844, "completions/min_length": 212.5, "completions/min_terminated_length": 212.5, "epoch": 0.917, "grad_norm": 0.4850649833679199, "kl": 0.061248779296875, "learning_rate": 1.1897987731960835e-07, "loss": 0.2912, "num_tokens": 135658406.0, "reward": 0.49370649456977844, "reward_std": 0.18053072318434715, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012478250078856945, "rewards/penalized_accuracy_reward/std": 0.04991300404071808, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16140944883227348, "step": 1834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1824.5, "completions/max_terminated_length": 1555.75, "completions/mean_length": 1098.8125, "completions/mean_terminated_length": 1015.3921813964844, "completions/min_length": 481.25, "completions/min_terminated_length": 481.25, "epoch": 0.9175, "grad_norm": 0.29524850845336914, "kl": 0.02593994140625, "learning_rate": 1.1875484149069004e-07, "loss": 0.1982, "num_tokens": 135737402.0, "reward": 0.6184686571359634, "reward_std": 0.37258465588092804, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07485932856798172, "rewards/penalized_accuracy_reward/std": 0.15747936815023422, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16375876776874065, "step": 1835 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1969.75, "completions/max_terminated_length": 1626.0, "completions/mean_length": 1066.4375, "completions/mean_terminated_length": 845.3166809082031, "completions/min_length": 202.5, "completions/min_terminated_length": 202.5, "epoch": 0.918, "grad_norm": 0.3282805383205414, "kl": 0.03253173828125, "learning_rate": 1.1853111930931312e-07, "loss": 0.3341, "num_tokens": 135816294.0, "reward": 0.4375, "reward_std": 0.12032167986035347, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.24064337462186813, "step": 1836 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 1094.671875, "completions/mean_terminated_length": 953.5370483398438, "completions/min_length": 411.5, "completions/min_terminated_length": 411.5, "epoch": 0.9185, "grad_norm": 0.3681339621543884, "kl": 0.053436279296875, "learning_rate": 1.1830871145697412e-07, "loss": 0.3059, "num_tokens": 135894225.0, "reward": 0.524108499288559, "reward_std": 0.265484930947423, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744486719369888, "rewards/penalized_accuracy_reward/std": 0.08050405234098434, "rewards/tag_count_reward/mean": 0.8984375, "rewards/tag_count_reward/std": 0.2495192214846611, "step": 1837 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2009.5, "completions/max_terminated_length": 1569.75, "completions/mean_length": 1089.21875, "completions/mean_terminated_length": 794.1367340087891, "completions/min_length": 320.25, "completions/min_terminated_length": 320.25, "epoch": 0.919, "grad_norm": 0.4107822775840759, "kl": 0.05584716796875, "learning_rate": 1.1808761861116589e-07, "loss": 0.3149, "num_tokens": 135973903.0, "reward": 0.4140625, "reward_std": 0.1238720752298832, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.2477441541850567, "step": 1838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1767.25, "completions/max_terminated_length": 1420.5, "completions/mean_length": 958.453125, "completions/mean_terminated_length": 816.3691711425781, "completions/min_length": 351.5, "completions/min_terminated_length": 351.5, "epoch": 0.9195, "grad_norm": 0.38246390223503113, "kl": 0.04388427734375, "learning_rate": 1.1786784144537563e-07, "loss": 0.2863, "num_tokens": 136045036.0, "reward": 0.47811491787433624, "reward_std": 0.18671990185976028, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012494957074522972, "rewards/penalized_accuracy_reward/std": 0.04997983202338219, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1875, "step": 1839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1524.25, "completions/max_terminated_length": 1497.0, "completions/mean_length": 932.96875, "completions/mean_terminated_length": 904.2924194335938, "completions/min_length": 535.75, "completions/min_terminated_length": 535.75, "epoch": 0.92, "grad_norm": 0.24093912541866302, "kl": 0.034271240234375, "learning_rate": 1.1764938062908261e-07, "loss": 0.1394, "num_tokens": 136113514.0, "reward": 0.9527411460876465, "reward_std": 0.6167395114898682, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.23711274564266205, "rewards/penalized_accuracy_reward/std": 0.3019707724452019, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11534032225608826, "step": 1840 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1503.25, "completions/max_terminated_length": 1386.25, "completions/mean_length": 863.53125, "completions/mean_terminated_length": 804.9638824462891, "completions/min_length": 434.25, "completions/min_terminated_length": 434.25, "epoch": 0.9205, "grad_norm": 0.5171255469322205, "kl": 0.046142578125, "learning_rate": 1.1743223682775649e-07, "loss": 0.1689, "num_tokens": 136180028.0, "reward": 0.7280073761940002, "reward_std": 0.38466376066207886, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12474589049816132, "rewards/penalized_accuracy_reward/std": 0.17126566171646118, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13137168437242508, "step": 1841 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1107.5, "completions/max_terminated_length": 963.25, "completions/mean_length": 564.109375, "completions/mean_terminated_length": 525.9107208251953, "completions/min_length": 246.25, "completions/min_terminated_length": 246.25, "epoch": 0.921, "grad_norm": 0.4665762186050415, "kl": 0.044464111328125, "learning_rate": 1.172164107028549e-07, "loss": 0.1058, "num_tokens": 136225971.0, "reward": 0.5921727120876312, "reward_std": 0.20515788719058037, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04999260604381561, "rewards/penalized_accuracy_reward/std": 0.08942949771881104, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.05259781517088413, "step": 1842 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1441.75, "completions/mean_length": 972.609375, "completions/mean_terminated_length": 900.9167175292969, "completions/min_length": 450.75, "completions/min_terminated_length": 450.75, "epoch": 0.9215, "grad_norm": 0.3621518909931183, "kl": 0.0318603515625, "learning_rate": 1.1700190291182158e-07, "loss": 0.2397, "num_tokens": 136297162.0, "reward": 0.6551006138324738, "reward_std": 0.27029917761683464, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08731593191623688, "rewards/penalized_accuracy_reward/std": 0.10225396603345871, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.14336910098791122, "step": 1843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1662.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 831.734375, "completions/mean_terminated_length": 777.2717437744141, "completions/min_length": 323.5, "completions/min_terminated_length": 323.5, "epoch": 0.922, "grad_norm": 0.45296475291252136, "kl": 0.04766845703125, "learning_rate": 1.1678871410808454e-07, "loss": 0.1817, "num_tokens": 136359353.0, "reward": 0.6704075485467911, "reward_std": 0.37033533677458763, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09985220432281494, "rewards/penalized_accuracy_reward/std": 0.16803470253944397, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.13993912376463413, "step": 1844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1835.0, "completions/max_terminated_length": 1562.5, "completions/mean_length": 1036.921875, "completions/mean_terminated_length": 975.6448211669922, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.9225, "grad_norm": 0.2814290523529053, "kl": 0.0341796875, "learning_rate": 1.1657684494105386e-07, "loss": 0.112, "num_tokens": 136435348.0, "reward": 0.668352484703064, "reward_std": 0.38499604538083076, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09980123676359653, "rewards/penalized_accuracy_reward/std": 0.16796905547380447, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17116425558924675, "step": 1845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1655.5, "completions/max_terminated_length": 1437.5, "completions/mean_length": 850.203125, "completions/mean_terminated_length": 769.3656311035156, "completions/min_length": 325.75, "completions/min_terminated_length": 325.75, "epoch": 0.923, "grad_norm": 0.533707320690155, "kl": 0.039825439453125, "learning_rate": 1.1636629605611966e-07, "loss": 0.168, "num_tokens": 136501809.0, "reward": 0.5128141343593597, "reward_std": 0.1995985396206379, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.17259745672345161, "step": 1846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1893.0, "completions/max_terminated_length": 1361.75, "completions/mean_length": 820.921875, "completions/mean_terminated_length": 752.2126617431641, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.9235, "grad_norm": 0.4053635001182556, "kl": 0.052215576171875, "learning_rate": 1.1615706809465051e-07, "loss": 0.2287, "num_tokens": 136562636.0, "reward": 0.4996021240949631, "reward_std": 0.15555289387702942, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012496372684836388, "rewards/penalized_accuracy_reward/std": 0.04998549446463585, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.15933074057102203, "step": 1847 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1806.0, "completions/max_terminated_length": 1634.25, "completions/mean_length": 1081.5625, "completions/mean_terminated_length": 968.8341674804688, "completions/min_length": 327.5, "completions/min_terminated_length": 327.5, "epoch": 0.924, "grad_norm": 0.3505668640136719, "kl": 0.049102783203125, "learning_rate": 1.1594916169399087e-07, "loss": 0.2663, "num_tokens": 136640176.0, "reward": 0.485914871096611, "reward_std": 0.1886262260377407, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01248868741095066, "rewards/penalized_accuracy_reward/std": 0.04995474964380264, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17743347585201263, "step": 1848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.25, "completions/max_terminated_length": 1087.25, "completions/mean_length": 576.203125, "completions/mean_terminated_length": 576.203125, "completions/min_length": 245.25, "completions/min_terminated_length": 245.25, "epoch": 0.9245, "grad_norm": 0.5811783075332642, "kl": 0.046234130859375, "learning_rate": 1.1574257748745986e-07, "loss": 0.1193, "num_tokens": 136686317.0, "reward": 0.8975013047456741, "reward_std": 0.39018112421035767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19972721859812737, "rewards/penalized_accuracy_reward/std": 0.19118431955575943, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1706.75, "completions/max_terminated_length": 1452.0, "completions/mean_length": 999.015625, "completions/mean_terminated_length": 834.1177978515625, "completions/min_length": 375.75, "completions/min_terminated_length": 375.75, "epoch": 0.925, "grad_norm": 0.4406484365463257, "kl": 0.05169677734375, "learning_rate": 1.1553731610434876e-07, "loss": 0.2202, "num_tokens": 136764462.0, "reward": 0.8660583198070526, "reward_std": 0.4106885977089405, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2123260349035263, "rewards/penalized_accuracy_reward/std": 0.16994646191596985, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.17486442625522614, "step": 1850 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1557.75, "completions/max_terminated_length": 1260.0, "completions/mean_length": 870.21875, "completions/mean_terminated_length": 770.5320281982422, "completions/min_length": 414.25, "completions/min_terminated_length": 414.25, "epoch": 0.9255, "grad_norm": 0.40197622776031494, "kl": 0.03955078125, "learning_rate": 1.1533337816991931e-07, "loss": 0.0943, "num_tokens": 136828652.0, "reward": 0.6297838240861893, "reward_std": 0.36806635558605194, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08735284954309464, "rewards/penalized_accuracy_reward/std": 0.16378191113471985, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.16614429652690887, "step": 1851 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 1251.4375, "completions/mean_terminated_length": 903.1648406982422, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.926, "grad_norm": 0.3267115652561188, "kl": 0.067657470703125, "learning_rate": 1.1513076430540177e-07, "loss": 0.3247, "num_tokens": 136919320.0, "reward": 0.47331403195858, "reward_std": 0.2868846654891968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03743826597929001, "rewards/penalized_accuracy_reward/std": 0.08048985153436661, "rewards/tag_count_reward/mean": 0.796875, "rewards/tag_count_reward/std": 0.29237477108836174, "step": 1852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1889.75, "completions/max_terminated_length": 1368.5, "completions/mean_length": 863.46875, "completions/mean_terminated_length": 719.4335479736328, "completions/min_length": 333.25, "completions/min_terminated_length": 333.25, "epoch": 0.9265, "grad_norm": 0.40407121181488037, "kl": 0.052825927734375, "learning_rate": 1.1492947512799328e-07, "loss": 0.2078, "num_tokens": 136985206.0, "reward": 0.5838504582643509, "reward_std": 0.3326832018792629, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062433045357465744, "rewards/penalized_accuracy_reward/std": 0.13931019976735115, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.18350879102945328, "step": 1853 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1572.0, "completions/max_terminated_length": 1236.75, "completions/mean_length": 847.421875, "completions/mean_terminated_length": 779.9521102905273, "completions/min_length": 330.5, "completions/min_terminated_length": 330.5, "epoch": 0.927, "grad_norm": 0.5767349600791931, "kl": 0.037109375, "learning_rate": 1.1472951125085547e-07, "loss": 0.276, "num_tokens": 137048417.0, "reward": 0.7071432769298553, "reward_std": 0.24747087061405182, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11236070096492767, "rewards/penalized_accuracy_reward/std": 0.1023426428437233, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.08557119965553284, "step": 1854 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1343.25, "completions/max_terminated_length": 1330.75, "completions/mean_length": 722.765625, "completions/mean_terminated_length": 685.2566986083984, "completions/min_length": 344.25, "completions/min_terminated_length": 344.25, "epoch": 0.9275, "grad_norm": 0.259004145860672, "kl": 0.031982421875, "learning_rate": 1.1453087328311299e-07, "loss": 0.1189, "num_tokens": 137104482.0, "reward": 0.6609156131744385, "reward_std": 0.30484603345394135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08729374129325151, "rewards/penalized_accuracy_reward/std": 0.14968567714095116, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07966844737529755, "step": 1855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1803.5, "completions/max_terminated_length": 1551.0, "completions/mean_length": 893.21875, "completions/mean_terminated_length": 726.6170654296875, "completions/min_length": 351.75, "completions/min_terminated_length": 351.75, "epoch": 0.928, "grad_norm": 0.39557382464408875, "kl": 0.042694091796875, "learning_rate": 1.1433356182985158e-07, "loss": 0.2141, "num_tokens": 137171552.0, "reward": 0.439453125, "reward_std": 0.1015625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.203125, "step": 1856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1206.0, "completions/max_terminated_length": 1023.5, "completions/mean_length": 553.234375, "completions/mean_terminated_length": 533.5645904541016, "completions/min_length": 250.25, "completions/min_terminated_length": 250.25, "epoch": 0.9285, "grad_norm": 0.6586804389953613, "kl": 0.04962158203125, "learning_rate": 1.1413757749211602e-07, "loss": 0.2088, "num_tokens": 137216719.0, "reward": 0.7590121477842331, "reward_std": 0.40328139066696167, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13731857389211655, "rewards/penalized_accuracy_reward/std": 0.18363189697265625, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09858439117670059, "step": 1857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1773.25, "completions/max_terminated_length": 1530.0, "completions/mean_length": 937.53125, "completions/mean_terminated_length": 789.9958343505859, "completions/min_length": 311.25, "completions/min_terminated_length": 311.25, "epoch": 0.929, "grad_norm": 0.4038904011249542, "kl": 0.037109375, "learning_rate": 1.1394292086690874e-07, "loss": 0.2547, "num_tokens": 137286577.0, "reward": 0.55306476354599, "reward_std": 0.2623554654419422, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04996988922357559, "rewards/penalized_accuracy_reward/std": 0.08938885480165482, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.16715551167726517, "step": 1858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1565.5, "completions/max_terminated_length": 1471.25, "completions/mean_length": 1026.703125, "completions/mean_terminated_length": 972.2969055175781, "completions/min_length": 538.5, "completions/min_terminated_length": 538.5, "epoch": 0.9295, "grad_norm": 0.2871192395687103, "kl": 0.031646728515625, "learning_rate": 1.137495925471875e-07, "loss": 0.1595, "num_tokens": 137361230.0, "reward": 0.6224416643381119, "reward_std": 0.4131038021296263, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07489270158112049, "rewards/penalized_accuracy_reward/std": 0.18919712677598, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13991425558924675, "step": 1859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1460.75, "completions/max_terminated_length": 1287.5, "completions/mean_length": 678.953125, "completions/mean_terminated_length": 660.4354248046875, "completions/min_length": 312.25, "completions/min_terminated_length": 312.25, "epoch": 0.93, "grad_norm": 0.6013730764389038, "kl": 0.05596923828125, "learning_rate": 1.1355759312186396e-07, "loss": 0.1201, "num_tokens": 137412651.0, "reward": 0.5053999423980713, "reward_std": 0.1487371325492859, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01246559340506792, "rewards/penalized_accuracy_reward/std": 0.04986237362027168, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13644563034176826, "step": 1860 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1726.75, "completions/max_terminated_length": 1624.75, "completions/mean_length": 874.3125, "completions/mean_terminated_length": 837.5083618164062, "completions/min_length": 368.25, "completions/min_terminated_length": 368.25, "epoch": 0.9305, "grad_norm": 0.3777771592140198, "kl": 0.035369873046875, "learning_rate": 1.1336692317580158e-07, "loss": 0.1618, "num_tokens": 137480383.0, "reward": 0.482421875, "reward_std": 0.04863205552101135, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.0972641110420227, "step": 1861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1857.5, "completions/max_terminated_length": 1779.5, "completions/mean_length": 1258.65625, "completions/mean_terminated_length": 1091.8295288085938, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.931, "grad_norm": 0.2036210149526596, "kl": 0.02862548828125, "learning_rate": 1.1317758328981414e-07, "loss": 0.0727, "num_tokens": 137568553.0, "reward": 0.5640323609113693, "reward_std": 0.35895562544465065, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06228961423039436, "rewards/penalized_accuracy_reward/std": 0.148439921438694, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.19135022163391113, "step": 1862 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2042.5, "completions/max_terminated_length": 1829.5, "completions/mean_length": 1118.78125, "completions/mean_terminated_length": 1045.596420288086, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.9315, "grad_norm": 0.5367610454559326, "kl": 0.046051025390625, "learning_rate": 1.1298957404066381e-07, "loss": 0.2113, "num_tokens": 137650379.0, "reward": 0.466796875, "reward_std": 0.0926229041069746, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1852458082139492, "step": 1863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1546.75, "completions/max_terminated_length": 1332.25, "completions/mean_length": 1114.8125, "completions/mean_terminated_length": 890.2708587646484, "completions/min_length": 537.75, "completions/min_terminated_length": 537.75, "epoch": 0.932, "grad_norm": 0.142255961894989, "kl": 0.0258941650390625, "learning_rate": 1.1280289600105928e-07, "loss": 0.1137, "num_tokens": 137731807.0, "reward": 0.435546875, "reward_std": 0.0783807747066021, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.1567615494132042, "step": 1864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1468.75, "completions/max_terminated_length": 1284.25, "completions/mean_length": 718.71875, "completions/mean_terminated_length": 683.9937591552734, "completions/min_length": 303.75, "completions/min_terminated_length": 303.75, "epoch": 0.9325, "grad_norm": 0.33958905935287476, "kl": 0.051727294921875, "learning_rate": 1.1261754973965422e-07, "loss": 0.0862, "num_tokens": 137786317.0, "reward": 0.6839649975299835, "reward_std": 0.3634471073746681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09979500249028206, "rewards/penalized_accuracy_reward/std": 0.1679992973804474, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10519563034176826, "step": 1865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1466.25, "completions/max_terminated_length": 1219.0, "completions/mean_length": 765.703125, "completions/mean_terminated_length": 655.6490783691406, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.933, "grad_norm": 0.5690696835517883, "kl": 0.0611572265625, "learning_rate": 1.1243353582104555e-07, "loss": 0.2651, "num_tokens": 137843610.0, "reward": 0.7586462497711182, "reward_std": 0.2634542379528284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": NaN, "rewards/penalized_accuracy_reward/std": NaN, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.16954081133008003, "step": 1866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1749.25, "completions/max_terminated_length": 1378.0, "completions/mean_length": 811.984375, "completions/mean_terminated_length": 760.9604187011719, "completions/min_length": 322.75, "completions/min_terminated_length": 322.75, "epoch": 0.9335, "grad_norm": 0.43287089467048645, "kl": 0.041748046875, "learning_rate": 1.1225085480577158e-07, "loss": 0.2659, "num_tokens": 137903321.0, "reward": 0.6301680505275726, "reward_std": 0.25524837896227837, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07484965026378632, "rewards/penalized_accuracy_reward/std": 0.09979956597089767, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11129852384328842, "step": 1867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1810.75, "completions/max_terminated_length": 1574.25, "completions/mean_length": 953.984375, "completions/mean_terminated_length": 848.3646087646484, "completions/min_length": 338.25, "completions/min_terminated_length": 338.25, "epoch": 0.934, "grad_norm": 0.32116466760635376, "kl": 0.033355712890625, "learning_rate": 1.1206950725031034e-07, "loss": 0.2134, "num_tokens": 137972616.0, "reward": 0.5817942321300507, "reward_std": 0.2571815885603428, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06238148361444473, "rewards/penalized_accuracy_reward/std": 0.09556122124195099, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.16958122327923775, "step": 1868 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1693.0, "completions/max_terminated_length": 1289.5, "completions/mean_length": 735.609375, "completions/mean_terminated_length": 694.4666748046875, "completions/min_length": 349.5, "completions/min_terminated_length": 349.5, "epoch": 0.9345, "grad_norm": 0.44572803378105164, "kl": 0.04583740234375, "learning_rate": 1.1188949370707787e-07, "loss": 0.1945, "num_tokens": 138029391.0, "reward": 0.486328125, "reward_std": 0.0546875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.109375, "step": 1869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1797.75, "completions/max_terminated_length": 1657.5, "completions/mean_length": 940.0625, "completions/mean_terminated_length": 891.2768096923828, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.935, "grad_norm": 0.28431087732315063, "kl": 0.057830810546875, "learning_rate": 1.117108147244268e-07, "loss": 0.1312, "num_tokens": 138099971.0, "reward": 0.749193012714386, "reward_std": 0.34196142107248306, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13729181606322527, "rewards/penalized_accuracy_reward/std": 0.14977150037884712, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11859130859375, "step": 1870 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1560.5, "completions/max_terminated_length": 1407.25, "completions/mean_length": 853.5, "completions/mean_terminated_length": 818.6000213623047, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.9355, "grad_norm": 0.2757273018360138, "kl": 0.030670166015625, "learning_rate": 1.1153347084664419e-07, "loss": 0.1324, "num_tokens": 138167843.0, "reward": 0.478515625, "reward_std": 0.0481650996953249, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.0963301993906498, "step": 1871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1599.5, "completions/max_terminated_length": 1278.75, "completions/mean_length": 793.03125, "completions/mean_terminated_length": 756.6875305175781, "completions/min_length": 336.75, "completions/min_terminated_length": 336.75, "epoch": 0.936, "grad_norm": 0.28084537386894226, "kl": 0.0419921875, "learning_rate": 1.1135746261395021e-07, "loss": 0.134, "num_tokens": 138228277.0, "reward": 0.5113125741481781, "reward_std": 0.12764739990234375, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0124922264367342, "rewards/penalized_accuracy_reward/std": 0.0499689094722271, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.08957063034176826, "step": 1872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1322.75, "completions/mean_length": 1013.6875, "completions/mean_terminated_length": 791.0500335693359, "completions/min_length": 333.75, "completions/min_terminated_length": 333.75, "epoch": 0.9365, "grad_norm": 0.5407847762107849, "kl": 0.050933837890625, "learning_rate": 1.1118279056249653e-07, "loss": 0.4239, "num_tokens": 138301505.0, "reward": 0.4296875, "reward_std": 0.12318200245499611, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.24636401236057281, "step": 1873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1724.5, "completions/max_terminated_length": 1465.25, "completions/mean_length": 1103.234375, "completions/mean_terminated_length": 997.7212066650391, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.937, "grad_norm": 0.3018345534801483, "kl": 0.0427398681640625, "learning_rate": 1.1100945522436453e-07, "loss": 0.1473, "num_tokens": 138380912.0, "reward": 0.5163517147302628, "reward_std": 0.2549353539943695, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03747272863984108, "rewards/penalized_accuracy_reward/std": 0.08056395500898361, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.23818545043468475, "step": 1874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1532.0, "completions/max_terminated_length": 1384.5, "completions/mean_length": 785.046875, "completions/mean_terminated_length": 765.5218811035156, "completions/min_length": 348.5, "completions/min_terminated_length": 348.5, "epoch": 0.9375, "grad_norm": 0.36669543385505676, "kl": 0.04217529296875, "learning_rate": 1.1083745712756364e-07, "loss": 0.135, "num_tokens": 138439203.0, "reward": 0.7836558222770691, "reward_std": 0.3209949880838394, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14964042603969574, "rewards/penalized_accuracy_reward/std": 0.14544828236103058, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10519563034176826, "step": 1875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1914.25, "completions/max_terminated_length": 1222.75, "completions/mean_length": 803.78125, "completions/mean_terminated_length": 674.4120025634766, "completions/min_length": 351.5, "completions/min_terminated_length": 351.5, "epoch": 0.938, "grad_norm": 0.5101845264434814, "kl": 0.0526123046875, "learning_rate": 1.1066679679602998e-07, "loss": 0.2914, "num_tokens": 138498565.0, "reward": 0.6146612018346786, "reward_std": 0.38377730548381805, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07490871846675873, "rewards/penalized_accuracy_reward/std": 0.15754349529743195, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1734122931957245, "step": 1876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1468.5, "completions/max_terminated_length": 1248.5, "completions/mean_length": 668.96875, "completions/mean_terminated_length": 649.4322967529297, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9385, "grad_norm": 0.45488715171813965, "kl": 0.06109619140625, "learning_rate": 1.1049747474962444e-07, "loss": 0.077, "num_tokens": 138549155.0, "reward": 0.5343277454376221, "reward_std": 0.23100632801651955, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024976379238069057, "rewards/penalized_accuracy_reward/std": 0.09990552067756653, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.10101010836660862, "step": 1877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.75, "completions/max_terminated_length": 1278.75, "completions/mean_length": 726.828125, "completions/mean_terminated_length": 726.828125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.939, "grad_norm": 0.3965708911418915, "kl": 0.045440673828125, "learning_rate": 1.1032949150413137e-07, "loss": 0.0829, "num_tokens": 138604232.0, "reward": 0.8454647958278656, "reward_std": 0.41887331008911133, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.174685537815094, "rewards/penalized_accuracy_reward/std": 0.20457080006599426, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1878 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1738.5, "completions/max_terminated_length": 1631.25, "completions/mean_length": 953.59375, "completions/mean_terminated_length": 902.9327087402344, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "epoch": 0.9395, "grad_norm": 0.45416077971458435, "kl": 0.041107177734375, "learning_rate": 1.1016284757125685e-07, "loss": 0.1299, "num_tokens": 138673966.0, "reward": 0.49958810210227966, "reward_std": 0.15061897411942482, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012489361688494682, "rewards/penalized_accuracy_reward/std": 0.04995744675397873, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1281814817339182, "step": 1879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1744.0, "completions/max_terminated_length": 1453.5, "completions/mean_length": 934.0625, "completions/mean_terminated_length": 831.0393371582031, "completions/min_length": 344.75, "completions/min_terminated_length": 344.75, "epoch": 0.94, "grad_norm": 0.31931453943252563, "kl": 0.0340576171875, "learning_rate": 1.099975434586272e-07, "loss": 0.2161, "num_tokens": 138743122.0, "reward": 0.46875, "reward_std": 0.06351631693542004, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.12703263387084007, "step": 1880 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1776.0, "completions/max_terminated_length": 1446.75, "completions/mean_length": 796.953125, "completions/mean_terminated_length": 757.9166870117188, "completions/min_length": 309.75, "completions/min_terminated_length": 309.75, "epoch": 0.9405, "grad_norm": 0.2523519992828369, "kl": 0.0361328125, "learning_rate": 1.0983357966978745e-07, "loss": 0.1101, "num_tokens": 138803007.0, "reward": 0.7820095717906952, "reward_std": 0.38299502432346344, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1497938632965088, "rewards/penalized_accuracy_reward/std": 0.18283477425575256, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10298807546496391, "step": 1881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1781.25, "completions/max_terminated_length": 1626.0, "completions/mean_length": 819.109375, "completions/mean_terminated_length": 779.8646087646484, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.941, "grad_norm": 0.4158669710159302, "kl": 0.04632568359375, "learning_rate": 1.096709567041997e-07, "loss": 0.2113, "num_tokens": 138865062.0, "reward": 0.7880860567092896, "reward_std": 0.2256440669298172, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14990240335464478, "rewards/penalized_accuracy_reward/std": 0.0893845409154892, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.09375, "step": 1882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 1223.953125, "completions/mean_terminated_length": 915.6323852539062, "completions/min_length": 364.25, "completions/min_terminated_length": 364.25, "epoch": 0.9415, "grad_norm": 0.34909141063690186, "kl": 0.0616455078125, "learning_rate": 1.0950967505724175e-07, "loss": 0.3328, "num_tokens": 138953715.0, "reward": 0.5963025093078613, "reward_std": 0.4763478524982929, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09990905970335007, "rewards/penalized_accuracy_reward/std": 0.17872276157140732, "rewards/tag_count_reward/mean": 0.79296875, "rewards/tag_count_reward/std": 0.3285861909389496, "step": 1883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1998.0, "completions/max_terminated_length": 1599.75, "completions/mean_length": 796.1875, "completions/mean_terminated_length": 715.8906707763672, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.942, "grad_norm": 0.37232106924057007, "kl": 0.05560302734375, "learning_rate": 1.0934973522020537e-07, "loss": 0.2597, "num_tokens": 139016511.0, "reward": 0.6915637850761414, "reward_std": 0.27178284898400307, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11238345503807068, "rewards/penalized_accuracy_reward/std": 0.10236336290836334, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1648966744542122, "step": 1884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1974.5, "completions/max_terminated_length": 1871.75, "completions/mean_length": 1080.859375, "completions/mean_terminated_length": 924.0708160400391, "completions/min_length": 297.25, "completions/min_terminated_length": 297.25, "epoch": 0.9425, "grad_norm": 0.4578389823436737, "kl": 0.05377197265625, "learning_rate": 1.0919113768029517e-07, "loss": 0.3278, "num_tokens": 139097910.0, "reward": 0.439453125, "reward_std": 0.12957576662302017, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.25915154069662094, "step": 1885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1512.0, "completions/max_terminated_length": 1482.5, "completions/mean_length": 896.109375, "completions/mean_terminated_length": 885.6291809082031, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.943, "grad_norm": 0.5330392122268677, "kl": 0.037200927734375, "learning_rate": 1.0903388292062668e-07, "loss": 0.0984, "num_tokens": 139162621.0, "reward": 0.48828125, "reward_std": 0.04192390665411949, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08384781517088413, "step": 1886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1741.25, "completions/max_terminated_length": 1444.0, "completions/mean_length": 887.015625, "completions/mean_terminated_length": 825.0309753417969, "completions/min_length": 406.5, "completions/min_terminated_length": 406.5, "epoch": 0.9435, "grad_norm": 0.4821576178073883, "kl": 0.052764892578125, "learning_rate": 1.0887797142022521e-07, "loss": 0.2639, "num_tokens": 139233806.0, "reward": 0.7837911993265152, "reward_std": 0.3295186348259449, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14970810059458017, "rewards/penalized_accuracy_reward/std": 0.14549259096384048, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.09529344737529755, "step": 1887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1671.0, "completions/max_terminated_length": 1313.75, "completions/mean_length": 879.890625, "completions/mean_terminated_length": 816.9921875, "completions/min_length": 510.75, "completions/min_terminated_length": 510.75, "epoch": 0.944, "grad_norm": 0.4569546580314636, "kl": 0.04364013671875, "learning_rate": 1.0872340365402415e-07, "loss": 0.161, "num_tokens": 139301975.0, "reward": 0.5607878714799881, "reward_std": 0.2495763637125492, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04992518201470375, "rewards/penalized_accuracy_reward/std": 0.08930891752243042, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.14191709086298943, "step": 1888 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1601.5, "completions/max_terminated_length": 1592.25, "completions/mean_length": 774.375, "completions/mean_terminated_length": 715.2115478515625, "completions/min_length": 265.25, "completions/min_terminated_length": 265.25, "epoch": 0.9445, "grad_norm": 0.8788064122200012, "kl": 0.06024169921875, "learning_rate": 1.0857018009286381e-07, "loss": 0.1528, "num_tokens": 139363535.0, "reward": 0.493741512298584, "reward_std": 0.15828853100538254, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012495753355324268, "rewards/penalized_accuracy_reward/std": 0.04998301342129707, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.1508433148264885, "step": 1889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1877.25, "completions/max_terminated_length": 1698.5, "completions/mean_length": 1099.90625, "completions/mean_terminated_length": 925.6907348632812, "completions/min_length": 416.75, "completions/min_terminated_length": 416.75, "epoch": 0.945, "grad_norm": 0.3660215139389038, "kl": 0.05291748046875, "learning_rate": 1.0841830120348969e-07, "loss": 0.2928, "num_tokens": 139443833.0, "reward": 0.6642884910106659, "reward_std": 0.39420953579247, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11241766810417175, "rewards/penalized_accuracy_reward/std": 0.15316616371273994, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.2410547249019146, "step": 1890 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1650.75, "completions/max_terminated_length": 1252.0, "completions/mean_length": 817.421875, "completions/mean_terminated_length": 639.1652526855469, "completions/min_length": 279.25, "completions/min_terminated_length": 279.25, "epoch": 0.9455, "grad_norm": 0.3187362551689148, "kl": 0.0457763671875, "learning_rate": 1.0826776744855121e-07, "loss": 0.2658, "num_tokens": 139506996.0, "reward": 0.5030632764101028, "reward_std": 0.21160516142845154, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496914006769657, "rewards/penalized_accuracy_reward/std": 0.06822868436574936, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.16778605617582798, "step": 1891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1581.0, "completions/mean_length": 960.75, "completions/mean_terminated_length": 821.1774139404297, "completions/min_length": 308.5, "completions/min_terminated_length": 308.5, "epoch": 0.946, "grad_norm": 0.5047162175178528, "kl": 0.075286865234375, "learning_rate": 1.0811857928660037e-07, "loss": 0.2748, "num_tokens": 139579988.0, "reward": 0.45847849547863007, "reward_std": 0.18077382072806358, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012442371807992458, "rewards/penalized_accuracy_reward/std": 0.04976949095726013, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.23675650358200073, "step": 1892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1288.0, "completions/max_terminated_length": 1265.5, "completions/mean_length": 718.421875, "completions/mean_terminated_length": 660.2932739257812, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.9465, "grad_norm": 0.33029091358184814, "kl": 0.04339599609375, "learning_rate": 1.0797073717209013e-07, "loss": 0.1593, "num_tokens": 139639487.0, "reward": 0.7821521162986755, "reward_std": 0.3289592117071152, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14986512251198292, "rewards/penalized_accuracy_reward/std": 0.14558369293808937, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.07558366656303406, "step": 1893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1826.5, "completions/max_terminated_length": 1510.25, "completions/mean_length": 966.65625, "completions/mean_terminated_length": 919.6771087646484, "completions/min_length": 270.75, "completions/min_terminated_length": 270.75, "epoch": 0.947, "grad_norm": 0.4707791805267334, "kl": 0.04180908203125, "learning_rate": 1.0782424155537314e-07, "loss": 0.1616, "num_tokens": 139708537.0, "reward": 0.822418600320816, "reward_std": 0.4713806491345167, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17488118261098862, "rewards/penalized_accuracy_reward/std": 0.20479991286993027, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18031632527709007, "step": 1894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1605.5, "completions/max_terminated_length": 1364.25, "completions/mean_length": 899.109375, "completions/mean_terminated_length": 773.1156463623047, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.9475, "grad_norm": 0.5125336647033691, "kl": 0.06097412109375, "learning_rate": 1.0767909288270063e-07, "loss": 0.2017, "num_tokens": 139774992.0, "reward": 0.8854554295539856, "reward_std": 0.4727371819317341, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2122589647769928, "rewards/penalized_accuracy_reward/std": 0.20221014320850372, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.13663379102945328, "step": 1895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1852.5, "completions/max_terminated_length": 1215.0, "completions/mean_length": 783.703125, "completions/mean_terminated_length": 719.2031478881836, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.948, "grad_norm": 0.523749589920044, "kl": 0.064422607421875, "learning_rate": 1.0753529159622047e-07, "loss": 0.2947, "num_tokens": 139833837.0, "reward": 0.8241613954305649, "reward_std": 0.4359930530190468, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17477600648999214, "rewards/penalized_accuracy_reward/std": 0.18923313170671463, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.16923292353749275, "step": 1896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1664.75, "completions/max_terminated_length": 1461.75, "completions/mean_length": 888.515625, "completions/mean_terminated_length": 830.7342376708984, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.9485, "grad_norm": 0.4327208995819092, "kl": 0.0340576171875, "learning_rate": 1.0739283813397639e-07, "loss": 0.1523, "num_tokens": 139899246.0, "reward": 0.8071930408477783, "reward_std": 0.20925819501280785, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.16238558292388916, "rewards/penalized_accuracy_reward/std": 0.08056582510471344, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.10717359744012356, "step": 1897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1599.5, "completions/mean_length": 1331.875, "completions/mean_terminated_length": 1103.440185546875, "completions/min_length": 523.75, "completions/min_terminated_length": 523.75, "epoch": 0.949, "grad_norm": 0.26282086968421936, "kl": 0.0284423828125, "learning_rate": 1.0725173292990626e-07, "loss": 0.225, "num_tokens": 139992566.0, "reward": 0.46791917085647583, "reward_std": 0.2286122851073742, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024975210428237915, "rewards/penalized_accuracy_reward/std": 0.06824526935815811, "rewards/tag_count_reward/mean": 0.8359375, "rewards/tag_count_reward/std": 0.24034369736909866, "step": 1898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1681.25, "completions/max_terminated_length": 1548.5, "completions/mean_length": 1014.484375, "completions/mean_terminated_length": 952.2010498046875, "completions/min_length": 512.5, "completions/min_terminated_length": 512.5, "epoch": 0.9495, "grad_norm": 0.4251072406768799, "kl": 0.039459228515625, "learning_rate": 1.0711197641384115e-07, "loss": -0.0329, "num_tokens": 140068325.0, "reward": 0.5514430850744247, "reward_std": 0.26513219624757767, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03744029812514782, "rewards/penalized_accuracy_reward/std": 0.11811894550919533, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.09306412376463413, "step": 1899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1649.0, "completions/max_terminated_length": 1569.25, "completions/mean_length": 995.828125, "completions/mean_terminated_length": 926.109375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.95, "grad_norm": 0.30850842595100403, "kl": 0.040618896484375, "learning_rate": 1.0697356901150353e-07, "loss": 0.1696, "num_tokens": 140142682.0, "reward": 0.5456032902002335, "reward_std": 0.21384471654891968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.03745008260011673, "rewards/penalized_accuracy_reward/std": 0.08051525801420212, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.11894455552101135, "step": 1900 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1431.0, "completions/max_terminated_length": 1089.25, "completions/mean_length": 658.140625, "completions/mean_terminated_length": 632.0781326293945, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.9505, "grad_norm": 0.2995031774044037, "kl": 0.0439453125, "learning_rate": 1.068365111445064e-07, "loss": 0.0442, "num_tokens": 140193411.0, "reward": 0.5460569262504578, "reward_std": 0.202090784907341, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02498159557580948, "rewards/penalized_accuracy_reward/std": 0.09992638602852821, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1462.25, "completions/mean_length": 1110.71875, "completions/mean_terminated_length": 981.0452728271484, "completions/min_length": 425.25, "completions/min_terminated_length": 425.25, "epoch": 0.951, "grad_norm": 0.3772423267364502, "kl": 0.0316162109375, "learning_rate": 1.0670080323035176e-07, "loss": 0.2973, "num_tokens": 140274289.0, "reward": 0.43359375, "reward_std": 0.1251726783812046, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8671875, "rewards/tag_count_reward/std": 0.2503453716635704, "step": 1902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1783.75, "completions/max_terminated_length": 1333.5, "completions/mean_length": 752.03125, "completions/mean_terminated_length": 693.7864837646484, "completions/min_length": 327.5, "completions/min_terminated_length": 327.5, "epoch": 0.9515, "grad_norm": 0.5489780902862549, "kl": 0.05120849609375, "learning_rate": 1.0656644568242946e-07, "loss": 0.2984, "num_tokens": 140331283.0, "reward": 0.49952687323093414, "reward_std": 0.17537017539143562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012458749115467072, "rewards/penalized_accuracy_reward/std": 0.049835000187158585, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.1514003686606884, "step": 1903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1732.0, "completions/max_terminated_length": 1663.25, "completions/mean_length": 866.921875, "completions/mean_terminated_length": 849.8260498046875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.952, "grad_norm": 0.2798708975315094, "kl": 0.0335693359375, "learning_rate": 1.0643343891001591e-07, "loss": 0.0688, "num_tokens": 140395134.0, "reward": 0.5210449546575546, "reward_std": 0.11542981117963791, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012475602328777313, "rewards/penalized_accuracy_reward/std": 0.04990240931510925, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1904 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1552.0, "completions/max_terminated_length": 1397.25, "completions/mean_length": 915.609375, "completions/mean_terminated_length": 838.0292053222656, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.9525, "grad_norm": 0.2710053026676178, "kl": 0.037078857421875, "learning_rate": 1.063017833182728e-07, "loss": 0.1316, "num_tokens": 140465237.0, "reward": 0.474609375, "reward_std": 0.0546875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.109375, "step": 1905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1356.25, "completions/mean_length": 811.90625, "completions/mean_terminated_length": 687.7381134033203, "completions/min_length": 285.75, "completions/min_terminated_length": 285.75, "epoch": 0.953, "grad_norm": 0.5801121592521667, "kl": 0.050689697265625, "learning_rate": 1.0617147930824586e-07, "loss": 0.479, "num_tokens": 140525839.0, "reward": 0.466796875, "reward_std": 0.1004345752298832, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.2008691541850567, "step": 1906 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1805.25, "completions/max_terminated_length": 1617.25, "completions/mean_length": 1039.28125, "completions/mean_terminated_length": 940.2969207763672, "completions/min_length": 465.5, "completions/min_terminated_length": 465.5, "epoch": 0.9535, "grad_norm": 0.3560361862182617, "kl": 0.03631591796875, "learning_rate": 1.0604252727686379e-07, "loss": 0.24, "num_tokens": 140602689.0, "reward": 0.7412828505039215, "reward_std": 0.27401842176914215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13724300265312195, "rewards/penalized_accuracy_reward/std": 0.09556381404399872, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1838192492723465, "step": 1907 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1699.75, "completions/max_terminated_length": 1659.75, "completions/mean_length": 898.890625, "completions/mean_terminated_length": 852.4568634033203, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.954, "grad_norm": 0.30909109115600586, "kl": 0.04730224609375, "learning_rate": 1.0591492761693674e-07, "loss": 0.0352, "num_tokens": 140670026.0, "reward": 0.6130154877901077, "reward_std": 0.3047690913081169, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06236712820827961, "rewards/penalized_accuracy_reward/std": 0.13918372988700867, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.08086910098791122, "step": 1908 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1516.0, "completions/max_terminated_length": 1325.25, "completions/mean_length": 760.75, "completions/mean_terminated_length": 737.7958374023438, "completions/min_length": 311.25, "completions/min_terminated_length": 311.25, "epoch": 0.9545, "grad_norm": 0.42106497287750244, "kl": 0.0335693359375, "learning_rate": 1.0578868071715544e-07, "loss": 0.191, "num_tokens": 140728522.0, "reward": 0.5901178121566772, "reward_std": 0.20634789019823074, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04994171857833862, "rewards/penalized_accuracy_reward/std": 0.08933846652507782, "rewards/tag_count_reward/mean": 0.98046875, "rewards/tag_count_reward/std": 0.05534191615879536, "step": 1909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1918.25, "completions/max_terminated_length": 1695.0, "completions/mean_length": 831.78125, "completions/mean_terminated_length": 771.4541931152344, "completions/min_length": 344.75, "completions/min_terminated_length": 344.75, "epoch": 0.955, "grad_norm": 0.2352658212184906, "kl": 0.04168701171875, "learning_rate": 1.0566378696208987e-07, "loss": 0.0975, "num_tokens": 140794092.0, "reward": 0.7357940375804901, "reward_std": 0.3902829438447952, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12473293766379356, "rewards/penalized_accuracy_reward/std": 0.18902970850467682, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.08957063034176826, "step": 1910 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1874.0, "completions/max_terminated_length": 1473.75, "completions/mean_length": 812.390625, "completions/mean_terminated_length": 687.25, "completions/min_length": 301.5, "completions/min_terminated_length": 301.5, "epoch": 0.9555, "grad_norm": 0.45959585905075073, "kl": 0.05780029296875, "learning_rate": 1.0554024673218806e-07, "loss": 0.255, "num_tokens": 140856293.0, "reward": 0.9566523432731628, "reward_std": 0.47341587394475937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.24981053173542023, "rewards/penalized_accuracy_reward/std": 0.1980709210038185, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1755845732986927, "step": 1911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1769.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 869.0625, "completions/mean_terminated_length": 777.9775543212891, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.956, "grad_norm": 0.44001051783561707, "kl": 0.048919677734375, "learning_rate": 1.054180604037749e-07, "loss": 0.2689, "num_tokens": 140921801.0, "reward": 0.6682433784008026, "reward_std": 0.29370689392089844, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0997466892004013, "rewards/penalized_accuracy_reward/std": 0.10301795601844788, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.17534197121858597, "step": 1912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1476.0, "completions/max_terminated_length": 1160.5, "completions/mean_length": 641.03125, "completions/mean_terminated_length": 573.1433258056641, "completions/min_length": 268.75, "completions/min_terminated_length": 268.75, "epoch": 0.9565, "grad_norm": 0.41762450337409973, "kl": 0.04473876953125, "learning_rate": 1.0529722834905125e-07, "loss": 0.153, "num_tokens": 140970267.0, "reward": 0.5265355110168457, "reward_std": 0.18585428968071938, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024986501783132553, "rewards/penalized_accuracy_reward/std": 0.06827612221240997, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14040156453847885, "step": 1913 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1717.75, "completions/max_terminated_length": 1507.0, "completions/mean_length": 941.8125, "completions/mean_terminated_length": 879.1830596923828, "completions/min_length": 394.75, "completions/min_terminated_length": 394.75, "epoch": 0.957, "grad_norm": 0.3531741201877594, "kl": 0.0489501953125, "learning_rate": 1.0517775093609241e-07, "loss": 0.2211, "num_tokens": 141039551.0, "reward": 0.7723992168903351, "reward_std": 0.5094284228980541, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.14987148344516754, "rewards/penalized_accuracy_reward/std": 0.23365110903978348, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1551513709127903, "step": 1914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1569.25, "completions/max_terminated_length": 1404.0, "completions/mean_length": 773.03125, "completions/mean_terminated_length": 717.3076934814453, "completions/min_length": 282.5, "completions/min_terminated_length": 282.5, "epoch": 0.9575, "grad_norm": 0.3789839446544647, "kl": 0.0360107421875, "learning_rate": 1.0505962852884739e-07, "loss": 0.1346, "num_tokens": 141096065.0, "reward": 0.6800389587879181, "reward_std": 0.4582808166742325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0997850950807333, "rewards/penalized_accuracy_reward/std": 0.2196327969431877, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09120866656303406, "step": 1915 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1300.75, "completions/max_terminated_length": 1041.25, "completions/mean_length": 651.59375, "completions/mean_terminated_length": 586.9315032958984, "completions/min_length": 262.5, "completions/min_terminated_length": 262.5, "epoch": 0.958, "grad_norm": 0.4078820049762726, "kl": 0.04608154296875, "learning_rate": 1.0494286148713744e-07, "loss": 0.1308, "num_tokens": 141154071.0, "reward": 0.5823614299297333, "reward_std": 0.21656915545463562, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04996977746486664, "rewards/penalized_accuracy_reward/std": 0.0893886610865593, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.07558366656303406, "step": 1916 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1180.75, "completions/max_terminated_length": 1110.5, "completions/mean_length": 644.640625, "completions/mean_terminated_length": 629.2562561035156, "completions/min_length": 277.25, "completions/min_terminated_length": 277.25, "epoch": 0.9585, "grad_norm": 0.4985881447792053, "kl": 0.038818359375, "learning_rate": 1.0482745016665526e-07, "loss": 0.0666, "num_tokens": 141206096.0, "reward": 0.7878142148256302, "reward_std": 0.42131292074918747, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1497664824128151, "rewards/penalized_accuracy_reward/std": 0.19792576879262924, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07394563034176826, "step": 1917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1812.0, "completions/max_terminated_length": 1420.75, "completions/mean_length": 997.890625, "completions/mean_terminated_length": 925.7323913574219, "completions/min_length": 346.75, "completions/min_terminated_length": 346.75, "epoch": 0.959, "grad_norm": 0.5050830245018005, "kl": 0.04315185546875, "learning_rate": 1.0471339491896373e-07, "loss": 0.2647, "num_tokens": 141278297.0, "reward": 0.5109195411205292, "reward_std": 0.22131051123142242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024991022422909737, "rewards/penalized_accuracy_reward/std": 0.06828847527503967, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.186958285048604, "step": 1918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1667.5, "completions/mean_length": 973.28125, "completions/mean_terminated_length": 826.2936248779297, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.9595, "grad_norm": 0.45894622802734375, "kl": 0.05792236328125, "learning_rate": 1.0460069609149496e-07, "loss": 0.3542, "num_tokens": 141350251.0, "reward": 0.4453125, "reward_std": 0.12369456328451633, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.890625, "rewards/tag_count_reward/std": 0.24738912656903267, "step": 1919 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1640.25, "completions/max_terminated_length": 1531.75, "completions/mean_length": 841.90625, "completions/mean_terminated_length": 766.7968902587891, "completions/min_length": 385.25, "completions/min_terminated_length": 385.25, "epoch": 0.96, "grad_norm": 0.5495268702507019, "kl": 0.04461669921875, "learning_rate": 1.044893540275491e-07, "loss": 0.15, "num_tokens": 141413061.0, "reward": 1.0679390132427216, "reward_std": 0.5252410545945168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2995945056900382, "rewards/penalized_accuracy_reward/std": 0.23487866297364235, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.13575975596904755, "step": 1920 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1654.75, "completions/max_terminated_length": 1480.25, "completions/mean_length": 820.640625, "completions/mean_terminated_length": 713.9278411865234, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.9605, "grad_norm": 0.4226078391075134, "kl": 0.04034423828125, "learning_rate": 1.0437936906629334e-07, "loss": 0.1808, "num_tokens": 141475166.0, "reward": 0.48980917036533356, "reward_std": 0.14272979274392128, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482712045311928, "rewards/penalized_accuracy_reward/std": 0.04993084818124771, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.1470555067062378, "step": 1921 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1743.5, "completions/max_terminated_length": 1716.5, "completions/mean_length": 1032.921875, "completions/mean_terminated_length": 978.2627868652344, "completions/min_length": 407.75, "completions/min_terminated_length": 407.75, "epoch": 0.961, "grad_norm": 0.1589377075433731, "kl": 0.028472900390625, "learning_rate": 1.0427074154276104e-07, "loss": -0.03, "num_tokens": 141553689.0, "reward": 0.49957361817359924, "reward_std": 0.11942166835069656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012482122518122196, "rewards/penalized_accuracy_reward/std": 0.049928490072488785, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.08596643060445786, "step": 1922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1727.75, "completions/mean_length": 1134.4375, "completions/mean_terminated_length": 990.2982177734375, "completions/min_length": 515.25, "completions/min_terminated_length": 515.25, "epoch": 0.9615, "grad_norm": 0.4006868600845337, "kl": 0.0416259765625, "learning_rate": 1.0416347178785039e-07, "loss": 0.2113, "num_tokens": 141635925.0, "reward": 0.47217318415641785, "reward_std": 0.19509724713861942, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012453784234821796, "rewards/penalized_accuracy_reward/std": 0.049815140664577484, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23320727795362473, "step": 1923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1780.75, "completions/max_terminated_length": 1379.75, "completions/mean_length": 783.890625, "completions/mean_terminated_length": 723.7021026611328, "completions/min_length": 292.75, "completions/min_terminated_length": 292.75, "epoch": 0.962, "grad_norm": 0.38628053665161133, "kl": 0.040069580078125, "learning_rate": 1.0405756012832367e-07, "loss": 0.2438, "num_tokens": 141696094.0, "reward": 0.478515625, "reward_std": 0.06899145990610123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.13798292353749275, "step": 1924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1940.75, "completions/max_terminated_length": 1378.75, "completions/mean_length": 737.5, "completions/mean_terminated_length": 649.6897583007812, "completions/min_length": 323.25, "completions/min_terminated_length": 323.25, "epoch": 0.9625, "grad_norm": 0.4802060127258301, "kl": 0.0498046875, "learning_rate": 1.0395300688680625e-07, "loss": 0.1954, "num_tokens": 141751966.0, "reward": 0.5975539088249207, "reward_std": 0.3918266035616398, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0624488340690732, "rewards/penalized_accuracy_reward/std": 0.1804736852645874, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14854013174772263, "step": 1925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1879.75, "completions/max_terminated_length": 1405.5, "completions/mean_length": 808.359375, "completions/mean_terminated_length": 703.3880844116211, "completions/min_length": 263.5, "completions/min_terminated_length": 263.5, "epoch": 0.963, "grad_norm": 0.47741931676864624, "kl": 0.04730224609375, "learning_rate": 1.0384981238178533e-07, "loss": 0.3203, "num_tokens": 141812549.0, "reward": 0.466796875, "reward_std": 0.07949705049395561, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.15899410098791122, "step": 1926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1597.25, "completions/max_terminated_length": 1428.5, "completions/mean_length": 861.078125, "completions/mean_terminated_length": 738.5686645507812, "completions/min_length": 321.25, "completions/min_terminated_length": 321.25, "epoch": 0.9635, "grad_norm": 0.41842731833457947, "kl": 0.048095703125, "learning_rate": 1.0374797692760933e-07, "loss": 0.1874, "num_tokens": 141876250.0, "reward": 1.0316555947065353, "reward_std": 0.5080892816185951, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.28731219563633204, "rewards/penalized_accuracy_reward/std": 0.23294464871287346, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.1393338106572628, "step": 1927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1822.25, "completions/max_terminated_length": 1176.0, "completions/mean_length": 939.296875, "completions/mean_terminated_length": 760.7567749023438, "completions/min_length": 327.5, "completions/min_terminated_length": 327.5, "epoch": 0.964, "grad_norm": 0.41457995772361755, "kl": 0.04681396484375, "learning_rate": 1.036475008344867e-07, "loss": 0.3104, "num_tokens": 141946637.0, "reward": 0.44140625, "reward_std": 0.107691815122962, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8828125, "rewards/tag_count_reward/std": 0.215383630245924, "step": 1928 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1930.75, "completions/max_terminated_length": 1551.75, "completions/mean_length": 1035.296875, "completions/mean_terminated_length": 949.4107055664062, "completions/min_length": 381.25, "completions/min_terminated_length": 381.25, "epoch": 0.9645, "grad_norm": 0.2752316892147064, "kl": 0.03851318359375, "learning_rate": 1.0354838440848501e-07, "loss": 0.1899, "num_tokens": 142024928.0, "reward": 0.6512117981910706, "reward_std": 0.25398828834295273, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08732465654611588, "rewards/penalized_accuracy_reward/std": 0.10226422548294067, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.13264633901417255, "step": 1929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1723.25, "completions/max_terminated_length": 1643.25, "completions/mean_length": 966.53125, "completions/mean_terminated_length": 884.2855834960938, "completions/min_length": 415.5, "completions/min_terminated_length": 415.5, "epoch": 0.965, "grad_norm": 0.36248159408569336, "kl": 0.036956787109375, "learning_rate": 1.0345062795153009e-07, "loss": 0.1902, "num_tokens": 142098674.0, "reward": 0.466796875, "reward_std": 0.0854385532438755, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.1708771139383316, "step": 1930 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1866.25, "completions/max_terminated_length": 1457.0, "completions/mean_length": 994.265625, "completions/mean_terminated_length": 858.5823059082031, "completions/min_length": 360.25, "completions/min_terminated_length": 360.25, "epoch": 0.9655, "grad_norm": 0.3476272523403168, "kl": 0.041015625, "learning_rate": 1.0335423176140511e-07, "loss": 0.2407, "num_tokens": 142172275.0, "reward": 0.6277927756309509, "reward_std": 0.27188458666205406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08733389526605606, "rewards/penalized_accuracy_reward/std": 0.10227501392364502, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.1656232737004757, "step": 1931 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1833.75, "completions/max_terminated_length": 1530.5, "completions/mean_length": 981.828125, "completions/mean_terminated_length": 885.2083740234375, "completions/min_length": 477.5, "completions/min_terminated_length": 477.5, "epoch": 0.966, "grad_norm": 0.2944320738315582, "kl": 0.0506591796875, "learning_rate": 1.0325919613174951e-07, "loss": 0.1059, "num_tokens": 142244360.0, "reward": 0.6666059046983719, "reward_std": 0.37957875803112984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09990452416241169, "rewards/penalized_accuracy_reward/std": 0.16816505789756775, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.16108575090765953, "step": 1932 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1463.25, "completions/max_terminated_length": 1299.0, "completions/mean_length": 780.671875, "completions/mean_terminated_length": 740.4330444335938, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.9665, "grad_norm": 0.3905867040157318, "kl": 0.036895751953125, "learning_rate": 1.0316552135205837e-07, "loss": 0.1155, "num_tokens": 142302931.0, "reward": 0.484375, "reward_std": 0.055456146597862244, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.11091229319572449, "step": 1933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1657.25, "completions/max_terminated_length": 1538.0, "completions/mean_length": 840.359375, "completions/mean_terminated_length": 789.1107330322266, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "epoch": 0.967, "grad_norm": 0.4805154502391815, "kl": 0.039031982421875, "learning_rate": 1.0307320770768129e-07, "loss": 0.2056, "num_tokens": 142364090.0, "reward": 0.678282618522644, "reward_std": 0.27740270271897316, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09988349676132202, "rewards/penalized_accuracy_reward/std": 0.10315924882888794, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.14216844737529755, "step": 1934 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1882.75, "completions/max_terminated_length": 1387.75, "completions/mean_length": 794.828125, "completions/mean_terminated_length": 683.5739898681641, "completions/min_length": 321.5, "completions/min_terminated_length": 321.5, "epoch": 0.9675, "grad_norm": 0.6653538942337036, "kl": 0.0606689453125, "learning_rate": 1.029822554798216e-07, "loss": 0.3825, "num_tokens": 142423519.0, "reward": 0.462890625, "reward_std": 0.08795566484332085, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1759113371372223, "step": 1935 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1427.25, "completions/max_terminated_length": 1117.5, "completions/mean_length": 734.109375, "completions/mean_terminated_length": 698.1708526611328, "completions/min_length": 284.5, "completions/min_terminated_length": 284.5, "epoch": 0.968, "grad_norm": 0.3095092177391052, "kl": 0.03546142578125, "learning_rate": 1.0289266494553565e-07, "loss": 0.0851, "num_tokens": 142483926.0, "reward": 0.8779945373535156, "reward_std": 0.30731876753270626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.199739471077919, "rewards/penalized_accuracy_reward/std": 0.13644694536924362, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.1072557382285595, "step": 1936 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1742.25, "completions/max_terminated_length": 1548.75, "completions/mean_length": 904.46875, "completions/mean_terminated_length": 867.7008972167969, "completions/min_length": 463.25, "completions/min_terminated_length": 463.25, "epoch": 0.9685, "grad_norm": 0.3406147360801697, "kl": 0.037200927734375, "learning_rate": 1.0280443637773163e-07, "loss": 0.0636, "num_tokens": 142553316.0, "reward": 0.7302394062280655, "reward_std": 0.33590345829725266, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12488533742725849, "rewards/penalized_accuracy_reward/std": 0.15232695639133453, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.09971532225608826, "step": 1937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1817.25, "completions/max_terminated_length": 1652.5, "completions/mean_length": 1049.703125, "completions/mean_terminated_length": 996.9760589599609, "completions/min_length": 557.5, "completions/min_terminated_length": 557.5, "epoch": 0.969, "grad_norm": 0.35623347759246826, "kl": 0.04071044921875, "learning_rate": 1.0271757004516918e-07, "loss": 0.1419, "num_tokens": 142631633.0, "reward": 0.8067503869533539, "reward_std": 0.2111483048647642, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.17485958337783813, "rewards/penalized_accuracy_reward/std": 0.06825821846723557, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.15828263387084007, "step": 1938 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1881.5, "completions/max_terminated_length": 1585.75, "completions/mean_length": 880.859375, "completions/mean_terminated_length": 802.8018035888672, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.9695, "grad_norm": 0.384662002325058, "kl": 0.0458984375, "learning_rate": 1.0263206621245807e-07, "loss": 0.2208, "num_tokens": 142699816.0, "reward": 0.48585939407348633, "reward_std": 0.16899413987994194, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012460951693356037, "rewards/penalized_accuracy_reward/std": 0.04984381049871445, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.2025844343006611, "step": 1939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1593.25, "completions/max_terminated_length": 1490.25, "completions/mean_length": 815.984375, "completions/mean_terminated_length": 782.4419708251953, "completions/min_length": 358.25, "completions/min_terminated_length": 358.25, "epoch": 0.97, "grad_norm": 0.3055869936943054, "kl": 0.035247802734375, "learning_rate": 1.0254792514005792e-07, "loss": 0.0567, "num_tokens": 142758327.0, "reward": 0.7379419505596161, "reward_std": 0.48003968596458435, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12483035400509834, "rewards/penalized_accuracy_reward/std": 0.2363334223628044, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.07020078226923943, "step": 1940 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1315.5, "completions/mean_length": 909.015625, "completions/mean_terminated_length": 775.9741058349609, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.9705, "grad_norm": 0.4204767644405365, "kl": 0.04888916015625, "learning_rate": 1.0246514708427701e-07, "loss": 0.3347, "num_tokens": 142824296.0, "reward": 0.4937055855989456, "reward_std": 0.17598766088485718, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012477792799472809, "rewards/penalized_accuracy_reward/std": 0.049911174923181534, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.18039370700716972, "step": 1941 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1754.5, "completions/max_terminated_length": 1524.25, "completions/mean_length": 936.703125, "completions/mean_terminated_length": 849.2489776611328, "completions/min_length": 385.5, "completions/min_terminated_length": 385.5, "epoch": 0.971, "grad_norm": 0.41121894121170044, "kl": 0.041900634765625, "learning_rate": 1.0238373229727166e-07, "loss": 0.1466, "num_tokens": 142893317.0, "reward": 1.1618311256170273, "reward_std": 0.7107425481081009, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.3494702344760299, "rewards/penalized_accuracy_reward/std": 0.3451894111931324, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1533849686384201, "step": 1942 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1822.75, "completions/max_terminated_length": 1455.5, "completions/mean_length": 999.390625, "completions/mean_terminated_length": 928.6135559082031, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 0.9715, "grad_norm": 0.3350200653076172, "kl": 0.045562744140625, "learning_rate": 1.0230368102704531e-07, "loss": 0.1399, "num_tokens": 142965870.0, "reward": 0.5128311216831207, "reward_std": 0.2008742168545723, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024970244616270065, "rewards/penalized_accuracy_reward/std": 0.06823170185089111, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.16703152284026146, "step": 1943 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1836.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 1005.171875, "completions/mean_terminated_length": 879.3904724121094, "completions/min_length": 426.75, "completions/min_terminated_length": 426.75, "epoch": 0.972, "grad_norm": 0.3094707429409027, "kl": 0.039459228515625, "learning_rate": 1.022249935174482e-07, "loss": 0.163, "num_tokens": 143039769.0, "reward": 0.7024128884077072, "reward_std": 0.40497380308806896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12464394420385361, "rewards/penalized_accuracy_reward/std": 0.17115728557109833, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.20241425558924675, "step": 1944 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 1033.3125, "completions/mean_terminated_length": 856.0433807373047, "completions/min_length": 382.25, "completions/min_terminated_length": 382.25, "epoch": 0.9725, "grad_norm": 0.4362603425979614, "kl": 0.04473876953125, "learning_rate": 1.0214767000817596e-07, "loss": 0.3093, "num_tokens": 143121373.0, "reward": 0.6853384673595428, "reward_std": 0.3152725622057915, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.12489579617977142, "rewards/penalized_accuracy_reward/std": 0.09991665929555893, "rewards/tag_count_reward/mean": 0.87109375, "rewards/tag_count_reward/std": 0.25619275867938995, "step": 1945 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2041.75, "completions/max_terminated_length": 1804.25, "completions/mean_length": 1037.84375, "completions/mean_terminated_length": 950.2295227050781, "completions/min_length": 388.25, "completions/min_terminated_length": 388.25, "epoch": 0.973, "grad_norm": 0.2919071614742279, "kl": 0.02972412109375, "learning_rate": 1.0207171073476951e-07, "loss": 0.157, "num_tokens": 143195027.0, "reward": 0.49368998408317566, "reward_std": 0.16328173130750656, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012469987384974957, "rewards/penalized_accuracy_reward/std": 0.04987994581460953, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16546630859375, "step": 1946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1266.0, "completions/max_terminated_length": 1251.25, "completions/mean_length": 777.96875, "completions/mean_terminated_length": 767.9677124023438, "completions/min_length": 285.25, "completions/min_terminated_length": 285.25, "epoch": 0.9735, "grad_norm": 0.4041730463504791, "kl": 0.037689208984375, "learning_rate": 1.01997115928614e-07, "loss": 0.0981, "num_tokens": 143257729.0, "reward": 0.8956117630004883, "reward_std": 0.3980771452188492, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.19975900277495384, "rewards/penalized_accuracy_reward/std": 0.191226065158844, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.03125, "step": 1947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1814.0, "completions/max_terminated_length": 1728.5, "completions/mean_length": 1010.765625, "completions/mean_terminated_length": 936.1205596923828, "completions/min_length": 421.25, "completions/min_terminated_length": 421.25, "epoch": 0.974, "grad_norm": 0.37778371572494507, "kl": 0.0328369140625, "learning_rate": 1.0192388581693806e-07, "loss": 0.1516, "num_tokens": 143330626.0, "reward": 0.5974540859460831, "reward_std": 0.35282833874225616, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062398917973041534, "rewards/penalized_accuracy_reward/std": 0.14868637174367905, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14812761545181274, "step": 1948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1831.75, "completions/max_terminated_length": 1690.5, "completions/mean_length": 1133.234375, "completions/mean_terminated_length": 1044.3638458251953, "completions/min_length": 409.75, "completions/min_terminated_length": 409.75, "epoch": 0.9745, "grad_norm": 0.258340060710907, "kl": 0.026702880859375, "learning_rate": 1.0185202062281336e-07, "loss": 0.1287, "num_tokens": 143410689.0, "reward": 0.5973173528909683, "reward_std": 0.246453445404768, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06233055517077446, "rewards/penalized_accuracy_reward/std": 0.09548314660787582, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.13556399568915367, "step": 1949 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1539.25, "completions/max_terminated_length": 1370.75, "completions/mean_length": 731.640625, "completions/mean_terminated_length": 651.2611694335938, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.975, "grad_norm": 0.4943680167198181, "kl": 0.0648193359375, "learning_rate": 1.0178152056515371e-07, "loss": 0.2212, "num_tokens": 143467098.0, "reward": 0.5054306834936142, "reward_std": 0.1426471285521984, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012480965815484524, "rewards/penalized_accuracy_reward/std": 0.049923866987228394, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.11366254836320877, "step": 1950 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1360.5, "completions/max_terminated_length": 1200.75, "completions/mean_length": 705.25, "completions/mean_terminated_length": 688.7666778564453, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.9755, "grad_norm": 0.3746451735496521, "kl": 0.045562744140625, "learning_rate": 1.017123858587145e-07, "loss": -0.0076, "num_tokens": 143524138.0, "reward": 0.91673943400383, "reward_std": 0.41885001957416534, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21227596700191498, "rewards/penalized_accuracy_reward/std": 0.20223180204629898, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1721.5, "completions/max_terminated_length": 1693.0, "completions/mean_length": 1125.703125, "completions/mean_terminated_length": 1063.4520263671875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.976, "grad_norm": 0.23969756066799164, "kl": 0.0504150390625, "learning_rate": 1.0164461671409212e-07, "loss": 0.029, "num_tokens": 143604551.0, "reward": 0.487400621175766, "reward_std": 0.20121632516384125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024950314313173294, "rewards/penalized_accuracy_reward/std": 0.0681772455573082, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.17757852375507355, "step": 1952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1830.75, "completions/max_terminated_length": 1742.25, "completions/mean_length": 872.890625, "completions/mean_terminated_length": 819.1469116210938, "completions/min_length": 313.5, "completions/min_terminated_length": 313.5, "epoch": 0.9765, "grad_norm": 0.3700663149356842, "kl": 0.0404052734375, "learning_rate": 1.0157821333772304e-07, "loss": 0.2394, "num_tokens": 143673024.0, "reward": 0.4765625, "reward_std": 0.06849108636379242, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.13698217645287514, "step": 1953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1677.5, "completions/max_terminated_length": 1559.25, "completions/mean_length": 1070.1875, "completions/mean_terminated_length": 926.4666748046875, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.977, "grad_norm": 0.33474260568618774, "kl": 0.046661376953125, "learning_rate": 1.0151317593188354e-07, "loss": 0.0451, "num_tokens": 143757596.0, "reward": 0.4546605944633484, "reward_std": 0.17104469425976276, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012486547231674194, "rewards/penalized_accuracy_reward/std": 0.04994618892669678, "rewards/tag_count_reward/mean": 0.859375, "rewards/tag_count_reward/std": 0.1807340607047081, "step": 1954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1726.75, "completions/max_terminated_length": 1290.5, "completions/mean_length": 809.140625, "completions/mean_terminated_length": 751.9885711669922, "completions/min_length": 322.25, "completions/min_terminated_length": 322.25, "epoch": 0.9775, "grad_norm": 0.37127023935317993, "kl": 0.039093017578125, "learning_rate": 1.014495046946888e-07, "loss": 0.1848, "num_tokens": 143820005.0, "reward": 0.5054348260164261, "reward_std": 0.15047287940979004, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012483036145567894, "rewards/penalized_accuracy_reward/std": 0.049932144582271576, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.1292813941836357, "step": 1955 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1587.75, "completions/max_terminated_length": 1375.0, "completions/mean_length": 726.734375, "completions/mean_terminated_length": 649.712516784668, "completions/min_length": 334.75, "completions/min_terminated_length": 334.75, "epoch": 0.978, "grad_norm": 0.40491196513175964, "kl": 0.045745849609375, "learning_rate": 1.0138719982009242e-07, "loss": 0.2056, "num_tokens": 143877636.0, "reward": 0.46875, "reward_std": 0.06332463026046753, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.12664926052093506, "step": 1956 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1755.25, "completions/max_terminated_length": 1579.25, "completions/mean_length": 837.71875, "completions/mean_terminated_length": 794.8885650634766, "completions/min_length": 360.5, "completions/min_terminated_length": 360.5, "epoch": 0.9785, "grad_norm": 0.42361852526664734, "kl": 0.03924560546875, "learning_rate": 1.013262614978859e-07, "loss": 0.1913, "num_tokens": 143942530.0, "reward": 0.782113254070282, "reward_std": 0.2246968299150467, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.149845689535141, "rewards/penalized_accuracy_reward/std": 0.08935072273015976, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.1265372931957245, "step": 1957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1397.5, "completions/mean_length": 918.578125, "completions/mean_terminated_length": 792.9272308349609, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.979, "grad_norm": 1.9808452129364014, "kl": 0.14605712890625, "learning_rate": 1.0126668991369792e-07, "loss": 0.2111, "num_tokens": 144015783.0, "reward": 0.6779564023017883, "reward_std": 0.37859016843140125, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11241569928824902, "rewards/penalized_accuracy_reward/std": 0.1531279906630516, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.22092687711119652, "step": 1958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 634.265625, "completions/mean_terminated_length": 634.265625, "completions/min_length": 306.25, "completions/min_terminated_length": 306.25, "epoch": 0.9795, "grad_norm": 0.42828506231307983, "kl": 0.0447998046875, "learning_rate": 1.0120848524899386e-07, "loss": 0.0322, "num_tokens": 144065224.0, "reward": 0.9222714602947235, "reward_std": 0.24417207390069962, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.21211227774620056, "rewards/penalized_accuracy_reward/std": 0.1181797906756401, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1959 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1863.75, "completions/max_terminated_length": 1534.0, "completions/mean_length": 898.75, "completions/mean_terminated_length": 791.657470703125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.98, "grad_norm": 0.4703972339630127, "kl": 0.049407958984375, "learning_rate": 1.0115164768107522e-07, "loss": 0.1782, "num_tokens": 144132312.0, "reward": 0.6475081294775009, "reward_std": 0.4581069052219391, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08742593973875046, "rewards/penalized_accuracy_reward/std": 0.2170758992433548, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1320282220840454, "step": 1960 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1620.25, "completions/max_terminated_length": 1210.75, "completions/mean_length": 863.625, "completions/mean_terminated_length": 763.4976196289062, "completions/min_length": 241.75, "completions/min_terminated_length": 241.75, "epoch": 0.9805, "grad_norm": 0.5170933604240417, "kl": 0.04486083984375, "learning_rate": 1.0109617738307911e-07, "loss": 0.2857, "num_tokens": 144197584.0, "reward": 0.447265625, "reward_std": 0.11911221593618393, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.23822444677352905, "step": 1961 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1696.75, "completions/max_terminated_length": 1571.75, "completions/mean_length": 843.578125, "completions/mean_terminated_length": 702.9551696777344, "completions/min_length": 299.25, "completions/min_terminated_length": 299.25, "epoch": 0.981, "grad_norm": 0.4504857361316681, "kl": 0.041717529296875, "learning_rate": 1.0104207452397761e-07, "loss": 0.2523, "num_tokens": 144259173.0, "reward": 0.5338906347751617, "reward_std": 0.3088317532092333, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374531289562583, "rewards/penalized_accuracy_reward/std": 0.1181471012532711, "rewards/tag_count_reward/mean": 0.91796875, "rewards/tag_count_reward/std": 0.14507511630654335, "step": 1962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1356.75, "completions/max_terminated_length": 1340.5, "completions/mean_length": 705.84375, "completions/mean_terminated_length": 678.9910888671875, "completions/min_length": 273.25, "completions/min_terminated_length": 273.25, "epoch": 0.9815, "grad_norm": 0.4036239981651306, "kl": 0.051971435546875, "learning_rate": 1.0098933926857752e-07, "loss": 0.1537, "num_tokens": 144314395.0, "reward": 0.6284204721450806, "reward_std": 0.25012707710266113, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07495242357254028, "rewards/penalized_accuracy_reward/std": 0.09993656724691391, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10050791501998901, "step": 1963 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1968.5, "completions/max_terminated_length": 1572.5, "completions/mean_length": 929.109375, "completions/mean_terminated_length": 810.5323028564453, "completions/min_length": 335.5, "completions/min_terminated_length": 335.5, "epoch": 0.982, "grad_norm": 0.2683984041213989, "kl": 0.031097412109375, "learning_rate": 1.0093797177751944e-07, "loss": 0.165, "num_tokens": 144381298.0, "reward": 0.5186110883951187, "reward_std": 0.22502024844288826, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024930539540946484, "rewards/penalized_accuracy_reward/std": 0.09972215443849564, "rewards/tag_count_reward/mean": 0.9375, "rewards/tag_count_reward/std": 0.16600050404667854, "step": 1964 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1777.25, "completions/max_terminated_length": 1465.25, "completions/mean_length": 831.6875, "completions/mean_terminated_length": 777.1461486816406, "completions/min_length": 344.5, "completions/min_terminated_length": 344.5, "epoch": 0.9825, "grad_norm": 0.27796077728271484, "kl": 0.0477294921875, "learning_rate": 1.0088797220727779e-07, "loss": 0.0878, "num_tokens": 144444286.0, "reward": 0.8611047863960266, "reward_std": 0.42728395760059357, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1873883157968521, "rewards/penalized_accuracy_reward/std": 0.2056298926472664, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.07966844737529755, "step": 1965 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1491.5, "completions/mean_length": 1060.828125, "completions/mean_terminated_length": 932.6218414306641, "completions/min_length": 362.5, "completions/min_terminated_length": 362.5, "epoch": 0.983, "grad_norm": 0.36677777767181396, "kl": 0.0379638671875, "learning_rate": 1.0083934071015988e-07, "loss": 0.1396, "num_tokens": 144525123.0, "reward": 0.6568500101566315, "reward_std": 0.4745064228773117, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09990938473492861, "rewards/penalized_accuracy_reward/std": 0.2138427197933197, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.21156632527709007, "step": 1966 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1772.75, "completions/mean_length": 1073.953125, "completions/mean_terminated_length": 935.9809875488281, "completions/min_length": 419.5, "completions/min_terminated_length": 419.5, "epoch": 0.9835, "grad_norm": 0.41381824016571045, "kl": 0.051666259765625, "learning_rate": 1.007920774343056e-07, "loss": 0.2373, "num_tokens": 144604192.0, "reward": 0.48199114203453064, "reward_std": 0.19363365322351456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012479949742555618, "rewards/penalized_accuracy_reward/std": 0.04991980269551277, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.22601484507322311, "step": 1967 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1548.5, "completions/mean_length": 967.359375, "completions/mean_terminated_length": 895.3167266845703, "completions/min_length": 501.25, "completions/min_terminated_length": 501.25, "epoch": 0.984, "grad_norm": 0.38944780826568604, "kl": 0.037994384765625, "learning_rate": 1.0074618252368726e-07, "loss": 0.2198, "num_tokens": 144674647.0, "reward": 0.8222467005252838, "reward_std": 0.4384080693125725, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1747952215373516, "rewards/penalized_accuracy_reward/std": 0.18922331184148788, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.18485792353749275, "step": 1968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1615.75, "completions/max_terminated_length": 1474.75, "completions/mean_length": 1045.4375, "completions/mean_terminated_length": 935.0701599121094, "completions/min_length": 391.5, "completions/min_terminated_length": 391.5, "epoch": 0.9845, "grad_norm": 0.35110339522361755, "kl": 0.035125732421875, "learning_rate": 1.0070165611810855e-07, "loss": 0.0422, "num_tokens": 144750195.0, "reward": 0.5550077557563782, "reward_std": 0.2530374713242054, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.04996481537818909, "rewards/penalized_accuracy_reward/std": 0.08937978744506836, "rewards/tag_count_reward/mean": 0.91015625, "rewards/tag_count_reward/std": 0.14855579286813736, "step": 1969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 1063.3125, "completions/mean_terminated_length": 889.3868560791016, "completions/min_length": 387.75, "completions/min_terminated_length": 387.75, "epoch": 0.985, "grad_norm": 0.38913989067077637, "kl": 0.05078125, "learning_rate": 1.0065849835320473e-07, "loss": 0.3817, "num_tokens": 144829879.0, "reward": 0.439453125, "reward_std": 0.12783927842974663, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.87890625, "rewards/tag_count_reward/std": 0.25567856058478355, "step": 1970 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1678.25, "completions/max_terminated_length": 1608.25, "completions/mean_length": 1031.46875, "completions/mean_terminated_length": 976.3258972167969, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.9855, "grad_norm": 0.3001694977283478, "kl": 0.0499267578125, "learning_rate": 1.0061670936044178e-07, "loss": 0.0791, "num_tokens": 144904213.0, "reward": 0.6764492392539978, "reward_std": 0.2633681483566761, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0999433621764183, "rewards/penalized_accuracy_reward/std": 0.10322106629610062, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.14370574057102203, "step": 1971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 577.15625, "completions/mean_terminated_length": 577.15625, "completions/min_length": 229.5, "completions/min_terminated_length": 229.5, "epoch": 0.986, "grad_norm": 0.3734094798564911, "kl": 0.04119873046875, "learning_rate": 1.0057628926711624e-07, "loss": 0.005, "num_tokens": 144950703.0, "reward": 0.699758768081665, "reward_std": 0.35227689146995544, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09987938404083252, "rewards/penalized_accuracy_reward/std": 0.17613844573497772, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 1972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1865.25, "completions/max_terminated_length": 1381.0, "completions/mean_length": 875.53125, "completions/mean_terminated_length": 728.3396911621094, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.9865, "grad_norm": 0.42760616540908813, "kl": 0.042999267578125, "learning_rate": 1.005372381963547e-07, "loss": 0.3289, "num_tokens": 145015793.0, "reward": 0.462890625, "reward_std": 0.0835045725107193, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.92578125, "rewards/tag_count_reward/std": 0.1670091450214386, "step": 1973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1653.0, "completions/max_terminated_length": 1471.25, "completions/mean_length": 961.90625, "completions/mean_terminated_length": 863.7477722167969, "completions/min_length": 348.25, "completions/min_terminated_length": 348.25, "epoch": 0.987, "grad_norm": 0.30295440554618835, "kl": 0.049468994140625, "learning_rate": 1.0049955626711354e-07, "loss": 0.1011, "num_tokens": 145085979.0, "reward": 0.6857324242591858, "reward_std": 0.26998716592788696, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1123974621295929, "rewards/penalized_accuracy_reward/std": 0.10237612575292587, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.158807672560215, "step": 1974 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1756.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 960.234375, "completions/mean_terminated_length": 903.2908630371094, "completions/min_length": 412.25, "completions/min_terminated_length": 412.25, "epoch": 0.9875, "grad_norm": 0.29088762402534485, "kl": 0.043212890625, "learning_rate": 1.0046324359417842e-07, "loss": 0.1553, "num_tokens": 145159722.0, "reward": 0.6033071577548981, "reward_std": 0.3278902657330036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.06239577103406191, "rewards/penalized_accuracy_reward/std": 0.13922719657421112, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.11702133901417255, "step": 1975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1838.0, "completions/max_terminated_length": 1488.5, "completions/mean_length": 987.515625, "completions/mean_terminated_length": 888.5932006835938, "completions/min_length": 447.75, "completions/min_terminated_length": 447.75, "epoch": 0.988, "grad_norm": 0.4740748703479767, "kl": 0.040771484375, "learning_rate": 1.0042830028816399e-07, "loss": 0.2002, "num_tokens": 145232059.0, "reward": 0.7815995067358017, "reward_std": 0.4843611679971218, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.162284130230546, "rewards/penalized_accuracy_reward/std": 0.2178894616663456, "rewards/tag_count_reward/mean": 0.9140625, "rewards/tag_count_reward/std": 0.17919977009296417, "step": 1976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1685.25, "completions/max_terminated_length": 1368.0, "completions/mean_length": 858.328125, "completions/mean_terminated_length": 771.2781524658203, "completions/min_length": 358.5, "completions/min_terminated_length": 358.5, "epoch": 0.9885, "grad_norm": 0.4532969295978546, "kl": 0.04364013671875, "learning_rate": 1.0039472645551372e-07, "loss": 0.2568, "num_tokens": 145297168.0, "reward": 0.466796875, "reward_std": 0.07416288927197456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.14832578226923943, "step": 1977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1666.25, "completions/max_terminated_length": 1429.75, "completions/mean_length": 801.921875, "completions/mean_terminated_length": 721.0960006713867, "completions/min_length": 300.25, "completions/min_terminated_length": 300.25, "epoch": 0.989, "grad_norm": 0.5136459469795227, "kl": 0.046905517578125, "learning_rate": 1.0036252219849932e-07, "loss": 0.1232, "num_tokens": 145360923.0, "reward": 0.7490964829921722, "reward_std": 0.5628696978092194, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1372435437515378, "rewards/penalized_accuracy_reward/std": 0.2703750617802143, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.12428925558924675, "step": 1978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1583.25, "completions/max_terminated_length": 1583.25, "completions/mean_length": 800.6875, "completions/mean_terminated_length": 800.6875, "completions/min_length": 350.25, "completions/min_terminated_length": 350.25, "epoch": 0.9895, "grad_norm": 0.25955474376678467, "kl": 0.026763916015625, "learning_rate": 1.0033168761522048e-07, "loss": 0.0488, "num_tokens": 145425607.0, "reward": 0.5479851216077805, "reward_std": 0.14426977932453156, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.02496912330389023, "rewards/penalized_accuracy_reward/std": 0.06822864711284637, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.015625, "step": 1979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1362.5, "completions/max_terminated_length": 1253.75, "completions/mean_length": 755.125, "completions/mean_terminated_length": 703.3822174072266, "completions/min_length": 315.5, "completions/min_terminated_length": 315.5, "epoch": 0.99, "grad_norm": 0.3532673716545105, "kl": 0.0321044921875, "learning_rate": 1.0030222279960469e-07, "loss": 0.1853, "num_tokens": 145482975.0, "reward": 0.9527267515659332, "reward_std": 0.4957783818244934, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.2371055632829666, "rewards/penalized_accuracy_reward/std": 0.24145737290382385, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.05915529653429985, "step": 1980 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1637.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 785.046875, "completions/mean_terminated_length": 769.2406311035156, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.9905, "grad_norm": 0.3694373071193695, "kl": 0.040924072265625, "learning_rate": 1.002741278414069e-07, "loss": -0.0012, "num_tokens": 145542802.0, "reward": 0.5420609414577484, "reward_std": 0.16753040254116058, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.024936722591519356, "rewards/penalized_accuracy_reward/std": 0.06814020127058029, "rewards/tag_count_reward/mean": 0.984375, "rewards/tag_count_reward/std": 0.0625, "step": 1981 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1711.5, "completions/max_terminated_length": 1428.0, "completions/mean_length": 765.609375, "completions/mean_terminated_length": 679.8900909423828, "completions/min_length": 231.25, "completions/min_terminated_length": 231.25, "epoch": 0.991, "grad_norm": 0.3685739040374756, "kl": 0.05029296875, "learning_rate": 1.002474028262093e-07, "loss": 0.221, "num_tokens": 145603785.0, "reward": 0.6741072535514832, "reward_std": 0.33355167880654335, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09974895138293505, "rewards/penalized_accuracy_reward/std": 0.1521521806716919, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11166805773973465, "step": 1982 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1607.25, "completions/max_terminated_length": 1520.75, "completions/mean_length": 893.265625, "completions/mean_terminated_length": 833.46875, "completions/min_length": 334.5, "completions/min_terminated_length": 334.5, "epoch": 0.9915, "grad_norm": 0.3258056640625, "kl": 0.035125732421875, "learning_rate": 1.0022204783542078e-07, "loss": 0.0189, "num_tokens": 145670938.0, "reward": 0.7915712594985962, "reward_std": 0.20317840576171875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.1623871922492981, "rewards/penalized_accuracy_reward/std": 0.08056662976741791, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.08409032225608826, "step": 1983 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1784.25, "completions/max_terminated_length": 1627.25, "completions/mean_length": 1267.4375, "completions/mean_terminated_length": 1095.6042175292969, "completions/min_length": 452.25, "completions/min_terminated_length": 452.25, "epoch": 0.992, "grad_norm": 0.38896510004997253, "kl": 0.0343017578125, "learning_rate": 1.001980629462772e-07, "loss": 0.1647, "num_tokens": 145762822.0, "reward": 0.5888992547988892, "reward_std": 0.3173318710178137, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.08741836994886398, "rewards/penalized_accuracy_reward/std": 0.10237392783164978, "rewards/tag_count_reward/mean": 0.828125, "rewards/tag_count_reward/std": 0.22516803070902824, "step": 1984 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 921.890625, "completions/mean_terminated_length": 846.8166961669922, "completions/min_length": 358.75, "completions/min_terminated_length": 358.75, "epoch": 0.9925, "grad_norm": 0.3853187561035156, "kl": 0.033294677734375, "learning_rate": 1.0017544823184055e-07, "loss": 0.1901, "num_tokens": 145830975.0, "reward": 0.7471796125173569, "reward_std": 0.4388892501592636, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.13726169522851706, "rewards/penalized_accuracy_reward/std": 0.20208602026104927, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.1811130754649639, "step": 1985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1886.75, "completions/max_terminated_length": 1559.5, "completions/mean_length": 1125.328125, "completions/mean_terminated_length": 1054.1536102294922, "completions/min_length": 580.5, "completions/min_terminated_length": 580.5, "epoch": 0.993, "grad_norm": 0.3723486363887787, "kl": 0.032684326171875, "learning_rate": 1.0015420376099923e-07, "loss": 0.1862, "num_tokens": 145913700.0, "reward": 0.466796875, "reward_std": 0.09548348188400269, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.93359375, "rewards/tag_count_reward/std": 0.19096697121858597, "step": 1986 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1768.0, "completions/max_terminated_length": 1555.5, "completions/mean_length": 962.296875, "completions/mean_terminated_length": 929.3458709716797, "completions/min_length": 394.75, "completions/min_terminated_length": 394.75, "epoch": 0.9935, "grad_norm": 0.3600040376186371, "kl": 0.0474853515625, "learning_rate": 1.001343295984676e-07, "loss": 0.1089, "num_tokens": 145985239.0, "reward": 0.49763423204421997, "reward_std": 0.15407377108931541, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012488989159464836, "rewards/penalized_accuracy_reward/std": 0.049955952912569046, "rewards/tag_count_reward/mean": 0.9453125, "rewards/tag_count_reward/std": 0.14684643223881721, "step": 1987 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1752.75, "completions/max_terminated_length": 1346.75, "completions/mean_length": 914.75, "completions/mean_terminated_length": 830.8531341552734, "completions/min_length": 410.25, "completions/min_terminated_length": 410.25, "epoch": 0.994, "grad_norm": 0.31669333577156067, "kl": 0.032379150390625, "learning_rate": 1.0011582580478576e-07, "loss": 0.2376, "num_tokens": 146054087.0, "reward": 0.5856518149375916, "reward_std": 0.2650383375585079, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.062357161194086075, "rewards/penalized_accuracy_reward/std": 0.09552392363548279, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.17242243513464928, "step": 1988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1740.75, "completions/max_terminated_length": 1111.5, "completions/mean_length": 665.625, "completions/mean_terminated_length": 576.4902038574219, "completions/min_length": 261.25, "completions/min_terminated_length": 261.25, "epoch": 0.9945, "grad_norm": 0.6467880010604858, "kl": 0.0562744140625, "learning_rate": 1.0009869243631952e-07, "loss": 0.2889, "num_tokens": 146106527.0, "reward": 0.6514272093772888, "reward_std": 0.3754263184964657, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.087432362139225, "rewards/penalized_accuracy_reward/std": 0.16391616314649582, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.13698217645287514, "step": 1989 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1808.75, "completions/max_terminated_length": 1612.5, "completions/mean_length": 1064.21875, "completions/mean_terminated_length": 926.5541839599609, "completions/min_length": 483.25, "completions/min_terminated_length": 483.25, "epoch": 0.995, "grad_norm": 0.3939071595668793, "kl": 0.04058837890625, "learning_rate": 1.000829295452601e-07, "loss": 0.1412, "num_tokens": 146183821.0, "reward": 0.6619738712906837, "reward_std": 0.3720012791454792, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11223692912608385, "rewards/penalized_accuracy_reward/std": 0.1529499925673008, "rewards/tag_count_reward/mean": 0.875, "rewards/tag_count_reward/std": 0.20616771467030048, "step": 1990 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1879.5, "completions/max_terminated_length": 1629.75, "completions/mean_length": 1056.390625, "completions/mean_terminated_length": 899.9548034667969, "completions/min_length": 278.75, "completions/min_terminated_length": 278.75, "epoch": 0.9955, "grad_norm": 2.521959066390991, "kl": 0.091949462890625, "learning_rate": 1.0006853717962393e-07, "loss": 0.2452, "num_tokens": 146262326.0, "reward": 0.4722284972667694, "reward_std": 0.20950599759817123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012481438927352428, "rewards/penalized_accuracy_reward/std": 0.04992575943470001, "rewards/tag_count_reward/mean": 0.89453125, "rewards/tag_count_reward/std": 0.21930895745754242, "step": 1991 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1724.5, "completions/max_terminated_length": 1608.5, "completions/mean_length": 1056.140625, "completions/mean_terminated_length": 1008.3125152587891, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.996, "grad_norm": 0.34282875061035156, "kl": 0.02734375, "learning_rate": 1.0005551538325274e-07, "loss": 0.0884, "num_tokens": 146341327.0, "reward": 0.4765625, "reward_std": 0.07493153586983681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.1498630754649639, "step": 1992 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1609.5, "completions/max_terminated_length": 1314.25, "completions/mean_length": 769.390625, "completions/mean_terminated_length": 713.6526947021484, "completions/min_length": 301.25, "completions/min_terminated_length": 301.25, "epoch": 0.9965, "grad_norm": 0.332943320274353, "kl": 0.03485107421875, "learning_rate": 1.000438641958131e-07, "loss": 0.1892, "num_tokens": 146399336.0, "reward": 0.474609375, "reward_std": 0.052480507642030716, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.10496101714670658, "step": 1993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1705.0, "completions/max_terminated_length": 1649.25, "completions/mean_length": 948.640625, "completions/mean_terminated_length": 901.1742858886719, "completions/min_length": 431.25, "completions/min_terminated_length": 431.25, "epoch": 0.997, "grad_norm": 0.25855687260627747, "kl": 0.028472900390625, "learning_rate": 1.0003358365279661e-07, "loss": 0.0427, "num_tokens": 146469457.0, "reward": 0.6841753274202347, "reward_std": 0.37248852849006653, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.09990017116069794, "rewards/penalized_accuracy_reward/std": 0.17870686948299408, "rewards/tag_count_reward/mean": 0.96875, "rewards/tag_count_reward/std": 0.0816391110420227, "step": 1994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1870.25, "completions/max_terminated_length": 1695.25, "completions/mean_length": 923.40625, "completions/mean_terminated_length": 834.8125305175781, "completions/min_length": 339.5, "completions/min_terminated_length": 339.5, "epoch": 0.9975, "grad_norm": 0.4169415831565857, "kl": 0.038818359375, "learning_rate": 1.0002467378551954e-07, "loss": 0.2124, "num_tokens": 146539019.0, "reward": 0.69937963783741, "reward_std": 0.42838911712169647, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.11238513141870499, "rewards/penalized_accuracy_reward/std": 0.18500491231679916, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.11675865203142166, "step": 1995 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1856.0, "completions/max_terminated_length": 1635.5, "completions/mean_length": 1066.21875, "completions/mean_terminated_length": 972.6739807128906, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.998, "grad_norm": 0.36258208751678467, "kl": 0.028717041015625, "learning_rate": 1.000171346211229e-07, "loss": 0.2105, "num_tokens": 146615609.0, "reward": 0.470703125, "reward_std": 0.08104817569255829, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0, "rewards/penalized_accuracy_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.94140625, "rewards/tag_count_reward/std": 0.16209635883569717, "step": 1996 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1849.5, "completions/max_terminated_length": 1672.5, "completions/mean_length": 781.109375, "completions/mean_terminated_length": 743.13232421875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.9985, "grad_norm": 0.24123264849185944, "kl": 0.04876708984375, "learning_rate": 1.0001096618257236e-07, "loss": 0.1429, "num_tokens": 146674048.0, "reward": 0.5015375763177872, "reward_std": 0.1361654158681631, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.01248753909021616, "rewards/penalized_accuracy_reward/std": 0.04995015636086464, "rewards/tag_count_reward/mean": 0.953125, "rewards/tag_count_reward/std": 0.11482159048318863, "step": 1997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1311.25, "completions/max_terminated_length": 1176.5, "completions/mean_length": 700.640625, "completions/mean_terminated_length": 640.7271728515625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.999, "grad_norm": 0.30017977952957153, "kl": 0.05914306640625, "learning_rate": 1.0000616848865797e-07, "loss": 0.0812, "num_tokens": 146726361.0, "reward": 0.5034531205892563, "reward_std": 0.12659092992544174, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.012468747794628143, "rewards/penalized_accuracy_reward/std": 0.04987499117851257, "rewards/tag_count_reward/mean": 0.95703125, "rewards/tag_count_reward/std": 0.10683366656303406, "step": 1998 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1781.0, "completions/max_terminated_length": 1380.5, "completions/mean_length": 818.328125, "completions/mean_terminated_length": 776.5208511352539, "completions/min_length": 376.75, "completions/min_terminated_length": 376.75, "epoch": 0.9995, "grad_norm": 0.5131229758262634, "kl": 0.05377197265625, "learning_rate": 1.0000274155399433e-07, "loss": 0.2563, "num_tokens": 146786606.0, "reward": 0.6302158236503601, "reward_std": 0.3753454424440861, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.07487353309988976, "rewards/penalized_accuracy_reward/std": 0.16097326576709747, "rewards/tag_count_reward/mean": 0.9609375, "rewards/tag_count_reward/std": 0.13270078226923943, "step": 1999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1285.0, "completions/max_terminated_length": 1274.25, "completions/mean_length": 711.859375, "completions/mean_terminated_length": 629.7073974609375, "completions/min_length": 245.25, "completions/min_terminated_length": 245.25, "epoch": 1.0, "grad_norm": 0.704089343547821, "kl": 0.051727294921875, "learning_rate": 1.0000068538902053e-07, "loss": 0.1203, "num_tokens": 146841285.0, "reward": 0.5397269874811172, "reward_std": 0.2798202782869339, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/penalized_accuracy_reward/mean": 0.0374416122213006, "rewards/penalized_accuracy_reward/std": 0.11813940852880478, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.113535076379776, "step": 2000 }, { "epoch": 1.0, "step": 2000, "total_flos": 0.0, "train_loss": 0.19940190527349477, "train_runtime": 102758.6793, "train_samples_per_second": 0.078, "train_steps_per_second": 0.019 } ], "logging_steps": 1, "max_steps": 2000, "num_input_tokens_seen": 146841285, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }