{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9952755905511812, "eval_steps": 100, "global_step": 316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 514.859375, "completions/mean_terminated_length": 486.98406982421875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.006299212598425197, "grad_norm": 0.45115670561790466, "kl": 9.506940841674805e-05, "learning_rate": 0.0, "loss": -0.0368, "num_tokens": 586082.0, "reward": -0.5526061058044434, "reward_std": 0.764463484287262, "rewards/cosine_scaled_reward/mean": -0.5526061058044434, "rewards/cosine_scaled_reward/std": 1.0664217472076416, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025669642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 535.5926513671875, "completions/mean_terminated_length": 495.74688720703125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.012598425196850394, "grad_norm": 0.430998295545578, "kl": 9.66787338256836e-05, "learning_rate": 3.125e-08, "loss": -0.0952, "num_tokens": 1196373.0, "reward": -0.4628583788871765, "reward_std": 0.8271132707595825, "rewards/cosine_scaled_reward/mean": -0.4628583788871765, "rewards/cosine_scaled_reward/std": 1.1489886045455933, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 586.6060791015625, "completions/mean_terminated_length": 551.5325927734375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.01889763779527559, "grad_norm": 0.4537774324417114, "kl": 9.518861770629883e-05, "learning_rate": 6.25e-08, "loss": -0.1136, "num_tokens": 1863732.0, "reward": -0.6168839931488037, "reward_std": 0.7393009066581726, "rewards/cosine_scaled_reward/mean": -0.6168839335441589, "rewards/cosine_scaled_reward/std": 0.9996686577796936, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 484.872802734375, "completions/mean_terminated_length": 452.826904296875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.025196850393700787, "grad_norm": 0.5108857750892639, "kl": 0.0001074075698852539, "learning_rate": 9.375e-08, "loss": -0.0851, "num_tokens": 2449778.0, "reward": -0.5494452714920044, "reward_std": 0.7850840091705322, "rewards/cosine_scaled_reward/mean": -0.5494452118873596, "rewards/cosine_scaled_reward/std": 1.0697675943374634, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 530.9732666015625, "completions/mean_terminated_length": 496.337890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.031496062992125984, "grad_norm": 0.4788239300251007, "kl": 0.00010323524475097656, "learning_rate": 1.25e-07, "loss": -0.0865, "num_tokens": 3063498.0, "reward": -0.49671319127082825, "reward_std": 0.8431764841079712, "rewards/cosine_scaled_reward/mean": -0.49671316146850586, "rewards/cosine_scaled_reward/std": 1.1200929880142212, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 486.044677734375, "completions/mean_terminated_length": 459.4506530761719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.03779527559055118, "grad_norm": 0.4317552149295807, "kl": 0.00010275840759277344, "learning_rate": 1.5624999999999999e-07, "loss": -0.1117, "num_tokens": 3632018.0, "reward": -0.5126838684082031, "reward_std": 0.8028110265731812, "rewards/cosine_scaled_reward/mean": -0.5126838684082031, "rewards/cosine_scaled_reward/std": 1.1045653820037842, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 531.9910888671875, "completions/mean_terminated_length": 495.6068420410156, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.04409448818897638, "grad_norm": 0.4166410565376282, "kl": 9.614229202270508e-05, "learning_rate": 1.875e-07, "loss": -0.0529, "num_tokens": 4239450.0, "reward": -0.5730382800102234, "reward_std": 0.7830650210380554, "rewards/cosine_scaled_reward/mean": -0.5730382204055786, "rewards/cosine_scaled_reward/std": 1.0463130474090576, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 558.1350708007812, "completions/mean_terminated_length": 515.3719482421875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.050393700787401574, "grad_norm": 0.3572291433811188, "kl": 0.0001131296157836914, "learning_rate": 2.1875e-07, "loss": -0.0693, "num_tokens": 4878691.0, "reward": -0.5359418392181396, "reward_std": 0.8269107341766357, "rewards/cosine_scaled_reward/mean": -0.5359417796134949, "rewards/cosine_scaled_reward/std": 1.0827312469482422, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 532.286865234375, "completions/mean_terminated_length": 495.9096984863281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.05669291338582677, "grad_norm": 0.44510260224342346, "kl": 0.00010901689529418945, "learning_rate": 2.5e-07, "loss": -0.0958, "num_tokens": 5486228.0, "reward": -0.5363856554031372, "reward_std": 0.765136182308197, "rewards/cosine_scaled_reward/mean": -0.5363856554031372, "rewards/cosine_scaled_reward/std": 1.0828957557678223, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 498.3594055175781, "completions/mean_terminated_length": 470.18408203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.06299212598425197, "grad_norm": 0.49168774485588074, "kl": 0.00010883808135986328, "learning_rate": 2.8125e-07, "loss": -0.1151, "num_tokens": 6062822.0, "reward": -0.5703827738761902, "reward_std": 0.7068774700164795, "rewards/cosine_scaled_reward/mean": -0.5703827142715454, "rewards/cosine_scaled_reward/std": 1.0497106313705444, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.024553571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 562.6730346679688, "completions/mean_terminated_length": 525.2848510742188, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.06929133858267716, "grad_norm": 0.41304177045822144, "kl": 0.00014287233352661133, "learning_rate": 3.1249999999999997e-07, "loss": -0.1104, "num_tokens": 6701873.0, "reward": -0.6360986828804016, "reward_std": 0.6729649305343628, "rewards/cosine_scaled_reward/mean": -0.6360986828804016, "rewards/cosine_scaled_reward/std": 0.9765380620956421, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025669642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 578.0670166015625, "completions/mean_terminated_length": 539.3402099609375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.07559055118110236, "grad_norm": 0.31683093309402466, "kl": 0.0001493692398071289, "learning_rate": 3.4375e-07, "loss": -0.0419, "num_tokens": 7368189.0, "reward": -0.5699460506439209, "reward_std": 0.7894014716148376, "rewards/cosine_scaled_reward/mean": -0.5699459910392761, "rewards/cosine_scaled_reward/std": 1.0498303174972534, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 514.8705444335938, "completions/mean_terminated_length": 472.6742858886719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.08188976377952756, "grad_norm": 0.4763113856315613, "kl": 0.00022745132446289062, "learning_rate": 3.75e-07, "loss": -0.1156, "num_tokens": 7978633.0, "reward": -0.5255608558654785, "reward_std": 0.8026651740074158, "rewards/cosine_scaled_reward/mean": -0.5255607962608337, "rewards/cosine_scaled_reward/std": 1.0921350717544556, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 508.7734680175781, "completions/mean_terminated_length": 471.8320007324219, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.08818897637795275, "grad_norm": 0.671411395072937, "kl": 0.0003235340118408203, "learning_rate": 4.0625e-07, "loss": -0.0157, "num_tokens": 8570062.0, "reward": -0.31583818793296814, "reward_std": 1.0193272829055786, "rewards/cosine_scaled_reward/mean": -0.31583812832832336, "rewards/cosine_scaled_reward/std": 1.2581169605255127, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030133928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 559.8739013671875, "completions/mean_terminated_length": 513.6375122070312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.09448818897637795, "grad_norm": 0.35042640566825867, "kl": 0.0003719329833984375, "learning_rate": 4.375e-07, "loss": -0.0833, "num_tokens": 9216429.0, "reward": -0.45582443475723267, "reward_std": 0.8441293239593506, "rewards/cosine_scaled_reward/mean": -0.4558244049549103, "rewards/cosine_scaled_reward/std": 1.1542800664901733, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030133928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 564.8192138671875, "completions/mean_terminated_length": 518.7365112304688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.10078740157480315, "grad_norm": 0.5082417726516724, "kl": 0.0004266500473022461, "learning_rate": 4.6874999999999996e-07, "loss": -0.0501, "num_tokens": 9835243.0, "reward": -0.44934120774269104, "reward_std": 0.8662951588630676, "rewards/cosine_scaled_reward/mean": -0.4493412375450134, "rewards/cosine_scaled_reward/std": 1.1598572731018066, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.036830357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 600.6685791015625, "completions/mean_terminated_length": 545.324462890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.10708661417322834, "grad_norm": 0.41793763637542725, "kl": 0.0005254745483398438, "learning_rate": 5e-07, "loss": -0.0947, "num_tokens": 10517874.0, "reward": -0.3498678505420685, "reward_std": 0.8900845646858215, "rewards/cosine_scaled_reward/mean": -0.3498677909374237, "rewards/cosine_scaled_reward/std": 1.23600435256958, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025669642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 532.1439819335938, "completions/mean_terminated_length": 492.2073669433594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.11338582677165354, "grad_norm": 0.5198326706886292, "kl": 0.0006380081176757812, "learning_rate": 5.3125e-07, "loss": -0.0733, "num_tokens": 11121267.0, "reward": -0.44612544775009155, "reward_std": 0.8814043998718262, "rewards/cosine_scaled_reward/mean": -0.44612544775009155, "rewards/cosine_scaled_reward/std": 1.162795901298523, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.036830357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 609.1004638671875, "completions/mean_terminated_length": 554.0787963867188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.11968503937007874, "grad_norm": 0.4639606475830078, "kl": 0.0006771087646484375, "learning_rate": 5.625e-07, "loss": -0.0639, "num_tokens": 11791677.0, "reward": -0.2794720232486725, "reward_std": 0.9824929237365723, "rewards/cosine_scaled_reward/mean": -0.2794720232486725, "rewards/cosine_scaled_reward/std": 1.281421184539795, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 575.9866333007812, "completions/mean_terminated_length": 542.3789672851562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.12598425196850394, "grad_norm": 0.4229482412338257, "kl": 0.0009136199951171875, "learning_rate": 5.937499999999999e-07, "loss": -0.059, "num_tokens": 12453601.0, "reward": -0.43330615758895874, "reward_std": 0.8788204789161682, "rewards/cosine_scaled_reward/mean": -0.43330615758895874, "rewards/cosine_scaled_reward/std": 1.1737666130065918, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 595.1663208007812, "completions/mean_terminated_length": 553.4661254882812, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.13228346456692913, "grad_norm": 0.35286158323287964, "kl": 0.0011720657348632812, "learning_rate": 6.249999999999999e-07, "loss": -0.0754, "num_tokens": 13124934.0, "reward": -0.16925115883350372, "reward_std": 1.009746789932251, "rewards/cosine_scaled_reward/mean": -0.16925112903118134, "rewards/cosine_scaled_reward/std": 1.3426754474639893, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0345982142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 607.3404541015625, "completions/mean_terminated_length": 555.7098388671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.13858267716535433, "grad_norm": 0.37835246324539185, "kl": 0.0016326904296875, "learning_rate": 6.5625e-07, "loss": -0.0973, "num_tokens": 13804535.0, "reward": -0.21277326345443726, "reward_std": 0.9656177163124084, "rewards/cosine_scaled_reward/mean": -0.21277324855327606, "rewards/cosine_scaled_reward/std": 1.319942593574524, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0424107142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 676.90625, "completions/mean_terminated_length": 616.1818237304688, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.14488188976377953, "grad_norm": 0.30840951204299927, "kl": 0.0016546249389648438, "learning_rate": 6.875e-07, "loss": -0.0202, "num_tokens": 14550627.0, "reward": -0.038872163742780685, "reward_std": 1.0010526180267334, "rewards/cosine_scaled_reward/mean": -0.038872163742780685, "rewards/cosine_scaled_reward/std": 1.4004237651824951, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 614.700927734375, "completions/mean_terminated_length": 568.4654541015625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.15118110236220472, "grad_norm": 1.1502492427825928, "kl": 0.0022287368774414062, "learning_rate": 7.1875e-07, "loss": -0.034, "num_tokens": 15218759.0, "reward": -0.03518987447023392, "reward_std": 1.0530657768249512, "rewards/cosine_scaled_reward/mean": -0.03518987074494362, "rewards/cosine_scaled_reward/std": 1.4014222621917725, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.024553571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 588.9777221679688, "completions/mean_terminated_length": 552.251708984375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.15748031496062992, "grad_norm": 0.32569238543510437, "kl": 0.00287628173828125, "learning_rate": 7.5e-07, "loss": -0.0104, "num_tokens": 15853619.0, "reward": 0.004624083638191223, "reward_std": 1.038998007774353, "rewards/cosine_scaled_reward/mean": 0.004624092020094395, "rewards/cosine_scaled_reward/std": 1.4162551164627075, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0502232142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 756.7924194335938, "completions/mean_terminated_length": 688.5146484375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.16377952755905512, "grad_norm": 0.3726414144039154, "kl": 0.00331878662109375, "learning_rate": 7.812499999999999e-07, "loss": 0.0287, "num_tokens": 16648345.0, "reward": -0.18634870648384094, "reward_std": 0.8561702966690063, "rewards/cosine_scaled_reward/mean": -0.18634869158267975, "rewards/cosine_scaled_reward/std": 1.3344650268554688, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 682.4074096679688, "completions/mean_terminated_length": 631.8298950195312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.1700787401574803, "grad_norm": 0.27346354722976685, "kl": 0.00455474853515625, "learning_rate": 8.125e-07, "loss": 0.0598, "num_tokens": 17386646.0, "reward": 0.3427794575691223, "reward_std": 1.1637052297592163, "rewards/cosine_scaled_reward/mean": 0.3427794277667999, "rewards/cosine_scaled_reward/std": 1.4923925399780273, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 707.4174194335938, "completions/mean_terminated_length": 651.2999877929688, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1763779527559055, "grad_norm": 0.25242871046066284, "kl": 0.0053730010986328125, "learning_rate": 8.4375e-07, "loss": 0.0511, "num_tokens": 18169068.0, "reward": 0.07486817985773087, "reward_std": 1.024709701538086, "rewards/cosine_scaled_reward/mean": 0.07486817985773087, "rewards/cosine_scaled_reward/std": 1.4391454458236694, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 734.8471069335938, "completions/mean_terminated_length": 681.4668579101562, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1826771653543307, "grad_norm": 0.393325537443161, "kl": 0.006855010986328125, "learning_rate": 8.75e-07, "loss": 0.0531, "num_tokens": 18953107.0, "reward": 0.42649638652801514, "reward_std": 1.0319130420684814, "rewards/cosine_scaled_reward/mean": 0.42649635672569275, "rewards/cosine_scaled_reward/std": 1.498863935470581, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 709.2064819335938, "completions/mean_terminated_length": 653.1639404296875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.1889763779527559, "grad_norm": 0.32169538736343384, "kl": 0.007747650146484375, "learning_rate": 9.0625e-07, "loss": 0.0762, "num_tokens": 19721804.0, "reward": 0.5034300088882446, "reward_std": 1.0488739013671875, "rewards/cosine_scaled_reward/mean": 0.5034299492835999, "rewards/cosine_scaled_reward/std": 1.5007421970367432, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0502232142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 800.1741333007812, "completions/mean_terminated_length": 734.1903686523438, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.1952755905511811, "grad_norm": 0.22457779943943024, "kl": 0.007617950439453125, "learning_rate": 9.374999999999999e-07, "loss": 0.0583, "num_tokens": 20562328.0, "reward": 0.6339239478111267, "reward_std": 1.1418627500534058, "rewards/cosine_scaled_reward/mean": 0.6339239478111267, "rewards/cosine_scaled_reward/std": 1.494827151298523, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030133928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 672.171875, "completions/mean_terminated_length": 629.4246215820312, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2015748031496063, "grad_norm": 0.2857438027858734, "kl": 0.009197235107421875, "learning_rate": 9.6875e-07, "loss": 0.0746, "num_tokens": 21281746.0, "reward": 0.7780371308326721, "reward_std": 1.0006595849990845, "rewards/cosine_scaled_reward/mean": 0.7780370712280273, "rewards/cosine_scaled_reward/std": 1.4746843576431274, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0580357142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 820.8381958007812, "completions/mean_terminated_length": 745.2310791015625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.2078740157480315, "grad_norm": 0.353129118680954, "kl": 0.008514404296875, "learning_rate": 1e-06, "loss": 0.0844, "num_tokens": 22153649.0, "reward": 0.3895089328289032, "reward_std": 0.9308316111564636, "rewards/cosine_scaled_reward/mean": 0.3895089328289032, "rewards/cosine_scaled_reward/std": 1.4967604875564575, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0558035714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 765.2767944335938, "completions/mean_terminated_length": 689.4656982421875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2141732283464567, "grad_norm": 0.3190731406211853, "kl": 0.0099639892578125, "learning_rate": 9.999694086498248e-07, "loss": 0.0808, "num_tokens": 22977081.0, "reward": 0.7411263585090637, "reward_std": 1.0240404605865479, "rewards/cosine_scaled_reward/mean": 0.7411263585090637, "rewards/cosine_scaled_reward/std": 1.4812393188476562, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.036830357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 716.021240234375, "completions/mean_terminated_length": 665.0880737304688, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.2204724409448819, "grad_norm": 0.27622923254966736, "kl": 0.010406494140625, "learning_rate": 9.998776383426216e-07, "loss": 0.0507, "num_tokens": 23758268.0, "reward": 0.6139054298400879, "reward_std": 0.8962022662162781, "rewards/cosine_scaled_reward/mean": 0.6139054298400879, "rewards/cosine_scaled_reward/std": 1.4964288473129272, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 759.5558471679688, "completions/mean_terminated_length": 699.3480834960938, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.22677165354330708, "grad_norm": 0.31751304864883423, "kl": 0.01029205322265625, "learning_rate": 9.997247003079009e-07, "loss": 0.0489, "num_tokens": 24579358.0, "reward": 0.47326162457466125, "reward_std": 0.9961800575256348, "rewards/cosine_scaled_reward/mean": 0.47326159477233887, "rewards/cosine_scaled_reward/std": 1.5005344152450562, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030133928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 690.1194458007812, "completions/mean_terminated_length": 647.9298095703125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.23307086614173228, "grad_norm": 0.3456534147262573, "kl": 0.011474609375, "learning_rate": 9.995106132599868e-07, "loss": 0.0696, "num_tokens": 25341609.0, "reward": 0.8717383146286011, "reward_std": 0.797817587852478, "rewards/cosine_scaled_reward/mean": 0.8717382550239563, "rewards/cosine_scaled_reward/std": 1.4538819789886475, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 678.0201416015625, "completions/mean_terminated_length": 633.8272094726562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.23937007874015748, "grad_norm": 0.4764210283756256, "kl": 0.01166534423828125, "learning_rate": 9.992354033957265e-07, "loss": 0.0301, "num_tokens": 26068747.0, "reward": 0.757907509803772, "reward_std": 0.8109303712844849, "rewards/cosine_scaled_reward/mean": 0.7579074501991272, "rewards/cosine_scaled_reward/std": 1.4783906936645508, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 694.2020263671875, "completions/mean_terminated_length": 661.7108764648438, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.24566929133858267, "grad_norm": 6.102512359619141, "kl": 0.010040283203125, "learning_rate": 9.988991043912856e-07, "loss": 0.0905, "num_tokens": 26821280.0, "reward": 0.697593629360199, "reward_std": 0.9140774011611938, "rewards/cosine_scaled_reward/mean": 0.697593629360199, "rewards/cosine_scaled_reward/std": 1.4877097606658936, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041294642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 763.3047485351562, "completions/mean_terminated_length": 707.9685668945312, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.25196850393700787, "grad_norm": 1.1193389892578125, "kl": 0.01035308837890625, "learning_rate": 9.98501757398026e-07, "loss": 0.068, "num_tokens": 27626545.0, "reward": 0.7076338529586792, "reward_std": 0.9347448348999023, "rewards/cosine_scaled_reward/mean": 0.7076338529586792, "rewards/cosine_scaled_reward/std": 1.486343502998352, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 721.7701416015625, "completions/mean_terminated_length": 685.268310546875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.25826771653543307, "grad_norm": 0.28604307770729065, "kl": 0.011211395263671875, "learning_rate": 9.980434110374724e-07, "loss": 0.0462, "num_tokens": 28407091.0, "reward": 0.6906989216804504, "reward_std": 0.9639256596565247, "rewards/cosine_scaled_reward/mean": 0.6906989216804504, "rewards/cosine_scaled_reward/std": 1.488455891609192, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0345982142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 795.6417846679688, "completions/mean_terminated_length": 750.759521484375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.26456692913385826, "grad_norm": 0.23319000005722046, "kl": 0.0089263916015625, "learning_rate": 9.975241213953604e-07, "loss": 0.0443, "num_tokens": 29252562.0, "reward": 0.5669840574264526, "reward_std": 0.8921679258346558, "rewards/cosine_scaled_reward/mean": 0.5669840574264526, "rewards/cosine_scaled_reward/std": 1.4992791414260864, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033482142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 729.4832763671875, "completions/mean_terminated_length": 683.80712890625, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.27086614173228346, "grad_norm": 0.3877314627170563, "kl": 0.011043548583984375, "learning_rate": 9.969439520147753e-07, "loss": 0.0633, "num_tokens": 30032467.0, "reward": 0.6205415725708008, "reward_std": 0.9956651329994202, "rewards/cosine_scaled_reward/mean": 0.6205415725708008, "rewards/cosine_scaled_reward/std": 1.4959126710891724, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 708.482177734375, "completions/mean_terminated_length": 671.6146240234375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.27716535433070866, "grad_norm": 0.3526562750339508, "kl": 0.010223388671875, "learning_rate": 9.96302973888376e-07, "loss": 0.0391, "num_tokens": 30792435.0, "reward": 0.8092110753059387, "reward_std": 0.8714210391044617, "rewards/cosine_scaled_reward/mean": 0.8239240050315857, "rewards/cosine_scaled_reward/std": 1.465368628501892, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029017857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 747.982177734375, "completions/mean_terminated_length": 709.1310424804688, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.28346456692913385, "grad_norm": 0.2751273512840271, "kl": 0.0096893310546875, "learning_rate": 9.956012654497072e-07, "loss": 0.0289, "num_tokens": 31581763.0, "reward": 0.5770221948623657, "reward_std": 1.057499647140503, "rewards/cosine_scaled_reward/mean": 0.577022135257721, "rewards/cosine_scaled_reward/std": 1.4988446235656738, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029017857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 680.700927734375, "completions/mean_terminated_length": 639.8390502929688, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.28976377952755905, "grad_norm": 0.835785984992981, "kl": 0.01076507568359375, "learning_rate": 9.948389125636038e-07, "loss": 0.0516, "num_tokens": 32316247.0, "reward": 0.6774722933769226, "reward_std": 0.7520989179611206, "rewards/cosine_scaled_reward/mean": 0.6774722337722778, "rewards/cosine_scaled_reward/std": 1.4901094436645508, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0345982142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 743.0714721679688, "completions/mean_terminated_length": 696.30517578125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.29606299212598425, "grad_norm": 0.4167815148830414, "kl": 0.0100250244140625, "learning_rate": 9.940160085156819e-07, "loss": 0.0426, "num_tokens": 33120535.0, "reward": 0.5268194079399109, "reward_std": 0.9155340790748596, "rewards/cosine_scaled_reward/mean": 0.5268194079399109, "rewards/cosine_scaled_reward/std": 1.5005640983581543, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 695.7154541015625, "completions/mean_terminated_length": 652.0933227539062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.30236220472440944, "grad_norm": 0.8342124819755554, "kl": 0.0113067626953125, "learning_rate": 9.931326540009253e-07, "loss": 0.0437, "num_tokens": 33879336.0, "reward": 0.6574720740318298, "reward_std": 0.9341281056404114, "rewards/cosine_scaled_reward/mean": 0.6574720740318298, "rewards/cosine_scaled_reward/std": 1.4924215078353882, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025669642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 720.7813110351562, "completions/mean_terminated_length": 685.814453125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.30866141732283464, "grad_norm": 0.5337599515914917, "kl": 0.010944366455078125, "learning_rate": 9.921889571113627e-07, "loss": 0.0304, "num_tokens": 34674036.0, "reward": 0.6473553776741028, "reward_std": 0.7692996859550476, "rewards/cosine_scaled_reward/mean": 0.6473553776741028, "rewards/cosine_scaled_reward/std": 1.4935177564620972, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 701.8359985351562, "completions/mean_terminated_length": 671.1015625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.31496062992125984, "grad_norm": 0.6662896871566772, "kl": 0.01065826416015625, "learning_rate": 9.911850333228427e-07, "loss": 0.0542, "num_tokens": 35430705.0, "reward": 0.821460485458374, "reward_std": 0.9691373705863953, "rewards/cosine_scaled_reward/mean": 0.8214603662490845, "rewards/cosine_scaled_reward/std": 1.4658869504928589, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 697.9620971679688, "completions/mean_terminated_length": 671.8521118164062, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.32125984251968503, "grad_norm": 0.9534047842025757, "kl": 0.01047515869140625, "learning_rate": 9.901210054809014e-07, "loss": 0.0367, "num_tokens": 36186687.0, "reward": 0.6305912137031555, "reward_std": 0.8517847657203674, "rewards/cosine_scaled_reward/mean": 0.6305912137031555, "rewards/cosine_scaled_reward/std": 1.4951274394989014, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 707.9364013671875, "completions/mean_terminated_length": 682.0193481445312, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.32755905511811023, "grad_norm": 0.3993193507194519, "kl": 0.010395050048828125, "learning_rate": 9.889970037857323e-07, "loss": 0.0396, "num_tokens": 36967334.0, "reward": 0.5871036052703857, "reward_std": 0.9707032442092896, "rewards/cosine_scaled_reward/mean": 0.587103545665741, "rewards/cosine_scaled_reward/std": 1.4982444047927856, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 690.3069458007812, "completions/mean_terminated_length": 670.3182373046875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.33385826771653543, "grad_norm": 0.4851699769496918, "kl": 0.010189056396484375, "learning_rate": 9.878131657762535e-07, "loss": 0.0056, "num_tokens": 37729289.0, "reward": 0.623904824256897, "reward_std": 0.9286167621612549, "rewards/cosine_scaled_reward/mean": 0.6239047646522522, "rewards/cosine_scaled_reward/std": 1.4956852197647095, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 700.4330444335938, "completions/mean_terminated_length": 675.9318237304688, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3401574803149606, "grad_norm": 1.0800269842147827, "kl": 0.010822296142578125, "learning_rate": 9.865696363132768e-07, "loss": 0.0377, "num_tokens": 38493613.0, "reward": 0.5335128307342529, "reward_std": 0.8787680268287659, "rewards/cosine_scaled_reward/mean": 0.5335127711296082, "rewards/cosine_scaled_reward/std": 1.5004278421401978, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 752.1897583007812, "completions/mean_terminated_length": 724.1162719726562, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3464566929133858, "grad_norm": 0.2947627902030945, "kl": 0.00998687744140625, "learning_rate": 9.852665675617837e-07, "loss": 0.0258, "num_tokens": 39296823.0, "reward": 0.8013721108436584, "reward_std": 0.88521409034729, "rewards/cosine_scaled_reward/mean": 0.8013721108436584, "rewards/cosine_scaled_reward/std": 1.4701893329620361, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 730.2120971679688, "completions/mean_terminated_length": 703.1959228515625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.352755905511811, "grad_norm": 0.29348793625831604, "kl": 0.01019287109375, "learning_rate": 9.83904118972304e-07, "loss": 0.0265, "num_tokens": 40101221.0, "reward": 0.5870950818061829, "reward_std": 0.9105805158615112, "rewards/cosine_scaled_reward/mean": 0.5870950818061829, "rewards/cosine_scaled_reward/std": 1.49825918674469, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 676.7489013671875, "completions/mean_terminated_length": 651.8170166015625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.3590551181102362, "grad_norm": 1.3304539918899536, "kl": 0.01004791259765625, "learning_rate": 9.82482457261405e-07, "loss": 0.0235, "num_tokens": 40832692.0, "reward": 0.7879303097724915, "reward_std": 0.986963152885437, "rewards/cosine_scaled_reward/mean": 0.7879303097724915, "rewards/cosine_scaled_reward/std": 1.4728718996047974, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 700.677490234375, "completions/mean_terminated_length": 671.488037109375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.3653543307086614, "grad_norm": 0.2850182354450226, "kl": 0.00983428955078125, "learning_rate": 9.81001756391292e-07, "loss": 0.0345, "num_tokens": 41588563.0, "reward": 0.6406442523002625, "reward_std": 0.868269145488739, "rewards/cosine_scaled_reward/mean": 0.6406441926956177, "rewards/cosine_scaled_reward/std": 1.494200348854065, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 613.5480346679688, "completions/mean_terminated_length": 592.42919921875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3716535433070866, "grad_norm": 0.4392086863517761, "kl": 0.011688232421875, "learning_rate": 9.7946219754852e-07, "loss": 0.0472, "num_tokens": 42260142.0, "reward": 0.8915125727653503, "reward_std": 0.8179810643196106, "rewards/cosine_scaled_reward/mean": 0.8915125131607056, "rewards/cosine_scaled_reward/std": 1.4483691453933716, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 688.6529541015625, "completions/mean_terminated_length": 660.7847900390625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3779527559055118, "grad_norm": 0.3848814070224762, "kl": 0.009918212890625, "learning_rate": 9.77863969121824e-07, "loss": 0.0307, "num_tokens": 43001975.0, "reward": 0.744490385055542, "reward_std": 0.9112673997879028, "rewards/cosine_scaled_reward/mean": 0.7444903254508972, "rewards/cosine_scaled_reward/std": 1.4806958436965942, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 677.6473388671875, "completions/mean_terminated_length": 649.5535278320312, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.384251968503937, "grad_norm": 0.35118257999420166, "kl": 0.0096893310546875, "learning_rate": 9.762072666790656e-07, "loss": -0.0035, "num_tokens": 43752059.0, "reward": 0.5971543192863464, "reward_std": 0.843102216720581, "rewards/cosine_scaled_reward/mean": 0.5971542596817017, "rewards/cosine_scaled_reward/std": 1.4975931644439697, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 692.1629638671875, "completions/mean_terminated_length": 664.3667602539062, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.3905511811023622, "grad_norm": 0.3855595886707306, "kl": 0.00948333740234375, "learning_rate": 9.744922929433033e-07, "loss": 0.0581, "num_tokens": 44502845.0, "reward": 0.5435569882392883, "reward_std": 0.8974959850311279, "rewards/cosine_scaled_reward/mean": 0.5435569882392883, "rewards/cosine_scaled_reward/std": 1.5001307725906372, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 677.5748291015625, "completions/mean_terminated_length": 649.4795532226562, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.3968503937007874, "grad_norm": 0.5436669588088989, "kl": 0.00952911376953125, "learning_rate": 9.72719257767985e-07, "loss": 0.0235, "num_tokens": 45237840.0, "reward": 0.6974921822547913, "reward_std": 0.985985279083252, "rewards/cosine_scaled_reward/mean": 0.6974921226501465, "rewards/cosine_scaled_reward/std": 1.4876641035079956, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030133928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 769.7176513671875, "completions/mean_terminated_length": 730.0011596679688, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.4031496062992126, "grad_norm": 0.590516984462738, "kl": 0.008392333984375, "learning_rate": 9.70888378111271e-07, "loss": 0.0478, "num_tokens": 46067475.0, "reward": 0.33929741382598877, "reward_std": 0.9476777911186218, "rewards/cosine_scaled_reward/mean": 0.339297354221344, "rewards/cosine_scaled_reward/std": 1.4921878576278687, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 628.677490234375, "completions/mean_terminated_length": 607.7814331054688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4094488188976378, "grad_norm": 0.34324246644973755, "kl": 0.011089324951171875, "learning_rate": 9.689998780094837e-07, "loss": 0.0601, "num_tokens": 46761986.0, "reward": 0.925286591053009, "reward_std": 1.0149413347244263, "rewards/cosine_scaled_reward/mean": 0.925286591053009, "rewards/cosine_scaled_reward/std": 1.439137578010559, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 699.3839721679688, "completions/mean_terminated_length": 668.5935668945312, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.415748031496063, "grad_norm": 0.39818695187568665, "kl": 0.009059906005859375, "learning_rate": 9.67053988549695e-07, "loss": 0.0456, "num_tokens": 47514154.0, "reward": 0.7109607458114624, "reward_std": 1.0165566205978394, "rewards/cosine_scaled_reward/mean": 0.7109607458114624, "rewards/cosine_scaled_reward/std": 1.4858678579330444, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 710.5413208007812, "completions/mean_terminated_length": 680.0056762695312, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4220472440944882, "grad_norm": 0.4072217643260956, "kl": 0.009124755859375, "learning_rate": 9.650509478414482e-07, "loss": 0.0476, "num_tokens": 48275359.0, "reward": 0.7946111559867859, "reward_std": 0.9021484851837158, "rewards/cosine_scaled_reward/mean": 0.7946110963821411, "rewards/cosine_scaled_reward/std": 1.4715018272399902, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 695.6283569335938, "completions/mean_terminated_length": 663.1714477539062, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.4283464566929134, "grad_norm": 0.32094287872314453, "kl": 0.00888824462890625, "learning_rate": 9.62991000987622e-07, "loss": 0.0311, "num_tokens": 49047650.0, "reward": 0.6975319981575012, "reward_std": 0.9495226740837097, "rewards/cosine_scaled_reward/mean": 0.6975319981575012, "rewards/cosine_scaled_reward/std": 1.487709879875183, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029017857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 668.6741333007812, "completions/mean_terminated_length": 627.452880859375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.4346456692913386, "grad_norm": 0.35820212960243225, "kl": 0.01102447509765625, "learning_rate": 9.60874400054439e-07, "loss": 0.0277, "num_tokens": 49783358.0, "reward": 0.7410476207733154, "reward_std": 0.9227878451347351, "rewards/cosine_scaled_reward/mean": 0.7410475015640259, "rewards/cosine_scaled_reward/std": 1.481183409690857, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 695.1785888671875, "completions/mean_terminated_length": 662.7108764648438, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.4409448818897638, "grad_norm": 0.2676938772201538, "kl": 0.008640289306640625, "learning_rate": 9.587014040406206e-07, "loss": 0.0365, "num_tokens": 50540622.0, "reward": 0.5602699518203735, "reward_std": 0.8083215951919556, "rewards/cosine_scaled_reward/mean": 0.5602698922157288, "rewards/cosine_scaled_reward/std": 1.4995226860046387, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 766.4799194335938, "completions/mean_terminated_length": 735.7234497070312, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.44724409448818897, "grad_norm": 0.25637087225914, "kl": 0.008380889892578125, "learning_rate": 9.564722788456943e-07, "loss": 0.0424, "num_tokens": 51390028.0, "reward": 0.3091750741004944, "reward_std": 0.9855142831802368, "rewards/cosine_scaled_reward/mean": 0.309175044298172, "rewards/cosine_scaled_reward/std": 1.4886199235916138, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 664.40625, "completions/mean_terminated_length": 631.2000122070312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.45354330708661417, "grad_norm": 0.3117295205593109, "kl": 0.010120391845703125, "learning_rate": 9.541872972374582e-07, "loss": 0.0579, "num_tokens": 52123992.0, "reward": 0.8448864817619324, "reward_std": 0.961277186870575, "rewards/cosine_scaled_reward/mean": 0.8448864221572876, "rewards/cosine_scaled_reward/std": 1.4605894088745117, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 659.1674194335938, "completions/mean_terminated_length": 635.5210571289062, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.45984251968503936, "grad_norm": 0.34628939628601074, "kl": 0.008678436279296875, "learning_rate": 9.518467388186019e-07, "loss": 0.0462, "num_tokens": 52841694.0, "reward": 0.7478297352790833, "reward_std": 0.8671966791152954, "rewards/cosine_scaled_reward/mean": 0.7478297352790833, "rewards/cosine_scaled_reward/std": 1.4801048040390015, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 769.2879638671875, "completions/mean_terminated_length": 734.093994140625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.46614173228346456, "grad_norm": 0.28708598017692566, "kl": 0.00865936279296875, "learning_rate": 9.494508899924947e-07, "loss": 0.0203, "num_tokens": 53685440.0, "reward": 0.41296184062957764, "reward_std": 1.0075267553329468, "rewards/cosine_scaled_reward/mean": 0.41296181082725525, "rewards/cosine_scaled_reward/std": 1.4982719421386719, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 734.6864013671875, "completions/mean_terminated_length": 696.9907836914062, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.47244094488188976, "grad_norm": 0.22522306442260742, "kl": 0.0083770751953125, "learning_rate": 9.470000439281378e-07, "loss": 0.0213, "num_tokens": 54483543.0, "reward": 0.43300533294677734, "reward_std": 0.853945791721344, "rewards/cosine_scaled_reward/mean": 0.43300530314445496, "rewards/cosine_scaled_reward/std": 1.4992822408676147, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 721.5346069335938, "completions/mean_terminated_length": 694.340576171875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.47874015748031495, "grad_norm": 0.40116333961486816, "kl": 0.0081329345703125, "learning_rate": 9.444945005242923e-07, "loss": 0.0563, "num_tokens": 55285750.0, "reward": 0.3962365686893463, "reward_std": 0.8454629778862, "rewards/cosine_scaled_reward/mean": 0.3962365686893463, "rewards/cosine_scaled_reward/std": 1.4971959590911865, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 661.125, "completions/mean_terminated_length": 637.511962890625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.48503937007874015, "grad_norm": 0.24201875925064087, "kl": 0.009799957275390625, "learning_rate": 9.419345663727804e-07, "loss": 0.0013, "num_tokens": 56000966.0, "reward": 0.6039526462554932, "reward_std": 0.9591172933578491, "rewards/cosine_scaled_reward/mean": 0.6039525866508484, "rewards/cosine_scaled_reward/std": 1.497071623802185, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 667.388427734375, "completions/mean_terminated_length": 637.4777221679688, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.49133858267716535, "grad_norm": 0.4878818392753601, "kl": 0.00946044921875, "learning_rate": 9.393205547209708e-07, "loss": 0.0725, "num_tokens": 56714002.0, "reward": 0.7847359776496887, "reward_std": 0.8636373281478882, "rewards/cosine_scaled_reward/mean": 0.7847359776496887, "rewards/cosine_scaled_reward/std": 1.473402976989746, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 629.6796875, "completions/mean_terminated_length": 598.9520874023438, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.49763779527559054, "grad_norm": 0.509618878364563, "kl": 0.00946044921875, "learning_rate": 9.366527854334462e-07, "loss": 0.0226, "num_tokens": 57416579.0, "reward": 0.6808841228485107, "reward_std": 0.9489257335662842, "rewards/cosine_scaled_reward/mean": 0.680884063243866, "rewards/cosine_scaled_reward/std": 1.4896831512451172, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 645.3270263671875, "completions/mean_terminated_length": 629.4954833984375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.5039370078740157, "grad_norm": 0.25900280475616455, "kl": 0.009029388427734375, "learning_rate": 9.339315849528649e-07, "loss": 0.0104, "num_tokens": 58139288.0, "reward": 0.7511017322540283, "reward_std": 0.8574750423431396, "rewards/cosine_scaled_reward/mean": 0.7511016726493835, "rewards/cosine_scaled_reward/std": 1.4796228408813477, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 636.7020263671875, "completions/mean_terminated_length": 614.3004760742188, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.510236220472441, "grad_norm": 0.2857658267021179, "kl": 0.009235382080078125, "learning_rate": 9.311572862600138e-07, "loss": 0.0175, "num_tokens": 58831197.0, "reward": 0.9521499872207642, "reward_std": 0.9827561378479004, "rewards/cosine_scaled_reward/mean": 0.9521499276161194, "rewards/cosine_scaled_reward/std": 1.4308584928512573, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 664.6350708007812, "completions/mean_terminated_length": 647.440673828125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5165354330708661, "grad_norm": 0.19333045184612274, "kl": 0.008487701416015625, "learning_rate": 9.283302288330643e-07, "loss": 0.0009, "num_tokens": 59560566.0, "reward": 0.7243615984916687, "reward_std": 0.8989306688308716, "rewards/cosine_scaled_reward/mean": 0.7243615388870239, "rewards/cosine_scaled_reward/std": 1.4838638305664062, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 717.2957763671875, "completions/mean_terminated_length": 685.3588256835938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5228346456692914, "grad_norm": 0.7998607754707336, "kl": 0.0081939697265625, "learning_rate": 9.25450758606031e-07, "loss": -0.0049, "num_tokens": 60337039.0, "reward": 0.38290420174598694, "reward_std": 0.8364748358726501, "rewards/cosine_scaled_reward/mean": 0.38290414214134216, "rewards/cosine_scaled_reward/std": 1.496158242225647, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 657.6328125, "completions/mean_terminated_length": 632.3533935546875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.5291338582677165, "grad_norm": 0.2160445600748062, "kl": 0.008991241455078125, "learning_rate": 9.225192279264422e-07, "loss": 0.0412, "num_tokens": 61047318.0, "reward": 0.7544872164726257, "reward_std": 0.7835718393325806, "rewards/cosine_scaled_reward/mean": 0.754487156867981, "rewards/cosine_scaled_reward/std": 1.4789953231811523, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 613.2232666015625, "completions/mean_terminated_length": 590.448974609375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5354330708661418, "grad_norm": 0.3396480083465576, "kl": 0.009510040283203125, "learning_rate": 9.195359955122243e-07, "loss": 0.0483, "num_tokens": 61716782.0, "reward": 0.9754770398139954, "reward_std": 0.7877938747406006, "rewards/cosine_scaled_reward/mean": 0.9754770994186401, "rewards/cosine_scaled_reward/std": 1.4231003522872925, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 678.0457763671875, "completions/mean_terminated_length": 654.7208251953125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5417322834645669, "grad_norm": 0.3173322379589081, "kl": 0.009063720703125, "learning_rate": 9.165014264078068e-07, "loss": 0.0434, "num_tokens": 62462775.0, "reward": 0.7376777529716492, "reward_std": 0.8068321943283081, "rewards/cosine_scaled_reward/mean": 0.7376776933670044, "rewards/cosine_scaled_reward/std": 1.4817646741867065, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 698.255615234375, "completions/mean_terminated_length": 672.1513061523438, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.5480314960629922, "grad_norm": 0.381440669298172, "kl": 0.00875091552734375, "learning_rate": 9.134158919394544e-07, "loss": -0.0019, "num_tokens": 63246604.0, "reward": 0.6875295042991638, "reward_std": 0.8792651891708374, "rewards/cosine_scaled_reward/mean": 0.6875295042991638, "rewards/cosine_scaled_reward/std": 1.4889642000198364, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 677.9765625, "completions/mean_terminated_length": 640.2694702148438, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5543307086614173, "grad_norm": 0.31262969970703125, "kl": 0.009006500244140625, "learning_rate": 9.102797696698283e-07, "loss": 0.0392, "num_tokens": 63992103.0, "reward": 0.6004459261894226, "reward_std": 0.7837154269218445, "rewards/cosine_scaled_reward/mean": 0.6004458665847778, "rewards/cosine_scaled_reward/std": 1.4973416328430176, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 699.6752319335938, "completions/mean_terminated_length": 670.4640502929688, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5606299212598426, "grad_norm": 0.2140110731124878, "kl": 0.008007049560546875, "learning_rate": 9.070934433517872e-07, "loss": -0.0011, "num_tokens": 64751652.0, "reward": 0.6506852507591248, "reward_std": 0.9374973773956299, "rewards/cosine_scaled_reward/mean": 0.6506852507591248, "rewards/cosine_scaled_reward/std": 1.4932117462158203, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 648.125, "completions/mean_terminated_length": 616.1643676757812, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5669291338582677, "grad_norm": 0.2540406286716461, "kl": 0.009563446044921875, "learning_rate": 9.038573028814271e-07, "loss": 0.0493, "num_tokens": 65504420.0, "reward": 0.8448294401168823, "reward_std": 0.9804660081863403, "rewards/cosine_scaled_reward/mean": 0.8448294401168823, "rewards/cosine_scaled_reward/std": 1.4603689908981323, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 672.9933471679688, "completions/mean_terminated_length": 633.5269775390625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.573228346456693, "grad_norm": 0.4330436885356903, "kl": 0.008884429931640625, "learning_rate": 9.005717442503739e-07, "loss": 0.05, "num_tokens": 66254270.0, "reward": 0.7009679675102234, "reward_std": 0.9250093698501587, "rewards/cosine_scaled_reward/mean": 0.7009679079055786, "rewards/cosine_scaled_reward/std": 1.4871753454208374, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.024553571428571397, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 653.8158569335938, "completions/mean_terminated_length": 618.721923828125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5795275590551181, "grad_norm": 0.32978150248527527, "kl": 0.009395599365234375, "learning_rate": 8.972371694973261e-07, "loss": 0.0342, "num_tokens": 66965097.0, "reward": 0.7579172849655151, "reward_std": 0.791741132736206, "rewards/cosine_scaled_reward/mean": 0.7579172849655151, "rewards/cosine_scaled_reward/std": 1.4783599376678467, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0323660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 685.1964721679688, "completions/mean_terminated_length": 639.6124267578125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5858267716535434, "grad_norm": 0.3639669120311737, "kl": 0.0100860595703125, "learning_rate": 8.938539866588592e-07, "loss": 0.048, "num_tokens": 67714169.0, "reward": 0.7544926404953003, "reward_std": 0.9491753578186035, "rewards/cosine_scaled_reward/mean": 0.7544926404953003, "rewards/cosine_scaled_reward/std": 1.4788836240768433, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.036830357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 747.8225708007812, "completions/mean_terminated_length": 698.1054077148438, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5921259842519685, "grad_norm": 0.29422667622566223, "kl": 0.008636474609375, "learning_rate": 8.904226097194969e-07, "loss": 0.028, "num_tokens": 68528394.0, "reward": 0.446431040763855, "reward_std": 0.7203037142753601, "rewards/cosine_scaled_reward/mean": 0.4464310109615326, "rewards/cosine_scaled_reward/std": 1.4998756647109985, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0379464285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 733.1272583007812, "completions/mean_terminated_length": 681.2644653320312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5984251968503937, "grad_norm": 0.3252141773700714, "kl": 0.009124755859375, "learning_rate": 8.869434585610534e-07, "loss": 0.048, "num_tokens": 69309772.0, "reward": 0.6976136565208435, "reward_std": 0.9922178983688354, "rewards/cosine_scaled_reward/mean": 0.6976136565208435, "rewards/cosine_scaled_reward/std": 1.4876292943954468, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 674.083740234375, "completions/mean_terminated_length": 634.648681640625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.6047244094488189, "grad_norm": 0.28459686040878296, "kl": 0.009387969970703125, "learning_rate": 8.834169589112543e-07, "loss": 0.058, "num_tokens": 70038327.0, "reward": 0.8014413714408875, "reward_std": 0.9954638481140137, "rewards/cosine_scaled_reward/mean": 0.8014413118362427, "rewards/cosine_scaled_reward/std": 1.4701112508773804, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0379464285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 734.513427734375, "completions/mean_terminated_length": 682.705322265625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6110236220472441, "grad_norm": 0.40386196970939636, "kl": 0.009552001953125, "learning_rate": 8.798435422916423e-07, "loss": 0.0509, "num_tokens": 70831219.0, "reward": 0.6741692423820496, "reward_std": 0.8956859111785889, "rewards/cosine_scaled_reward/mean": 0.6741691827774048, "rewards/cosine_scaled_reward/std": 1.490605354309082, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030133928571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 729.880615234375, "completions/mean_terminated_length": 688.9263305664062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6173228346456693, "grad_norm": 0.3040317893028259, "kl": 0.00994873046875, "learning_rate": 8.762236459647743e-07, "loss": 0.0398, "num_tokens": 71620504.0, "reward": 0.39625483751296997, "reward_std": 0.7587441802024841, "rewards/cosine_scaled_reward/mean": 0.3962548077106476, "rewards/cosine_scaled_reward/std": 1.4971867799758911, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 720.888427734375, "completions/mean_terminated_length": 684.3623657226562, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6236220472440945, "grad_norm": 0.34343069791793823, "kl": 0.009426116943359375, "learning_rate": 8.725577128807142e-07, "loss": 0.0646, "num_tokens": 72394724.0, "reward": 0.570304811000824, "reward_std": 0.9204005002975464, "rewards/cosine_scaled_reward/mean": 0.5703047513961792, "rewards/cosine_scaled_reward/std": 1.4990601539611816, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033482142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 699.3717041015625, "completions/mean_terminated_length": 652.6524047851562, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6299212598425197, "grad_norm": 0.45011547207832336, "kl": 0.01010894775390625, "learning_rate": 8.688461916228332e-07, "loss": 0.0548, "num_tokens": 73170145.0, "reward": 0.7544993758201599, "reward_std": 0.8879099488258362, "rewards/cosine_scaled_reward/mean": 0.7544994354248047, "rewards/cosine_scaled_reward/std": 1.4789304733276367, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025669642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 646.4163208007812, "completions/mean_terminated_length": 609.4902954101562, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.6362204724409449, "grad_norm": 0.5061929821968079, "kl": 0.0099029541015625, "learning_rate": 8.650895363529172e-07, "loss": 0.0783, "num_tokens": 73878262.0, "reward": 0.8282220363616943, "reward_std": 1.0407150983810425, "rewards/cosine_scaled_reward/mean": 0.8282219767570496, "rewards/cosine_scaled_reward/std": 1.4642788171768188, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 719.0000610351562, "completions/mean_terminated_length": 664.9755859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6425196850393701, "grad_norm": 0.49533170461654663, "kl": 0.0106964111328125, "learning_rate": 8.612882067555933e-07, "loss": 0.0511, "num_tokens": 74669942.0, "reward": 0.5335639119148254, "reward_std": 0.7676834464073181, "rewards/cosine_scaled_reward/mean": 0.5335639119148254, "rewards/cosine_scaled_reward/std": 1.5003803968429565, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 770.5859985351562, "completions/mean_terminated_length": 698.2794799804688, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.6488188976377953, "grad_norm": 0.4521535038948059, "kl": 0.0103912353515625, "learning_rate": 8.574426679820813e-07, "loss": 0.1075, "num_tokens": 75488659.0, "reward": 0.4699151813983917, "reward_std": 0.9292130470275879, "rewards/cosine_scaled_reward/mean": 0.46991515159606934, "rewards/cosine_scaled_reward/std": 1.5004866123199463, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0479910714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 752.2734985351562, "completions/mean_terminated_length": 686.9554443359375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.6551181102362205, "grad_norm": 0.7238117456436157, "kl": 0.01027679443359375, "learning_rate": 8.535533905932737e-07, "loss": 0.0409, "num_tokens": 76295112.0, "reward": 0.6177164316177368, "reward_std": 0.9820640683174133, "rewards/cosine_scaled_reward/mean": 0.617716372013092, "rewards/cosine_scaled_reward/std": 1.495753526687622, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0323660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 663.9910888671875, "completions/mean_terminated_length": 617.6978149414062, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.6614173228346457, "grad_norm": 0.44373902678489685, "kl": 0.01088714599609375, "learning_rate": 8.49620850502157e-07, "loss": 0.1424, "num_tokens": 77017296.0, "reward": 0.7043907642364502, "reward_std": 0.8915003538131714, "rewards/cosine_scaled_reward/mean": 0.7043907046318054, "rewards/cosine_scaled_reward/std": 1.4866788387298584, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 702.9989013671875, "completions/mean_terminated_length": 648.3240356445312, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.6677165354330709, "grad_norm": 0.43054643273353577, "kl": 0.0108642578125, "learning_rate": 8.45645528915575e-07, "loss": 0.0957, "num_tokens": 77775551.0, "reward": 0.6707912087440491, "reward_std": 0.8435035943984985, "rewards/cosine_scaled_reward/mean": 0.6707910895347595, "rewards/cosine_scaled_reward/std": 1.491044044494629, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0401785714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 692.9453735351562, "completions/mean_terminated_length": 636.2221069335938, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6740157480314961, "grad_norm": 0.4474175274372101, "kl": 0.011871337890625, "learning_rate": 8.416279122753466e-07, "loss": 0.055, "num_tokens": 78514990.0, "reward": 0.8312375545501709, "reward_std": 0.986255407333374, "rewards/cosine_scaled_reward/mean": 0.8312374949455261, "rewards/cosine_scaled_reward/std": 1.4634435176849365, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041294642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 754.786865234375, "completions/mean_terminated_length": 699.0838012695312, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.6803149606299213, "grad_norm": 0.5195061564445496, "kl": 0.010135650634765625, "learning_rate": 8.375684921987421e-07, "loss": 0.0418, "num_tokens": 79327503.0, "reward": 0.6640704274177551, "reward_std": 0.9568787217140198, "rewards/cosine_scaled_reward/mean": 0.6640704274177551, "rewards/cosine_scaled_reward/std": 1.4918031692504883, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052455357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 766.427490234375, "completions/mean_terminated_length": 695.4805908203125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.6866141732283465, "grad_norm": 2.433952569961548, "kl": 0.01102447509765625, "learning_rate": 8.334677654183253e-07, "loss": 0.1123, "num_tokens": 80162590.0, "reward": 0.6239715814590454, "reward_std": 0.9540754556655884, "rewards/cosine_scaled_reward/mean": 0.6239715218544006, "rewards/cosine_scaled_reward/std": 1.4956040382385254, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044642857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 733.6551513671875, "completions/mean_terminated_length": 672.2371215820312, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.6929133858267716, "grad_norm": 0.41383421421051025, "kl": 0.01192474365234375, "learning_rate": 8.293262337211722e-07, "loss": 0.0604, "num_tokens": 80939513.0, "reward": 0.6272814869880676, "reward_std": 0.896499514579773, "rewards/cosine_scaled_reward/mean": 0.6272814869880676, "rewards/cosine_scaled_reward/std": 1.4953655004501343, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0323660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 695.2210083007812, "completions/mean_terminated_length": 649.9722900390625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.6992125984251969, "grad_norm": 0.4339134097099304, "kl": 0.01230621337890625, "learning_rate": 8.251444038874685e-07, "loss": 0.0797, "num_tokens": 81699887.0, "reward": 0.6373085379600525, "reward_std": 0.944847583770752, "rewards/cosine_scaled_reward/mean": 0.6373085379600525, "rewards/cosine_scaled_reward/std": 1.494478702545166, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 699.7533569335938, "completions/mean_terminated_length": 623.4375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.705511811023622, "grad_norm": 0.5097730755805969, "kl": 0.01384735107421875, "learning_rate": 8.209227876284971e-07, "loss": 0.0861, "num_tokens": 82448626.0, "reward": 0.6977179646492004, "reward_std": 0.9958769083023071, "rewards/cosine_scaled_reward/mean": 0.6977178454399109, "rewards/cosine_scaled_reward/std": 1.4875280857086182, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029017857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 691.3236694335938, "completions/mean_terminated_length": 650.779296875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7118110236220473, "grad_norm": 0.4512317180633545, "kl": 0.01357269287109375, "learning_rate": 8.166619015240235e-07, "loss": 0.1306, "num_tokens": 83188772.0, "reward": 0.93192458152771, "reward_std": 1.0319198369979858, "rewards/cosine_scaled_reward/mean": 0.9319245219230652, "rewards/cosine_scaled_reward/std": 1.4371205568313599, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0502232142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 719.3203735351562, "completions/mean_terminated_length": 649.0610961914062, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.7181102362204724, "grad_norm": 0.3798118531703949, "kl": 0.013671875, "learning_rate": 8.12362266959083e-07, "loss": 0.0621, "num_tokens": 83956275.0, "reward": 0.6744152307510376, "reward_std": 0.8991620540618896, "rewards/cosine_scaled_reward/mean": 0.6744151711463928, "rewards/cosine_scaled_reward/std": 1.4903264045715332, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0479910714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 738.3370971679688, "completions/mean_terminated_length": 672.3165283203125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.7244094488188977, "grad_norm": 0.4109092056751251, "kl": 0.013885498046875, "learning_rate": 8.080244100601821e-07, "loss": 0.1301, "num_tokens": 84749041.0, "reward": 0.6942418217658997, "reward_std": 0.9705992341041565, "rewards/cosine_scaled_reward/mean": 0.6942418217658997, "rewards/cosine_scaled_reward/std": 1.488061785697937, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0647321428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 739.8873291015625, "completions/mean_terminated_length": 649.3496704101562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.7307086614173228, "grad_norm": 0.5201711058616638, "kl": 0.0142669677734375, "learning_rate": 8.036488616309183e-07, "loss": 0.1653, "num_tokens": 85548028.0, "reward": 0.7681640982627869, "reward_std": 1.0258592367172241, "rewards/cosine_scaled_reward/mean": 0.7681640982627869, "rewards/cosine_scaled_reward/std": 1.4761563539505005, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056919642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 743.7388916015625, "completions/mean_terminated_length": 665.0201416015625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.7370078740157481, "grad_norm": 0.31872037053108215, "kl": 0.0137939453125, "learning_rate": 7.992361570870287e-07, "loss": 0.0601, "num_tokens": 86343826.0, "reward": 0.6071450114250183, "reward_std": 0.8564595580101013, "rewards/cosine_scaled_reward/mean": 0.6071449518203735, "rewards/cosine_scaled_reward/std": 1.4969135522842407, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 694.1339721679688, "completions/mean_terminated_length": 617.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.7433070866141732, "grad_norm": 0.34207943081855774, "kl": 0.0148162841796875, "learning_rate": 7.947868363908728e-07, "loss": 0.1579, "num_tokens": 87078410.0, "reward": 0.8284109830856323, "reward_std": 0.9332945942878723, "rewards/cosine_scaled_reward/mean": 0.8284109830856323, "rewards/cosine_scaled_reward/std": 1.4639177322387695, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 796.0949096679688, "completions/mean_terminated_length": 699.7944946289062, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7496062992125985, "grad_norm": 0.34661048650741577, "kl": 0.0142364501953125, "learning_rate": 7.903014439853603e-07, "loss": 0.1492, "num_tokens": 87927503.0, "reward": 0.6473815441131592, "reward_std": 0.9624386429786682, "rewards/cosine_scaled_reward/mean": 0.6473814845085144, "rewards/cosine_scaled_reward/std": 1.4935013055801392, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 709.3906860351562, "completions/mean_terminated_length": 633.6203002929688, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.7559055118110236, "grad_norm": 0.3804778456687927, "kl": 0.01534271240234375, "learning_rate": 7.857805287273305e-07, "loss": 0.1421, "num_tokens": 88694285.0, "reward": 0.6875464916229248, "reward_std": 0.925369381904602, "rewards/cosine_scaled_reward/mean": 0.68754643201828, "rewards/cosine_scaled_reward/std": 1.4890048503875732, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0535714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 735.6484985351562, "completions/mean_terminated_length": 661.3643798828125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.7622047244094489, "grad_norm": 0.47416040301322937, "kl": 0.0153656005859375, "learning_rate": 7.812246438203903e-07, "loss": 0.1223, "num_tokens": 89475234.0, "reward": 0.634546160697937, "reward_std": 0.9282253980636597, "rewards/cosine_scaled_reward/mean": 0.634546160697937, "rewards/cosine_scaled_reward/std": 1.4941778182983398, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 775.114990234375, "completions/mean_terminated_length": 678.8463134765625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.768503937007874, "grad_norm": 0.41114383935928345, "kl": 0.016082763671875, "learning_rate": 7.766343467472218e-07, "loss": 0.0932, "num_tokens": 90299369.0, "reward": 0.48326361179351807, "reward_std": 0.9238112568855286, "rewards/cosine_scaled_reward/mean": 0.4832635819911957, "rewards/cosine_scaled_reward/std": 1.500690221786499, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 660.1171875, "completions/mean_terminated_length": 567.5916748046875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7748031496062993, "grad_norm": 0.3870268762111664, "kl": 0.01950836181640625, "learning_rate": 7.720101992013661e-07, "loss": 0.1427, "num_tokens": 91015906.0, "reward": 0.8052693009376526, "reward_std": 0.9232622385025024, "rewards/cosine_scaled_reward/mean": 0.8052692413330078, "rewards/cosine_scaled_reward/std": 1.4687471389770508, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0680803571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 805.6417846679688, "completions/mean_terminated_length": 714.8826293945312, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.7811023622047244, "grad_norm": 0.39262351393699646, "kl": 0.019805908203125, "learning_rate": 7.673527670184901e-07, "loss": 0.0868, "num_tokens": 91876785.0, "reward": 0.6843234896659851, "reward_std": 0.8282831907272339, "rewards/cosine_scaled_reward/mean": 0.6843234300613403, "rewards/cosine_scaled_reward/std": 1.4892817735671997, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0926339285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 779.9799194335938, "completions/mean_terminated_length": 650.5264282226562, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7874015748031497, "grad_norm": 0.4867366850376129, "kl": 0.02044677734375, "learning_rate": 7.626626201071493e-07, "loss": 0.1882, "num_tokens": 92693375.0, "reward": 0.6739551424980164, "reward_std": 0.9228638410568237, "rewards/cosine_scaled_reward/mean": 0.6739550828933716, "rewards/cosine_scaled_reward/std": 1.4905028343200684, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0948660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 810.3303833007812, "completions/mean_terminated_length": 680.611572265625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.7937007874015748, "grad_norm": 116.46406555175781, "kl": 0.028717041015625, "learning_rate": 7.5794033237905e-07, "loss": 0.1429, "num_tokens": 93564199.0, "reward": 0.610503077507019, "reward_std": 0.9900833964347839, "rewards/cosine_scaled_reward/mean": 0.610503077507019, "rewards/cosine_scaled_reward/std": 1.4967402219772339, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1127232142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 809.3761596679688, "completions/mean_terminated_length": 652.016357421875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.8, "grad_norm": 0.6866064667701721, "kl": 0.0263214111328125, "learning_rate": 7.53186481678822e-07, "loss": 0.2378, "num_tokens": 94407080.0, "reward": 0.49012550711631775, "reward_std": 1.0525535345077515, "rewards/cosine_scaled_reward/mean": 0.49012547731399536, "rewards/cosine_scaled_reward/std": 1.5006331205368042, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 814.9810791015625, "completions/mean_terminated_length": 578.8709716796875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.8062992125984252, "grad_norm": 0.7681502103805542, "kl": 0.0304718017578125, "learning_rate": 7.484016497133111e-07, "loss": 0.2825, "num_tokens": 95265191.0, "reward": 0.5303881764411926, "reward_std": 1.094639778137207, "rewards/cosine_scaled_reward/mean": 0.5303881764411926, "rewards/cosine_scaled_reward/std": 1.5002450942993164, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1629464285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 891.5234985351562, "completions/mean_terminated_length": 666.39599609375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8125984251968504, "grad_norm": 0.987953245639801, "kl": 0.0319671630859375, "learning_rate": 7.435864219803982e-07, "loss": 0.264, "num_tokens": 96208188.0, "reward": 0.49702638387680054, "reward_std": 1.135278582572937, "rewards/cosine_scaled_reward/mean": 0.49702638387680054, "rewards/cosine_scaled_reward/std": 1.5005019903182983, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2645089285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1011.3906860351562, "completions/mean_terminated_length": 638.5887451171875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.8188976377952756, "grad_norm": 0.9615009427070618, "kl": 0.042144775390625, "learning_rate": 7.387413876973543e-07, "loss": 0.3279, "num_tokens": 97255322.0, "reward": 0.3463992476463318, "reward_std": 1.0665947198867798, "rewards/cosine_scaled_reward/mean": 0.3463992476463318, "rewards/cosine_scaled_reward/std": 1.4925742149353027, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3381696428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1115.0926513671875, "completions/mean_terminated_length": 638.4131469726562, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8251968503937008, "grad_norm": 1.6049638986587524, "kl": 0.05584716796875, "learning_rate": 7.338671397287408e-07, "loss": 0.3132, "num_tokens": 98388477.0, "reward": 0.15194740891456604, "reward_std": 1.066095232963562, "rewards/cosine_scaled_reward/mean": 0.15194739401340485, "rewards/cosine_scaled_reward/std": 1.4597177505493164, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4084821428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1218.71875, "completions/mean_terminated_length": 646.0452880859375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.831496062992126, "grad_norm": 2.0752828121185303, "kl": 0.065032958984375, "learning_rate": 7.289642745138637e-07, "loss": 0.3363, "num_tokens": 99606193.0, "reward": 0.1383928656578064, "reward_std": 1.1820039749145508, "rewards/cosine_scaled_reward/mean": 0.1383928507566452, "rewards/cosine_scaled_reward/std": 1.4565742015838623, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5379464285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 1383.8248291015625, "completions/mean_terminated_length": 610.5579833984375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8377952755905512, "grad_norm": 2.3217105865478516, "kl": 0.09649658203125, "learning_rate": 7.240333919937892e-07, "loss": 0.2741, "num_tokens": 100981460.0, "reward": -0.22238367795944214, "reward_std": 1.0377624034881592, "rewards/cosine_scaled_reward/mean": -0.22238366305828094, "rewards/cosine_scaled_reward/std": 1.314494013786316, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5658482142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1403.39404296875, "completions/mean_terminated_length": 563.251953125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.8440944881889764, "grad_norm": 146.2183837890625, "kl": 1.4737548828125, "learning_rate": 7.19075095537933e-07, "loss": 0.3846, "num_tokens": 102352565.0, "reward": -0.1662946492433548, "reward_std": 1.1486141681671143, "rewards/cosine_scaled_reward/mean": -0.1662946492433548, "rewards/cosine_scaled_reward/std": 1.3446446657180786, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6975446428571428, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 1610.3739013671875, "completions/mean_terminated_length": 601.0885620117188, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.8503937007874016, "grad_norm": 3.1824281215667725, "kl": 0.18603515625, "learning_rate": 7.140899918702275e-07, "loss": 0.2155, "num_tokens": 103938276.0, "reward": -0.5144848823547363, "reward_std": 0.8551457524299622, "rewards/cosine_scaled_reward/mean": -0.5144848227500916, "rewards/cosine_scaled_reward/std": 1.105492115020752, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7209821428571428, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1642.0860595703125, "completions/mean_terminated_length": 593.2040405273438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.8566929133858268, "grad_norm": 3.775615930557251, "kl": 0.2728271484375, "learning_rate": 7.090786909948809e-07, "loss": 0.214, "num_tokens": 105547761.0, "reward": -0.5837053656578064, "reward_std": 0.8053549528121948, "rewards/cosine_scaled_reward/mean": -0.5943182110786438, "rewards/cosine_scaled_reward/std": 1.0264818668365479, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8459821428571428, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1817.8974609375, "completions/mean_terminated_length": 554.0, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.862992125984252, "grad_norm": 3.16469407081604, "kl": 0.44287109375, "learning_rate": 7.040418061217324e-07, "loss": 0.1551, "num_tokens": 107305973.0, "reward": -0.7589171528816223, "reward_std": 0.5782804489135742, "rewards/cosine_scaled_reward/mean": -0.7589171528816223, "rewards/cosine_scaled_reward/std": 0.8159881234169006, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9040178571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1906.56591796875, "completions/mean_terminated_length": 574.4534912109375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.8692913385826772, "grad_norm": 5.566361427307129, "kl": 0.6162109375, "learning_rate": 6.989799535912181e-07, "loss": 0.087, "num_tokens": 109144544.0, "reward": -0.9062500596046448, "reward_std": 0.27266156673431396, "rewards/cosine_scaled_reward/mean": -0.90625, "rewards/cosine_scaled_reward/std": 0.5222694277763367, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8973214285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1899.40966796875, "completions/mean_terminated_length": 600.8587036132812, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8755905511811024, "grad_norm": 11.27455997467041, "kl": 0.79248046875, "learning_rate": 6.93893752798951e-07, "loss": 0.0864, "num_tokens": 110986895.0, "reward": -0.9363839626312256, "reward_std": 0.22900153696537018, "rewards/cosine_scaled_reward/mean": -0.9363839030265808, "rewards/cosine_scaled_reward/std": 0.4324464797973633, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8303571428571428, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1786.923095703125, "completions/mean_terminated_length": 509.0197448730469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.8818897637795275, "grad_norm": 8.36593246459961, "kl": 0.88232421875, "learning_rate": 6.887838261199292e-07, "loss": 0.1019, "num_tokens": 112728170.0, "reward": -0.9129464626312256, "reward_std": 0.26700180768966675, "rewards/cosine_scaled_reward/mean": -0.9129464030265808, "rewards/cosine_scaled_reward/std": 0.5038508772850037, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1737.1707763671875, "completions/mean_terminated_length": 517.7637329101562, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.8881889763779528, "grad_norm": 8.047481536865234, "kl": 0.94189453125, "learning_rate": 6.836507988323784e-07, "loss": 0.1031, "num_tokens": 114425763.0, "reward": -0.9095982313156128, "reward_std": 0.28521886467933655, "rewards/cosine_scaled_reward/mean": -0.9095982313156128, "rewards/cosine_scaled_reward/std": 0.5131537914276123, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7176339285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1619.587158203125, "completions/mean_terminated_length": 530.7747192382812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.8944881889763779, "grad_norm": 8.854314804077148, "kl": 0.978515625, "learning_rate": 6.784952990412393e-07, "loss": 0.1075, "num_tokens": 116005153.0, "reward": -0.8627232313156128, "reward_std": 0.3768954873085022, "rewards/cosine_scaled_reward/mean": -0.8627232313156128, "rewards/cosine_scaled_reward/std": 0.6272356510162354, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4508928571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1176.7098388671875, "completions/mean_terminated_length": 461.2601318359375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9007874015748032, "grad_norm": 665.7935791015625, "kl": 0.65087890625, "learning_rate": 6.733179576013097e-07, "loss": 0.1599, "num_tokens": 117187645.0, "reward": -0.691964328289032, "reward_std": 0.697722852230072, "rewards/cosine_scaled_reward/mean": -0.6919642686843872, "rewards/cosine_scaled_reward/std": 0.9111244082450867, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4017857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1132.872802734375, "completions/mean_terminated_length": 518.2350463867188, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.9070866141732283, "grad_norm": 571.6249389648438, "kl": 0.51708984375, "learning_rate": 6.681194080400495e-07, "loss": 0.1361, "num_tokens": 118343115.0, "reward": -0.7488839626312256, "reward_std": 0.5547734498977661, "rewards/cosine_scaled_reward/mean": -0.7488839030265808, "rewards/cosine_scaled_reward/std": 0.8313003182411194, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4486607142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1147.0926513671875, "completions/mean_terminated_length": 413.9656066894531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9133858267716536, "grad_norm": 25.801456451416016, "kl": 0.84716796875, "learning_rate": 6.629002864800588e-07, "loss": 0.1159, "num_tokens": 119498350.0, "reward": -0.7589285969734192, "reward_std": 0.5432791113853455, "rewards/cosine_scaled_reward/mean": -0.7589285969734192, "rewards/cosine_scaled_reward/std": 0.8159914016723633, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3939732142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 1025.8795166015625, "completions/mean_terminated_length": 361.4070129394531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9196850393700787, "grad_norm": 1.8065078258514404, "kl": 0.93115234375, "learning_rate": 6.576612315612386e-07, "loss": 0.1227, "num_tokens": 120553410.0, "reward": -0.8292410969734192, "reward_std": 0.4767807126045227, "rewards/cosine_scaled_reward/mean": -0.8292410969734192, "rewards/cosine_scaled_reward/std": 0.6954552531242371, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3102678571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 873.575927734375, "completions/mean_terminated_length": 345.27508544921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.925984251968504, "grad_norm": 2.2588658332824707, "kl": 0.89501953125, "learning_rate": 6.524028843626433e-07, "loss": 0.1051, "num_tokens": 121469110.0, "reward": -0.7756592035293579, "reward_std": 0.5666614770889282, "rewards/cosine_scaled_reward/mean": -0.7756591439247131, "rewards/cosine_scaled_reward/std": 0.7895299792289734, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2522321428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 786.1641235351562, "completions/mean_terminated_length": 360.52984619140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9322834645669291, "grad_norm": 21.380773544311523, "kl": 0.92578125, "learning_rate": 6.47125888324035e-07, "loss": -0.2447, "num_tokens": 122307657.0, "reward": -0.8401402831077576, "reward_std": 0.39588436484336853, "rewards/cosine_scaled_reward/mean": -0.8401403427124023, "rewards/cosine_scaled_reward/std": 0.6686471104621887, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1897321428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 673.6920166015625, "completions/mean_terminated_length": 351.8843078613281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9385826771653544, "grad_norm": 21.64946746826172, "kl": 0.84326171875, "learning_rate": 6.418308891671484e-07, "loss": -0.312, "num_tokens": 123060837.0, "reward": -0.6996389627456665, "reward_std": 0.5653135776519775, "rewards/cosine_scaled_reward/mean": -0.6996389031410217, "rewards/cosine_scaled_reward/std": 0.8859981298446655, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1361607142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 604.5279541015625, "completions/mean_terminated_length": 377.0038757324219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9448818897637795, "grad_norm": 15.67458438873291, "kl": 0.6279296875, "learning_rate": 6.365185348166768e-07, "loss": -0.2435, "num_tokens": 123715518.0, "reward": -0.6860414147377014, "reward_std": 0.6978532075881958, "rewards/cosine_scaled_reward/mean": -0.6860413551330566, "rewards/cosine_scaled_reward/std": 0.8987053036689758, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 675.2667846679688, "completions/mean_terminated_length": 390.3598327636719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9511811023622048, "grad_norm": 11.492012023925781, "kl": 0.377685546875, "learning_rate": 6.311894753209895e-07, "loss": -0.2775, "num_tokens": 124447101.0, "reward": -0.6155981421470642, "reward_std": 0.7278788685798645, "rewards/cosine_scaled_reward/mean": -0.6155981421470642, "rewards/cosine_scaled_reward/std": 0.9894371032714844, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 692.622802734375, "completions/mean_terminated_length": 411.31805419921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9574803149606299, "grad_norm": 3.143880605697632, "kl": 0.25732421875, "learning_rate": 6.258443627725867e-07, "loss": -0.0845, "num_tokens": 125219899.0, "reward": -0.4098784625530243, "reward_std": 0.891829788684845, "rewards/cosine_scaled_reward/mean": -0.4098784923553467, "rewards/cosine_scaled_reward/std": 1.1879875659942627, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1294642857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 643.4051513671875, "completions/mean_terminated_length": 434.5166931152344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9637795275590552, "grad_norm": 4.117553234100342, "kl": 0.201416015625, "learning_rate": 6.204838512283071e-07, "loss": -0.1204, "num_tokens": 125923670.0, "reward": -0.35601532459259033, "reward_std": 0.9412047863006592, "rewards/cosine_scaled_reward/mean": -0.35601529479026794, "rewards/cosine_scaled_reward/std": 1.2269173860549927, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0948660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 623.4207763671875, "completions/mean_terminated_length": 474.1121826171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9700787401574803, "grad_norm": 3.315114736557007, "kl": 0.15234375, "learning_rate": 6.151085966292941e-07, "loss": -0.1847, "num_tokens": 126628463.0, "reward": -0.34353500604629517, "reward_std": 0.8719759583473206, "rewards/cosine_scaled_reward/mean": -0.3435349762439728, "rewards/cosine_scaled_reward/std": 1.236801028251648, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0970982142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 621.3303833007812, "completions/mean_terminated_length": 467.9060363769531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9763779527559056, "grad_norm": 1.6421157121658325, "kl": 0.12652587890625, "learning_rate": 6.097192567207303e-07, "loss": -0.078, "num_tokens": 127332503.0, "reward": -0.1615428477525711, "reward_std": 1.0047760009765625, "rewards/cosine_scaled_reward/mean": -0.16154281795024872, "rewards/cosine_scaled_reward/std": 1.3453506231307983, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 650.7098388671875, "completions/mean_terminated_length": 519.3406982421875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9826771653543307, "grad_norm": 2.226269006729126, "kl": 0.10272216796875, "learning_rate": 6.043164909713532e-07, "loss": -0.0363, "num_tokens": 128044803.0, "reward": -0.22488074004650116, "reward_std": 0.929477870464325, "rewards/cosine_scaled_reward/mean": -0.22488072514533997, "rewards/cosine_scaled_reward/std": 1.3119038343429565, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0770089285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 648.5413208007812, "completions/mean_terminated_length": 531.7787475585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.988976377952756, "grad_norm": 2.474903106689453, "kl": 0.09051513671875, "learning_rate": 5.989009604927586e-07, "loss": -0.138, "num_tokens": 128762616.0, "reward": -0.07482035458087921, "reward_std": 0.9417048096656799, "rewards/cosine_scaled_reward/mean": -0.07482033967971802, "rewards/cosine_scaled_reward/std": 1.385107398033142, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.047330097087378675, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 564.5715942382812, "completions/mean_terminated_length": 490.87261962890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.9952755905511811, "grad_norm": 5.221060276031494, "kl": 0.131103515625, "learning_rate": 5.934733279585036e-07, "loss": -0.1088, "num_tokens": 129405493.0, "reward": -0.10780435800552368, "reward_std": 0.9690559506416321, "rewards/cosine_scaled_reward/mean": -0.10780435055494308, "rewards/cosine_scaled_reward/std": 1.3703655004501343, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 537.7467041015625, "completions/mean_terminated_length": 476.3542175292969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0062992125984251, "grad_norm": 4.053348541259766, "kl": 0.14080810546875, "learning_rate": 5.880342575230181e-07, "loss": -0.1351, "num_tokens": 130008786.0, "reward": 0.06913615763187408, "reward_std": 1.0828073024749756, "rewards/cosine_scaled_reward/mean": 0.06913615763187408, "rewards/cosine_scaled_reward/std": 1.4362900257110596, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 2048.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 499.8114013671875, "completions/mean_terminated_length": 464.464599609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0125984251968503, "grad_norm": 4.627394676208496, "kl": 0.1671142578125, "learning_rate": 5.825844147403352e-07, "loss": -0.1719, "num_tokens": 130603513.0, "reward": -0.08518640697002411, "reward_std": 1.049144983291626, "rewards/cosine_scaled_reward/mean": -0.0851864144206047, "rewards/cosine_scaled_reward/std": 1.3810899257659912, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 541.7254638671875, "completions/mean_terminated_length": 510.8451232910156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0188976377952756, "grad_norm": 5.731709957122803, "kl": 0.164306640625, "learning_rate": 5.771244664826511e-07, "loss": -0.0978, "num_tokens": 131220211.0, "reward": -0.02823840081691742, "reward_std": 1.049377202987671, "rewards/cosine_scaled_reward/mean": -0.02823840081691742, "rewards/cosine_scaled_reward/std": 1.4038633108139038, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0345982142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 541.1127319335938, "completions/mean_terminated_length": 487.1086730957031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0251968503937008, "grad_norm": 4.804685115814209, "kl": 0.2054443359375, "learning_rate": 5.71655080858722e-07, "loss": -0.0506, "num_tokens": 131831960.0, "reward": 0.05900329723954201, "reward_std": 1.0088309049606323, "rewards/cosine_scaled_reward/mean": 0.0590033121407032, "rewards/cosine_scaled_reward/std": 1.4333640336990356, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052455357142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 578.4163208007812, "completions/mean_terminated_length": 497.061279296875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.031496062992126, "grad_norm": 7.659826278686523, "kl": 0.22412109375, "learning_rate": 5.661769271321113e-07, "loss": -0.0245, "num_tokens": 132480317.0, "reward": 0.09290509670972824, "reward_std": 1.1230714321136475, "rewards/cosine_scaled_reward/mean": 0.09290510416030884, "rewards/cosine_scaled_reward/std": 1.4430676698684692, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.049107142857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 579.7467041015625, "completions/mean_terminated_length": 503.9213562011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0377952755905513, "grad_norm": 13.899866104125977, "kl": 0.27783203125, "learning_rate": 5.606906756392949e-07, "loss": -0.0631, "num_tokens": 133122762.0, "reward": 0.048874642699956894, "reward_std": 1.049883484840393, "rewards/cosine_scaled_reward/mean": 0.04887465760111809, "rewards/cosine_scaled_reward/std": 1.4304143190383911, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060267857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 570.8694458007812, "completions/mean_terminated_length": 476.1365966796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0440944881889764, "grad_norm": 13.861762046813965, "kl": 0.345703125, "learning_rate": 5.551969977076349e-07, "loss": -0.013, "num_tokens": 133775221.0, "reward": 0.049550432711839676, "reward_std": 1.0978636741638184, "rewards/cosine_scaled_reward/mean": 0.049550436437129974, "rewards/cosine_scaled_reward/std": 1.4299269914627075, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 663.302490234375, "completions/mean_terminated_length": 538.64599609375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.0503937007874016, "grad_norm": 5.041938781738281, "kl": 0.44580078125, "learning_rate": 5.49696565573233e-07, "loss": -0.0428, "num_tokens": 134495524.0, "reward": -0.2152213752269745, "reward_std": 1.0333141088485718, "rewards/cosine_scaled_reward/mean": -0.21522139012813568, "rewards/cosine_scaled_reward/std": 1.3177170753479004, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 702.8906860351562, "completions/mean_terminated_length": 537.7017211914062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0566929133858267, "grad_norm": 10.903532981872559, "kl": 0.56884765625, "learning_rate": 5.441900522986712e-07, "loss": 0.0091, "num_tokens": 135258114.0, "reward": -0.14526759088039398, "reward_std": 1.0364834070205688, "rewards/cosine_scaled_reward/mean": -0.14526759088039398, "rewards/cosine_scaled_reward/std": 1.3537439107894897, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1261160714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 722.075927734375, "completions/mean_terminated_length": 530.7228393554688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0629921259842519, "grad_norm": 11.074158668518066, "kl": 0.7021484375, "learning_rate": 5.38678131690653e-07, "loss": 0.0007, "num_tokens": 136034406.0, "reward": -0.21496644616127014, "reward_std": 1.0454844236373901, "rewards/cosine_scaled_reward/mean": -0.21496644616127014, "rewards/cosine_scaled_reward/std": 1.317626714706421, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1618303571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 784.0301513671875, "completions/mean_terminated_length": 539.9879760742188, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.0692913385826772, "grad_norm": 18.679859161376953, "kl": 0.91455078125, "learning_rate": 5.33161478217552e-07, "loss": -0.0063, "num_tokens": 136861985.0, "reward": -0.43908149003982544, "reward_std": 0.8596888184547424, "rewards/cosine_scaled_reward/mean": -0.43908146023750305, "rewards/cosine_scaled_reward/std": 1.1680676937103271, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1908482142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 843.6439819335938, "completions/mean_terminated_length": 559.5820922851562, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 1.0755905511811024, "grad_norm": 8.620196342468262, "kl": 1.0087890625, "learning_rate": 5.27640766926881e-07, "loss": 0.0926, "num_tokens": 137754034.0, "reward": -0.37983787059783936, "reward_std": 0.88871169090271, "rewards/cosine_scaled_reward/mean": -0.37983784079551697, "rewards/cosine_scaled_reward/std": 1.2146024703979492, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2243303571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 887.3605346679688, "completions/mean_terminated_length": 551.6935424804688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.0818897637795275, "grad_norm": 9.824555397033691, "kl": 1.1435546875, "learning_rate": 5.221166733626894e-07, "loss": 0.1494, "num_tokens": 138682053.0, "reward": -0.32001835107803345, "reward_std": 0.9776201248168945, "rewards/cosine_scaled_reward/mean": -0.32001832127571106, "rewards/cosine_scaled_reward/std": 1.2563639879226685, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2533482142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 933.0123291015625, "completions/mean_terminated_length": 554.68310546875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.0881889763779529, "grad_norm": 7.74678373336792, "kl": 1.2822265625, "learning_rate": 5.165898734828995e-07, "loss": 0.1787, "num_tokens": 139647792.0, "reward": -0.410632461309433, "reward_std": 0.9149812459945679, "rewards/cosine_scaled_reward/mean": -0.410632461309433, "rewards/cosine_scaled_reward/std": 1.192514181137085, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1042.896240234375, "completions/mean_terminated_length": 586.0308227539062, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.094488188976378, "grad_norm": 7.497556209564209, "kl": 1.419921875, "learning_rate": 5.110610435765934e-07, "loss": 0.1739, "num_tokens": 140712563.0, "reward": -0.5546740293502808, "reward_std": 0.7227898240089417, "rewards/cosine_scaled_reward/mean": -0.554673969745636, "rewards/cosine_scaled_reward/std": 1.0671894550323486, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3080357142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1014.2254638671875, "completions/mean_terminated_length": 554.029052734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1007874015748031, "grad_norm": 6.004925727844238, "kl": 1.466796875, "learning_rate": 5.055308601812578e-07, "loss": 0.1959, "num_tokens": 141756109.0, "reward": -0.5145089626312256, "reward_std": 0.8673759698867798, "rewards/cosine_scaled_reward/mean": -0.5145089030265808, "rewards/cosine_scaled_reward/std": 1.1055024862289429, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2533482142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 937.6529541015625, "completions/mean_terminated_length": 560.8983764648438, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 1.1070866141732283, "grad_norm": 16.494152069091797, "kl": 1.650390625, "learning_rate": 5e-07, "loss": 0.1548, "num_tokens": 142716854.0, "reward": -0.611550509929657, "reward_std": 0.7335640788078308, "rewards/cosine_scaled_reward/mean": -0.6115504503250122, "rewards/cosine_scaled_reward/std": 1.0076801776885986, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2566964285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 931.55810546875, "completions/mean_terminated_length": 546.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1133858267716534, "grad_norm": 7.309991359710693, "kl": 1.529296875, "learning_rate": 4.944691398187422e-07, "loss": 0.1781, "num_tokens": 143694762.0, "reward": -0.6149553656578064, "reward_std": 0.7380089163780212, "rewards/cosine_scaled_reward/mean": -0.6149553656578064, "rewards/cosine_scaled_reward/std": 1.003991723060608, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2131696428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 856.0982666015625, "completions/mean_terminated_length": 533.185791015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1196850393700788, "grad_norm": 8.146985054016113, "kl": 1.611328125, "learning_rate": 4.889389564234066e-07, "loss": 0.1488, "num_tokens": 144588786.0, "reward": -0.6183035373687744, "reward_std": 0.7189446091651917, "rewards/cosine_scaled_reward/mean": -0.6183034777641296, "rewards/cosine_scaled_reward/std": 1.000256896018982, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1495535714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 721.6049194335938, "completions/mean_terminated_length": 488.3543395996094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.125984251968504, "grad_norm": 11.923486709594727, "kl": 1.490234375, "learning_rate": 4.834101265171005e-07, "loss": 0.0083, "num_tokens": 145371456.0, "reward": -0.47316575050354004, "reward_std": 0.8562769293785095, "rewards/cosine_scaled_reward/mean": -0.47316572070121765, "rewards/cosine_scaled_reward/std": 1.1405795812606812, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 664.8192138671875, "completions/mean_terminated_length": 455.0308532714844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.132283464566929, "grad_norm": 9.277202606201172, "kl": 1.2861328125, "learning_rate": 4.778833266373106e-07, "loss": -0.0526, "num_tokens": 146098318.0, "reward": -0.5175777077674866, "reward_std": 0.8472175002098083, "rewards/cosine_scaled_reward/mean": -0.5175777077674866, "rewards/cosine_scaled_reward/std": 1.0977907180786133, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1551339285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 724.489990234375, "completions/mean_terminated_length": 481.4676513671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1385826771653544, "grad_norm": 7.163623332977295, "kl": 1.0029296875, "learning_rate": 4.7235923307311906e-07, "loss": 0.0004, "num_tokens": 146870917.0, "reward": -0.34621867537498474, "reward_std": 0.9726521968841553, "rewards/cosine_scaled_reward/mean": -0.34621864557266235, "rewards/cosine_scaled_reward/std": 1.2381350994110107, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1049107142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 627.3973388671875, "completions/mean_terminated_length": 460.8927917480469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1448818897637796, "grad_norm": 10.082908630371094, "kl": 0.9375, "learning_rate": 4.6683852178244817e-07, "loss": 0.0122, "num_tokens": 147552937.0, "reward": -0.2516986131668091, "reward_std": 1.0554618835449219, "rewards/cosine_scaled_reward/mean": -0.2516985833644867, "rewards/cosine_scaled_reward/std": 1.2968847751617432, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1506696428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 723.8236694335938, "completions/mean_terminated_length": 488.9172058105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1511811023622047, "grad_norm": 8.064560890197754, "kl": 0.7548828125, "learning_rate": 4.613218683093471e-07, "loss": -0.0383, "num_tokens": 148331531.0, "reward": -0.4500025510787964, "reward_std": 0.8437191247940063, "rewards/cosine_scaled_reward/mean": -0.450002521276474, "rewards/cosine_scaled_reward/std": 1.1603206396102905, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 644.7142944335938, "completions/mean_terminated_length": 486.08197021484375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1574803149606299, "grad_norm": 6.472278594970703, "kl": 0.8740234375, "learning_rate": 4.558099477013288e-07, "loss": -0.0598, "num_tokens": 149031899.0, "reward": -0.27882441878318787, "reward_std": 0.894206166267395, "rewards/cosine_scaled_reward/mean": -0.2788243889808655, "rewards/cosine_scaled_reward/std": 1.2811487913131714, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1037946428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 630.2567138671875, "completions/mean_terminated_length": 466.05975341796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.163779527559055, "grad_norm": 7.136538982391357, "kl": 0.78369140625, "learning_rate": 4.5030343442676703e-07, "loss": 0.013, "num_tokens": 149728417.0, "reward": -0.219038724899292, "reward_std": 0.9884912371635437, "rewards/cosine_scaled_reward/mean": -0.219038724899292, "rewards/cosine_scaled_reward/std": 1.3161735534667969, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 634.265625, "completions/mean_terminated_length": 464.61749267578125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1700787401574804, "grad_norm": 6.526285171508789, "kl": 0.94677734375, "learning_rate": 4.4480300229236517e-07, "loss": -0.0554, "num_tokens": 150418991.0, "reward": -0.385952889919281, "reward_std": 0.8925806283950806, "rewards/cosine_scaled_reward/mean": -0.385952889919281, "rewards/cosine_scaled_reward/std": 1.2094550132751465, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 641.4152221679688, "completions/mean_terminated_length": 472.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1763779527559055, "grad_norm": 24.995378494262695, "kl": 0.736572265625, "learning_rate": 4.3930932436070534e-07, "loss": 0.0653, "num_tokens": 151132563.0, "reward": -0.1384265422821045, "reward_std": 1.111531138420105, "rewards/cosine_scaled_reward/mean": -0.1384265422821045, "rewards/cosine_scaled_reward/std": 1.3569211959838867, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1026785714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 630.9263916015625, "completions/mean_terminated_length": 468.77362060546875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.1826771653543307, "grad_norm": 11.412405014038086, "kl": 0.91064453125, "learning_rate": 4.338230728678888e-07, "loss": -0.0276, "num_tokens": 151825393.0, "reward": -0.27033674716949463, "reward_std": 1.0509710311889648, "rewards/cosine_scaled_reward/mean": -0.27033671736717224, "rewards/cosine_scaled_reward/std": 1.284220814704895, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 600.7042846679688, "completions/mean_terminated_length": 478.0520935058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.188976377952756, "grad_norm": 14.40892219543457, "kl": 0.95556640625, "learning_rate": 4.283449191412779e-07, "loss": 0.0398, "num_tokens": 152486072.0, "reward": -0.23905393481254578, "reward_std": 1.0509700775146484, "rewards/cosine_scaled_reward/mean": -0.23905394971370697, "rewards/cosine_scaled_reward/std": 1.3049958944320679, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 638.9721069335938, "completions/mean_terminated_length": 493.2105712890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.1952755905511812, "grad_norm": 8.587828636169434, "kl": 0.794921875, "learning_rate": 4.228755335173487e-07, "loss": -0.045, "num_tokens": 153216063.0, "reward": -0.35925498604774475, "reward_std": 0.8481911420822144, "rewards/cosine_scaled_reward/mean": -0.35925498604774475, "rewards/cosine_scaled_reward/std": 1.2286874055862427, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1305803571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 704.0391235351562, "completions/mean_terminated_length": 502.1861572265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2015748031496063, "grad_norm": 8.152901649475098, "kl": 0.9990234375, "learning_rate": 4.174155852596646e-07, "loss": 0.0527, "num_tokens": 153987218.0, "reward": -0.2865358889102936, "reward_std": 1.0425032377243042, "rewards/cosine_scaled_reward/mean": -0.2865358889102936, "rewards/cosine_scaled_reward/std": 1.27761971950531, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 672.5480346679688, "completions/mean_terminated_length": 472.0345153808594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.2078740157480314, "grad_norm": 9.168877601623535, "kl": 1.3720703125, "learning_rate": 4.1196574247698184e-07, "loss": 0.1603, "num_tokens": 154706349.0, "reward": -0.1460474580526352, "reward_std": 1.021892786026001, "rewards/cosine_scaled_reward/mean": -0.1460474580526352, "rewards/cosine_scaled_reward/std": 1.354325294494629, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 733.3047485351562, "completions/mean_terminated_length": 518.1727294921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2141732283464566, "grad_norm": 8.236457824707031, "kl": 1.1708984375, "learning_rate": 4.0652667204149633e-07, "loss": 0.1351, "num_tokens": 155512830.0, "reward": -0.17286305129528046, "reward_std": 1.0575268268585205, "rewards/cosine_scaled_reward/mean": -0.17286303639411926, "rewards/cosine_scaled_reward/std": 1.3412247896194458, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1372767857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 697.7623291015625, "completions/mean_terminated_length": 482.91204833984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.220472440944882, "grad_norm": 13.081280708312988, "kl": 1.5234375, "learning_rate": 4.010990395072413e-07, "loss": 0.1504, "num_tokens": 156270681.0, "reward": -0.23280996084213257, "reward_std": 0.9587434530258179, "rewards/cosine_scaled_reward/mean": -0.23280994594097137, "rewards/cosine_scaled_reward/std": 1.3090548515319824, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 682.5670166015625, "completions/mean_terminated_length": 501.3148193359375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.226771653543307, "grad_norm": 9.536521911621094, "kl": 1.55126953125, "learning_rate": 3.956835090286468e-07, "loss": 0.0377, "num_tokens": 157020757.0, "reward": -0.35255441069602966, "reward_std": 0.8735190629959106, "rewards/cosine_scaled_reward/mean": -0.3525543808937073, "rewards/cosine_scaled_reward/std": 1.2333698272705078, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0948660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 624.279052734375, "completions/mean_terminated_length": 475.0603942871094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2330708661417322, "grad_norm": 31.1558780670166, "kl": 1.67919921875, "learning_rate": 3.9028074327926975e-07, "loss": 0.0171, "num_tokens": 157706335.0, "reward": -0.18870538473129272, "reward_std": 0.940399706363678, "rewards/cosine_scaled_reward/mean": -0.18870536983013153, "rewards/cosine_scaled_reward/std": 1.3321219682693481, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 578.078125, "completions/mean_terminated_length": 457.3598937988281, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.2393700787401576, "grad_norm": 11.43847370147705, "kl": 1.6435546875, "learning_rate": 3.8489140337070594e-07, "loss": 0.0715, "num_tokens": 158351605.0, "reward": -0.09771518409252167, "reward_std": 1.1073017120361328, "rewards/cosine_scaled_reward/mean": -0.09771519154310226, "rewards/cosine_scaled_reward/std": 1.374672532081604, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 634.375, "completions/mean_terminated_length": 501.4700927734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2456692913385827, "grad_norm": 12.567444801330566, "kl": 1.46484375, "learning_rate": 3.795161487716928e-07, "loss": 0.0189, "num_tokens": 159053941.0, "reward": -0.23867203295230865, "reward_std": 1.0107946395874023, "rewards/cosine_scaled_reward/mean": -0.23867204785346985, "rewards/cosine_scaled_reward/std": 1.3047585487365723, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0792410714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 582.1451416015625, "completions/mean_terminated_length": 455.99273681640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2519685039370079, "grad_norm": 41.585182189941406, "kl": 1.40478515625, "learning_rate": 3.741556372274133e-07, "loss": -0.0895, "num_tokens": 159702999.0, "reward": -0.24997612833976746, "reward_std": 0.9644049406051636, "rewards/cosine_scaled_reward/mean": -0.24997612833976746, "rewards/cosine_scaled_reward/std": 1.295945644378662, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0747767857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 556.560302734375, "completions/mean_terminated_length": 436.021728515625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.258267716535433, "grad_norm": 27.830976486206055, "kl": 1.490234375, "learning_rate": 3.6881052467901054e-07, "loss": -0.1011, "num_tokens": 160330989.0, "reward": -0.23273612558841705, "reward_std": 0.9888551235198975, "rewards/cosine_scaled_reward/mean": -0.23273612558841705, "rewards/cosine_scaled_reward/std": 1.3051189184188843, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0613839285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 537.2288208007812, "completions/mean_terminated_length": 438.4268798828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2645669291338582, "grad_norm": 12.468254089355469, "kl": 1.302734375, "learning_rate": 3.634814651833231e-07, "loss": -0.0855, "num_tokens": 160936538.0, "reward": -0.24863407015800476, "reward_std": 1.0323801040649414, "rewards/cosine_scaled_reward/mean": -0.24863405525684357, "rewards/cosine_scaled_reward/std": 1.2952616214752197, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0926339285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 608.7120971679688, "completions/mean_terminated_length": 461.773681640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2708661417322835, "grad_norm": 20.568553924560547, "kl": 1.21923828125, "learning_rate": 3.5816911083285164e-07, "loss": -0.1144, "num_tokens": 161611224.0, "reward": -0.48500585556030273, "reward_std": 0.8858336210250854, "rewards/cosine_scaled_reward/mean": -0.48500585556030273, "rewards/cosine_scaled_reward/std": 1.124108910560608, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0982142857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 620.3125, "completions/mean_terminated_length": 464.82177734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2771653543307087, "grad_norm": 13.276288986206055, "kl": 1.01123046875, "learning_rate": 3.5287411167596505e-07, "loss": -0.1358, "num_tokens": 162310720.0, "reward": -0.41049110889434814, "reward_std": 0.8386304974555969, "rewards/cosine_scaled_reward/mean": -0.41049107909202576, "rewards/cosine_scaled_reward/std": 1.1842224597930908, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 604.693115234375, "completions/mean_terminated_length": 441.5366516113281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2834645669291338, "grad_norm": 10.093551635742188, "kl": 0.89501953125, "learning_rate": 3.475971156373567e-07, "loss": -0.0996, "num_tokens": 162971117.0, "reward": -0.3916541337966919, "reward_std": 0.9409958124160767, "rewards/cosine_scaled_reward/mean": -0.3916541039943695, "rewards/cosine_scaled_reward/std": 1.199949026107788, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1316964285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 651.216552734375, "completions/mean_terminated_length": 439.36505126953125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2897637795275592, "grad_norm": 8.370497703552246, "kl": 0.668701171875, "learning_rate": 3.423387684387615e-07, "loss": -0.1193, "num_tokens": 163680943.0, "reward": -0.34324315190315247, "reward_std": 0.9392971992492676, "rewards/cosine_scaled_reward/mean": -0.3432431221008301, "rewards/cosine_scaled_reward/std": 1.2325228452682495, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1573660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 664.4074096679688, "completions/mean_terminated_length": 406.01458740234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.2960629921259843, "grad_norm": 28.92987060546875, "kl": 0.61181640625, "learning_rate": 3.3709971351994126e-07, "loss": 0.061, "num_tokens": 164400812.0, "reward": -0.2606620490550995, "reward_std": 1.1027953624725342, "rewards/cosine_scaled_reward/mean": -0.2606620788574219, "rewards/cosine_scaled_reward/std": 1.2864234447479248, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2098214285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 746.3917846679688, "completions/mean_terminated_length": 400.7669372558594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3023622047244094, "grad_norm": 13.491728782653809, "kl": 0.46875, "learning_rate": 3.318805919599506e-07, "loss": -0.2178, "num_tokens": 165201771.0, "reward": -0.4426966905593872, "reward_std": 0.7850523591041565, "rewards/cosine_scaled_reward/mean": -0.4426967203617096, "rewards/cosine_scaled_reward/std": 1.14866304397583, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2276785714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 803.7210083007812, "completions/mean_terminated_length": 436.910400390625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3086614173228346, "grad_norm": 9.700703620910645, "kl": 0.364013671875, "learning_rate": 3.266820423986904e-07, "loss": -0.1382, "num_tokens": 166057569.0, "reward": -0.4414040148258209, "reward_std": 0.8158534169197083, "rewards/cosine_scaled_reward/mean": -0.4414040148258209, "rewards/cosine_scaled_reward/std": 1.1522157192230225, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2377232142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 850.9308471679688, "completions/mean_terminated_length": 477.61346435546875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3149606299212597, "grad_norm": 15.584068298339844, "kl": 0.3076171875, "learning_rate": 3.215047009587608e-07, "loss": -0.0463, "num_tokens": 166948723.0, "reward": -0.4017762839794159, "reward_std": 0.9151403307914734, "rewards/cosine_scaled_reward/mean": -0.4017762839794159, "rewards/cosine_scaled_reward/std": 1.192360758781433, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2678571428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 906.7891235351562, "completions/mean_terminated_length": 489.2728576660156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.321259842519685, "grad_norm": 8.948128700256348, "kl": 0.3023681640625, "learning_rate": 3.163492011676217e-07, "loss": -0.1143, "num_tokens": 167890006.0, "reward": -0.4780765175819397, "reward_std": 0.8043166399002075, "rewards/cosine_scaled_reward/mean": -0.4780765175819397, "rewards/cosine_scaled_reward/std": 1.1253687143325806, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2466517857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 859.8392944335938, "completions/mean_terminated_length": 470.82666015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3275590551181102, "grad_norm": 20.913394927978516, "kl": 0.2698974609375, "learning_rate": 3.112161738800708e-07, "loss": 0.0274, "num_tokens": 168804806.0, "reward": -0.25251245498657227, "reward_std": 1.0358350276947021, "rewards/cosine_scaled_reward/mean": -0.2525124251842499, "rewards/cosine_scaled_reward/std": 1.2934397459030151, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2243303571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 849.2835083007812, "completions/mean_terminated_length": 502.60430908203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3338582677165354, "grad_norm": 29.57670783996582, "kl": 0.3201904296875, "learning_rate": 3.0610624720104885e-07, "loss": 0.084, "num_tokens": 169694404.0, "reward": -0.0885855183005333, "reward_std": 1.1232795715332031, "rewards/cosine_scaled_reward/mean": -0.08858553320169449, "rewards/cosine_scaled_reward/std": 1.375993251800537, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2220982142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 842.013427734375, "completions/mean_terminated_length": 497.6929626464844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3401574803149607, "grad_norm": 37.739898681640625, "kl": 0.4039306640625, "learning_rate": 3.010200464087818e-07, "loss": 0.0637, "num_tokens": 170570640.0, "reward": -0.10702486336231232, "reward_std": 1.1242616176605225, "rewards/cosine_scaled_reward/mean": -0.10702486336231232, "rewards/cosine_scaled_reward/std": 1.3697468042373657, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 738.1596069335938, "completions/mean_terminated_length": 470.55780029296875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3464566929133859, "grad_norm": 25.782590866088867, "kl": 0.537353515625, "learning_rate": 2.9595819387826747e-07, "loss": 0.0198, "num_tokens": 171365391.0, "reward": -0.14487382769584656, "reward_std": 1.1085338592529297, "rewards/cosine_scaled_reward/mean": -0.14487382769584656, "rewards/cosine_scaled_reward/std": 1.3499091863632202, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1104910714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 665.2779541015625, "completions/mean_terminated_length": 493.5219421386719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.352755905511811, "grad_norm": 29.355329513549805, "kl": 0.72216796875, "learning_rate": 2.909213090051191e-07, "loss": 0.0077, "num_tokens": 172101320.0, "reward": -0.05070412904024124, "reward_std": 1.0974308252334595, "rewards/cosine_scaled_reward/mean": -0.050704121589660645, "rewards/cosine_scaled_reward/std": 1.3941594362258911, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0993303571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 655.5390625, "completions/mean_terminated_length": 501.97149658203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3590551181102362, "grad_norm": 15.642478942871094, "kl": 0.658447265625, "learning_rate": 2.859100081297724e-07, "loss": -0.0779, "num_tokens": 172841611.0, "reward": -0.1841743439435959, "reward_std": 0.9994798302650452, "rewards/cosine_scaled_reward/mean": -0.18417437374591827, "rewards/cosine_scaled_reward/std": 1.3331142663955688, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 653.40625, "completions/mean_terminated_length": 509.1379089355469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3653543307086613, "grad_norm": 29.55940055847168, "kl": 0.74853515625, "learning_rate": 2.8092490446206696e-07, "loss": 0.0411, "num_tokens": 173559431.0, "reward": 0.005901983939111233, "reward_std": 1.1592895984649658, "rewards/cosine_scaled_reward/mean": 0.005901975091546774, "rewards/cosine_scaled_reward/std": 1.4154858589172363, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 690.6953735351562, "completions/mean_terminated_length": 524.0087890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3716535433070867, "grad_norm": 21.52190399169922, "kl": 0.92822265625, "learning_rate": 2.7596660800621074e-07, "loss": 0.0483, "num_tokens": 174322294.0, "reward": -0.064623162150383, "reward_std": 1.1310359239578247, "rewards/cosine_scaled_reward/mean": -0.06462316960096359, "rewards/cosine_scaled_reward/std": 1.3891171216964722, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 620.1183471679688, "completions/mean_terminated_length": 487.7780456542969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3779527559055118, "grad_norm": 13.272688865661621, "kl": 1.056640625, "learning_rate": 2.710357254861364e-07, "loss": -0.0935, "num_tokens": 175016800.0, "reward": -0.15486857295036316, "reward_std": 0.9477849006652832, "rewards/cosine_scaled_reward/mean": -0.15486857295036316, "rewards/cosine_scaled_reward/std": 1.3487260341644287, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1026785714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 636.3359375, "completions/mean_terminated_length": 474.8022155761719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.384251968503937, "grad_norm": 16.31165885925293, "kl": 1.234619140625, "learning_rate": 2.6613286027125914e-07, "loss": 0.0854, "num_tokens": 175731805.0, "reward": 0.0020337319001555443, "reward_std": 1.1841351985931396, "rewards/cosine_scaled_reward/mean": 0.002033740282058716, "rewards/cosine_scaled_reward/std": 1.4147533178329468, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1305803571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 687.7924194335938, "completions/mean_terminated_length": 483.4993591308594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3905511811023623, "grad_norm": 15.241900444030762, "kl": 1.38671875, "learning_rate": 2.6125861230264566e-07, "loss": 0.0791, "num_tokens": 176489315.0, "reward": -0.10125848650932312, "reward_std": 1.0963494777679443, "rewards/cosine_scaled_reward/mean": -0.10125849395990372, "rewards/cosine_scaled_reward/std": 1.3734389543533325, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 726.1942138671875, "completions/mean_terminated_length": 552.6237182617188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.3968503937007875, "grad_norm": 7.404560565948486, "kl": 1.3603515625, "learning_rate": 2.5641357801960184e-07, "loss": 0.1102, "num_tokens": 177269585.0, "reward": -0.1522817313671112, "reward_std": 1.0328502655029297, "rewards/cosine_scaled_reward/mean": -0.1522817462682724, "rewards/cosine_scaled_reward/std": 1.3508657217025757, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1272321428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 714.5703735351562, "completions/mean_terminated_length": 520.182861328125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4031496062992126, "grad_norm": 8.131226539611816, "kl": 1.5, "learning_rate": 2.5159835028668894e-07, "loss": 0.0922, "num_tokens": 178060256.0, "reward": -0.212301105260849, "reward_std": 1.0029926300048828, "rewards/cosine_scaled_reward/mean": -0.212301105260849, "rewards/cosine_scaled_reward/std": 1.3198490142822266, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 709.6517944335938, "completions/mean_terminated_length": 518.4591674804688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4094488188976377, "grad_norm": 14.496360778808594, "kl": 1.7626953125, "learning_rate": 2.4681351832117814e-07, "loss": 0.1399, "num_tokens": 178830648.0, "reward": -0.25911927223205566, "reward_std": 0.9717407822608948, "rewards/cosine_scaled_reward/mean": -0.25911927223205566, "rewards/cosine_scaled_reward/std": 1.293389081954956, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1908482142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 794.9766235351562, "completions/mean_terminated_length": 499.4358825683594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4157480314960629, "grad_norm": 16.000030517578125, "kl": 2.080078125, "learning_rate": 2.4205966762095016e-07, "loss": 0.2121, "num_tokens": 179677731.0, "reward": -0.23970577120780945, "reward_std": 1.1195229291915894, "rewards/cosine_scaled_reward/mean": -0.23970575630664825, "rewards/cosine_scaled_reward/std": 1.305370807647705, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1662946428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 754.9642944335938, "completions/mean_terminated_length": 497.0495300292969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4220472440944882, "grad_norm": 7.867815017700195, "kl": 2.3828125, "learning_rate": 2.3733737989285068e-07, "loss": 0.247, "num_tokens": 180468163.0, "reward": -0.19275090098381042, "reward_std": 1.1585314273834229, "rewards/cosine_scaled_reward/mean": -0.19275090098381042, "rewards/cosine_scaled_reward/std": 1.330829381942749, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 679.0089721679688, "completions/mean_terminated_length": 497.2844543457031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4283464566929134, "grad_norm": 9.984745979309082, "kl": 1.916015625, "learning_rate": 2.3264723298150996e-07, "loss": 0.1061, "num_tokens": 181237179.0, "reward": -0.21782931685447693, "reward_std": 1.0763907432556152, "rewards/cosine_scaled_reward/mean": -0.22178985178470612, "rewards/cosine_scaled_reward/std": 1.3146083354949951, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1350446428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 676.5926513671875, "completions/mean_terminated_length": 462.47613525390625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4346456692913385, "grad_norm": 11.298858642578125, "kl": 2.31640625, "learning_rate": 2.2798980079863384e-07, "loss": 0.1711, "num_tokens": 181973310.0, "reward": -0.2311823070049286, "reward_std": 1.0497584342956543, "rewards/cosine_scaled_reward/mean": -0.23538561165332794, "rewards/cosine_scaled_reward/std": 1.3069753646850586, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1138392857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 646.2299194335938, "completions/mean_terminated_length": 466.15362548828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4409448818897639, "grad_norm": 35.73098373413086, "kl": 2.1005859375, "learning_rate": 2.23365653252778e-07, "loss": 0.0011, "num_tokens": 182683452.0, "reward": -0.25549647212028503, "reward_std": 0.9556432962417603, "rewards/cosine_scaled_reward/mean": -0.25549647212028503, "rewards/cosine_scaled_reward/std": 1.2951745986938477, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1205357142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 664.2098388671875, "completions/mean_terminated_length": 474.55328369140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.447244094488189, "grad_norm": 29.469329833984375, "kl": 2.181640625, "learning_rate": 2.1877535617960968e-07, "loss": 0.0947, "num_tokens": 183418856.0, "reward": -0.2517828643321991, "reward_std": 1.0157699584960938, "rewards/cosine_scaled_reward/mean": -0.2517828643321991, "rewards/cosine_scaled_reward/std": 1.2969443798065186, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1026785714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 638.6763916015625, "completions/mean_terminated_length": 477.4104309082031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4535433070866142, "grad_norm": 48.24620056152344, "kl": 2.287109375, "learning_rate": 2.1421947127266947e-07, "loss": -0.0172, "num_tokens": 184123302.0, "reward": -0.34807711839675903, "reward_std": 0.8679892420768738, "rewards/cosine_scaled_reward/mean": -0.34807705879211426, "rewards/cosine_scaled_reward/std": 1.235032081604004, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 606.4140625, "completions/mean_terminated_length": 470.88037109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4598425196850393, "grad_norm": 19.941768646240234, "kl": 1.9345703125, "learning_rate": 2.0969855601463965e-07, "loss": 0.0507, "num_tokens": 184796985.0, "reward": -0.25824618339538574, "reward_std": 1.0666064023971558, "rewards/cosine_scaled_reward/mean": -0.25824615359306335, "rewards/cosine_scaled_reward/std": 1.2928942441940308, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 614.2232666015625, "completions/mean_terminated_length": 477.5061340332031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4661417322834644, "grad_norm": 36.38364791870117, "kl": 1.9814453125, "learning_rate": 2.0521316360912726e-07, "loss": -0.0086, "num_tokens": 185478049.0, "reward": -0.25805556774139404, "reward_std": 0.9266217350959778, "rewards/cosine_scaled_reward/mean": -0.25805553793907166, "rewards/cosine_scaled_reward/std": 1.2928153276443481, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 577.5011596679688, "completions/mean_terminated_length": 456.7355041503906, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.4724409448818898, "grad_norm": 26.197174072265625, "kl": 1.97509765625, "learning_rate": 2.0076384291297133e-07, "loss": 0.0475, "num_tokens": 186115810.0, "reward": -0.12041401863098145, "reward_std": 1.0578092336654663, "rewards/cosine_scaled_reward/mean": -0.12041400372982025, "rewards/cosine_scaled_reward/std": 1.3638639450073242, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 577.1819458007812, "completions/mean_terminated_length": 465.9435729980469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.478740157480315, "grad_norm": 27.090312957763672, "kl": 1.75390625, "learning_rate": 1.9635113836908167e-07, "loss": -0.0501, "num_tokens": 186768133.0, "reward": -0.22769944369792938, "reward_std": 0.9778302907943726, "rewards/cosine_scaled_reward/mean": -0.2276994287967682, "rewards/cosine_scaled_reward/std": 1.3098596334457397, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 602.958740234375, "completions/mean_terminated_length": 472.86981201171875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.48503937007874, "grad_norm": 29.3377628326416, "kl": 1.5732421875, "learning_rate": 1.9197558993981783e-07, "loss": -0.0195, "num_tokens": 187432064.0, "reward": -0.0907805934548378, "reward_std": 1.0175319910049438, "rewards/cosine_scaled_reward/mean": -0.0907806009054184, "rewards/cosine_scaled_reward/std": 1.3775309324264526, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0747767857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 599.8203125, "completions/mean_terminated_length": 482.7780456542969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4913385826771655, "grad_norm": 30.716096878051758, "kl": 1.5380859375, "learning_rate": 1.876377330409169e-07, "loss": 0.0355, "num_tokens": 188096079.0, "reward": 0.010077612474560738, "reward_std": 1.1685551404953003, "rewards/cosine_scaled_reward/mean": 0.010077609680593014, "rewards/cosine_scaled_reward/std": 1.4161264896392822, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 582.03125, "completions/mean_terminated_length": 457.796630859375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.4976377952755906, "grad_norm": 13.030389785766602, "kl": 1.29638671875, "learning_rate": 1.833380984759764e-07, "loss": -0.008, "num_tokens": 188745819.0, "reward": 0.11718709766864777, "reward_std": 1.0303460359573364, "rewards/cosine_scaled_reward/mean": 0.11718709021806717, "rewards/cosine_scaled_reward/std": 1.448854684829712, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 601.419677734375, "completions/mean_terminated_length": 471.19219970703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5039370078740157, "grad_norm": 36.063846588134766, "kl": 1.2216796875, "learning_rate": 1.790772123715028e-07, "loss": -0.0307, "num_tokens": 189409971.0, "reward": 0.08020710945129395, "reward_std": 1.0514938831329346, "rewards/cosine_scaled_reward/mean": 0.08020710200071335, "rewards/cosine_scaled_reward/std": 1.4387195110321045, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 587.857177734375, "completions/mean_terminated_length": 450.5787658691406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.510236220472441, "grad_norm": 30.80532455444336, "kl": 1.3818359375, "learning_rate": 1.7485559611253148e-07, "loss": 0.0485, "num_tokens": 190066787.0, "reward": 0.10385970026254654, "reward_std": 1.2190966606140137, "rewards/cosine_scaled_reward/mean": 0.10385970771312714, "rewards/cosine_scaled_reward/std": 1.4451911449432373, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 619.474365234375, "completions/mean_terminated_length": 485.16851806640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.516535433070866, "grad_norm": 39.22409439086914, "kl": 1.02783203125, "learning_rate": 1.706737662788277e-07, "loss": -0.058, "num_tokens": 190759772.0, "reward": -0.0937255322933197, "reward_std": 1.0495779514312744, "rewards/cosine_scaled_reward/mean": -0.09372551739215851, "rewards/cosine_scaled_reward/std": 1.3757864236831665, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0948660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 656.9453125, "completions/mean_terminated_length": 511.1504211425781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5228346456692914, "grad_norm": 23.179277420043945, "kl": 1.03369140625, "learning_rate": 1.665322345816746e-07, "loss": -0.0006, "num_tokens": 191516299.0, "reward": -0.07772157341241837, "reward_std": 1.0302178859710693, "rewards/cosine_scaled_reward/mean": -0.07772157341241837, "rewards/cosine_scaled_reward/std": 1.3834202289581299, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 626.7410888671875, "completions/mean_terminated_length": 487.4019775390625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5291338582677165, "grad_norm": 23.85130500793457, "kl": 1.12841796875, "learning_rate": 1.624315078012579e-07, "loss": -0.0371, "num_tokens": 192203363.0, "reward": -0.07777473330497742, "reward_std": 1.0998897552490234, "rewards/cosine_scaled_reward/mean": -0.07777471840381622, "rewards/cosine_scaled_reward/std": 1.383444905281067, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0725446428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 595.3616333007812, "completions/mean_terminated_length": 481.7376708984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5354330708661417, "grad_norm": 9.8560209274292, "kl": 1.13330078125, "learning_rate": 1.5837208772465326e-07, "loss": -0.0159, "num_tokens": 192875015.0, "reward": -0.14465674757957458, "reward_std": 1.1027064323425293, "rewards/cosine_scaled_reward/mean": -0.14465674757957458, "rewards/cosine_scaled_reward/std": 1.3533574342727661, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0747767857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 595.3549194335938, "completions/mean_terminated_length": 477.9517517089844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.541732283464567, "grad_norm": 20.040447235107422, "kl": 0.97607421875, "learning_rate": 1.5435447108442496e-07, "loss": -0.0603, "num_tokens": 193530709.0, "reward": -0.04615917429327965, "reward_std": 1.0335677862167358, "rewards/cosine_scaled_reward/mean": -0.04615917429327965, "rewards/cosine_scaled_reward/std": 1.3946411609649658, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1082589285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 653.046875, "completions/mean_terminated_length": 483.6971435546875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5480314960629922, "grad_norm": 15.755793571472168, "kl": 1.23974609375, "learning_rate": 1.5037914949784296e-07, "loss": 0.0051, "num_tokens": 194240943.0, "reward": -0.13129432499408722, "reward_std": 1.0558489561080933, "rewards/cosine_scaled_reward/mean": -0.13129432499408722, "rewards/cosine_scaled_reward/std": 1.3597780466079712, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 623.2824096679688, "completions/mean_terminated_length": 491.2353515625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5543307086614173, "grad_norm": 25.45458221435547, "kl": 1.384521484375, "learning_rate": 1.4644660940672627e-07, "loss": 0.0341, "num_tokens": 194933308.0, "reward": -0.014127791859209538, "reward_std": 1.109133005142212, "rewards/cosine_scaled_reward/mean": -0.01412778440862894, "rewards/cosine_scaled_reward/std": 1.4083439111709595, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1004464285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 625.2767944335938, "completions/mean_terminated_length": 466.4118957519531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5606299212598427, "grad_norm": 34.377811431884766, "kl": 1.45751953125, "learning_rate": 1.425573320179188e-07, "loss": -0.0061, "num_tokens": 195623860.0, "reward": -0.03965873643755913, "reward_std": 1.0644091367721558, "rewards/cosine_scaled_reward/mean": -0.03965873643755913, "rewards/cosine_scaled_reward/std": 1.397312879562378, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0837053571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 625.8058471679688, "completions/mean_terminated_length": 495.8855285644531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5669291338582676, "grad_norm": 37.41438293457031, "kl": 1.186279296875, "learning_rate": 1.3871179324440675e-07, "loss": 0.0349, "num_tokens": 196315094.0, "reward": -0.05050932243466377, "reward_std": 1.1455857753753662, "rewards/cosine_scaled_reward/mean": -0.050509314984083176, "rewards/cosine_scaled_reward/std": 1.3940231800079346, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 604.8828125, "completions/mean_terminated_length": 482.58477783203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.573228346456693, "grad_norm": 10.482443809509277, "kl": 1.39111328125, "learning_rate": 1.3491046364708293e-07, "loss": -0.0028, "num_tokens": 196981293.0, "reward": 0.052977096289396286, "reward_std": 1.158719539642334, "rewards/cosine_scaled_reward/mean": 0.05297710373997688, "rewards/cosine_scaled_reward/std": 1.4308778047561646, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0770089285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 604.6283569335938, "completions/mean_terminated_length": 484.20196533203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.579527559055118, "grad_norm": 27.658212661743164, "kl": 1.5458984375, "learning_rate": 1.3115380837716683e-07, "loss": -0.0587, "num_tokens": 197652224.0, "reward": -0.140888974070549, "reward_std": 1.1005117893218994, "rewards/cosine_scaled_reward/mean": -0.140888974070549, "rewards/cosine_scaled_reward/std": 1.3547370433807373, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 577.8114013671875, "completions/mean_terminated_length": 437.6222839355469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 1.5858267716535432, "grad_norm": 18.967714309692383, "kl": 1.3759765625, "learning_rate": 1.2744228711928584e-07, "loss": 0.0063, "num_tokens": 198316631.0, "reward": -0.0707879513502121, "reward_std": 1.1439727544784546, "rewards/cosine_scaled_reward/mean": -0.0707879438996315, "rewards/cosine_scaled_reward/std": 1.386030912399292, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0948660714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 600.638427734375, "completions/mean_terminated_length": 448.9420471191406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5921259842519686, "grad_norm": 12.37597942352295, "kl": 2.0859375, "learning_rate": 1.2377635403522585e-07, "loss": 0.0865, "num_tokens": 198984099.0, "reward": -0.15372416377067566, "reward_std": 1.1067149639129639, "rewards/cosine_scaled_reward/mean": -0.15372416377067566, "rewards/cosine_scaled_reward/std": 1.3480288982391357, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 588.2232666015625, "completions/mean_terminated_length": 449.02691650390625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.5984251968503937, "grad_norm": 12.937677383422852, "kl": 1.830078125, "learning_rate": 1.2015645770835764e-07, "loss": 0.015, "num_tokens": 199644251.0, "reward": -0.06733787059783936, "reward_std": 1.0872905254364014, "rewards/cosine_scaled_reward/mean": -0.06733787059783936, "rewards/cosine_scaled_reward/std": 1.3873533010482788, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 603.3292846679688, "completions/mean_terminated_length": 473.2737121582031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.604724409448819, "grad_norm": 20.4193115234375, "kl": 1.68896484375, "learning_rate": 1.1658304108874573e-07, "loss": -0.0324, "num_tokens": 200326770.0, "reward": -0.2275594025850296, "reward_std": 0.9816082715988159, "rewards/cosine_scaled_reward/mean": -0.2275594025850296, "rewards/cosine_scaled_reward/std": 1.3098082542419434, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0926339285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 618.0078125, "completions/mean_terminated_length": 472.0184326171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6110236220472443, "grad_norm": 53.69316864013672, "kl": 1.55517578125, "learning_rate": 1.1305654143894672e-07, "loss": -0.0259, "num_tokens": 201010777.0, "reward": -0.343801349401474, "reward_std": 0.8968811631202698, "rewards/cosine_scaled_reward/mean": -0.3438013195991516, "rewards/cosine_scaled_reward/std": 1.236782431602478, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0837053571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 577.9576416015625, "completions/mean_terminated_length": 443.6662902832031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6173228346456692, "grad_norm": 54.09641647338867, "kl": 1.740234375, "learning_rate": 1.0957739028050327e-07, "loss": 0.0163, "num_tokens": 201664147.0, "reward": -0.0305644441395998, "reward_std": 1.1066824197769165, "rewards/cosine_scaled_reward/mean": -0.030564431101083755, "rewards/cosine_scaled_reward/std": 1.4016599655151367, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 594.8928833007812, "completions/mean_terminated_length": 464.0778503417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6236220472440945, "grad_norm": 41.750972747802734, "kl": 1.7802734375, "learning_rate": 1.0614601334114098e-07, "loss": -0.0287, "num_tokens": 202334163.0, "reward": -0.15131886303424835, "reward_std": 1.011100172996521, "rewards/cosine_scaled_reward/mean": -0.15131886303424835, "rewards/cosine_scaled_reward/std": 1.3501999378204346, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 580.3381958007812, "completions/mean_terminated_length": 444.31097412109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6299212598425197, "grad_norm": 17.3424015045166, "kl": 2.0615234375, "learning_rate": 1.0276283050267392e-07, "loss": -0.0038, "num_tokens": 202981090.0, "reward": -0.2078736275434494, "reward_std": 1.0589529275894165, "rewards/cosine_scaled_reward/mean": -0.20787358283996582, "rewards/cosine_scaled_reward/std": 1.3210111856460571, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0881696428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 594.2299194335938, "completions/mean_terminated_length": 453.65728759765625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6362204724409448, "grad_norm": 13.328221321105957, "kl": 1.9267578125, "learning_rate": 9.942825574962594e-08, "loss": 0.0285, "num_tokens": 203637856.0, "reward": -0.10727537423372269, "reward_std": 1.1523621082305908, "rewards/cosine_scaled_reward/mean": -0.10727538168430328, "rewards/cosine_scaled_reward/std": 1.3700629472732544, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0580357142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 538.950927734375, "completions/mean_terminated_length": 445.976318359375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6425196850393702, "grad_norm": 64.22296142578125, "kl": 1.751953125, "learning_rate": 9.614269711857281e-08, "loss": 0.0107, "num_tokens": 204248084.0, "reward": 0.05649043619632721, "reward_std": 1.1370935440063477, "rewards/cosine_scaled_reward/mean": 0.05649043247103691, "rewards/cosine_scaled_reward/std": 1.4317340850830078, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0725446428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 600.2154541015625, "completions/mean_terminated_length": 486.97113037109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6488188976377953, "grad_norm": 46.42252731323242, "kl": 1.7373046875, "learning_rate": 9.290655664821296e-08, "loss": -0.0064, "num_tokens": 204908949.0, "reward": -0.010218242183327675, "reward_std": 1.023814082145691, "rewards/cosine_scaled_reward/mean": -0.01021824311465025, "rewards/cosine_scaled_reward/std": 1.4091296195983887, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 604.7745971679688, "completions/mean_terminated_length": 469.0867004394531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6551181102362205, "grad_norm": 27.619144439697266, "kl": 1.8291015625, "learning_rate": 8.972023033017168e-08, "loss": 0.0308, "num_tokens": 205583179.0, "reward": -0.04715301841497421, "reward_std": 1.1445320844650269, "rewards/cosine_scaled_reward/mean": -0.04715301841497421, "rewards/cosine_scaled_reward/std": 1.395334005355835, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 594.4832763671875, "completions/mean_terminated_length": 467.4769592285156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6614173228346458, "grad_norm": 12.245925903320312, "kl": 1.54150390625, "learning_rate": 8.658410806054567e-08, "loss": 0.0044, "num_tokens": 206251260.0, "reward": -0.12348128110170364, "reward_std": 1.0466835498809814, "rewards/cosine_scaled_reward/mean": -0.12348127365112305, "rewards/cosine_scaled_reward/std": 1.3621493577957153, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 594.427490234375, "completions/mean_terminated_length": 463.570556640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6677165354330707, "grad_norm": 28.225969314575195, "kl": 1.6474609375, "learning_rate": 8.34985735921932e-08, "loss": -0.0019, "num_tokens": 206937723.0, "reward": -0.21088972687721252, "reward_std": 1.039965033531189, "rewards/cosine_scaled_reward/mean": -0.21088969707489014, "rewards/cosine_scaled_reward/std": 1.3190141916275024, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 580.4129638671875, "completions/mean_terminated_length": 475.0837097167969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.674015748031496, "grad_norm": 54.712013244628906, "kl": 1.8115234375, "learning_rate": 8.046400448777574e-08, "loss": 0.1046, "num_tokens": 207575053.0, "reward": 0.012515909038484097, "reward_std": 1.1547653675079346, "rewards/cosine_scaled_reward/mean": 0.012515915557742119, "rewards/cosine_scaled_reward/std": 1.4178693294525146, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0558035714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 575.0245971679688, "completions/mean_terminated_length": 487.9692687988281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6803149606299213, "grad_norm": 37.593994140625, "kl": 1.59375, "learning_rate": 7.748077207355764e-08, "loss": 0.0369, "num_tokens": 208211523.0, "reward": 0.09279867261648178, "reward_std": 1.2414970397949219, "rewards/cosine_scaled_reward/mean": 0.09279866516590118, "rewards/cosine_scaled_reward/std": 1.4429327249526978, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 584.6060791015625, "completions/mean_terminated_length": 479.5777282714844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6866141732283464, "grad_norm": 69.95362854003906, "kl": 1.4931640625, "learning_rate": 7.45492413939689e-08, "loss": -0.0334, "num_tokens": 208858722.0, "reward": -0.08139531314373016, "reward_std": 1.0697132349014282, "rewards/cosine_scaled_reward/mean": -0.08139531314373016, "rewards/cosine_scaled_reward/std": 1.3822332620620728, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 610.9029541015625, "completions/mean_terminated_length": 500.35699462890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.6929133858267718, "grad_norm": 156.42855834960938, "kl": 1.3740234375, "learning_rate": 7.166977116693567e-08, "loss": -0.1079, "num_tokens": 209551675.0, "reward": -0.2246764898300171, "reward_std": 0.9921932816505432, "rewards/cosine_scaled_reward/mean": -0.2246764600276947, "rewards/cosine_scaled_reward/std": 1.3119205236434937, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0613839285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 562.0892944335938, "completions/mean_terminated_length": 464.9131774902344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.699212598425197, "grad_norm": 32.215087890625, "kl": 1.58251953125, "learning_rate": 6.884271373998607e-08, "loss": 0.0249, "num_tokens": 210199083.0, "reward": 0.029325606301426888, "reward_std": 1.042394995689392, "rewards/cosine_scaled_reward/mean": 0.02932562120258808, "rewards/cosine_scaled_reward/std": 1.4233999252319336, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 652.8694458007812, "completions/mean_terminated_length": 508.5455627441406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.705511811023622, "grad_norm": 26.553613662719727, "kl": 1.3935546875, "learning_rate": 6.6068415047135e-08, "loss": -0.0025, "num_tokens": 210912934.0, "reward": -0.037757713347673416, "reward_std": 1.1614078283309937, "rewards/cosine_scaled_reward/mean": -0.037757713347673416, "rewards/cosine_scaled_reward/std": 1.3996750116348267, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 569.8683471679688, "completions/mean_terminated_length": 440.711181640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7118110236220474, "grad_norm": 37.41157531738281, "kl": 2.03759765625, "learning_rate": 6.334721456655362e-08, "loss": -0.0195, "num_tokens": 211553952.0, "reward": -0.2244766652584076, "reward_std": 1.026918888092041, "rewards/cosine_scaled_reward/mean": -0.2244766503572464, "rewards/cosine_scaled_reward/std": 1.311730146408081, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 584.3080444335938, "completions/mean_terminated_length": 473.608642578125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7181102362204723, "grad_norm": 41.6332893371582, "kl": 1.55810546875, "learning_rate": 6.067944527902918e-08, "loss": -0.0227, "num_tokens": 212207844.0, "reward": -0.08754130452871323, "reward_std": 1.0253074169158936, "rewards/cosine_scaled_reward/mean": -0.08754128962755203, "rewards/cosine_scaled_reward/std": 1.3790392875671387, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 635.3147583007812, "completions/mean_terminated_length": 508.138671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7244094488188977, "grad_norm": 186.7555389404297, "kl": 1.55322265625, "learning_rate": 5.806543362721944e-08, "loss": -0.0904, "num_tokens": 212912238.0, "reward": -0.22805923223495483, "reward_std": 0.8819826245307922, "rewards/cosine_scaled_reward/mean": -0.22805921733379364, "rewards/cosine_scaled_reward/std": 1.310096025466919, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 623.6171875, "completions/mean_terminated_length": 489.70086669921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7307086614173228, "grad_norm": 25.393768310546875, "kl": 1.5244140625, "learning_rate": 5.550549947570771e-08, "loss": -0.0251, "num_tokens": 213620199.0, "reward": -0.12161993980407715, "reward_std": 1.0506173372268677, "rewards/cosine_scaled_reward/mean": -0.12161993980407715, "rewards/cosine_scaled_reward/std": 1.3646435737609863, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 586.161865234375, "completions/mean_terminated_length": 466.10748291015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.737007874015748, "grad_norm": 40.13348388671875, "kl": 1.43701171875, "learning_rate": 5.299995607186219e-08, "loss": -0.117, "num_tokens": 214302392.0, "reward": -0.2332076132297516, "reward_std": 0.917121410369873, "rewards/cosine_scaled_reward/mean": -0.2332075983285904, "rewards/cosine_scaled_reward/std": 1.3055304288864136, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 597.0881958007812, "completions/mean_terminated_length": 462.6134033203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7433070866141733, "grad_norm": 22.70816993713379, "kl": 1.7373046875, "learning_rate": 5.0549110007505394e-08, "loss": 0.0543, "num_tokens": 214956023.0, "reward": -0.23842087388038635, "reward_std": 1.129518747329712, "rewards/cosine_scaled_reward/mean": -0.23842085897922516, "rewards/cosine_scaled_reward/std": 1.3046225309371948, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 557.9375, "completions/mean_terminated_length": 458.6000061035156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7496062992125985, "grad_norm": 66.7606201171875, "kl": 1.36669921875, "learning_rate": 4.815326118139812e-08, "loss": -0.0319, "num_tokens": 215573407.0, "reward": 0.08316393196582794, "reward_std": 1.129285454750061, "rewards/cosine_scaled_reward/mean": 0.08316391706466675, "rewards/cosine_scaled_reward/std": 1.4398179054260254, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0848214285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 567.6116333007812, "completions/mean_terminated_length": 430.4048767089844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7559055118110236, "grad_norm": 116.46288299560547, "kl": 1.400390625, "learning_rate": 4.581270276254195e-08, "loss": -0.1344, "num_tokens": 216237715.0, "reward": -0.1613207757472992, "reward_std": 0.9567738771438599, "rewards/cosine_scaled_reward/mean": -0.1613207757472992, "rewards/cosine_scaled_reward/std": 1.3453031778335571, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056919642857142905, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 532.4285888671875, "completions/mean_terminated_length": 440.9562072753906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.762204724409449, "grad_norm": 20.068740844726562, "kl": 1.5703125, "learning_rate": 4.35277211543057e-08, "loss": 0.009, "num_tokens": 216842355.0, "reward": 0.09989994019269943, "reward_std": 1.1928493976593018, "rewards/cosine_scaled_reward/mean": 0.09989994019269943, "rewards/cosine_scaled_reward/std": 1.4446313381195068, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0580357142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 536.8092041015625, "completions/mean_terminated_length": 443.70263671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.768503937007874, "grad_norm": 46.137939453125, "kl": 1.7451171875, "learning_rate": 4.129859595937946e-08, "loss": 0.0508, "num_tokens": 217447640.0, "reward": 0.04671396315097809, "reward_std": 1.1750531196594238, "rewards/cosine_scaled_reward/mean": 0.04671395570039749, "rewards/cosine_scaled_reward/std": 1.4285207986831665, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 606.0189819335938, "completions/mean_terminated_length": 476.2055969238281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7748031496062993, "grad_norm": 23.016414642333984, "kl": 1.33642578125, "learning_rate": 3.912559994556086e-08, "loss": -0.0769, "num_tokens": 218142409.0, "reward": -0.19041763246059418, "reward_std": 1.0285288095474243, "rewards/cosine_scaled_reward/mean": -0.19041761755943298, "rewards/cosine_scaled_reward/std": 1.329443335533142, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0736607142857143, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 582.8772583007812, "completions/mean_terminated_length": 466.3735046386719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7811023622047244, "grad_norm": 38.84382629394531, "kl": 1.611328125, "learning_rate": 3.7008999012377865e-08, "loss": 0.0061, "num_tokens": 218810795.0, "reward": -0.09773646295070648, "reward_std": 1.072763204574585, "rewards/cosine_scaled_reward/mean": -0.09773645550012589, "rewards/cosine_scaled_reward/std": 1.374637484550476, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 627.9420166015625, "completions/mean_terminated_length": 511.31884765625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.7874015748031495, "grad_norm": 37.48160171508789, "kl": 1.054931640625, "learning_rate": 3.494905215855187e-08, "loss": -0.0542, "num_tokens": 219529207.0, "reward": -0.27194881439208984, "reward_std": 0.925375759601593, "rewards/cosine_scaled_reward/mean": -0.27194881439208984, "rewards/cosine_scaled_reward/std": 1.285094976425171, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0636160714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 596.8605346679688, "completions/mean_terminated_length": 498.27294921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.793700787401575, "grad_norm": 37.41320037841797, "kl": 1.32421875, "learning_rate": 3.2946011450305065e-08, "loss": -0.0132, "num_tokens": 220194058.0, "reward": -0.09820376336574554, "reward_std": 1.0949289798736572, "rewards/cosine_scaled_reward/mean": -0.09820375591516495, "rewards/cosine_scaled_reward/std": 1.3751029968261719, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0725446428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 571.5625, "completions/mean_terminated_length": 456.0770263671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8, "grad_norm": 12.217202186584473, "kl": 1.6943359375, "learning_rate": 3.100012199051627e-08, "loss": -0.0625, "num_tokens": 220837490.0, "reward": -0.2304655760526657, "reward_std": 1.0655834674835205, "rewards/cosine_scaled_reward/mean": -0.2304655760526657, "rewards/cosine_scaled_reward/std": 1.3076075315475464, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0647321428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 559.1328125, "completions/mean_terminated_length": 456.0847473144531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8062992125984252, "grad_norm": 34.513824462890625, "kl": 1.451171875, "learning_rate": 2.9111621888728956e-08, "loss": 0.0098, "num_tokens": 221487257.0, "reward": -0.07044383883476257, "reward_std": 1.1145482063293457, "rewards/cosine_scaled_reward/mean": -0.07044383883476257, "rewards/cosine_scaled_reward/std": 1.3855737447738647, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 624.193115234375, "completions/mean_terminated_length": 490.3309020996094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8125984251968505, "grad_norm": 24.26609992980957, "kl": 1.314453125, "learning_rate": 2.7280742232014876e-08, "loss": -0.0561, "num_tokens": 222189414.0, "reward": -0.15032429993152618, "reward_std": 1.0422052145004272, "rewards/cosine_scaled_reward/mean": -0.15032431483268738, "rewards/cosine_scaled_reward/std": 1.3495419025421143, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 632.7299194335938, "completions/mean_terminated_length": 472.74285888671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8188976377952755, "grad_norm": 28.193471908569336, "kl": 1.4814453125, "learning_rate": 2.5507707056696748e-08, "loss": -0.0471, "num_tokens": 222892132.0, "reward": -0.1942838728427887, "reward_std": 0.9096174836158752, "rewards/cosine_scaled_reward/mean": -0.1942838579416275, "rewards/cosine_scaled_reward/std": 1.327970266342163, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0613839285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 555.6998291015625, "completions/mean_terminated_length": 458.1058349609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8251968503937008, "grad_norm": 45.35464859008789, "kl": 1.57080078125, "learning_rate": 2.3792733320934343e-08, "loss": 0.0264, "num_tokens": 223515623.0, "reward": -0.08820369839668274, "reward_std": 1.0817683935165405, "rewards/cosine_scaled_reward/mean": -0.08820368349552155, "rewards/cosine_scaled_reward/std": 1.3794188499450684, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060267857142857095, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 537.7467041015625, "completions/mean_terminated_length": 440.8895568847656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.831496062992126, "grad_norm": 25.04986572265625, "kl": 1.8056640625, "learning_rate": 2.2136030878176003e-08, "loss": -0.0126, "num_tokens": 224132324.0, "reward": -0.0332709364593029, "reward_std": 1.1177502870559692, "rewards/cosine_scaled_reward/mean": -0.0332709439098835, "rewards/cosine_scaled_reward/std": 1.4000455141067505, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 578.9888916015625, "completions/mean_terminated_length": 465.9880065917969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8377952755905511, "grad_norm": 64.00562286376953, "kl": 1.52001953125, "learning_rate": 2.0537802451479958e-08, "loss": -0.0162, "num_tokens": 224779450.0, "reward": -0.1576380431652069, "reward_std": 1.0528969764709473, "rewards/cosine_scaled_reward/mean": -0.1576380431652069, "rewards/cosine_scaled_reward/std": 1.3467310667037964, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 604.2678833007812, "completions/mean_terminated_length": 468.5323791503906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8440944881889765, "grad_norm": 14.133583068847656, "kl": 1.51904296875, "learning_rate": 1.8998243608708108e-08, "loss": -0.0226, "num_tokens": 225448730.0, "reward": -0.1075335368514061, "reward_std": 1.0885225534439087, "rewards/cosine_scaled_reward/mean": -0.1075335294008255, "rewards/cosine_scaled_reward/std": 1.3702043294906616, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0613839285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 544.9464721679688, "completions/mean_terminated_length": 446.64923095703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8503937007874016, "grad_norm": 31.37044906616211, "kl": 1.7294921875, "learning_rate": 1.751754273859507e-08, "loss": -0.017, "num_tokens": 226069018.0, "reward": -0.012751868925988674, "reward_std": 1.0909769535064697, "rewards/cosine_scaled_reward/mean": -0.012751864269375801, "rewards/cosine_scaled_reward/std": 1.4074139595031738, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0837053571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 595.966552734375, "completions/mean_terminated_length": 463.32037353515625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8566929133858268, "grad_norm": 96.38923645019531, "kl": 1.5947265625, "learning_rate": 1.6095881027696213e-08, "loss": -0.0047, "num_tokens": 226750364.0, "reward": -0.007183407433331013, "reward_std": 1.0385618209838867, "rewards/cosine_scaled_reward/mean": -0.007183406967669725, "rewards/cosine_scaled_reward/std": 1.4105796813964844, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 592.6160888671875, "completions/mean_terminated_length": 484.4220886230469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8629921259842521, "grad_norm": 44.77436065673828, "kl": 1.37353515625, "learning_rate": 1.4733432438216397e-08, "loss": 0.0028, "num_tokens": 227422516.0, "reward": -0.11443806439638138, "reward_std": 1.1223397254943848, "rewards/cosine_scaled_reward/mean": -0.11443805694580078, "rewards/cosine_scaled_reward/std": 1.3673371076583862, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0926339285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 639.3359375, "completions/mean_terminated_length": 495.52398681640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.869291338582677, "grad_norm": 31.86522674560547, "kl": 1.75634765625, "learning_rate": 1.3430363686723234e-08, "loss": 0.0412, "num_tokens": 228121073.0, "reward": -0.15190958976745605, "reward_std": 1.0342063903808594, "rewards/cosine_scaled_reward/mean": -0.15190958976745605, "rewards/cosine_scaled_reward/std": 1.3505640029907227, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0837053571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 612.8170166015625, "completions/mean_terminated_length": 481.71014404296875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8755905511811024, "grad_norm": 42.08808517456055, "kl": 1.8037109375, "learning_rate": 1.2186834223746612e-08, "loss": 0.0692, "num_tokens": 228801581.0, "reward": -0.2318280190229416, "reward_std": 1.0947842597961426, "rewards/cosine_scaled_reward/mean": -0.2318280190229416, "rewards/cosine_scaled_reward/std": 1.3084616661071777, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 599.0045166015625, "completions/mean_terminated_length": 480.00482177734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8818897637795275, "grad_norm": 44.561763763427734, "kl": 2.0576171875, "learning_rate": 1.100299621426759e-08, "loss": 0.0587, "num_tokens": 229459969.0, "reward": -0.168549582362175, "reward_std": 1.0784419775009155, "rewards/cosine_scaled_reward/mean": -0.1685495525598526, "rewards/cosine_scaled_reward/std": 1.3423055410385132, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0747767857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 599.2801513671875, "completions/mean_terminated_length": 482.1942138671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.8881889763779527, "grad_norm": 23.00943946838379, "kl": 1.57763671875, "learning_rate": 9.878994519098572e-09, "loss": -0.0356, "num_tokens": 230117756.0, "reward": -0.07141243666410446, "reward_std": 1.0951851606369019, "rewards/cosine_scaled_reward/mean": -0.07141242921352386, "rewards/cosine_scaled_reward/std": 1.3864585161209106, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 583.7154541015625, "completions/mean_terminated_length": 474.8597106933594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.894488188976378, "grad_norm": 21.019100189208984, "kl": 1.55859375, "learning_rate": 8.814966677157365e-09, "loss": 0.029, "num_tokens": 230794525.0, "reward": -0.10454464703798294, "reward_std": 1.1253935098648071, "rewards/cosine_scaled_reward/mean": -0.10454463958740234, "rewards/cosine_scaled_reward/std": 1.3719398975372314, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 568.1752319335938, "completions/mean_terminated_length": 458.1642761230469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9007874015748032, "grad_norm": 35.79774475097656, "kl": 1.35546875, "learning_rate": 7.811042888637209e-09, "loss": -0.0656, "num_tokens": 231431818.0, "reward": -0.0032371284905821085, "reward_std": 1.1527271270751953, "rewards/cosine_scaled_reward/mean": -0.003237124066799879, "rewards/cosine_scaled_reward/std": 1.4113599061965942, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 569.0725708007812, "completions/mean_terminated_length": 443.7397155761719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9070866141732283, "grad_norm": 19.340476989746094, "kl": 1.546875, "learning_rate": 6.867345999074736e-09, "loss": 0.0095, "num_tokens": 232076667.0, "reward": -0.17012757062911987, "reward_std": 1.0372531414031982, "rewards/cosine_scaled_reward/mean": -0.17012754082679749, "rewards/cosine_scaled_reward/std": 1.3394016027450562, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 592.1194458007812, "completions/mean_terminated_length": 480.1286315917969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9133858267716537, "grad_norm": 32.1158332824707, "kl": 1.28515625, "learning_rate": 5.983991484317996e-09, "loss": -0.045, "num_tokens": 232756214.0, "reward": -0.13372990489006042, "reward_std": 1.0666496753692627, "rewards/cosine_scaled_reward/mean": -0.13372988998889923, "rewards/cosine_scaled_reward/std": 1.3574635982513428, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0770089285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 599.7467041015625, "completions/mean_terminated_length": 478.9129638671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9196850393700786, "grad_norm": 24.803268432617188, "kl": 1.7548828125, "learning_rate": 5.161087436396095e-09, "loss": 0.0698, "num_tokens": 233415603.0, "reward": -0.09106288850307465, "reward_std": 1.1765754222869873, "rewards/cosine_scaled_reward/mean": -0.09106288105249405, "rewards/cosine_scaled_reward/std": 1.377711534500122, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0837053571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 616.6217041015625, "completions/mean_terminated_length": 485.8623962402344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.925984251968504, "grad_norm": 54.1906852722168, "kl": 1.47607421875, "learning_rate": 4.398734550292715e-09, "loss": 0.0022, "num_tokens": 234111856.0, "reward": -0.19376374781131744, "reward_std": 1.0452275276184082, "rewards/cosine_scaled_reward/mean": -0.19376373291015625, "rewards/cosine_scaled_reward/std": 1.327578067779541, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0758928571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 543.5011596679688, "completions/mean_terminated_length": 419.9432373046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9322834645669291, "grad_norm": 17.24119758605957, "kl": 1.52978515625, "learning_rate": 3.697026111624091e-09, "loss": 0.0133, "num_tokens": 234738641.0, "reward": -0.1634216010570526, "reward_std": 1.0093762874603271, "rewards/cosine_scaled_reward/mean": -0.16342158615589142, "rewards/cosine_scaled_reward/std": 1.3429415225982666, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0691964285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 587.6752319335938, "completions/mean_terminated_length": 479.1139221191406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9385826771653543, "grad_norm": 21.988357543945312, "kl": 1.7470703125, "learning_rate": 3.05604798522463e-09, "loss": 0.0234, "num_tokens": 235389614.0, "reward": -0.09133020788431168, "reward_std": 1.1265548467636108, "rewards/cosine_scaled_reward/mean": -0.09133020788431168, "rewards/cosine_scaled_reward/std": 1.377872109413147, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0959821428571429, "completions/max_length": 2048.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 597.7980346679688, "completions/mean_terminated_length": 443.8259582519531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9448818897637796, "grad_norm": 16.0437068939209, "kl": 1.390625, "learning_rate": 2.4758786046395476e-09, "loss": -0.0116, "num_tokens": 236085449.0, "reward": -0.11006226390600204, "reward_std": 1.004935622215271, "rewards/cosine_scaled_reward/mean": -0.11006225645542145, "rewards/cosine_scaled_reward/std": 1.3682260513305664, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0915178571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 613.5636596679688, "completions/mean_terminated_length": 469.0626525878906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9511811023622048, "grad_norm": 20.78128433227539, "kl": 1.42236328125, "learning_rate": 1.9565889625275944e-09, "loss": 0.0618, "num_tokens": 236781810.0, "reward": 0.04371757060289383, "reward_std": 1.1667969226837158, "rewards/cosine_scaled_reward/mean": 0.043717559427022934, "rewards/cosine_scaled_reward/std": 1.4271618127822876, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0926339285714286, "completions/max_length": 2048.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 603.5390625, "completions/mean_terminated_length": 456.07257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.95748031496063, "grad_norm": 32.46472930908203, "kl": 1.677734375, "learning_rate": 1.4982426019738426e-09, "loss": -0.0579, "num_tokens": 237464693.0, "reward": -0.2270333468914032, "reward_std": 1.0241820812225342, "rewards/cosine_scaled_reward/mean": -0.2270333468914032, "rewards/cosine_scaled_reward/std": 1.3094993829727173, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0669642857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 570.4185791015625, "completions/mean_terminated_length": 464.3719787597656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9637795275590553, "grad_norm": 31.72895050048828, "kl": 1.27587890625, "learning_rate": 1.1008956087144582e-09, "loss": 0.0253, "num_tokens": 238104108.0, "reward": 0.02927999757230282, "reward_std": 1.155513048171997, "rewards/cosine_scaled_reward/mean": 0.02927999384701252, "rewards/cosine_scaled_reward/std": 1.4235868453979492, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0825892857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 588.1261596679688, "completions/mean_terminated_length": 456.7019348144531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9700787401574802, "grad_norm": 49.65065383911133, "kl": 1.6025390625, "learning_rate": 7.645966042734153e-10, "loss": 0.0015, "num_tokens": 238755581.0, "reward": -0.10733804106712341, "reward_std": 1.0519185066223145, "rewards/cosine_scaled_reward/mean": -0.10733802616596222, "rewards/cosine_scaled_reward/std": 1.3700921535491943, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0870535714285714, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 603.6796875, "completions/mean_terminated_length": 465.9572448730469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9763779527559056, "grad_norm": 27.285886764526367, "kl": 1.5439453125, "learning_rate": 4.893867400131979e-10, "loss": 0.0261, "num_tokens": 239423902.0, "reward": -0.10080787539482117, "reward_std": 1.1222314834594727, "rewards/cosine_scaled_reward/mean": -0.10080786794424057, "rewards/cosine_scaled_reward/std": 1.3731297254562378, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0747767857142857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 561.9408569335938, "completions/mean_terminated_length": 441.837158203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9826771653543307, "grad_norm": 36.39866256713867, "kl": 1.474609375, "learning_rate": 2.7529969209910686e-10, "loss": -0.0051, "num_tokens": 240067561.0, "reward": -0.1611294150352478, "reward_std": 0.9675538539886475, "rewards/cosine_scaled_reward/mean": -0.1611294001340866, "rewards/cosine_scaled_reward/std": 1.3451868295669556, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0680803571428571, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 570.872802734375, "completions/mean_terminated_length": 462.962890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9889763779527558, "grad_norm": 75.8726577758789, "kl": 1.7724609375, "learning_rate": 1.2236165737850024e-10, "loss": 0.0088, "num_tokens": 240702599.0, "reward": -0.14775627851486206, "reward_std": 1.0878593921661377, "rewards/cosine_scaled_reward/mean": -0.14775624871253967, "rewards/cosine_scaled_reward/std": 1.3513950109481812, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08859223300970875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 603.0764770507812, "completions/mean_terminated_length": 462.6244812011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 1.9952755905511812, "grad_norm": 29.035417556762695, "kl": 1.70947265625, "learning_rate": 3.059135017535741e-11, "loss": 0.0663, "num_tokens": 241380215.0, "reward": -0.12112902104854584, "reward_std": 1.117789387702942, "rewards/cosine_scaled_reward/mean": -0.12112902104854584, "rewards/cosine_scaled_reward/std": 1.364324688911438, "step": 316 }, { "epoch": 1.9952755905511812, "step": 316, "total_flos": 0.0, "train_loss": 0.0234055612822943, "train_runtime": 34749.8496, "train_samples_per_second": 0.512, "train_steps_per_second": 0.009 } ], "logging_steps": 1, "max_steps": 316, "num_input_tokens_seen": 241380215, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }