{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.9952755905511812,
  "eval_steps": 100,
  "global_step": 316,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 514.859375,
      "completions/mean_terminated_length": 486.98406982421875,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.006299212598425197,
      "grad_norm": 0.45115670561790466,
      "kl": 9.506940841674805e-05,
      "learning_rate": 0.0,
      "loss": -0.0368,
      "num_tokens": 586082.0,
      "reward": -0.5526061058044434,
      "reward_std": 0.764463484287262,
      "rewards/cosine_scaled_reward/mean": -0.5526061058044434,
      "rewards/cosine_scaled_reward/std": 1.0664217472076416,
      "step": 1
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025669642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 535.5926513671875,
      "completions/mean_terminated_length": 495.74688720703125,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.012598425196850394,
      "grad_norm": 0.430998295545578,
      "kl": 9.66787338256836e-05,
      "learning_rate": 3.125e-08,
      "loss": -0.0952,
      "num_tokens": 1196373.0,
      "reward": -0.4628583788871765,
      "reward_std": 0.8271132707595825,
      "rewards/cosine_scaled_reward/mean": -0.4628583788871765,
      "rewards/cosine_scaled_reward/std": 1.1489886045455933,
      "step": 2
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 586.6060791015625,
      "completions/mean_terminated_length": 551.5325927734375,
      "completions/min_length": 10.0,
      "completions/min_terminated_length": 10.0,
      "epoch": 0.01889763779527559,
      "grad_norm": 0.4537774324417114,
      "kl": 9.518861770629883e-05,
      "learning_rate": 6.25e-08,
      "loss": -0.1136,
      "num_tokens": 1863732.0,
      "reward": -0.6168839931488037,
      "reward_std": 0.7393009066581726,
      "rewards/cosine_scaled_reward/mean": -0.6168839335441589,
      "rewards/cosine_scaled_reward/std": 0.9996686577796936,
      "step": 3
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0200892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 484.872802734375,
      "completions/mean_terminated_length": 452.826904296875,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.025196850393700787,
      "grad_norm": 0.5108857750892639,
      "kl": 0.0001074075698852539,
      "learning_rate": 9.375e-08,
      "loss": -0.0851,
      "num_tokens": 2449778.0,
      "reward": -0.5494452714920044,
      "reward_std": 0.7850840091705322,
      "rewards/cosine_scaled_reward/mean": -0.5494452118873596,
      "rewards/cosine_scaled_reward/std": 1.0697675943374634,
      "step": 4
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 530.9732666015625,
      "completions/mean_terminated_length": 496.337890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.031496062992125984,
      "grad_norm": 0.4788239300251007,
      "kl": 0.00010323524475097656,
      "learning_rate": 1.25e-07,
      "loss": -0.0865,
      "num_tokens": 3063498.0,
      "reward": -0.49671319127082825,
      "reward_std": 0.8431764841079712,
      "rewards/cosine_scaled_reward/mean": -0.49671316146850586,
      "rewards/cosine_scaled_reward/std": 1.1200929880142212,
      "step": 5
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.016741071428571397,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 486.044677734375,
      "completions/mean_terminated_length": 459.4506530761719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.03779527559055118,
      "grad_norm": 0.4317552149295807,
      "kl": 0.00010275840759277344,
      "learning_rate": 1.5624999999999999e-07,
      "loss": -0.1117,
      "num_tokens": 3632018.0,
      "reward": -0.5126838684082031,
      "reward_std": 0.8028110265731812,
      "rewards/cosine_scaled_reward/mean": -0.5126838684082031,
      "rewards/cosine_scaled_reward/std": 1.1045653820037842,
      "step": 6
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 531.9910888671875,
      "completions/mean_terminated_length": 495.6068420410156,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.04409448818897638,
      "grad_norm": 0.4166410565376282,
      "kl": 9.614229202270508e-05,
      "learning_rate": 1.875e-07,
      "loss": -0.0529,
      "num_tokens": 4239450.0,
      "reward": -0.5730382800102234,
      "reward_std": 0.7830650210380554,
      "rewards/cosine_scaled_reward/mean": -0.5730382204055786,
      "rewards/cosine_scaled_reward/std": 1.0463130474090576,
      "step": 7
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0279017857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 558.1350708007812,
      "completions/mean_terminated_length": 515.3719482421875,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.050393700787401574,
      "grad_norm": 0.3572291433811188,
      "kl": 0.0001131296157836914,
      "learning_rate": 2.1875e-07,
      "loss": -0.0693,
      "num_tokens": 4878691.0,
      "reward": -0.5359418392181396,
      "reward_std": 0.8269107341766357,
      "rewards/cosine_scaled_reward/mean": -0.5359417796134949,
      "rewards/cosine_scaled_reward/std": 1.0827312469482422,
      "step": 8
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 532.286865234375,
      "completions/mean_terminated_length": 495.9096984863281,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.05669291338582677,
      "grad_norm": 0.44510260224342346,
      "kl": 0.00010901689529418945,
      "learning_rate": 2.5e-07,
      "loss": -0.0958,
      "num_tokens": 5486228.0,
      "reward": -0.5363856554031372,
      "reward_std": 0.765136182308197,
      "rewards/cosine_scaled_reward/mean": -0.5363856554031372,
      "rewards/cosine_scaled_reward/std": 1.0828957557678223,
      "step": 9
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1918.0,
      "completions/mean_length": 498.3594055175781,
      "completions/mean_terminated_length": 470.18408203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.06299212598425197,
      "grad_norm": 0.49168774485588074,
      "kl": 0.00010883808135986328,
      "learning_rate": 2.8125e-07,
      "loss": -0.1151,
      "num_tokens": 6062822.0,
      "reward": -0.5703827738761902,
      "reward_std": 0.7068774700164795,
      "rewards/cosine_scaled_reward/mean": -0.5703827142715454,
      "rewards/cosine_scaled_reward/std": 1.0497106313705444,
      "step": 10
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.024553571428571397,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 562.6730346679688,
      "completions/mean_terminated_length": 525.2848510742188,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.06929133858267716,
      "grad_norm": 0.41304177045822144,
      "kl": 0.00014287233352661133,
      "learning_rate": 3.1249999999999997e-07,
      "loss": -0.1104,
      "num_tokens": 6701873.0,
      "reward": -0.6360986828804016,
      "reward_std": 0.6729649305343628,
      "rewards/cosine_scaled_reward/mean": -0.6360986828804016,
      "rewards/cosine_scaled_reward/std": 0.9765380620956421,
      "step": 11
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025669642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 578.0670166015625,
      "completions/mean_terminated_length": 539.3402099609375,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.07559055118110236,
      "grad_norm": 0.31683093309402466,
      "kl": 0.0001493692398071289,
      "learning_rate": 3.4375e-07,
      "loss": -0.0419,
      "num_tokens": 7368189.0,
      "reward": -0.5699460506439209,
      "reward_std": 0.7894014716148376,
      "rewards/cosine_scaled_reward/mean": -0.5699459910392761,
      "rewards/cosine_scaled_reward/std": 1.0498303174972534,
      "step": 12
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 514.8705444335938,
      "completions/mean_terminated_length": 472.6742858886719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.08188976377952756,
      "grad_norm": 0.4763113856315613,
      "kl": 0.00022745132446289062,
      "learning_rate": 3.75e-07,
      "loss": -0.1156,
      "num_tokens": 7978633.0,
      "reward": -0.5255608558654785,
      "reward_std": 0.8026651740074158,
      "rewards/cosine_scaled_reward/mean": -0.5255607962608337,
      "rewards/cosine_scaled_reward/std": 1.0921350717544556,
      "step": 13
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1875.0,
      "completions/mean_length": 508.7734680175781,
      "completions/mean_terminated_length": 471.8320007324219,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.08818897637795275,
      "grad_norm": 0.671411395072937,
      "kl": 0.0003235340118408203,
      "learning_rate": 4.0625e-07,
      "loss": -0.0157,
      "num_tokens": 8570062.0,
      "reward": -0.31583818793296814,
      "reward_std": 1.0193272829055786,
      "rewards/cosine_scaled_reward/mean": -0.31583812832832336,
      "rewards/cosine_scaled_reward/std": 1.2581169605255127,
      "step": 14
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.030133928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 559.8739013671875,
      "completions/mean_terminated_length": 513.6375122070312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.09448818897637795,
      "grad_norm": 0.35042640566825867,
      "kl": 0.0003719329833984375,
      "learning_rate": 4.375e-07,
      "loss": -0.0833,
      "num_tokens": 9216429.0,
      "reward": -0.45582443475723267,
      "reward_std": 0.8441293239593506,
      "rewards/cosine_scaled_reward/mean": -0.4558244049549103,
      "rewards/cosine_scaled_reward/std": 1.1542800664901733,
      "step": 15
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.030133928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 564.8192138671875,
      "completions/mean_terminated_length": 518.7365112304688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.10078740157480315,
      "grad_norm": 0.5082417726516724,
      "kl": 0.0004266500473022461,
      "learning_rate": 4.6874999999999996e-07,
      "loss": -0.0501,
      "num_tokens": 9835243.0,
      "reward": -0.44934120774269104,
      "reward_std": 0.8662951588630676,
      "rewards/cosine_scaled_reward/mean": -0.4493412375450134,
      "rewards/cosine_scaled_reward/std": 1.1598572731018066,
      "step": 16
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.036830357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1878.0,
      "completions/mean_length": 600.6685791015625,
      "completions/mean_terminated_length": 545.324462890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.10708661417322834,
      "grad_norm": 0.41793763637542725,
      "kl": 0.0005254745483398438,
      "learning_rate": 5e-07,
      "loss": -0.0947,
      "num_tokens": 10517874.0,
      "reward": -0.3498678505420685,
      "reward_std": 0.8900845646858215,
      "rewards/cosine_scaled_reward/mean": -0.3498677909374237,
      "rewards/cosine_scaled_reward/std": 1.23600435256958,
      "step": 17
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025669642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 532.1439819335938,
      "completions/mean_terminated_length": 492.2073669433594,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.11338582677165354,
      "grad_norm": 0.5198326706886292,
      "kl": 0.0006380081176757812,
      "learning_rate": 5.3125e-07,
      "loss": -0.0733,
      "num_tokens": 11121267.0,
      "reward": -0.44612544775009155,
      "reward_std": 0.8814043998718262,
      "rewards/cosine_scaled_reward/mean": -0.44612544775009155,
      "rewards/cosine_scaled_reward/std": 1.162795901298523,
      "step": 18
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.036830357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 609.1004638671875,
      "completions/mean_terminated_length": 554.0787963867188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.11968503937007874,
      "grad_norm": 0.4639606475830078,
      "kl": 0.0006771087646484375,
      "learning_rate": 5.625e-07,
      "loss": -0.0639,
      "num_tokens": 11791677.0,
      "reward": -0.2794720232486725,
      "reward_std": 0.9824929237365723,
      "rewards/cosine_scaled_reward/mean": -0.2794720232486725,
      "rewards/cosine_scaled_reward/std": 1.281421184539795,
      "step": 19
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 575.9866333007812,
      "completions/mean_terminated_length": 542.3789672851562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.12598425196850394,
      "grad_norm": 0.4229482412338257,
      "kl": 0.0009136199951171875,
      "learning_rate": 5.937499999999999e-07,
      "loss": -0.059,
      "num_tokens": 12453601.0,
      "reward": -0.43330615758895874,
      "reward_std": 0.8788204789161682,
      "rewards/cosine_scaled_reward/mean": -0.43330615758895874,
      "rewards/cosine_scaled_reward/std": 1.1737666130065918,
      "step": 20
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0279017857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 595.1663208007812,
      "completions/mean_terminated_length": 553.4661254882812,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.13228346456692913,
      "grad_norm": 0.35286158323287964,
      "kl": 0.0011720657348632812,
      "learning_rate": 6.249999999999999e-07,
      "loss": -0.0754,
      "num_tokens": 13124934.0,
      "reward": -0.16925115883350372,
      "reward_std": 1.009746789932251,
      "rewards/cosine_scaled_reward/mean": -0.16925112903118134,
      "rewards/cosine_scaled_reward/std": 1.3426754474639893,
      "step": 21
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0345982142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 607.3404541015625,
      "completions/mean_terminated_length": 555.7098388671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.13858267716535433,
      "grad_norm": 0.37835246324539185,
      "kl": 0.0016326904296875,
      "learning_rate": 6.5625e-07,
      "loss": -0.0973,
      "num_tokens": 13804535.0,
      "reward": -0.21277326345443726,
      "reward_std": 0.9656177163124084,
      "rewards/cosine_scaled_reward/mean": -0.21277324855327606,
      "rewards/cosine_scaled_reward/std": 1.319942593574524,
      "step": 22
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0424107142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 676.90625,
      "completions/mean_terminated_length": 616.1818237304688,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.14488188976377953,
      "grad_norm": 0.30840951204299927,
      "kl": 0.0016546249389648438,
      "learning_rate": 6.875e-07,
      "loss": -0.0202,
      "num_tokens": 14550627.0,
      "reward": -0.038872163742780685,
      "reward_std": 1.0010526180267334,
      "rewards/cosine_scaled_reward/mean": -0.038872163742780685,
      "rewards/cosine_scaled_reward/std": 1.4004237651824951,
      "step": 23
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 614.700927734375,
      "completions/mean_terminated_length": 568.4654541015625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.15118110236220472,
      "grad_norm": 1.1502492427825928,
      "kl": 0.0022287368774414062,
      "learning_rate": 7.1875e-07,
      "loss": -0.034,
      "num_tokens": 15218759.0,
      "reward": -0.03518987447023392,
      "reward_std": 1.0530657768249512,
      "rewards/cosine_scaled_reward/mean": -0.03518987074494362,
      "rewards/cosine_scaled_reward/std": 1.4014222621917725,
      "step": 24
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.024553571428571397,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 588.9777221679688,
      "completions/mean_terminated_length": 552.251708984375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.15748031496062992,
      "grad_norm": 0.32569238543510437,
      "kl": 0.00287628173828125,
      "learning_rate": 7.5e-07,
      "loss": -0.0104,
      "num_tokens": 15853619.0,
      "reward": 0.004624083638191223,
      "reward_std": 1.038998007774353,
      "rewards/cosine_scaled_reward/mean": 0.004624092020094395,
      "rewards/cosine_scaled_reward/std": 1.4162551164627075,
      "step": 25
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0502232142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 756.7924194335938,
      "completions/mean_terminated_length": 688.5146484375,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.16377952755905512,
      "grad_norm": 0.3726414144039154,
      "kl": 0.00331878662109375,
      "learning_rate": 7.812499999999999e-07,
      "loss": 0.0287,
      "num_tokens": 16648345.0,
      "reward": -0.18634870648384094,
      "reward_std": 0.8561702966690063,
      "rewards/cosine_scaled_reward/mean": -0.18634869158267975,
      "rewards/cosine_scaled_reward/std": 1.3344650268554688,
      "step": 26
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0357142857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 682.4074096679688,
      "completions/mean_terminated_length": 631.8298950195312,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.1700787401574803,
      "grad_norm": 0.27346354722976685,
      "kl": 0.00455474853515625,
      "learning_rate": 8.125e-07,
      "loss": 0.0598,
      "num_tokens": 17386646.0,
      "reward": 0.3427794575691223,
      "reward_std": 1.1637052297592163,
      "rewards/cosine_scaled_reward/mean": 0.3427794277667999,
      "rewards/cosine_scaled_reward/std": 1.4923925399780273,
      "step": 27
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 707.4174194335938,
      "completions/mean_terminated_length": 651.2999877929688,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.1763779527559055,
      "grad_norm": 0.25242871046066284,
      "kl": 0.0053730010986328125,
      "learning_rate": 8.4375e-07,
      "loss": 0.0511,
      "num_tokens": 18169068.0,
      "reward": 0.07486817985773087,
      "reward_std": 1.024709701538086,
      "rewards/cosine_scaled_reward/mean": 0.07486817985773087,
      "rewards/cosine_scaled_reward/std": 1.4391454458236694,
      "step": 28
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 734.8471069335938,
      "completions/mean_terminated_length": 681.4668579101562,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.1826771653543307,
      "grad_norm": 0.393325537443161,
      "kl": 0.006855010986328125,
      "learning_rate": 8.75e-07,
      "loss": 0.0531,
      "num_tokens": 18953107.0,
      "reward": 0.42649638652801514,
      "reward_std": 1.0319130420684814,
      "rewards/cosine_scaled_reward/mean": 0.42649635672569275,
      "rewards/cosine_scaled_reward/std": 1.498863935470581,
      "step": 29
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 709.2064819335938,
      "completions/mean_terminated_length": 653.1639404296875,
      "completions/min_length": 23.0,
      "completions/min_terminated_length": 23.0,
      "epoch": 0.1889763779527559,
      "grad_norm": 0.32169538736343384,
      "kl": 0.007747650146484375,
      "learning_rate": 9.0625e-07,
      "loss": 0.0762,
      "num_tokens": 19721804.0,
      "reward": 0.5034300088882446,
      "reward_std": 1.0488739013671875,
      "rewards/cosine_scaled_reward/mean": 0.5034299492835999,
      "rewards/cosine_scaled_reward/std": 1.5007421970367432,
      "step": 30
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0502232142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 800.1741333007812,
      "completions/mean_terminated_length": 734.1903686523438,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.1952755905511811,
      "grad_norm": 0.22457779943943024,
      "kl": 0.007617950439453125,
      "learning_rate": 9.374999999999999e-07,
      "loss": 0.0583,
      "num_tokens": 20562328.0,
      "reward": 0.6339239478111267,
      "reward_std": 1.1418627500534058,
      "rewards/cosine_scaled_reward/mean": 0.6339239478111267,
      "rewards/cosine_scaled_reward/std": 1.494827151298523,
      "step": 31
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.030133928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 672.171875,
      "completions/mean_terminated_length": 629.4246215820312,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.2015748031496063,
      "grad_norm": 0.2857438027858734,
      "kl": 0.009197235107421875,
      "learning_rate": 9.6875e-07,
      "loss": 0.0746,
      "num_tokens": 21281746.0,
      "reward": 0.7780371308326721,
      "reward_std": 1.0006595849990845,
      "rewards/cosine_scaled_reward/mean": 0.7780370712280273,
      "rewards/cosine_scaled_reward/std": 1.4746843576431274,
      "step": 32
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0580357142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 820.8381958007812,
      "completions/mean_terminated_length": 745.2310791015625,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.2078740157480315,
      "grad_norm": 0.353129118680954,
      "kl": 0.008514404296875,
      "learning_rate": 1e-06,
      "loss": 0.0844,
      "num_tokens": 22153649.0,
      "reward": 0.3895089328289032,
      "reward_std": 0.9308316111564636,
      "rewards/cosine_scaled_reward/mean": 0.3895089328289032,
      "rewards/cosine_scaled_reward/std": 1.4967604875564575,
      "step": 33
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0558035714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 765.2767944335938,
      "completions/mean_terminated_length": 689.4656982421875,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.2141732283464567,
      "grad_norm": 0.3190731406211853,
      "kl": 0.0099639892578125,
      "learning_rate": 9.999694086498248e-07,
      "loss": 0.0808,
      "num_tokens": 22977081.0,
      "reward": 0.7411263585090637,
      "reward_std": 1.0240404605865479,
      "rewards/cosine_scaled_reward/mean": 0.7411263585090637,
      "rewards/cosine_scaled_reward/std": 1.4812393188476562,
      "step": 34
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.036830357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 716.021240234375,
      "completions/mean_terminated_length": 665.0880737304688,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.2204724409448819,
      "grad_norm": 0.27622923254966736,
      "kl": 0.010406494140625,
      "learning_rate": 9.998776383426216e-07,
      "loss": 0.0507,
      "num_tokens": 23758268.0,
      "reward": 0.6139054298400879,
      "reward_std": 0.8962022662162781,
      "rewards/cosine_scaled_reward/mean": 0.6139054298400879,
      "rewards/cosine_scaled_reward/std": 1.4964288473129272,
      "step": 35
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044642857142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 759.5558471679688,
      "completions/mean_terminated_length": 699.3480834960938,
      "completions/min_length": 42.0,
      "completions/min_terminated_length": 42.0,
      "epoch": 0.22677165354330708,
      "grad_norm": 0.31751304864883423,
      "kl": 0.01029205322265625,
      "learning_rate": 9.997247003079009e-07,
      "loss": 0.0489,
      "num_tokens": 24579358.0,
      "reward": 0.47326162457466125,
      "reward_std": 0.9961800575256348,
      "rewards/cosine_scaled_reward/mean": 0.47326159477233887,
      "rewards/cosine_scaled_reward/std": 1.5005344152450562,
      "step": 36
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.030133928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 690.1194458007812,
      "completions/mean_terminated_length": 647.9298095703125,
      "completions/min_length": 113.0,
      "completions/min_terminated_length": 113.0,
      "epoch": 0.23307086614173228,
      "grad_norm": 0.3456534147262573,
      "kl": 0.011474609375,
      "learning_rate": 9.995106132599868e-07,
      "loss": 0.0696,
      "num_tokens": 25341609.0,
      "reward": 0.8717383146286011,
      "reward_std": 0.797817587852478,
      "rewards/cosine_scaled_reward/mean": 0.8717382550239563,
      "rewards/cosine_scaled_reward/std": 1.4538819789886475,
      "step": 37
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1881.0,
      "completions/mean_length": 678.0201416015625,
      "completions/mean_terminated_length": 633.8272094726562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.23937007874015748,
      "grad_norm": 0.4764210283756256,
      "kl": 0.01166534423828125,
      "learning_rate": 9.992354033957265e-07,
      "loss": 0.0301,
      "num_tokens": 26068747.0,
      "reward": 0.757907509803772,
      "reward_std": 0.8109303712844849,
      "rewards/cosine_scaled_reward/mean": 0.7579074501991272,
      "rewards/cosine_scaled_reward/std": 1.4783906936645508,
      "step": 38
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 694.2020263671875,
      "completions/mean_terminated_length": 661.7108764648438,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.24566929133858267,
      "grad_norm": 6.102512359619141,
      "kl": 0.010040283203125,
      "learning_rate": 9.988991043912856e-07,
      "loss": 0.0905,
      "num_tokens": 26821280.0,
      "reward": 0.697593629360199,
      "reward_std": 0.9140774011611938,
      "rewards/cosine_scaled_reward/mean": 0.697593629360199,
      "rewards/cosine_scaled_reward/std": 1.4877097606658936,
      "step": 39
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.041294642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 763.3047485351562,
      "completions/mean_terminated_length": 707.9685668945312,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.25196850393700787,
      "grad_norm": 1.1193389892578125,
      "kl": 0.01035308837890625,
      "learning_rate": 9.98501757398026e-07,
      "loss": 0.068,
      "num_tokens": 27626545.0,
      "reward": 0.7076338529586792,
      "reward_std": 0.9347448348999023,
      "rewards/cosine_scaled_reward/mean": 0.7076338529586792,
      "rewards/cosine_scaled_reward/std": 1.486343502998352,
      "step": 40
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1955.0,
      "completions/mean_length": 721.7701416015625,
      "completions/mean_terminated_length": 685.268310546875,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.25826771653543307,
      "grad_norm": 0.28604307770729065,
      "kl": 0.011211395263671875,
      "learning_rate": 9.980434110374724e-07,
      "loss": 0.0462,
      "num_tokens": 28407091.0,
      "reward": 0.6906989216804504,
      "reward_std": 0.9639256596565247,
      "rewards/cosine_scaled_reward/mean": 0.6906989216804504,
      "rewards/cosine_scaled_reward/std": 1.488455891609192,
      "step": 41
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0345982142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 795.6417846679688,
      "completions/mean_terminated_length": 750.759521484375,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.26456692913385826,
      "grad_norm": 0.23319000005722046,
      "kl": 0.0089263916015625,
      "learning_rate": 9.975241213953604e-07,
      "loss": 0.0443,
      "num_tokens": 29252562.0,
      "reward": 0.5669840574264526,
      "reward_std": 0.8921679258346558,
      "rewards/cosine_scaled_reward/mean": 0.5669840574264526,
      "rewards/cosine_scaled_reward/std": 1.4992791414260864,
      "step": 42
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.033482142857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 729.4832763671875,
      "completions/mean_terminated_length": 683.80712890625,
      "completions/min_length": 43.0,
      "completions/min_terminated_length": 43.0,
      "epoch": 0.27086614173228346,
      "grad_norm": 0.3877314627170563,
      "kl": 0.011043548583984375,
      "learning_rate": 9.969439520147753e-07,
      "loss": 0.0633,
      "num_tokens": 30032467.0,
      "reward": 0.6205415725708008,
      "reward_std": 0.9956651329994202,
      "rewards/cosine_scaled_reward/mean": 0.6205415725708008,
      "rewards/cosine_scaled_reward/std": 1.4959126710891724,
      "step": 43
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1678.0,
      "completions/mean_length": 708.482177734375,
      "completions/mean_terminated_length": 671.6146240234375,
      "completions/min_length": 133.0,
      "completions/min_terminated_length": 133.0,
      "epoch": 0.27716535433070866,
      "grad_norm": 0.3526562750339508,
      "kl": 0.010223388671875,
      "learning_rate": 9.96302973888376e-07,
      "loss": 0.0391,
      "num_tokens": 30792435.0,
      "reward": 0.8092110753059387,
      "reward_std": 0.8714210391044617,
      "rewards/cosine_scaled_reward/mean": 0.8239240050315857,
      "rewards/cosine_scaled_reward/std": 1.465368628501892,
      "step": 44
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.029017857142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 747.982177734375,
      "completions/mean_terminated_length": 709.1310424804688,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.28346456692913385,
      "grad_norm": 0.2751273512840271,
      "kl": 0.0096893310546875,
      "learning_rate": 9.956012654497072e-07,
      "loss": 0.0289,
      "num_tokens": 31581763.0,
      "reward": 0.5770221948623657,
      "reward_std": 1.057499647140503,
      "rewards/cosine_scaled_reward/mean": 0.577022135257721,
      "rewards/cosine_scaled_reward/std": 1.4988446235656738,
      "step": 45
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.029017857142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1995.0,
      "completions/mean_length": 680.700927734375,
      "completions/mean_terminated_length": 639.8390502929688,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.28976377952755905,
      "grad_norm": 0.835785984992981,
      "kl": 0.01076507568359375,
      "learning_rate": 9.948389125636038e-07,
      "loss": 0.0516,
      "num_tokens": 32316247.0,
      "reward": 0.6774722933769226,
      "reward_std": 0.7520989179611206,
      "rewards/cosine_scaled_reward/mean": 0.6774722337722778,
      "rewards/cosine_scaled_reward/std": 1.4901094436645508,
      "step": 46
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0345982142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 743.0714721679688,
      "completions/mean_terminated_length": 696.30517578125,
      "completions/min_length": 76.0,
      "completions/min_terminated_length": 76.0,
      "epoch": 0.29606299212598425,
      "grad_norm": 0.4167815148830414,
      "kl": 0.0100250244140625,
      "learning_rate": 9.940160085156819e-07,
      "loss": 0.0426,
      "num_tokens": 33120535.0,
      "reward": 0.5268194079399109,
      "reward_std": 0.9155340790748596,
      "rewards/cosine_scaled_reward/mean": 0.5268194079399109,
      "rewards/cosine_scaled_reward/std": 1.5005640983581543,
      "step": 47
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.03125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 695.7154541015625,
      "completions/mean_terminated_length": 652.0933227539062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.30236220472440944,
      "grad_norm": 0.8342124819755554,
      "kl": 0.0113067626953125,
      "learning_rate": 9.931326540009253e-07,
      "loss": 0.0437,
      "num_tokens": 33879336.0,
      "reward": 0.6574720740318298,
      "reward_std": 0.9341281056404114,
      "rewards/cosine_scaled_reward/mean": 0.6574720740318298,
      "rewards/cosine_scaled_reward/std": 1.4924215078353882,
      "step": 48
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025669642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 720.7813110351562,
      "completions/mean_terminated_length": 685.814453125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.30866141732283464,
      "grad_norm": 0.5337599515914917,
      "kl": 0.010944366455078125,
      "learning_rate": 9.921889571113627e-07,
      "loss": 0.0304,
      "num_tokens": 34674036.0,
      "reward": 0.6473553776741028,
      "reward_std": 0.7692996859550476,
      "rewards/cosine_scaled_reward/mean": 0.6473553776741028,
      "rewards/cosine_scaled_reward/std": 1.4935177564620972,
      "step": 49
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1923.0,
      "completions/mean_length": 701.8359985351562,
      "completions/mean_terminated_length": 671.1015625,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 0.31496062992125984,
      "grad_norm": 0.6662896871566772,
      "kl": 0.01065826416015625,
      "learning_rate": 9.911850333228427e-07,
      "loss": 0.0542,
      "num_tokens": 35430705.0,
      "reward": 0.821460485458374,
      "reward_std": 0.9691373705863953,
      "rewards/cosine_scaled_reward/mean": 0.8214603662490845,
      "rewards/cosine_scaled_reward/std": 1.4658869504928589,
      "step": 50
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0189732142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 697.9620971679688,
      "completions/mean_terminated_length": 671.8521118164062,
      "completions/min_length": 81.0,
      "completions/min_terminated_length": 81.0,
      "epoch": 0.32125984251968503,
      "grad_norm": 0.9534047842025757,
      "kl": 0.01047515869140625,
      "learning_rate": 9.901210054809014e-07,
      "loss": 0.0367,
      "num_tokens": 36186687.0,
      "reward": 0.6305912137031555,
      "reward_std": 0.8517847657203674,
      "rewards/cosine_scaled_reward/mean": 0.6305912137031555,
      "rewards/cosine_scaled_reward/std": 1.4951274394989014,
      "step": 51
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0189732142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 707.9364013671875,
      "completions/mean_terminated_length": 682.0193481445312,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.32755905511811023,
      "grad_norm": 0.3993193507194519,
      "kl": 0.010395050048828125,
      "learning_rate": 9.889970037857323e-07,
      "loss": 0.0396,
      "num_tokens": 36967334.0,
      "reward": 0.5871036052703857,
      "reward_std": 0.9707032442092896,
      "rewards/cosine_scaled_reward/mean": 0.587103545665741,
      "rewards/cosine_scaled_reward/std": 1.4982444047927856,
      "step": 52
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.014508928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 690.3069458007812,
      "completions/mean_terminated_length": 670.3182373046875,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.33385826771653543,
      "grad_norm": 0.4851699769496918,
      "kl": 0.010189056396484375,
      "learning_rate": 9.878131657762535e-07,
      "loss": 0.0056,
      "num_tokens": 37729289.0,
      "reward": 0.623904824256897,
      "reward_std": 0.9286167621612549,
      "rewards/cosine_scaled_reward/mean": 0.6239047646522522,
      "rewards/cosine_scaled_reward/std": 1.4956852197647095,
      "step": 53
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 700.4330444335938,
      "completions/mean_terminated_length": 675.9318237304688,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.3401574803149606,
      "grad_norm": 1.0800269842147827,
      "kl": 0.010822296142578125,
      "learning_rate": 9.865696363132768e-07,
      "loss": 0.0377,
      "num_tokens": 38493613.0,
      "reward": 0.5335128307342529,
      "reward_std": 0.8787680268287659,
      "rewards/cosine_scaled_reward/mean": 0.5335127711296082,
      "rewards/cosine_scaled_reward/std": 1.5004278421401978,
      "step": 54
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021205357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 752.1897583007812,
      "completions/mean_terminated_length": 724.1162719726562,
      "completions/min_length": 21.0,
      "completions/min_terminated_length": 21.0,
      "epoch": 0.3464566929133858,
      "grad_norm": 0.2947627902030945,
      "kl": 0.00998687744140625,
      "learning_rate": 9.852665675617837e-07,
      "loss": 0.0258,
      "num_tokens": 39296823.0,
      "reward": 0.8013721108436584,
      "reward_std": 0.88521409034729,
      "rewards/cosine_scaled_reward/mean": 0.8013721108436584,
      "rewards/cosine_scaled_reward/std": 1.4701893329620361,
      "step": 55
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0200892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 730.2120971679688,
      "completions/mean_terminated_length": 703.1959228515625,
      "completions/min_length": 35.0,
      "completions/min_terminated_length": 35.0,
      "epoch": 0.352755905511811,
      "grad_norm": 0.29348793625831604,
      "kl": 0.01019287109375,
      "learning_rate": 9.83904118972304e-07,
      "loss": 0.0265,
      "num_tokens": 40101221.0,
      "reward": 0.5870950818061829,
      "reward_std": 0.9105805158615112,
      "rewards/cosine_scaled_reward/mean": 0.5870950818061829,
      "rewards/cosine_scaled_reward/std": 1.49825918674469,
      "step": 56
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 676.7489013671875,
      "completions/mean_terminated_length": 651.8170166015625,
      "completions/min_length": 87.0,
      "completions/min_terminated_length": 87.0,
      "epoch": 0.3590551181102362,
      "grad_norm": 1.3304539918899536,
      "kl": 0.01004791259765625,
      "learning_rate": 9.82482457261405e-07,
      "loss": 0.0235,
      "num_tokens": 40832692.0,
      "reward": 0.7879303097724915,
      "reward_std": 0.986963152885437,
      "rewards/cosine_scaled_reward/mean": 0.7879303097724915,
      "rewards/cosine_scaled_reward/std": 1.4728718996047974,
      "step": 57
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021205357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 700.677490234375,
      "completions/mean_terminated_length": 671.488037109375,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.3653543307086614,
      "grad_norm": 0.2850182354450226,
      "kl": 0.00983428955078125,
      "learning_rate": 9.81001756391292e-07,
      "loss": 0.0345,
      "num_tokens": 41588563.0,
      "reward": 0.6406442523002625,
      "reward_std": 0.868269145488739,
      "rewards/cosine_scaled_reward/mean": 0.6406441926956177,
      "rewards/cosine_scaled_reward/std": 1.494200348854065,
      "step": 58
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.014508928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2011.0,
      "completions/mean_length": 613.5480346679688,
      "completions/mean_terminated_length": 592.42919921875,
      "completions/min_length": 29.0,
      "completions/min_terminated_length": 29.0,
      "epoch": 0.3716535433070866,
      "grad_norm": 0.4392086863517761,
      "kl": 0.011688232421875,
      "learning_rate": 9.7946219754852e-07,
      "loss": 0.0472,
      "num_tokens": 42260142.0,
      "reward": 0.8915125727653503,
      "reward_std": 0.8179810643196106,
      "rewards/cosine_scaled_reward/mean": 0.8915125131607056,
      "rewards/cosine_scaled_reward/std": 1.4483691453933716,
      "step": 59
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0200892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 688.6529541015625,
      "completions/mean_terminated_length": 660.7847900390625,
      "completions/min_length": 24.0,
      "completions/min_terminated_length": 24.0,
      "epoch": 0.3779527559055118,
      "grad_norm": 0.3848814070224762,
      "kl": 0.009918212890625,
      "learning_rate": 9.77863969121824e-07,
      "loss": 0.0307,
      "num_tokens": 43001975.0,
      "reward": 0.744490385055542,
      "reward_std": 0.9112673997879028,
      "rewards/cosine_scaled_reward/mean": 0.7444903254508972,
      "rewards/cosine_scaled_reward/std": 1.4806958436965942,
      "step": 60
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0200892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 677.6473388671875,
      "completions/mean_terminated_length": 649.5535278320312,
      "completions/min_length": 139.0,
      "completions/min_terminated_length": 139.0,
      "epoch": 0.384251968503937,
      "grad_norm": 0.35118257999420166,
      "kl": 0.0096893310546875,
      "learning_rate": 9.762072666790656e-07,
      "loss": -0.0035,
      "num_tokens": 43752059.0,
      "reward": 0.5971543192863464,
      "reward_std": 0.843102216720581,
      "rewards/cosine_scaled_reward/mean": 0.5971542596817017,
      "rewards/cosine_scaled_reward/std": 1.4975931644439697,
      "step": 61
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0200892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 692.1629638671875,
      "completions/mean_terminated_length": 664.3667602539062,
      "completions/min_length": 120.0,
      "completions/min_terminated_length": 120.0,
      "epoch": 0.3905511811023622,
      "grad_norm": 0.3855595886707306,
      "kl": 0.00948333740234375,
      "learning_rate": 9.744922929433033e-07,
      "loss": 0.0581,
      "num_tokens": 44502845.0,
      "reward": 0.5435569882392883,
      "reward_std": 0.8974959850311279,
      "rewards/cosine_scaled_reward/mean": 0.5435569882392883,
      "rewards/cosine_scaled_reward/std": 1.5001307725906372,
      "step": 62
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0200892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 677.5748291015625,
      "completions/mean_terminated_length": 649.4795532226562,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.3968503937007874,
      "grad_norm": 0.5436669588088989,
      "kl": 0.00952911376953125,
      "learning_rate": 9.72719257767985e-07,
      "loss": 0.0235,
      "num_tokens": 45237840.0,
      "reward": 0.6974921822547913,
      "reward_std": 0.985985279083252,
      "rewards/cosine_scaled_reward/mean": 0.6974921226501465,
      "rewards/cosine_scaled_reward/std": 1.4876641035079956,
      "step": 63
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.030133928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 769.7176513671875,
      "completions/mean_terminated_length": 730.0011596679688,
      "completions/min_length": 130.0,
      "completions/min_terminated_length": 130.0,
      "epoch": 0.4031496062992126,
      "grad_norm": 0.590516984462738,
      "kl": 0.008392333984375,
      "learning_rate": 9.70888378111271e-07,
      "loss": 0.0478,
      "num_tokens": 46067475.0,
      "reward": 0.33929741382598877,
      "reward_std": 0.9476777911186218,
      "rewards/cosine_scaled_reward/mean": 0.339297354221344,
      "rewards/cosine_scaled_reward/std": 1.4921878576278687,
      "step": 64
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.014508928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 628.677490234375,
      "completions/mean_terminated_length": 607.7814331054688,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.4094488188976378,
      "grad_norm": 0.34324246644973755,
      "kl": 0.011089324951171875,
      "learning_rate": 9.689998780094837e-07,
      "loss": 0.0601,
      "num_tokens": 46761986.0,
      "reward": 0.925286591053009,
      "reward_std": 1.0149413347244263,
      "rewards/cosine_scaled_reward/mean": 0.925286591053009,
      "rewards/cosine_scaled_reward/std": 1.439137578010559,
      "step": 65
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 699.3839721679688,
      "completions/mean_terminated_length": 668.5935668945312,
      "completions/min_length": 168.0,
      "completions/min_terminated_length": 168.0,
      "epoch": 0.415748031496063,
      "grad_norm": 0.39818695187568665,
      "kl": 0.009059906005859375,
      "learning_rate": 9.67053988549695e-07,
      "loss": 0.0456,
      "num_tokens": 47514154.0,
      "reward": 0.7109607458114624,
      "reward_std": 1.0165566205978394,
      "rewards/cosine_scaled_reward/mean": 0.7109607458114624,
      "rewards/cosine_scaled_reward/std": 1.4858678579330444,
      "step": 66
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 710.5413208007812,
      "completions/mean_terminated_length": 680.0056762695312,
      "completions/min_length": 156.0,
      "completions/min_terminated_length": 156.0,
      "epoch": 0.4220472440944882,
      "grad_norm": 0.4072217643260956,
      "kl": 0.009124755859375,
      "learning_rate": 9.650509478414482e-07,
      "loss": 0.0476,
      "num_tokens": 48275359.0,
      "reward": 0.7946111559867859,
      "reward_std": 0.9021484851837158,
      "rewards/cosine_scaled_reward/mean": 0.7946110963821411,
      "rewards/cosine_scaled_reward/std": 1.4715018272399902,
      "step": 67
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 695.6283569335938,
      "completions/mean_terminated_length": 663.1714477539062,
      "completions/min_length": 100.0,
      "completions/min_terminated_length": 100.0,
      "epoch": 0.4283464566929134,
      "grad_norm": 0.32094287872314453,
      "kl": 0.00888824462890625,
      "learning_rate": 9.62991000987622e-07,
      "loss": 0.0311,
      "num_tokens": 49047650.0,
      "reward": 0.6975319981575012,
      "reward_std": 0.9495226740837097,
      "rewards/cosine_scaled_reward/mean": 0.6975319981575012,
      "rewards/cosine_scaled_reward/std": 1.487709879875183,
      "step": 68
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.029017857142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1912.0,
      "completions/mean_length": 668.6741333007812,
      "completions/mean_terminated_length": 627.452880859375,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.4346456692913386,
      "grad_norm": 0.35820212960243225,
      "kl": 0.01102447509765625,
      "learning_rate": 9.60874400054439e-07,
      "loss": 0.0277,
      "num_tokens": 49783358.0,
      "reward": 0.7410476207733154,
      "reward_std": 0.9227878451347351,
      "rewards/cosine_scaled_reward/mean": 0.7410475015640259,
      "rewards/cosine_scaled_reward/std": 1.481183409690857,
      "step": 69
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2002.0,
      "completions/mean_length": 695.1785888671875,
      "completions/mean_terminated_length": 662.7108764648438,
      "completions/min_length": 155.0,
      "completions/min_terminated_length": 155.0,
      "epoch": 0.4409448818897638,
      "grad_norm": 0.2676938772201538,
      "kl": 0.008640289306640625,
      "learning_rate": 9.587014040406206e-07,
      "loss": 0.0365,
      "num_tokens": 50540622.0,
      "reward": 0.5602699518203735,
      "reward_std": 0.8083215951919556,
      "rewards/cosine_scaled_reward/mean": 0.5602698922157288,
      "rewards/cosine_scaled_reward/std": 1.4995226860046387,
      "step": 70
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 766.4799194335938,
      "completions/mean_terminated_length": 735.7234497070312,
      "completions/min_length": 158.0,
      "completions/min_terminated_length": 158.0,
      "epoch": 0.44724409448818897,
      "grad_norm": 0.25637087225914,
      "kl": 0.008380889892578125,
      "learning_rate": 9.564722788456943e-07,
      "loss": 0.0424,
      "num_tokens": 51390028.0,
      "reward": 0.3091750741004944,
      "reward_std": 0.9855142831802368,
      "rewards/cosine_scaled_reward/mean": 0.309175044298172,
      "rewards/cosine_scaled_reward/std": 1.4886199235916138,
      "step": 71
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 664.40625,
      "completions/mean_terminated_length": 631.2000122070312,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.45354330708661417,
      "grad_norm": 0.3117295205593109,
      "kl": 0.010120391845703125,
      "learning_rate": 9.541872972374582e-07,
      "loss": 0.0579,
      "num_tokens": 52123992.0,
      "reward": 0.8448864817619324,
      "reward_std": 0.961277186870575,
      "rewards/cosine_scaled_reward/mean": 0.8448864221572876,
      "rewards/cosine_scaled_reward/std": 1.4605894088745117,
      "step": 72
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.016741071428571397,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 659.1674194335938,
      "completions/mean_terminated_length": 635.5210571289062,
      "completions/min_length": 104.0,
      "completions/min_terminated_length": 104.0,
      "epoch": 0.45984251968503936,
      "grad_norm": 0.34628939628601074,
      "kl": 0.008678436279296875,
      "learning_rate": 9.518467388186019e-07,
      "loss": 0.0462,
      "num_tokens": 52841694.0,
      "reward": 0.7478297352790833,
      "reward_std": 0.8671966791152954,
      "rewards/cosine_scaled_reward/mean": 0.7478297352790833,
      "rewards/cosine_scaled_reward/std": 1.4801048040390015,
      "step": 73
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1980.0,
      "completions/mean_length": 769.2879638671875,
      "completions/mean_terminated_length": 734.093994140625,
      "completions/min_length": 140.0,
      "completions/min_terminated_length": 140.0,
      "epoch": 0.46614173228346456,
      "grad_norm": 0.28708598017692566,
      "kl": 0.00865936279296875,
      "learning_rate": 9.494508899924947e-07,
      "loss": 0.0203,
      "num_tokens": 53685440.0,
      "reward": 0.41296184062957764,
      "reward_std": 1.0075267553329468,
      "rewards/cosine_scaled_reward/mean": 0.41296181082725525,
      "rewards/cosine_scaled_reward/std": 1.4982719421386719,
      "step": 74
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0279017857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 734.6864013671875,
      "completions/mean_terminated_length": 696.9907836914062,
      "completions/min_length": 116.0,
      "completions/min_terminated_length": 116.0,
      "epoch": 0.47244094488188976,
      "grad_norm": 0.22522306442260742,
      "kl": 0.0083770751953125,
      "learning_rate": 9.470000439281378e-07,
      "loss": 0.0213,
      "num_tokens": 54483543.0,
      "reward": 0.43300533294677734,
      "reward_std": 0.853945791721344,
      "rewards/cosine_scaled_reward/mean": 0.43300530314445496,
      "rewards/cosine_scaled_reward/std": 1.4992822408676147,
      "step": 75
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0200892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1907.0,
      "completions/mean_length": 721.5346069335938,
      "completions/mean_terminated_length": 694.340576171875,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.47874015748031495,
      "grad_norm": 0.40116333961486816,
      "kl": 0.0081329345703125,
      "learning_rate": 9.444945005242923e-07,
      "loss": 0.0563,
      "num_tokens": 55285750.0,
      "reward": 0.3962365686893463,
      "reward_std": 0.8454629778862,
      "rewards/cosine_scaled_reward/mean": 0.3962365686893463,
      "rewards/cosine_scaled_reward/std": 1.4971959590911865,
      "step": 76
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.016741071428571397,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 661.125,
      "completions/mean_terminated_length": 637.511962890625,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.48503937007874015,
      "grad_norm": 0.24201875925064087,
      "kl": 0.009799957275390625,
      "learning_rate": 9.419345663727804e-07,
      "loss": 0.0013,
      "num_tokens": 56000966.0,
      "reward": 0.6039526462554932,
      "reward_std": 0.9591172933578491,
      "rewards/cosine_scaled_reward/mean": 0.6039525866508484,
      "rewards/cosine_scaled_reward/std": 1.497071623802185,
      "step": 77
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021205357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 667.388427734375,
      "completions/mean_terminated_length": 637.4777221679688,
      "completions/min_length": 84.0,
      "completions/min_terminated_length": 84.0,
      "epoch": 0.49133858267716535,
      "grad_norm": 0.4878818392753601,
      "kl": 0.00946044921875,
      "learning_rate": 9.393205547209708e-07,
      "loss": 0.0725,
      "num_tokens": 56714002.0,
      "reward": 0.7847359776496887,
      "reward_std": 0.8636373281478882,
      "rewards/cosine_scaled_reward/mean": 0.7847359776496887,
      "rewards/cosine_scaled_reward/std": 1.473402976989746,
      "step": 78
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021205357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1943.0,
      "completions/mean_length": 629.6796875,
      "completions/mean_terminated_length": 598.9520874023438,
      "completions/min_length": 118.0,
      "completions/min_terminated_length": 118.0,
      "epoch": 0.49763779527559054,
      "grad_norm": 0.509618878364563,
      "kl": 0.00946044921875,
      "learning_rate": 9.366527854334462e-07,
      "loss": 0.0226,
      "num_tokens": 57416579.0,
      "reward": 0.6808841228485107,
      "reward_std": 0.9489257335662842,
      "rewards/cosine_scaled_reward/mean": 0.680884063243866,
      "rewards/cosine_scaled_reward/std": 1.4896831512451172,
      "step": 79
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.011160714285714302,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 645.3270263671875,
      "completions/mean_terminated_length": 629.4954833984375,
      "completions/min_length": 78.0,
      "completions/min_terminated_length": 78.0,
      "epoch": 0.5039370078740157,
      "grad_norm": 0.25900280475616455,
      "kl": 0.009029388427734375,
      "learning_rate": 9.339315849528649e-07,
      "loss": 0.0104,
      "num_tokens": 58139288.0,
      "reward": 0.7511017322540283,
      "reward_std": 0.8574750423431396,
      "rewards/cosine_scaled_reward/mean": 0.7511016726493835,
      "rewards/cosine_scaled_reward/std": 1.4796228408813477,
      "step": 80
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1972.0,
      "completions/mean_length": 636.7020263671875,
      "completions/mean_terminated_length": 614.3004760742188,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.510236220472441,
      "grad_norm": 0.2857658267021179,
      "kl": 0.009235382080078125,
      "learning_rate": 9.311572862600138e-07,
      "loss": 0.0175,
      "num_tokens": 58831197.0,
      "reward": 0.9521499872207642,
      "reward_std": 0.9827561378479004,
      "rewards/cosine_scaled_reward/mean": 0.9521499276161194,
      "rewards/cosine_scaled_reward/std": 1.4308584928512573,
      "step": 81
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.012276785714285698,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 664.6350708007812,
      "completions/mean_terminated_length": 647.440673828125,
      "completions/min_length": 112.0,
      "completions/min_terminated_length": 112.0,
      "epoch": 0.5165354330708661,
      "grad_norm": 0.19333045184612274,
      "kl": 0.008487701416015625,
      "learning_rate": 9.283302288330643e-07,
      "loss": 0.0009,
      "num_tokens": 59560566.0,
      "reward": 0.7243615984916687,
      "reward_std": 0.8989306688308716,
      "rewards/cosine_scaled_reward/mean": 0.7243615388870239,
      "rewards/cosine_scaled_reward/std": 1.4838638305664062,
      "step": 82
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0234375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1954.0,
      "completions/mean_length": 717.2957763671875,
      "completions/mean_terminated_length": 685.3588256835938,
      "completions/min_length": 80.0,
      "completions/min_terminated_length": 80.0,
      "epoch": 0.5228346456692914,
      "grad_norm": 0.7998607754707336,
      "kl": 0.0081939697265625,
      "learning_rate": 9.25450758606031e-07,
      "loss": -0.0049,
      "num_tokens": 60337039.0,
      "reward": 0.38290420174598694,
      "reward_std": 0.8364748358726501,
      "rewards/cosine_scaled_reward/mean": 0.38290414214134216,
      "rewards/cosine_scaled_reward/std": 1.496158242225647,
      "step": 83
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.017857142857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 657.6328125,
      "completions/mean_terminated_length": 632.3533935546875,
      "completions/min_length": 101.0,
      "completions/min_terminated_length": 101.0,
      "epoch": 0.5291338582677165,
      "grad_norm": 0.2160445600748062,
      "kl": 0.008991241455078125,
      "learning_rate": 9.225192279264422e-07,
      "loss": 0.0412,
      "num_tokens": 61047318.0,
      "reward": 0.7544872164726257,
      "reward_std": 0.7835718393325806,
      "rewards/cosine_scaled_reward/mean": 0.754487156867981,
      "rewards/cosine_scaled_reward/std": 1.4789953231811523,
      "step": 84
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1884.0,
      "completions/mean_length": 613.2232666015625,
      "completions/mean_terminated_length": 590.448974609375,
      "completions/min_length": 39.0,
      "completions/min_terminated_length": 39.0,
      "epoch": 0.5354330708661418,
      "grad_norm": 0.3396480083465576,
      "kl": 0.009510040283203125,
      "learning_rate": 9.195359955122243e-07,
      "loss": 0.0483,
      "num_tokens": 61716782.0,
      "reward": 0.9754770398139954,
      "reward_std": 0.7877938747406006,
      "rewards/cosine_scaled_reward/mean": 0.9754770994186401,
      "rewards/cosine_scaled_reward/std": 1.4231003522872925,
      "step": 85
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.016741071428571397,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 678.0457763671875,
      "completions/mean_terminated_length": 654.7208251953125,
      "completions/min_length": 125.0,
      "completions/min_terminated_length": 125.0,
      "epoch": 0.5417322834645669,
      "grad_norm": 0.3173322379589081,
      "kl": 0.009063720703125,
      "learning_rate": 9.165014264078068e-07,
      "loss": 0.0434,
      "num_tokens": 62462775.0,
      "reward": 0.7376777529716492,
      "reward_std": 0.8068321943283081,
      "rewards/cosine_scaled_reward/mean": 0.7376776933670044,
      "rewards/cosine_scaled_reward/std": 1.4817646741867065,
      "step": 86
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0189732142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 698.255615234375,
      "completions/mean_terminated_length": 672.1513061523438,
      "completions/min_length": 105.0,
      "completions/min_terminated_length": 105.0,
      "epoch": 0.5480314960629922,
      "grad_norm": 0.381440669298172,
      "kl": 0.00875091552734375,
      "learning_rate": 9.134158919394544e-07,
      "loss": -0.0019,
      "num_tokens": 63246604.0,
      "reward": 0.6875295042991638,
      "reward_std": 0.8792651891708374,
      "rewards/cosine_scaled_reward/mean": 0.6875295042991638,
      "rewards/cosine_scaled_reward/std": 1.4889642000198364,
      "step": 87
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1945.0,
      "completions/mean_length": 677.9765625,
      "completions/mean_terminated_length": 640.2694702148438,
      "completions/min_length": 12.0,
      "completions/min_terminated_length": 12.0,
      "epoch": 0.5543307086614173,
      "grad_norm": 0.31262969970703125,
      "kl": 0.009006500244140625,
      "learning_rate": 9.102797696698283e-07,
      "loss": 0.0392,
      "num_tokens": 63992103.0,
      "reward": 0.6004459261894226,
      "reward_std": 0.7837154269218445,
      "rewards/cosine_scaled_reward/mean": 0.6004458665847778,
      "rewards/cosine_scaled_reward/std": 1.4973416328430176,
      "step": 88
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.021205357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 699.6752319335938,
      "completions/mean_terminated_length": 670.4640502929688,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.5606299212598426,
      "grad_norm": 0.2140110731124878,
      "kl": 0.008007049560546875,
      "learning_rate": 9.070934433517872e-07,
      "loss": -0.0011,
      "num_tokens": 64751652.0,
      "reward": 0.6506852507591248,
      "reward_std": 0.9374973773956299,
      "rewards/cosine_scaled_reward/mean": 0.6506852507591248,
      "rewards/cosine_scaled_reward/std": 1.4932117462158203,
      "step": 89
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 648.125,
      "completions/mean_terminated_length": 616.1643676757812,
      "completions/min_length": 108.0,
      "completions/min_terminated_length": 108.0,
      "epoch": 0.5669291338582677,
      "grad_norm": 0.2540406286716461,
      "kl": 0.009563446044921875,
      "learning_rate": 9.038573028814271e-07,
      "loss": 0.0493,
      "num_tokens": 65504420.0,
      "reward": 0.8448294401168823,
      "reward_std": 0.9804660081863403,
      "rewards/cosine_scaled_reward/mean": 0.8448294401168823,
      "rewards/cosine_scaled_reward/std": 1.4603689908981323,
      "step": 90
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0279017857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2003.0,
      "completions/mean_length": 672.9933471679688,
      "completions/mean_terminated_length": 633.5269775390625,
      "completions/min_length": 62.0,
      "completions/min_terminated_length": 62.0,
      "epoch": 0.573228346456693,
      "grad_norm": 0.4330436885356903,
      "kl": 0.008884429931640625,
      "learning_rate": 9.005717442503739e-07,
      "loss": 0.05,
      "num_tokens": 66254270.0,
      "reward": 0.7009679675102234,
      "reward_std": 0.9250093698501587,
      "rewards/cosine_scaled_reward/mean": 0.7009679079055786,
      "rewards/cosine_scaled_reward/std": 1.4871753454208374,
      "step": 91
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.024553571428571397,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 653.8158569335938,
      "completions/mean_terminated_length": 618.721923828125,
      "completions/min_length": 89.0,
      "completions/min_terminated_length": 89.0,
      "epoch": 0.5795275590551181,
      "grad_norm": 0.32978150248527527,
      "kl": 0.009395599365234375,
      "learning_rate": 8.972371694973261e-07,
      "loss": 0.0342,
      "num_tokens": 66965097.0,
      "reward": 0.7579172849655151,
      "reward_std": 0.791741132736206,
      "rewards/cosine_scaled_reward/mean": 0.7579172849655151,
      "rewards/cosine_scaled_reward/std": 1.4783599376678467,
      "step": 92
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0323660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 685.1964721679688,
      "completions/mean_terminated_length": 639.6124267578125,
      "completions/min_length": 96.0,
      "completions/min_terminated_length": 96.0,
      "epoch": 0.5858267716535434,
      "grad_norm": 0.3639669120311737,
      "kl": 0.0100860595703125,
      "learning_rate": 8.938539866588592e-07,
      "loss": 0.048,
      "num_tokens": 67714169.0,
      "reward": 0.7544926404953003,
      "reward_std": 0.9491753578186035,
      "rewards/cosine_scaled_reward/mean": 0.7544926404953003,
      "rewards/cosine_scaled_reward/std": 1.4788836240768433,
      "step": 93
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.036830357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 747.8225708007812,
      "completions/mean_terminated_length": 698.1054077148438,
      "completions/min_length": 16.0,
      "completions/min_terminated_length": 16.0,
      "epoch": 0.5921259842519685,
      "grad_norm": 0.29422667622566223,
      "kl": 0.008636474609375,
      "learning_rate": 8.904226097194969e-07,
      "loss": 0.028,
      "num_tokens": 68528394.0,
      "reward": 0.446431040763855,
      "reward_std": 0.7203037142753601,
      "rewards/cosine_scaled_reward/mean": 0.4464310109615326,
      "rewards/cosine_scaled_reward/std": 1.4998756647109985,
      "step": 94
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0379464285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 733.1272583007812,
      "completions/mean_terminated_length": 681.2644653320312,
      "completions/min_length": 27.0,
      "completions/min_terminated_length": 27.0,
      "epoch": 0.5984251968503937,
      "grad_norm": 0.3252141773700714,
      "kl": 0.009124755859375,
      "learning_rate": 8.869434585610534e-07,
      "loss": 0.048,
      "num_tokens": 69309772.0,
      "reward": 0.6976136565208435,
      "reward_std": 0.9922178983688354,
      "rewards/cosine_scaled_reward/mean": 0.6976136565208435,
      "rewards/cosine_scaled_reward/std": 1.4876292943954468,
      "step": 95
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0279017857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 674.083740234375,
      "completions/mean_terminated_length": 634.648681640625,
      "completions/min_length": 149.0,
      "completions/min_terminated_length": 149.0,
      "epoch": 0.6047244094488189,
      "grad_norm": 0.28459686040878296,
      "kl": 0.009387969970703125,
      "learning_rate": 8.834169589112543e-07,
      "loss": 0.058,
      "num_tokens": 70038327.0,
      "reward": 0.8014413714408875,
      "reward_std": 0.9954638481140137,
      "rewards/cosine_scaled_reward/mean": 0.8014413118362427,
      "rewards/cosine_scaled_reward/std": 1.4701112508773804,
      "step": 96
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0379464285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 734.513427734375,
      "completions/mean_terminated_length": 682.705322265625,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.6110236220472441,
      "grad_norm": 0.40386196970939636,
      "kl": 0.009552001953125,
      "learning_rate": 8.798435422916423e-07,
      "loss": 0.0509,
      "num_tokens": 70831219.0,
      "reward": 0.6741692423820496,
      "reward_std": 0.8956859111785889,
      "rewards/cosine_scaled_reward/mean": 0.6741691827774048,
      "rewards/cosine_scaled_reward/std": 1.490605354309082,
      "step": 97
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.030133928571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 729.880615234375,
      "completions/mean_terminated_length": 688.9263305664062,
      "completions/min_length": 25.0,
      "completions/min_terminated_length": 25.0,
      "epoch": 0.6173228346456693,
      "grad_norm": 0.3040317893028259,
      "kl": 0.00994873046875,
      "learning_rate": 8.762236459647743e-07,
      "loss": 0.0398,
      "num_tokens": 71620504.0,
      "reward": 0.39625483751296997,
      "reward_std": 0.7587441802024841,
      "rewards/cosine_scaled_reward/mean": 0.3962548077106476,
      "rewards/cosine_scaled_reward/std": 1.4971867799758911,
      "step": 98
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0267857142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 720.888427734375,
      "completions/mean_terminated_length": 684.3623657226562,
      "completions/min_length": 122.0,
      "completions/min_terminated_length": 122.0,
      "epoch": 0.6236220472440945,
      "grad_norm": 0.34343069791793823,
      "kl": 0.009426116943359375,
      "learning_rate": 8.725577128807142e-07,
      "loss": 0.0646,
      "num_tokens": 72394724.0,
      "reward": 0.570304811000824,
      "reward_std": 0.9204005002975464,
      "rewards/cosine_scaled_reward/mean": 0.5703047513961792,
      "rewards/cosine_scaled_reward/std": 1.4990601539611816,
      "step": 99
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.033482142857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 699.3717041015625,
      "completions/mean_terminated_length": 652.6524047851562,
      "completions/min_length": 36.0,
      "completions/min_terminated_length": 36.0,
      "epoch": 0.6299212598425197,
      "grad_norm": 0.45011547207832336,
      "kl": 0.01010894775390625,
      "learning_rate": 8.688461916228332e-07,
      "loss": 0.0548,
      "num_tokens": 73170145.0,
      "reward": 0.7544993758201599,
      "reward_std": 0.8879099488258362,
      "rewards/cosine_scaled_reward/mean": 0.7544994354248047,
      "rewards/cosine_scaled_reward/std": 1.4789304733276367,
      "step": 100
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.025669642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1950.0,
      "completions/mean_length": 646.4163208007812,
      "completions/mean_terminated_length": 609.4902954101562,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.6362204724409449,
      "grad_norm": 0.5061929821968079,
      "kl": 0.0099029541015625,
      "learning_rate": 8.650895363529172e-07,
      "loss": 0.0783,
      "num_tokens": 73878262.0,
      "reward": 0.8282220363616943,
      "reward_std": 1.0407150983810425,
      "rewards/cosine_scaled_reward/mean": 0.8282219767570496,
      "rewards/cosine_scaled_reward/std": 1.4642788171768188,
      "step": 101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 719.0000610351562,
      "completions/mean_terminated_length": 664.9755859375,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.6425196850393701,
      "grad_norm": 0.49533170461654663,
      "kl": 0.0106964111328125,
      "learning_rate": 8.612882067555933e-07,
      "loss": 0.0511,
      "num_tokens": 74669942.0,
      "reward": 0.5335639119148254,
      "reward_std": 0.7676834464073181,
      "rewards/cosine_scaled_reward/mean": 0.5335639119148254,
      "rewards/cosine_scaled_reward/std": 1.5003803968429565,
      "step": 102
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0535714285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 770.5859985351562,
      "completions/mean_terminated_length": 698.2794799804688,
      "completions/min_length": 99.0,
      "completions/min_terminated_length": 99.0,
      "epoch": 0.6488188976377953,
      "grad_norm": 0.4521535038948059,
      "kl": 0.0103912353515625,
      "learning_rate": 8.574426679820813e-07,
      "loss": 0.1075,
      "num_tokens": 75488659.0,
      "reward": 0.4699151813983917,
      "reward_std": 0.9292130470275879,
      "rewards/cosine_scaled_reward/mean": 0.46991515159606934,
      "rewards/cosine_scaled_reward/std": 1.5004866123199463,
      "step": 103
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0479910714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 752.2734985351562,
      "completions/mean_terminated_length": 686.9554443359375,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 0.6551181102362205,
      "grad_norm": 0.7238117456436157,
      "kl": 0.01027679443359375,
      "learning_rate": 8.535533905932737e-07,
      "loss": 0.0409,
      "num_tokens": 76295112.0,
      "reward": 0.6177164316177368,
      "reward_std": 0.9820640683174133,
      "rewards/cosine_scaled_reward/mean": 0.617716372013092,
      "rewards/cosine_scaled_reward/std": 1.495753526687622,
      "step": 104
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0323660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 663.9910888671875,
      "completions/mean_terminated_length": 617.6978149414062,
      "completions/min_length": 34.0,
      "completions/min_terminated_length": 34.0,
      "epoch": 0.6614173228346457,
      "grad_norm": 0.44373902678489685,
      "kl": 0.01088714599609375,
      "learning_rate": 8.49620850502157e-07,
      "loss": 0.1424,
      "num_tokens": 77017296.0,
      "reward": 0.7043907642364502,
      "reward_std": 0.8915003538131714,
      "rewards/cosine_scaled_reward/mean": 0.7043907046318054,
      "rewards/cosine_scaled_reward/std": 1.4866788387298584,
      "step": 105
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 702.9989013671875,
      "completions/mean_terminated_length": 648.3240356445312,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.6677165354330709,
      "grad_norm": 0.43054643273353577,
      "kl": 0.0108642578125,
      "learning_rate": 8.45645528915575e-07,
      "loss": 0.0957,
      "num_tokens": 77775551.0,
      "reward": 0.6707912087440491,
      "reward_std": 0.8435035943984985,
      "rewards/cosine_scaled_reward/mean": 0.6707910895347595,
      "rewards/cosine_scaled_reward/std": 1.491044044494629,
      "step": 106
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0401785714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1967.0,
      "completions/mean_length": 692.9453735351562,
      "completions/mean_terminated_length": 636.2221069335938,
      "completions/min_length": 56.0,
      "completions/min_terminated_length": 56.0,
      "epoch": 0.6740157480314961,
      "grad_norm": 0.4474175274372101,
      "kl": 0.011871337890625,
      "learning_rate": 8.416279122753466e-07,
      "loss": 0.055,
      "num_tokens": 78514990.0,
      "reward": 0.8312375545501709,
      "reward_std": 0.986255407333374,
      "rewards/cosine_scaled_reward/mean": 0.8312374949455261,
      "rewards/cosine_scaled_reward/std": 1.4634435176849365,
      "step": 107
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.041294642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 754.786865234375,
      "completions/mean_terminated_length": 699.0838012695312,
      "completions/min_length": 32.0,
      "completions/min_terminated_length": 32.0,
      "epoch": 0.6803149606299213,
      "grad_norm": 0.5195061564445496,
      "kl": 0.010135650634765625,
      "learning_rate": 8.375684921987421e-07,
      "loss": 0.0418,
      "num_tokens": 79327503.0,
      "reward": 0.6640704274177551,
      "reward_std": 0.9568787217140198,
      "rewards/cosine_scaled_reward/mean": 0.6640704274177551,
      "rewards/cosine_scaled_reward/std": 1.4918031692504883,
      "step": 108
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.052455357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 766.427490234375,
      "completions/mean_terminated_length": 695.4805908203125,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 0.6866141732283465,
      "grad_norm": 2.433952569961548,
      "kl": 0.01102447509765625,
      "learning_rate": 8.334677654183253e-07,
      "loss": 0.1123,
      "num_tokens": 80162590.0,
      "reward": 0.6239715814590454,
      "reward_std": 0.9540754556655884,
      "rewards/cosine_scaled_reward/mean": 0.6239715218544006,
      "rewards/cosine_scaled_reward/std": 1.4956040382385254,
      "step": 109
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.044642857142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 733.6551513671875,
      "completions/mean_terminated_length": 672.2371215820312,
      "completions/min_length": 138.0,
      "completions/min_terminated_length": 138.0,
      "epoch": 0.6929133858267716,
      "grad_norm": 0.41383421421051025,
      "kl": 0.01192474365234375,
      "learning_rate": 8.293262337211722e-07,
      "loss": 0.0604,
      "num_tokens": 80939513.0,
      "reward": 0.6272814869880676,
      "reward_std": 0.896499514579773,
      "rewards/cosine_scaled_reward/mean": 0.6272814869880676,
      "rewards/cosine_scaled_reward/std": 1.4953655004501343,
      "step": 110
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0323660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1942.0,
      "completions/mean_length": 695.2210083007812,
      "completions/mean_terminated_length": 649.9722900390625,
      "completions/min_length": 85.0,
      "completions/min_terminated_length": 85.0,
      "epoch": 0.6992125984251969,
      "grad_norm": 0.4339134097099304,
      "kl": 0.01230621337890625,
      "learning_rate": 8.251444038874685e-07,
      "loss": 0.0797,
      "num_tokens": 81699887.0,
      "reward": 0.6373085379600525,
      "reward_std": 0.944847583770752,
      "rewards/cosine_scaled_reward/mean": 0.6373085379600525,
      "rewards/cosine_scaled_reward/std": 1.494478702545166,
      "step": 111
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0535714285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 699.7533569335938,
      "completions/mean_terminated_length": 623.4375,
      "completions/min_length": 82.0,
      "completions/min_terminated_length": 82.0,
      "epoch": 0.705511811023622,
      "grad_norm": 0.5097730755805969,
      "kl": 0.01384735107421875,
      "learning_rate": 8.209227876284971e-07,
      "loss": 0.0861,
      "num_tokens": 82448626.0,
      "reward": 0.6977179646492004,
      "reward_std": 0.9958769083023071,
      "rewards/cosine_scaled_reward/mean": 0.6977178454399109,
      "rewards/cosine_scaled_reward/std": 1.4875280857086182,
      "step": 112
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.029017857142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 691.3236694335938,
      "completions/mean_terminated_length": 650.779296875,
      "completions/min_length": 86.0,
      "completions/min_terminated_length": 86.0,
      "epoch": 0.7118110236220473,
      "grad_norm": 0.4512317180633545,
      "kl": 0.01357269287109375,
      "learning_rate": 8.166619015240235e-07,
      "loss": 0.1306,
      "num_tokens": 83188772.0,
      "reward": 0.93192458152771,
      "reward_std": 1.0319198369979858,
      "rewards/cosine_scaled_reward/mean": 0.9319245219230652,
      "rewards/cosine_scaled_reward/std": 1.4371205568313599,
      "step": 113
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0502232142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 719.3203735351562,
      "completions/mean_terminated_length": 649.0610961914062,
      "completions/min_length": 51.0,
      "completions/min_terminated_length": 51.0,
      "epoch": 0.7181102362204724,
      "grad_norm": 0.3798118531703949,
      "kl": 0.013671875,
      "learning_rate": 8.12362266959083e-07,
      "loss": 0.0621,
      "num_tokens": 83956275.0,
      "reward": 0.6744152307510376,
      "reward_std": 0.8991620540618896,
      "rewards/cosine_scaled_reward/mean": 0.6744151711463928,
      "rewards/cosine_scaled_reward/std": 1.4903264045715332,
      "step": 114
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0479910714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 738.3370971679688,
      "completions/mean_terminated_length": 672.3165283203125,
      "completions/min_length": 54.0,
      "completions/min_terminated_length": 54.0,
      "epoch": 0.7244094488188977,
      "grad_norm": 0.4109092056751251,
      "kl": 0.013885498046875,
      "learning_rate": 8.080244100601821e-07,
      "loss": 0.1301,
      "num_tokens": 84749041.0,
      "reward": 0.6942418217658997,
      "reward_std": 0.9705992341041565,
      "rewards/cosine_scaled_reward/mean": 0.6942418217658997,
      "rewards/cosine_scaled_reward/std": 1.488061785697937,
      "step": 115
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0647321428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 739.8873291015625,
      "completions/mean_terminated_length": 649.3496704101562,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.7307086614173228,
      "grad_norm": 0.5201711058616638,
      "kl": 0.0142669677734375,
      "learning_rate": 8.036488616309183e-07,
      "loss": 0.1653,
      "num_tokens": 85548028.0,
      "reward": 0.7681640982627869,
      "reward_std": 1.0258592367172241,
      "rewards/cosine_scaled_reward/mean": 0.7681640982627869,
      "rewards/cosine_scaled_reward/std": 1.4761563539505005,
      "step": 116
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.056919642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 743.7388916015625,
      "completions/mean_terminated_length": 665.0201416015625,
      "completions/min_length": 40.0,
      "completions/min_terminated_length": 40.0,
      "epoch": 0.7370078740157481,
      "grad_norm": 0.31872037053108215,
      "kl": 0.0137939453125,
      "learning_rate": 7.992361570870287e-07,
      "loss": 0.0601,
      "num_tokens": 86343826.0,
      "reward": 0.6071450114250183,
      "reward_std": 0.8564595580101013,
      "rewards/cosine_scaled_reward/mean": 0.6071449518203735,
      "rewards/cosine_scaled_reward/std": 1.4969135522842407,
      "step": 117
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0535714285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 694.1339721679688,
      "completions/mean_terminated_length": 617.5,
      "completions/min_length": 30.0,
      "completions/min_terminated_length": 30.0,
      "epoch": 0.7433070866141732,
      "grad_norm": 0.34207943081855774,
      "kl": 0.0148162841796875,
      "learning_rate": 7.947868363908728e-07,
      "loss": 0.1579,
      "num_tokens": 87078410.0,
      "reward": 0.8284109830856323,
      "reward_std": 0.9332945942878723,
      "rewards/cosine_scaled_reward/mean": 0.8284109830856323,
      "rewards/cosine_scaled_reward/std": 1.4639177322387695,
      "step": 118
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0714285714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 796.0949096679688,
      "completions/mean_terminated_length": 699.7944946289062,
      "completions/min_length": 129.0,
      "completions/min_terminated_length": 129.0,
      "epoch": 0.7496062992125985,
      "grad_norm": 0.34661048650741577,
      "kl": 0.0142364501953125,
      "learning_rate": 7.903014439853603e-07,
      "loss": 0.1492,
      "num_tokens": 87927503.0,
      "reward": 0.6473815441131592,
      "reward_std": 0.9624386429786682,
      "rewards/cosine_scaled_reward/mean": 0.6473814845085144,
      "rewards/cosine_scaled_reward/std": 1.4935013055801392,
      "step": 119
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0535714285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 709.3906860351562,
      "completions/mean_terminated_length": 633.6203002929688,
      "completions/min_length": 68.0,
      "completions/min_terminated_length": 68.0,
      "epoch": 0.7559055118110236,
      "grad_norm": 0.3804778456687927,
      "kl": 0.01534271240234375,
      "learning_rate": 7.857805287273305e-07,
      "loss": 0.1421,
      "num_tokens": 88694285.0,
      "reward": 0.6875464916229248,
      "reward_std": 0.925369381904602,
      "rewards/cosine_scaled_reward/mean": 0.68754643201828,
      "rewards/cosine_scaled_reward/std": 1.4890048503875732,
      "step": 120
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0535714285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 735.6484985351562,
      "completions/mean_terminated_length": 661.3643798828125,
      "completions/min_length": 92.0,
      "completions/min_terminated_length": 92.0,
      "epoch": 0.7622047244094489,
      "grad_norm": 0.47416040301322937,
      "kl": 0.0153656005859375,
      "learning_rate": 7.812246438203903e-07,
      "loss": 0.1223,
      "num_tokens": 89475234.0,
      "reward": 0.634546160697937,
      "reward_std": 0.9282253980636597,
      "rewards/cosine_scaled_reward/mean": 0.634546160697937,
      "rewards/cosine_scaled_reward/std": 1.4941778182983398,
      "step": 121
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 775.114990234375,
      "completions/mean_terminated_length": 678.8463134765625,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.768503937007874,
      "grad_norm": 0.41114383935928345,
      "kl": 0.016082763671875,
      "learning_rate": 7.766343467472218e-07,
      "loss": 0.0932,
      "num_tokens": 90299369.0,
      "reward": 0.48326361179351807,
      "reward_std": 0.9238112568855286,
      "rewards/cosine_scaled_reward/mean": 0.4832635819911957,
      "rewards/cosine_scaled_reward/std": 1.500690221786499,
      "step": 122
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 660.1171875,
      "completions/mean_terminated_length": 567.5916748046875,
      "completions/min_length": 159.0,
      "completions/min_terminated_length": 159.0,
      "epoch": 0.7748031496062993,
      "grad_norm": 0.3870268762111664,
      "kl": 0.01950836181640625,
      "learning_rate": 7.720101992013661e-07,
      "loss": 0.1427,
      "num_tokens": 91015906.0,
      "reward": 0.8052693009376526,
      "reward_std": 0.9232622385025024,
      "rewards/cosine_scaled_reward/mean": 0.8052692413330078,
      "rewards/cosine_scaled_reward/std": 1.4687471389770508,
      "step": 123
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0680803571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2006.0,
      "completions/mean_length": 805.6417846679688,
      "completions/mean_terminated_length": 714.8826293945312,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.7811023622047244,
      "grad_norm": 0.39262351393699646,
      "kl": 0.019805908203125,
      "learning_rate": 7.673527670184901e-07,
      "loss": 0.0868,
      "num_tokens": 91876785.0,
      "reward": 0.6843234896659851,
      "reward_std": 0.8282831907272339,
      "rewards/cosine_scaled_reward/mean": 0.6843234300613403,
      "rewards/cosine_scaled_reward/std": 1.4892817735671997,
      "step": 124
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0926339285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 779.9799194335938,
      "completions/mean_terminated_length": 650.5264282226562,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.7874015748031497,
      "grad_norm": 0.4867366850376129,
      "kl": 0.02044677734375,
      "learning_rate": 7.626626201071493e-07,
      "loss": 0.1882,
      "num_tokens": 92693375.0,
      "reward": 0.6739551424980164,
      "reward_std": 0.9228638410568237,
      "rewards/cosine_scaled_reward/mean": 0.6739550828933716,
      "rewards/cosine_scaled_reward/std": 1.4905028343200684,
      "step": 125
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0948660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2009.0,
      "completions/mean_length": 810.3303833007812,
      "completions/mean_terminated_length": 680.611572265625,
      "completions/min_length": 132.0,
      "completions/min_terminated_length": 132.0,
      "epoch": 0.7937007874015748,
      "grad_norm": 116.46406555175781,
      "kl": 0.028717041015625,
      "learning_rate": 7.5794033237905e-07,
      "loss": 0.1429,
      "num_tokens": 93564199.0,
      "reward": 0.610503077507019,
      "reward_std": 0.9900833964347839,
      "rewards/cosine_scaled_reward/mean": 0.610503077507019,
      "rewards/cosine_scaled_reward/std": 1.4967402219772339,
      "step": 126
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1127232142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 809.3761596679688,
      "completions/mean_terminated_length": 652.016357421875,
      "completions/min_length": 90.0,
      "completions/min_terminated_length": 90.0,
      "epoch": 0.8,
      "grad_norm": 0.6866064667701721,
      "kl": 0.0263214111328125,
      "learning_rate": 7.53186481678822e-07,
      "loss": 0.2378,
      "num_tokens": 94407080.0,
      "reward": 0.49012550711631775,
      "reward_std": 1.0525535345077515,
      "rewards/cosine_scaled_reward/mean": 0.49012547731399536,
      "rewards/cosine_scaled_reward/std": 1.5006331205368042,
      "step": 127
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1607142857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 814.9810791015625,
      "completions/mean_terminated_length": 578.8709716796875,
      "completions/min_length": 83.0,
      "completions/min_terminated_length": 83.0,
      "epoch": 0.8062992125984252,
      "grad_norm": 0.7681502103805542,
      "kl": 0.0304718017578125,
      "learning_rate": 7.484016497133111e-07,
      "loss": 0.2825,
      "num_tokens": 95265191.0,
      "reward": 0.5303881764411926,
      "reward_std": 1.094639778137207,
      "rewards/cosine_scaled_reward/mean": 0.5303881764411926,
      "rewards/cosine_scaled_reward/std": 1.5002450942993164,
      "step": 128
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1629464285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 891.5234985351562,
      "completions/mean_terminated_length": 666.39599609375,
      "completions/min_length": 136.0,
      "completions/min_terminated_length": 136.0,
      "epoch": 0.8125984251968504,
      "grad_norm": 0.987953245639801,
      "kl": 0.0319671630859375,
      "learning_rate": 7.435864219803982e-07,
      "loss": 0.264,
      "num_tokens": 96208188.0,
      "reward": 0.49702638387680054,
      "reward_std": 1.135278582572937,
      "rewards/cosine_scaled_reward/mean": 0.49702638387680054,
      "rewards/cosine_scaled_reward/std": 1.5005019903182983,
      "step": 129
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2645089285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1011.3906860351562,
      "completions/mean_terminated_length": 638.5887451171875,
      "completions/min_length": 121.0,
      "completions/min_terminated_length": 121.0,
      "epoch": 0.8188976377952756,
      "grad_norm": 0.9615009427070618,
      "kl": 0.042144775390625,
      "learning_rate": 7.387413876973543e-07,
      "loss": 0.3279,
      "num_tokens": 97255322.0,
      "reward": 0.3463992476463318,
      "reward_std": 1.0665947198867798,
      "rewards/cosine_scaled_reward/mean": 0.3463992476463318,
      "rewards/cosine_scaled_reward/std": 1.4925742149353027,
      "step": 130
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3381696428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 1115.0926513671875,
      "completions/mean_terminated_length": 638.4131469726562,
      "completions/min_length": 20.0,
      "completions/min_terminated_length": 20.0,
      "epoch": 0.8251968503937008,
      "grad_norm": 1.6049638986587524,
      "kl": 0.05584716796875,
      "learning_rate": 7.338671397287408e-07,
      "loss": 0.3132,
      "num_tokens": 98388477.0,
      "reward": 0.15194740891456604,
      "reward_std": 1.066095232963562,
      "rewards/cosine_scaled_reward/mean": 0.15194739401340485,
      "rewards/cosine_scaled_reward/std": 1.4597177505493164,
      "step": 131
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4084821428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 1218.71875,
      "completions/mean_terminated_length": 646.0452880859375,
      "completions/min_length": 11.0,
      "completions/min_terminated_length": 11.0,
      "epoch": 0.831496062992126,
      "grad_norm": 2.0752828121185303,
      "kl": 0.065032958984375,
      "learning_rate": 7.289642745138637e-07,
      "loss": 0.3363,
      "num_tokens": 99606193.0,
      "reward": 0.1383928656578064,
      "reward_std": 1.1820039749145508,
      "rewards/cosine_scaled_reward/mean": 0.1383928507566452,
      "rewards/cosine_scaled_reward/std": 1.4565742015838623,
      "step": 132
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5379464285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1885.0,
      "completions/mean_length": 1383.8248291015625,
      "completions/mean_terminated_length": 610.5579833984375,
      "completions/min_length": 44.0,
      "completions/min_terminated_length": 44.0,
      "epoch": 0.8377952755905512,
      "grad_norm": 2.3217105865478516,
      "kl": 0.09649658203125,
      "learning_rate": 7.240333919937892e-07,
      "loss": 0.2741,
      "num_tokens": 100981460.0,
      "reward": -0.22238367795944214,
      "reward_std": 1.0377624034881592,
      "rewards/cosine_scaled_reward/mean": -0.22238366305828094,
      "rewards/cosine_scaled_reward/std": 1.314494013786316,
      "step": 133
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.5658482142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1962.0,
      "completions/mean_length": 1403.39404296875,
      "completions/mean_terminated_length": 563.251953125,
      "completions/min_length": 14.0,
      "completions/min_terminated_length": 14.0,
      "epoch": 0.8440944881889764,
      "grad_norm": 146.2183837890625,
      "kl": 1.4737548828125,
      "learning_rate": 7.19075095537933e-07,
      "loss": 0.3846,
      "num_tokens": 102352565.0,
      "reward": -0.1662946492433548,
      "reward_std": 1.1486141681671143,
      "rewards/cosine_scaled_reward/mean": -0.1662946492433548,
      "rewards/cosine_scaled_reward/std": 1.3446446657180786,
      "step": 134
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.6975446428571428,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1969.0,
      "completions/mean_length": 1610.3739013671875,
      "completions/mean_terminated_length": 601.0885620117188,
      "completions/min_length": 49.0,
      "completions/min_terminated_length": 49.0,
      "epoch": 0.8503937007874016,
      "grad_norm": 3.1824281215667725,
      "kl": 0.18603515625,
      "learning_rate": 7.140899918702275e-07,
      "loss": 0.2155,
      "num_tokens": 103938276.0,
      "reward": -0.5144848823547363,
      "reward_std": 0.8551457524299622,
      "rewards/cosine_scaled_reward/mean": -0.5144848227500916,
      "rewards/cosine_scaled_reward/std": 1.105492115020752,
      "step": 135
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7209821428571428,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1642.0860595703125,
      "completions/mean_terminated_length": 593.2040405273438,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8566929133858268,
      "grad_norm": 3.775615930557251,
      "kl": 0.2728271484375,
      "learning_rate": 7.090786909948809e-07,
      "loss": 0.214,
      "num_tokens": 105547761.0,
      "reward": -0.5837053656578064,
      "reward_std": 0.8053549528121948,
      "rewards/cosine_scaled_reward/mean": -0.5943182110786438,
      "rewards/cosine_scaled_reward/std": 1.0264818668365479,
      "step": 136
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8459821428571428,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 1817.8974609375,
      "completions/mean_terminated_length": 554.0,
      "completions/min_length": 8.0,
      "completions/min_terminated_length": 8.0,
      "epoch": 0.862992125984252,
      "grad_norm": 3.16469407081604,
      "kl": 0.44287109375,
      "learning_rate": 7.040418061217324e-07,
      "loss": 0.1551,
      "num_tokens": 107305973.0,
      "reward": -0.7589171528816223,
      "reward_std": 0.5782804489135742,
      "rewards/cosine_scaled_reward/mean": -0.7589171528816223,
      "rewards/cosine_scaled_reward/std": 0.8159881234169006,
      "step": 137
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.9040178571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 1906.56591796875,
      "completions/mean_terminated_length": 574.4534912109375,
      "completions/min_length": 13.0,
      "completions/min_terminated_length": 13.0,
      "epoch": 0.8692913385826772,
      "grad_norm": 5.566361427307129,
      "kl": 0.6162109375,
      "learning_rate": 6.989799535912181e-07,
      "loss": 0.087,
      "num_tokens": 109144544.0,
      "reward": -0.9062500596046448,
      "reward_std": 0.27266156673431396,
      "rewards/cosine_scaled_reward/mean": -0.90625,
      "rewards/cosine_scaled_reward/std": 0.5222694277763367,
      "step": 138
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8973214285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2034.0,
      "completions/mean_length": 1899.40966796875,
      "completions/mean_terminated_length": 600.8587036132812,
      "completions/min_length": 19.0,
      "completions/min_terminated_length": 19.0,
      "epoch": 0.8755905511811024,
      "grad_norm": 11.27455997467041,
      "kl": 0.79248046875,
      "learning_rate": 6.93893752798951e-07,
      "loss": 0.0864,
      "num_tokens": 110986895.0,
      "reward": -0.9363839626312256,
      "reward_std": 0.22900153696537018,
      "rewards/cosine_scaled_reward/mean": -0.9363839030265808,
      "rewards/cosine_scaled_reward/std": 0.4324464797973633,
      "step": 139
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.8303571428571428,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 1786.923095703125,
      "completions/mean_terminated_length": 509.0197448730469,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.8818897637795275,
      "grad_norm": 8.36593246459961,
      "kl": 0.88232421875,
      "learning_rate": 6.887838261199292e-07,
      "loss": 0.1019,
      "num_tokens": 112728170.0,
      "reward": -0.9129464626312256,
      "reward_std": 0.26700180768966675,
      "rewards/cosine_scaled_reward/mean": -0.9129464030265808,
      "rewards/cosine_scaled_reward/std": 0.5038508772850037,
      "step": 140
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.796875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2010.0,
      "completions/mean_length": 1737.1707763671875,
      "completions/mean_terminated_length": 517.7637329101562,
      "completions/min_length": 5.0,
      "completions/min_terminated_length": 5.0,
      "epoch": 0.8881889763779528,
      "grad_norm": 8.047481536865234,
      "kl": 0.94189453125,
      "learning_rate": 6.836507988323784e-07,
      "loss": 0.1031,
      "num_tokens": 114425763.0,
      "reward": -0.9095982313156128,
      "reward_std": 0.28521886467933655,
      "rewards/cosine_scaled_reward/mean": -0.9095982313156128,
      "rewards/cosine_scaled_reward/std": 0.5131537914276123,
      "step": 141
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.7176339285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 1619.587158203125,
      "completions/mean_terminated_length": 530.7747192382812,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.8944881889763779,
      "grad_norm": 8.854314804077148,
      "kl": 0.978515625,
      "learning_rate": 6.784952990412393e-07,
      "loss": 0.1075,
      "num_tokens": 116005153.0,
      "reward": -0.8627232313156128,
      "reward_std": 0.3768954873085022,
      "rewards/cosine_scaled_reward/mean": -0.8627232313156128,
      "rewards/cosine_scaled_reward/std": 0.6272356510162354,
      "step": 142
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4508928571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1176.7098388671875,
      "completions/mean_terminated_length": 461.2601318359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9007874015748032,
      "grad_norm": 665.7935791015625,
      "kl": 0.65087890625,
      "learning_rate": 6.733179576013097e-07,
      "loss": 0.1599,
      "num_tokens": 117187645.0,
      "reward": -0.691964328289032,
      "reward_std": 0.697722852230072,
      "rewards/cosine_scaled_reward/mean": -0.6919642686843872,
      "rewards/cosine_scaled_reward/std": 0.9111244082450867,
      "step": 143
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4017857142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 1132.872802734375,
      "completions/mean_terminated_length": 518.2350463867188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 0.9070866141732283,
      "grad_norm": 571.6249389648438,
      "kl": 0.51708984375,
      "learning_rate": 6.681194080400495e-07,
      "loss": 0.1361,
      "num_tokens": 118343115.0,
      "reward": -0.7488839626312256,
      "reward_std": 0.5547734498977661,
      "rewards/cosine_scaled_reward/mean": -0.7488839030265808,
      "rewards/cosine_scaled_reward/std": 0.8313003182411194,
      "step": 144
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.4486607142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 1147.0926513671875,
      "completions/mean_terminated_length": 413.9656066894531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9133858267716536,
      "grad_norm": 25.801456451416016,
      "kl": 0.84716796875,
      "learning_rate": 6.629002864800588e-07,
      "loss": 0.1159,
      "num_tokens": 119498350.0,
      "reward": -0.7589285969734192,
      "reward_std": 0.5432791113853455,
      "rewards/cosine_scaled_reward/mean": -0.7589285969734192,
      "rewards/cosine_scaled_reward/std": 0.8159914016723633,
      "step": 145
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3939732142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1863.0,
      "completions/mean_length": 1025.8795166015625,
      "completions/mean_terminated_length": 361.4070129394531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9196850393700787,
      "grad_norm": 1.8065078258514404,
      "kl": 0.93115234375,
      "learning_rate": 6.576612315612386e-07,
      "loss": 0.1227,
      "num_tokens": 120553410.0,
      "reward": -0.8292410969734192,
      "reward_std": 0.4767807126045227,
      "rewards/cosine_scaled_reward/mean": -0.8292410969734192,
      "rewards/cosine_scaled_reward/std": 0.6954552531242371,
      "step": 146
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3102678571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 873.575927734375,
      "completions/mean_terminated_length": 345.27508544921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.925984251968504,
      "grad_norm": 2.2588658332824707,
      "kl": 0.89501953125,
      "learning_rate": 6.524028843626433e-07,
      "loss": 0.1051,
      "num_tokens": 121469110.0,
      "reward": -0.7756592035293579,
      "reward_std": 0.5666614770889282,
      "rewards/cosine_scaled_reward/mean": -0.7756591439247131,
      "rewards/cosine_scaled_reward/std": 0.7895299792289734,
      "step": 147
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2522321428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 786.1641235351562,
      "completions/mean_terminated_length": 360.52984619140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9322834645669291,
      "grad_norm": 21.380773544311523,
      "kl": 0.92578125,
      "learning_rate": 6.47125888324035e-07,
      "loss": -0.2447,
      "num_tokens": 122307657.0,
      "reward": -0.8401402831077576,
      "reward_std": 0.39588436484336853,
      "rewards/cosine_scaled_reward/mean": -0.8401403427124023,
      "rewards/cosine_scaled_reward/std": 0.6686471104621887,
      "step": 148
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1897321428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 673.6920166015625,
      "completions/mean_terminated_length": 351.8843078613281,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9385826771653544,
      "grad_norm": 21.64946746826172,
      "kl": 0.84326171875,
      "learning_rate": 6.418308891671484e-07,
      "loss": -0.312,
      "num_tokens": 123060837.0,
      "reward": -0.6996389627456665,
      "reward_std": 0.5653135776519775,
      "rewards/cosine_scaled_reward/mean": -0.6996389031410217,
      "rewards/cosine_scaled_reward/std": 0.8859981298446655,
      "step": 149
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1361607142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 604.5279541015625,
      "completions/mean_terminated_length": 377.0038757324219,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9448818897637795,
      "grad_norm": 15.67458438873291,
      "kl": 0.6279296875,
      "learning_rate": 6.365185348166768e-07,
      "loss": -0.2435,
      "num_tokens": 123715518.0,
      "reward": -0.6860414147377014,
      "reward_std": 0.6978532075881958,
      "rewards/cosine_scaled_reward/mean": -0.6860413551330566,
      "rewards/cosine_scaled_reward/std": 0.8987053036689758,
      "step": 150
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2047.0,
      "completions/mean_length": 675.2667846679688,
      "completions/mean_terminated_length": 390.3598327636719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9511811023622048,
      "grad_norm": 11.492012023925781,
      "kl": 0.377685546875,
      "learning_rate": 6.311894753209895e-07,
      "loss": -0.2775,
      "num_tokens": 124447101.0,
      "reward": -0.6155981421470642,
      "reward_std": 0.7278788685798645,
      "rewards/cosine_scaled_reward/mean": -0.6155981421470642,
      "rewards/cosine_scaled_reward/std": 0.9894371032714844,
      "step": 151
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1969.0,
      "completions/mean_length": 692.622802734375,
      "completions/mean_terminated_length": 411.31805419921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9574803149606299,
      "grad_norm": 3.143880605697632,
      "kl": 0.25732421875,
      "learning_rate": 6.258443627725867e-07,
      "loss": -0.0845,
      "num_tokens": 125219899.0,
      "reward": -0.4098784625530243,
      "reward_std": 0.891829788684845,
      "rewards/cosine_scaled_reward/mean": -0.4098784923553467,
      "rewards/cosine_scaled_reward/std": 1.1879875659942627,
      "step": 152
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1294642857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 643.4051513671875,
      "completions/mean_terminated_length": 434.5166931152344,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9637795275590552,
      "grad_norm": 4.117553234100342,
      "kl": 0.201416015625,
      "learning_rate": 6.204838512283071e-07,
      "loss": -0.1204,
      "num_tokens": 125923670.0,
      "reward": -0.35601532459259033,
      "reward_std": 0.9412047863006592,
      "rewards/cosine_scaled_reward/mean": -0.35601529479026794,
      "rewards/cosine_scaled_reward/std": 1.2269173860549927,
      "step": 153
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0948660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 623.4207763671875,
      "completions/mean_terminated_length": 474.1121826171875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9700787401574803,
      "grad_norm": 3.315114736557007,
      "kl": 0.15234375,
      "learning_rate": 6.151085966292941e-07,
      "loss": -0.1847,
      "num_tokens": 126628463.0,
      "reward": -0.34353500604629517,
      "reward_std": 0.8719759583473206,
      "rewards/cosine_scaled_reward/mean": -0.3435349762439728,
      "rewards/cosine_scaled_reward/std": 1.236801028251648,
      "step": 154
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0970982142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 621.3303833007812,
      "completions/mean_terminated_length": 467.9060363769531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9763779527559056,
      "grad_norm": 1.6421157121658325,
      "kl": 0.12652587890625,
      "learning_rate": 6.097192567207303e-07,
      "loss": -0.078,
      "num_tokens": 127332503.0,
      "reward": -0.1615428477525711,
      "reward_std": 1.0047760009765625,
      "rewards/cosine_scaled_reward/mean": -0.16154281795024872,
      "rewards/cosine_scaled_reward/std": 1.3453506231307983,
      "step": 155
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 650.7098388671875,
      "completions/mean_terminated_length": 519.3406982421875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9826771653543307,
      "grad_norm": 2.226269006729126,
      "kl": 0.10272216796875,
      "learning_rate": 6.043164909713532e-07,
      "loss": -0.0363,
      "num_tokens": 128044803.0,
      "reward": -0.22488074004650116,
      "reward_std": 0.929477870464325,
      "rewards/cosine_scaled_reward/mean": -0.22488072514533997,
      "rewards/cosine_scaled_reward/std": 1.3119038343429565,
      "step": 156
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0770089285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1941.0,
      "completions/mean_length": 648.5413208007812,
      "completions/mean_terminated_length": 531.7787475585938,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.988976377952756,
      "grad_norm": 2.474903106689453,
      "kl": 0.09051513671875,
      "learning_rate": 5.989009604927586e-07,
      "loss": -0.138,
      "num_tokens": 128762616.0,
      "reward": -0.07482035458087921,
      "reward_std": 0.9417048096656799,
      "rewards/cosine_scaled_reward/mean": -0.07482033967971802,
      "rewards/cosine_scaled_reward/std": 1.385107398033142,
      "step": 157
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.047330097087378675,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1903.0,
      "completions/mean_length": 564.5715942382812,
      "completions/mean_terminated_length": 490.87261962890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 0.9952755905511811,
      "grad_norm": 5.221060276031494,
      "kl": 0.131103515625,
      "learning_rate": 5.934733279585036e-07,
      "loss": -0.1088,
      "num_tokens": 129405493.0,
      "reward": -0.10780435800552368,
      "reward_std": 0.9690559506416321,
      "rewards/cosine_scaled_reward/mean": -0.10780435055494308,
      "rewards/cosine_scaled_reward/std": 1.3703655004501343,
      "step": 158
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0390625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 537.7467041015625,
      "completions/mean_terminated_length": 476.3542175292969,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0062992125984251,
      "grad_norm": 4.053348541259766,
      "kl": 0.14080810546875,
      "learning_rate": 5.880342575230181e-07,
      "loss": -0.1351,
      "num_tokens": 130008786.0,
      "reward": 0.06913615763187408,
      "reward_std": 1.0828073024749756,
      "rewards/cosine_scaled_reward/mean": 0.06913615763187408,
      "rewards/cosine_scaled_reward/std": 1.4362900257110596,
      "step": 159
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.022321428571428603,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1820.0,
      "completions/mean_length": 499.8114013671875,
      "completions/mean_terminated_length": 464.464599609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0125984251968503,
      "grad_norm": 4.627394676208496,
      "kl": 0.1671142578125,
      "learning_rate": 5.825844147403352e-07,
      "loss": -0.1719,
      "num_tokens": 130603513.0,
      "reward": -0.08518640697002411,
      "reward_std": 1.049144983291626,
      "rewards/cosine_scaled_reward/mean": -0.0851864144206047,
      "rewards/cosine_scaled_reward/std": 1.3810899257659912,
      "step": 160
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0200892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 541.7254638671875,
      "completions/mean_terminated_length": 510.8451232910156,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0188976377952756,
      "grad_norm": 5.731709957122803,
      "kl": 0.164306640625,
      "learning_rate": 5.771244664826511e-07,
      "loss": -0.0978,
      "num_tokens": 131220211.0,
      "reward": -0.02823840081691742,
      "reward_std": 1.049377202987671,
      "rewards/cosine_scaled_reward/mean": -0.02823840081691742,
      "rewards/cosine_scaled_reward/std": 1.4038633108139038,
      "step": 161
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0345982142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1900.0,
      "completions/mean_length": 541.1127319335938,
      "completions/mean_terminated_length": 487.1086730957031,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0251968503937008,
      "grad_norm": 4.804685115814209,
      "kl": 0.2054443359375,
      "learning_rate": 5.71655080858722e-07,
      "loss": -0.0506,
      "num_tokens": 131831960.0,
      "reward": 0.05900329723954201,
      "reward_std": 1.0088309049606323,
      "rewards/cosine_scaled_reward/mean": 0.0590033121407032,
      "rewards/cosine_scaled_reward/std": 1.4333640336990356,
      "step": 162
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.052455357142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 578.4163208007812,
      "completions/mean_terminated_length": 497.061279296875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.031496062992126,
      "grad_norm": 7.659826278686523,
      "kl": 0.22412109375,
      "learning_rate": 5.661769271321113e-07,
      "loss": -0.0245,
      "num_tokens": 132480317.0,
      "reward": 0.09290509670972824,
      "reward_std": 1.1230714321136475,
      "rewards/cosine_scaled_reward/mean": 0.09290510416030884,
      "rewards/cosine_scaled_reward/std": 1.4430676698684692,
      "step": 163
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.049107142857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 579.7467041015625,
      "completions/mean_terminated_length": 503.9213562011719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0377952755905513,
      "grad_norm": 13.899866104125977,
      "kl": 0.27783203125,
      "learning_rate": 5.606906756392949e-07,
      "loss": -0.0631,
      "num_tokens": 133122762.0,
      "reward": 0.048874642699956894,
      "reward_std": 1.049883484840393,
      "rewards/cosine_scaled_reward/mean": 0.04887465760111809,
      "rewards/cosine_scaled_reward/std": 1.4304143190383911,
      "step": 164
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.060267857142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 570.8694458007812,
      "completions/mean_terminated_length": 476.1365966796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0440944881889764,
      "grad_norm": 13.861762046813965,
      "kl": 0.345703125,
      "learning_rate": 5.551969977076349e-07,
      "loss": -0.013,
      "num_tokens": 133775221.0,
      "reward": 0.049550432711839676,
      "reward_std": 1.0978636741638184,
      "rewards/cosine_scaled_reward/mean": 0.049550436437129974,
      "rewards/cosine_scaled_reward/std": 1.4299269914627075,
      "step": 165
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1908.0,
      "completions/mean_length": 663.302490234375,
      "completions/mean_terminated_length": 538.64599609375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.0503937007874016,
      "grad_norm": 5.041938781738281,
      "kl": 0.44580078125,
      "learning_rate": 5.49696565573233e-07,
      "loss": -0.0428,
      "num_tokens": 134495524.0,
      "reward": -0.2152213752269745,
      "reward_std": 1.0333141088485718,
      "rewards/cosine_scaled_reward/mean": -0.21522139012813568,
      "rewards/cosine_scaled_reward/std": 1.3177170753479004,
      "step": 166
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1966.0,
      "completions/mean_length": 702.8906860351562,
      "completions/mean_terminated_length": 537.7017211914062,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0566929133858267,
      "grad_norm": 10.903532981872559,
      "kl": 0.56884765625,
      "learning_rate": 5.441900522986712e-07,
      "loss": 0.0091,
      "num_tokens": 135258114.0,
      "reward": -0.14526759088039398,
      "reward_std": 1.0364834070205688,
      "rewards/cosine_scaled_reward/mean": -0.14526759088039398,
      "rewards/cosine_scaled_reward/std": 1.3537439107894897,
      "step": 167
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1261160714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 722.075927734375,
      "completions/mean_terminated_length": 530.7228393554688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0629921259842519,
      "grad_norm": 11.074158668518066,
      "kl": 0.7021484375,
      "learning_rate": 5.38678131690653e-07,
      "loss": 0.0007,
      "num_tokens": 136034406.0,
      "reward": -0.21496644616127014,
      "reward_std": 1.0454844236373901,
      "rewards/cosine_scaled_reward/mean": -0.21496644616127014,
      "rewards/cosine_scaled_reward/std": 1.317626714706421,
      "step": 168
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1618303571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 784.0301513671875,
      "completions/mean_terminated_length": 539.9879760742188,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.0692913385826772,
      "grad_norm": 18.679859161376953,
      "kl": 0.91455078125,
      "learning_rate": 5.33161478217552e-07,
      "loss": -0.0063,
      "num_tokens": 136861985.0,
      "reward": -0.43908149003982544,
      "reward_std": 0.8596888184547424,
      "rewards/cosine_scaled_reward/mean": -0.43908146023750305,
      "rewards/cosine_scaled_reward/std": 1.1680676937103271,
      "step": 169
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1908482142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 843.6439819335938,
      "completions/mean_terminated_length": 559.5820922851562,
      "completions/min_length": 6.0,
      "completions/min_terminated_length": 6.0,
      "epoch": 1.0755905511811024,
      "grad_norm": 8.620196342468262,
      "kl": 1.0087890625,
      "learning_rate": 5.27640766926881e-07,
      "loss": 0.0926,
      "num_tokens": 137754034.0,
      "reward": -0.37983787059783936,
      "reward_std": 0.88871169090271,
      "rewards/cosine_scaled_reward/mean": -0.37983784079551697,
      "rewards/cosine_scaled_reward/std": 1.2146024703979492,
      "step": 170
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2243303571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2044.0,
      "completions/mean_length": 887.3605346679688,
      "completions/mean_terminated_length": 551.6935424804688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.0818897637795275,
      "grad_norm": 9.824555397033691,
      "kl": 1.1435546875,
      "learning_rate": 5.221166733626894e-07,
      "loss": 0.1494,
      "num_tokens": 138682053.0,
      "reward": -0.32001835107803345,
      "reward_std": 0.9776201248168945,
      "rewards/cosine_scaled_reward/mean": -0.32001832127571106,
      "rewards/cosine_scaled_reward/std": 1.2563639879226685,
      "step": 171
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2533482142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 933.0123291015625,
      "completions/mean_terminated_length": 554.68310546875,
      "completions/min_length": 4.0,
      "completions/min_terminated_length": 4.0,
      "epoch": 1.0881889763779529,
      "grad_norm": 7.74678373336792,
      "kl": 1.2822265625,
      "learning_rate": 5.165898734828995e-07,
      "loss": 0.1787,
      "num_tokens": 139647792.0,
      "reward": -0.410632461309433,
      "reward_std": 0.9149812459945679,
      "rewards/cosine_scaled_reward/mean": -0.410632461309433,
      "rewards/cosine_scaled_reward/std": 1.192514181137085,
      "step": 172
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1988.0,
      "completions/mean_length": 1042.896240234375,
      "completions/mean_terminated_length": 586.0308227539062,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.094488188976378,
      "grad_norm": 7.497556209564209,
      "kl": 1.419921875,
      "learning_rate": 5.110610435765934e-07,
      "loss": 0.1739,
      "num_tokens": 140712563.0,
      "reward": -0.5546740293502808,
      "reward_std": 0.7227898240089417,
      "rewards/cosine_scaled_reward/mean": -0.554673969745636,
      "rewards/cosine_scaled_reward/std": 1.0671894550323486,
      "step": 173
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.3080357142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 1014.2254638671875,
      "completions/mean_terminated_length": 554.029052734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1007874015748031,
      "grad_norm": 6.004925727844238,
      "kl": 1.466796875,
      "learning_rate": 5.055308601812578e-07,
      "loss": 0.1959,
      "num_tokens": 141756109.0,
      "reward": -0.5145089626312256,
      "reward_std": 0.8673759698867798,
      "rewards/cosine_scaled_reward/mean": -0.5145089030265808,
      "rewards/cosine_scaled_reward/std": 1.1055024862289429,
      "step": 174
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2533482142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 937.6529541015625,
      "completions/mean_terminated_length": 560.8983764648438,
      "completions/min_length": 7.0,
      "completions/min_terminated_length": 7.0,
      "epoch": 1.1070866141732283,
      "grad_norm": 16.494152069091797,
      "kl": 1.650390625,
      "learning_rate": 5e-07,
      "loss": 0.1548,
      "num_tokens": 142716854.0,
      "reward": -0.611550509929657,
      "reward_std": 0.7335640788078308,
      "rewards/cosine_scaled_reward/mean": -0.6115504503250122,
      "rewards/cosine_scaled_reward/std": 1.0076801776885986,
      "step": 175
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2566964285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 931.55810546875,
      "completions/mean_terminated_length": 546.0,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1133858267716534,
      "grad_norm": 7.309991359710693,
      "kl": 1.529296875,
      "learning_rate": 4.944691398187422e-07,
      "loss": 0.1781,
      "num_tokens": 143694762.0,
      "reward": -0.6149553656578064,
      "reward_std": 0.7380089163780212,
      "rewards/cosine_scaled_reward/mean": -0.6149553656578064,
      "rewards/cosine_scaled_reward/std": 1.003991723060608,
      "step": 176
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2131696428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 856.0982666015625,
      "completions/mean_terminated_length": 533.185791015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1196850393700788,
      "grad_norm": 8.146985054016113,
      "kl": 1.611328125,
      "learning_rate": 4.889389564234066e-07,
      "loss": 0.1488,
      "num_tokens": 144588786.0,
      "reward": -0.6183035373687744,
      "reward_std": 0.7189446091651917,
      "rewards/cosine_scaled_reward/mean": -0.6183034777641296,
      "rewards/cosine_scaled_reward/std": 1.000256896018982,
      "step": 177
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1495535714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 721.6049194335938,
      "completions/mean_terminated_length": 488.3543395996094,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.125984251968504,
      "grad_norm": 11.923486709594727,
      "kl": 1.490234375,
      "learning_rate": 4.834101265171005e-07,
      "loss": 0.0083,
      "num_tokens": 145371456.0,
      "reward": -0.47316575050354004,
      "reward_std": 0.8562769293785095,
      "rewards/cosine_scaled_reward/mean": -0.47316572070121765,
      "rewards/cosine_scaled_reward/std": 1.1405795812606812,
      "step": 178
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1316964285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2020.0,
      "completions/mean_length": 664.8192138671875,
      "completions/mean_terminated_length": 455.0308532714844,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.132283464566929,
      "grad_norm": 9.277202606201172,
      "kl": 1.2861328125,
      "learning_rate": 4.778833266373106e-07,
      "loss": -0.0526,
      "num_tokens": 146098318.0,
      "reward": -0.5175777077674866,
      "reward_std": 0.8472175002098083,
      "rewards/cosine_scaled_reward/mean": -0.5175777077674866,
      "rewards/cosine_scaled_reward/std": 1.0977907180786133,
      "step": 179
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1551339285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2024.0,
      "completions/mean_length": 724.489990234375,
      "completions/mean_terminated_length": 481.4676513671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1385826771653544,
      "grad_norm": 7.163623332977295,
      "kl": 1.0029296875,
      "learning_rate": 4.7235923307311906e-07,
      "loss": 0.0004,
      "num_tokens": 146870917.0,
      "reward": -0.34621867537498474,
      "reward_std": 0.9726521968841553,
      "rewards/cosine_scaled_reward/mean": -0.34621864557266235,
      "rewards/cosine_scaled_reward/std": 1.2381350994110107,
      "step": 180
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1049107142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 627.3973388671875,
      "completions/mean_terminated_length": 460.8927917480469,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1448818897637796,
      "grad_norm": 10.082908630371094,
      "kl": 0.9375,
      "learning_rate": 4.6683852178244817e-07,
      "loss": 0.0122,
      "num_tokens": 147552937.0,
      "reward": -0.2516986131668091,
      "reward_std": 1.0554618835449219,
      "rewards/cosine_scaled_reward/mean": -0.2516985833644867,
      "rewards/cosine_scaled_reward/std": 1.2968847751617432,
      "step": 181
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1506696428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 723.8236694335938,
      "completions/mean_terminated_length": 488.9172058105469,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1511811023622047,
      "grad_norm": 8.064560890197754,
      "kl": 0.7548828125,
      "learning_rate": 4.613218683093471e-07,
      "loss": -0.0383,
      "num_tokens": 148331531.0,
      "reward": -0.4500025510787964,
      "reward_std": 0.8437191247940063,
      "rewards/cosine_scaled_reward/mean": -0.450002521276474,
      "rewards/cosine_scaled_reward/std": 1.1603206396102905,
      "step": 182
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 644.7142944335938,
      "completions/mean_terminated_length": 486.08197021484375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1574803149606299,
      "grad_norm": 6.472278594970703,
      "kl": 0.8740234375,
      "learning_rate": 4.558099477013288e-07,
      "loss": -0.0598,
      "num_tokens": 149031899.0,
      "reward": -0.27882441878318787,
      "reward_std": 0.894206166267395,
      "rewards/cosine_scaled_reward/mean": -0.2788243889808655,
      "rewards/cosine_scaled_reward/std": 1.2811487913131714,
      "step": 183
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1037946428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2021.0,
      "completions/mean_length": 630.2567138671875,
      "completions/mean_terminated_length": 466.05975341796875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.163779527559055,
      "grad_norm": 7.136538982391357,
      "kl": 0.78369140625,
      "learning_rate": 4.5030343442676703e-07,
      "loss": 0.013,
      "num_tokens": 149728417.0,
      "reward": -0.219038724899292,
      "reward_std": 0.9884912371635437,
      "rewards/cosine_scaled_reward/mean": -0.219038724899292,
      "rewards/cosine_scaled_reward/std": 1.3161735534667969,
      "step": 184
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1071428571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 634.265625,
      "completions/mean_terminated_length": 464.61749267578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1700787401574804,
      "grad_norm": 6.526285171508789,
      "kl": 0.94677734375,
      "learning_rate": 4.4480300229236517e-07,
      "loss": -0.0554,
      "num_tokens": 150418991.0,
      "reward": -0.385952889919281,
      "reward_std": 0.8925806283950806,
      "rewards/cosine_scaled_reward/mean": -0.385952889919281,
      "rewards/cosine_scaled_reward/std": 1.2094550132751465,
      "step": 185
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1071428571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1817.0,
      "completions/mean_length": 641.4152221679688,
      "completions/mean_terminated_length": 472.625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1763779527559055,
      "grad_norm": 24.995378494262695,
      "kl": 0.736572265625,
      "learning_rate": 4.3930932436070534e-07,
      "loss": 0.0653,
      "num_tokens": 151132563.0,
      "reward": -0.1384265422821045,
      "reward_std": 1.111531138420105,
      "rewards/cosine_scaled_reward/mean": -0.1384265422821045,
      "rewards/cosine_scaled_reward/std": 1.3569211959838867,
      "step": 186
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1026785714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 630.9263916015625,
      "completions/mean_terminated_length": 468.77362060546875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.1826771653543307,
      "grad_norm": 11.412405014038086,
      "kl": 0.91064453125,
      "learning_rate": 4.338230728678888e-07,
      "loss": -0.0276,
      "num_tokens": 151825393.0,
      "reward": -0.27033674716949463,
      "reward_std": 1.0509710311889648,
      "rewards/cosine_scaled_reward/mean": -0.27033671736717224,
      "rewards/cosine_scaled_reward/std": 1.284220814704895,
      "step": 187
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 600.7042846679688,
      "completions/mean_terminated_length": 478.0520935058594,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.188976377952756,
      "grad_norm": 14.40892219543457,
      "kl": 0.95556640625,
      "learning_rate": 4.283449191412779e-07,
      "loss": 0.0398,
      "num_tokens": 152486072.0,
      "reward": -0.23905393481254578,
      "reward_std": 1.0509700775146484,
      "rewards/cosine_scaled_reward/mean": -0.23905394971370697,
      "rewards/cosine_scaled_reward/std": 1.3049958944320679,
      "step": 188
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 638.9721069335938,
      "completions/mean_terminated_length": 493.2105712890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.1952755905511812,
      "grad_norm": 8.587828636169434,
      "kl": 0.794921875,
      "learning_rate": 4.228755335173487e-07,
      "loss": -0.045,
      "num_tokens": 153216063.0,
      "reward": -0.35925498604774475,
      "reward_std": 0.8481911420822144,
      "rewards/cosine_scaled_reward/mean": -0.35925498604774475,
      "rewards/cosine_scaled_reward/std": 1.2286874055862427,
      "step": 189
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1305803571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1944.0,
      "completions/mean_length": 704.0391235351562,
      "completions/mean_terminated_length": 502.1861572265625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2015748031496063,
      "grad_norm": 8.152901649475098,
      "kl": 0.9990234375,
      "learning_rate": 4.174155852596646e-07,
      "loss": 0.0527,
      "num_tokens": 153987218.0,
      "reward": -0.2865358889102936,
      "reward_std": 1.0425032377243042,
      "rewards/cosine_scaled_reward/mean": -0.2865358889102936,
      "rewards/cosine_scaled_reward/std": 1.27761971950531,
      "step": 190
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1272321428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 672.5480346679688,
      "completions/mean_terminated_length": 472.0345153808594,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.2078740157480314,
      "grad_norm": 9.168877601623535,
      "kl": 1.3720703125,
      "learning_rate": 4.1196574247698184e-07,
      "loss": 0.1603,
      "num_tokens": 154706349.0,
      "reward": -0.1460474580526352,
      "reward_std": 1.021892786026001,
      "rewards/cosine_scaled_reward/mean": -0.1460474580526352,
      "rewards/cosine_scaled_reward/std": 1.354325294494629,
      "step": 191
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.140625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1977.0,
      "completions/mean_length": 733.3047485351562,
      "completions/mean_terminated_length": 518.1727294921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2141732283464566,
      "grad_norm": 8.236457824707031,
      "kl": 1.1708984375,
      "learning_rate": 4.0652667204149633e-07,
      "loss": 0.1351,
      "num_tokens": 155512830.0,
      "reward": -0.17286305129528046,
      "reward_std": 1.0575268268585205,
      "rewards/cosine_scaled_reward/mean": -0.17286303639411926,
      "rewards/cosine_scaled_reward/std": 1.3412247896194458,
      "step": 192
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1372767857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1964.0,
      "completions/mean_length": 697.7623291015625,
      "completions/mean_terminated_length": 482.91204833984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.220472440944882,
      "grad_norm": 13.081280708312988,
      "kl": 1.5234375,
      "learning_rate": 4.010990395072413e-07,
      "loss": 0.1504,
      "num_tokens": 156270681.0,
      "reward": -0.23280996084213257,
      "reward_std": 0.9587434530258179,
      "rewards/cosine_scaled_reward/mean": -0.23280994594097137,
      "rewards/cosine_scaled_reward/std": 1.3090548515319824,
      "step": 193
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 682.5670166015625,
      "completions/mean_terminated_length": 501.3148193359375,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.226771653543307,
      "grad_norm": 9.536521911621094,
      "kl": 1.55126953125,
      "learning_rate": 3.956835090286468e-07,
      "loss": 0.0377,
      "num_tokens": 157020757.0,
      "reward": -0.35255441069602966,
      "reward_std": 0.8735190629959106,
      "rewards/cosine_scaled_reward/mean": -0.3525543808937073,
      "rewards/cosine_scaled_reward/std": 1.2333698272705078,
      "step": 194
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0948660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 624.279052734375,
      "completions/mean_terminated_length": 475.0603942871094,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2330708661417322,
      "grad_norm": 31.1558780670166,
      "kl": 1.67919921875,
      "learning_rate": 3.9028074327926975e-07,
      "loss": 0.0171,
      "num_tokens": 157706335.0,
      "reward": -0.18870538473129272,
      "reward_std": 0.940399706363678,
      "rewards/cosine_scaled_reward/mean": -0.18870536983013153,
      "rewards/cosine_scaled_reward/std": 1.3321219682693481,
      "step": 195
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0758928571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 578.078125,
      "completions/mean_terminated_length": 457.3598937988281,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.2393700787401576,
      "grad_norm": 11.43847370147705,
      "kl": 1.6435546875,
      "learning_rate": 3.8489140337070594e-07,
      "loss": 0.0715,
      "num_tokens": 158351605.0,
      "reward": -0.09771518409252167,
      "reward_std": 1.1073017120361328,
      "rewards/cosine_scaled_reward/mean": -0.09771519154310226,
      "rewards/cosine_scaled_reward/std": 1.374672532081604,
      "step": 196
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 634.375,
      "completions/mean_terminated_length": 501.4700927734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2456692913385827,
      "grad_norm": 12.567444801330566,
      "kl": 1.46484375,
      "learning_rate": 3.795161487716928e-07,
      "loss": 0.0189,
      "num_tokens": 159053941.0,
      "reward": -0.23867203295230865,
      "reward_std": 1.0107946395874023,
      "rewards/cosine_scaled_reward/mean": -0.23867204785346985,
      "rewards/cosine_scaled_reward/std": 1.3047585487365723,
      "step": 197
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0792410714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 582.1451416015625,
      "completions/mean_terminated_length": 455.99273681640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2519685039370079,
      "grad_norm": 41.585182189941406,
      "kl": 1.40478515625,
      "learning_rate": 3.741556372274133e-07,
      "loss": -0.0895,
      "num_tokens": 159702999.0,
      "reward": -0.24997612833976746,
      "reward_std": 0.9644049406051636,
      "rewards/cosine_scaled_reward/mean": -0.24997612833976746,
      "rewards/cosine_scaled_reward/std": 1.295945644378662,
      "step": 198
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0747767857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 556.560302734375,
      "completions/mean_terminated_length": 436.021728515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.258267716535433,
      "grad_norm": 27.830976486206055,
      "kl": 1.490234375,
      "learning_rate": 3.6881052467901054e-07,
      "loss": -0.1011,
      "num_tokens": 160330989.0,
      "reward": -0.23273612558841705,
      "reward_std": 0.9888551235198975,
      "rewards/cosine_scaled_reward/mean": -0.23273612558841705,
      "rewards/cosine_scaled_reward/std": 1.3051189184188843,
      "step": 199
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0613839285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1976.0,
      "completions/mean_length": 537.2288208007812,
      "completions/mean_terminated_length": 438.4268798828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2645669291338582,
      "grad_norm": 12.468254089355469,
      "kl": 1.302734375,
      "learning_rate": 3.634814651833231e-07,
      "loss": -0.0855,
      "num_tokens": 160936538.0,
      "reward": -0.24863407015800476,
      "reward_std": 1.0323801040649414,
      "rewards/cosine_scaled_reward/mean": -0.24863405525684357,
      "rewards/cosine_scaled_reward/std": 1.2952616214752197,
      "step": 200
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0926339285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 608.7120971679688,
      "completions/mean_terminated_length": 461.773681640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2708661417322835,
      "grad_norm": 20.568553924560547,
      "kl": 1.21923828125,
      "learning_rate": 3.5816911083285164e-07,
      "loss": -0.1144,
      "num_tokens": 161611224.0,
      "reward": -0.48500585556030273,
      "reward_std": 0.8858336210250854,
      "rewards/cosine_scaled_reward/mean": -0.48500585556030273,
      "rewards/cosine_scaled_reward/std": 1.124108910560608,
      "step": 201
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0982142857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2018.0,
      "completions/mean_length": 620.3125,
      "completions/mean_terminated_length": 464.82177734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2771653543307087,
      "grad_norm": 13.276288986206055,
      "kl": 1.01123046875,
      "learning_rate": 3.5287411167596505e-07,
      "loss": -0.1358,
      "num_tokens": 162310720.0,
      "reward": -0.41049110889434814,
      "reward_std": 0.8386304974555969,
      "rewards/cosine_scaled_reward/mean": -0.41049107909202576,
      "rewards/cosine_scaled_reward/std": 1.1842224597930908,
      "step": 202
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 604.693115234375,
      "completions/mean_terminated_length": 441.5366516113281,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2834645669291338,
      "grad_norm": 10.093551635742188,
      "kl": 0.89501953125,
      "learning_rate": 3.475971156373567e-07,
      "loss": -0.0996,
      "num_tokens": 162971117.0,
      "reward": -0.3916541337966919,
      "reward_std": 0.9409958124160767,
      "rewards/cosine_scaled_reward/mean": -0.3916541039943695,
      "rewards/cosine_scaled_reward/std": 1.199949026107788,
      "step": 203
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1316964285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1961.0,
      "completions/mean_length": 651.216552734375,
      "completions/mean_terminated_length": 439.36505126953125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2897637795275592,
      "grad_norm": 8.370497703552246,
      "kl": 0.668701171875,
      "learning_rate": 3.423387684387615e-07,
      "loss": -0.1193,
      "num_tokens": 163680943.0,
      "reward": -0.34324315190315247,
      "reward_std": 0.9392971992492676,
      "rewards/cosine_scaled_reward/mean": -0.3432431221008301,
      "rewards/cosine_scaled_reward/std": 1.2325228452682495,
      "step": 204
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1573660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 664.4074096679688,
      "completions/mean_terminated_length": 406.01458740234375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.2960629921259843,
      "grad_norm": 28.92987060546875,
      "kl": 0.61181640625,
      "learning_rate": 3.3709971351994126e-07,
      "loss": 0.061,
      "num_tokens": 164400812.0,
      "reward": -0.2606620490550995,
      "reward_std": 1.1027953624725342,
      "rewards/cosine_scaled_reward/mean": -0.2606620788574219,
      "rewards/cosine_scaled_reward/std": 1.2864234447479248,
      "step": 205
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2098214285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 746.3917846679688,
      "completions/mean_terminated_length": 400.7669372558594,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3023622047244094,
      "grad_norm": 13.491728782653809,
      "kl": 0.46875,
      "learning_rate": 3.318805919599506e-07,
      "loss": -0.2178,
      "num_tokens": 165201771.0,
      "reward": -0.4426966905593872,
      "reward_std": 0.7850523591041565,
      "rewards/cosine_scaled_reward/mean": -0.4426967203617096,
      "rewards/cosine_scaled_reward/std": 1.14866304397583,
      "step": 206
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2276785714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 803.7210083007812,
      "completions/mean_terminated_length": 436.910400390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3086614173228346,
      "grad_norm": 9.700703620910645,
      "kl": 0.364013671875,
      "learning_rate": 3.266820423986904e-07,
      "loss": -0.1382,
      "num_tokens": 166057569.0,
      "reward": -0.4414040148258209,
      "reward_std": 0.8158534169197083,
      "rewards/cosine_scaled_reward/mean": -0.4414040148258209,
      "rewards/cosine_scaled_reward/std": 1.1522157192230225,
      "step": 207
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2377232142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 850.9308471679688,
      "completions/mean_terminated_length": 477.61346435546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3149606299212597,
      "grad_norm": 15.584068298339844,
      "kl": 0.3076171875,
      "learning_rate": 3.215047009587608e-07,
      "loss": -0.0463,
      "num_tokens": 166948723.0,
      "reward": -0.4017762839794159,
      "reward_std": 0.9151403307914734,
      "rewards/cosine_scaled_reward/mean": -0.4017762839794159,
      "rewards/cosine_scaled_reward/std": 1.192360758781433,
      "step": 208
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2678571428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2036.0,
      "completions/mean_length": 906.7891235351562,
      "completions/mean_terminated_length": 489.2728576660156,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.321259842519685,
      "grad_norm": 8.948128700256348,
      "kl": 0.3023681640625,
      "learning_rate": 3.163492011676217e-07,
      "loss": -0.1143,
      "num_tokens": 167890006.0,
      "reward": -0.4780765175819397,
      "reward_std": 0.8043166399002075,
      "rewards/cosine_scaled_reward/mean": -0.4780765175819397,
      "rewards/cosine_scaled_reward/std": 1.1253687143325806,
      "step": 209
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2466517857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 859.8392944335938,
      "completions/mean_terminated_length": 470.82666015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3275590551181102,
      "grad_norm": 20.913394927978516,
      "kl": 0.2698974609375,
      "learning_rate": 3.112161738800708e-07,
      "loss": 0.0274,
      "num_tokens": 168804806.0,
      "reward": -0.25251245498657227,
      "reward_std": 1.0358350276947021,
      "rewards/cosine_scaled_reward/mean": -0.2525124251842499,
      "rewards/cosine_scaled_reward/std": 1.2934397459030151,
      "step": 210
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2243303571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2023.0,
      "completions/mean_length": 849.2835083007812,
      "completions/mean_terminated_length": 502.60430908203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3338582677165354,
      "grad_norm": 29.57670783996582,
      "kl": 0.3201904296875,
      "learning_rate": 3.0610624720104885e-07,
      "loss": 0.084,
      "num_tokens": 169694404.0,
      "reward": -0.0885855183005333,
      "reward_std": 1.1232795715332031,
      "rewards/cosine_scaled_reward/mean": -0.08858553320169449,
      "rewards/cosine_scaled_reward/std": 1.375993251800537,
      "step": 211
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.2220982142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2001.0,
      "completions/mean_length": 842.013427734375,
      "completions/mean_terminated_length": 497.6929626464844,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3401574803149607,
      "grad_norm": 37.739898681640625,
      "kl": 0.4039306640625,
      "learning_rate": 3.010200464087818e-07,
      "loss": 0.0637,
      "num_tokens": 170570640.0,
      "reward": -0.10702486336231232,
      "reward_std": 1.1242616176605225,
      "rewards/cosine_scaled_reward/mean": -0.10702486336231232,
      "rewards/cosine_scaled_reward/std": 1.3697468042373657,
      "step": 212
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1696428571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 738.1596069335938,
      "completions/mean_terminated_length": 470.55780029296875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3464566929133859,
      "grad_norm": 25.782590866088867,
      "kl": 0.537353515625,
      "learning_rate": 2.9595819387826747e-07,
      "loss": 0.0198,
      "num_tokens": 171365391.0,
      "reward": -0.14487382769584656,
      "reward_std": 1.1085338592529297,
      "rewards/cosine_scaled_reward/mean": -0.14487382769584656,
      "rewards/cosine_scaled_reward/std": 1.3499091863632202,
      "step": 213
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1104910714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1938.0,
      "completions/mean_length": 665.2779541015625,
      "completions/mean_terminated_length": 493.5219421386719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.352755905511811,
      "grad_norm": 29.355329513549805,
      "kl": 0.72216796875,
      "learning_rate": 2.909213090051191e-07,
      "loss": 0.0077,
      "num_tokens": 172101320.0,
      "reward": -0.05070412904024124,
      "reward_std": 1.0974308252334595,
      "rewards/cosine_scaled_reward/mean": -0.050704121589660645,
      "rewards/cosine_scaled_reward/std": 1.3941594362258911,
      "step": 214
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0993303571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2038.0,
      "completions/mean_length": 655.5390625,
      "completions/mean_terminated_length": 501.97149658203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3590551181102362,
      "grad_norm": 15.642478942871094,
      "kl": 0.658447265625,
      "learning_rate": 2.859100081297724e-07,
      "loss": -0.0779,
      "num_tokens": 172841611.0,
      "reward": -0.1841743439435959,
      "reward_std": 0.9994798302650452,
      "rewards/cosine_scaled_reward/mean": -0.18417437374591827,
      "rewards/cosine_scaled_reward/std": 1.3331142663955688,
      "step": 215
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1954.0,
      "completions/mean_length": 653.40625,
      "completions/mean_terminated_length": 509.1379089355469,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3653543307086613,
      "grad_norm": 29.55940055847168,
      "kl": 0.74853515625,
      "learning_rate": 2.8092490446206696e-07,
      "loss": 0.0411,
      "num_tokens": 173559431.0,
      "reward": 0.005901983939111233,
      "reward_std": 1.1592895984649658,
      "rewards/cosine_scaled_reward/mean": 0.005901975091546774,
      "rewards/cosine_scaled_reward/std": 1.4154858589172363,
      "step": 216
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.109375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 690.6953735351562,
      "completions/mean_terminated_length": 524.0087890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3716535433070867,
      "grad_norm": 21.52190399169922,
      "kl": 0.92822265625,
      "learning_rate": 2.7596660800621074e-07,
      "loss": 0.0483,
      "num_tokens": 174322294.0,
      "reward": -0.064623162150383,
      "reward_std": 1.1310359239578247,
      "rewards/cosine_scaled_reward/mean": -0.06462316960096359,
      "rewards/cosine_scaled_reward/std": 1.3891171216964722,
      "step": 217
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0848214285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1901.0,
      "completions/mean_length": 620.1183471679688,
      "completions/mean_terminated_length": 487.7780456542969,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3779527559055118,
      "grad_norm": 13.272688865661621,
      "kl": 1.056640625,
      "learning_rate": 2.710357254861364e-07,
      "loss": -0.0935,
      "num_tokens": 175016800.0,
      "reward": -0.15486857295036316,
      "reward_std": 0.9477849006652832,
      "rewards/cosine_scaled_reward/mean": -0.15486857295036316,
      "rewards/cosine_scaled_reward/std": 1.3487260341644287,
      "step": 218
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1026785714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1804.0,
      "completions/mean_length": 636.3359375,
      "completions/mean_terminated_length": 474.8022155761719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.384251968503937,
      "grad_norm": 16.31165885925293,
      "kl": 1.234619140625,
      "learning_rate": 2.6613286027125914e-07,
      "loss": 0.0854,
      "num_tokens": 175731805.0,
      "reward": 0.0020337319001555443,
      "reward_std": 1.1841351985931396,
      "rewards/cosine_scaled_reward/mean": 0.002033740282058716,
      "rewards/cosine_scaled_reward/std": 1.4147533178329468,
      "step": 219
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1305803571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 687.7924194335938,
      "completions/mean_terminated_length": 483.4993591308594,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3905511811023623,
      "grad_norm": 15.241900444030762,
      "kl": 1.38671875,
      "learning_rate": 2.6125861230264566e-07,
      "loss": 0.0791,
      "num_tokens": 176489315.0,
      "reward": -0.10125848650932312,
      "reward_std": 1.0963494777679443,
      "rewards/cosine_scaled_reward/mean": -0.10125849395990372,
      "rewards/cosine_scaled_reward/std": 1.3734389543533325,
      "step": 220
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1160714285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 726.1942138671875,
      "completions/mean_terminated_length": 552.6237182617188,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.3968503937007875,
      "grad_norm": 7.404560565948486,
      "kl": 1.3603515625,
      "learning_rate": 2.5641357801960184e-07,
      "loss": 0.1102,
      "num_tokens": 177269585.0,
      "reward": -0.1522817313671112,
      "reward_std": 1.0328502655029297,
      "rewards/cosine_scaled_reward/mean": -0.1522817462682724,
      "rewards/cosine_scaled_reward/std": 1.3508657217025757,
      "step": 221
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1272321428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 714.5703735351562,
      "completions/mean_terminated_length": 520.182861328125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4031496062992126,
      "grad_norm": 8.131226539611816,
      "kl": 1.5,
      "learning_rate": 2.5159835028668894e-07,
      "loss": 0.0922,
      "num_tokens": 178060256.0,
      "reward": -0.212301105260849,
      "reward_std": 1.0029926300048828,
      "rewards/cosine_scaled_reward/mean": -0.212301105260849,
      "rewards/cosine_scaled_reward/std": 1.3198490142822266,
      "step": 222
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 709.6517944335938,
      "completions/mean_terminated_length": 518.4591674804688,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4094488188976377,
      "grad_norm": 14.496360778808594,
      "kl": 1.7626953125,
      "learning_rate": 2.4681351832117814e-07,
      "loss": 0.1399,
      "num_tokens": 178830648.0,
      "reward": -0.25911927223205566,
      "reward_std": 0.9717407822608948,
      "rewards/cosine_scaled_reward/mean": -0.25911927223205566,
      "rewards/cosine_scaled_reward/std": 1.293389081954956,
      "step": 223
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1908482142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 794.9766235351562,
      "completions/mean_terminated_length": 499.4358825683594,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4157480314960629,
      "grad_norm": 16.000030517578125,
      "kl": 2.080078125,
      "learning_rate": 2.4205966762095016e-07,
      "loss": 0.2121,
      "num_tokens": 179677731.0,
      "reward": -0.23970577120780945,
      "reward_std": 1.1195229291915894,
      "rewards/cosine_scaled_reward/mean": -0.23970575630664825,
      "rewards/cosine_scaled_reward/std": 1.305370807647705,
      "step": 224
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1662946428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1801.0,
      "completions/mean_length": 754.9642944335938,
      "completions/mean_terminated_length": 497.0495300292969,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4220472440944882,
      "grad_norm": 7.867815017700195,
      "kl": 2.3828125,
      "learning_rate": 2.3733737989285068e-07,
      "loss": 0.247,
      "num_tokens": 180468163.0,
      "reward": -0.19275090098381042,
      "reward_std": 1.1585314273834229,
      "rewards/cosine_scaled_reward/mean": -0.19275090098381042,
      "rewards/cosine_scaled_reward/std": 1.330829381942749,
      "step": 225
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1171875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2045.0,
      "completions/mean_length": 679.0089721679688,
      "completions/mean_terminated_length": 497.2844543457031,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4283464566929134,
      "grad_norm": 9.984745979309082,
      "kl": 1.916015625,
      "learning_rate": 2.3264723298150996e-07,
      "loss": 0.1061,
      "num_tokens": 181237179.0,
      "reward": -0.21782931685447693,
      "reward_std": 1.0763907432556152,
      "rewards/cosine_scaled_reward/mean": -0.22178985178470612,
      "rewards/cosine_scaled_reward/std": 1.3146083354949951,
      "step": 226
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1350446428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 676.5926513671875,
      "completions/mean_terminated_length": 462.47613525390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4346456692913385,
      "grad_norm": 11.298858642578125,
      "kl": 2.31640625,
      "learning_rate": 2.2798980079863384e-07,
      "loss": 0.1711,
      "num_tokens": 181973310.0,
      "reward": -0.2311823070049286,
      "reward_std": 1.0497584342956543,
      "rewards/cosine_scaled_reward/mean": -0.23538561165332794,
      "rewards/cosine_scaled_reward/std": 1.3069753646850586,
      "step": 227
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1138392857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1826.0,
      "completions/mean_length": 646.2299194335938,
      "completions/mean_terminated_length": 466.15362548828125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4409448818897639,
      "grad_norm": 35.73098373413086,
      "kl": 2.1005859375,
      "learning_rate": 2.23365653252778e-07,
      "loss": 0.0011,
      "num_tokens": 182683452.0,
      "reward": -0.25549647212028503,
      "reward_std": 0.9556432962417603,
      "rewards/cosine_scaled_reward/mean": -0.25549647212028503,
      "rewards/cosine_scaled_reward/std": 1.2951745986938477,
      "step": 228
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1205357142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 664.2098388671875,
      "completions/mean_terminated_length": 474.55328369140625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.447244094488189,
      "grad_norm": 29.469329833984375,
      "kl": 2.181640625,
      "learning_rate": 2.1877535617960968e-07,
      "loss": 0.0947,
      "num_tokens": 183418856.0,
      "reward": -0.2517828643321991,
      "reward_std": 1.0157699584960938,
      "rewards/cosine_scaled_reward/mean": -0.2517828643321991,
      "rewards/cosine_scaled_reward/std": 1.2969443798065186,
      "step": 229
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1026785714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1985.0,
      "completions/mean_length": 638.6763916015625,
      "completions/mean_terminated_length": 477.4104309082031,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4535433070866142,
      "grad_norm": 48.24620056152344,
      "kl": 2.287109375,
      "learning_rate": 2.1421947127266947e-07,
      "loss": -0.0172,
      "num_tokens": 184123302.0,
      "reward": -0.34807711839675903,
      "reward_std": 0.8679892420768738,
      "rewards/cosine_scaled_reward/mean": -0.34807705879211426,
      "rewards/cosine_scaled_reward/std": 1.235032081604004,
      "step": 230
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1978.0,
      "completions/mean_length": 606.4140625,
      "completions/mean_terminated_length": 470.88037109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4598425196850393,
      "grad_norm": 19.941768646240234,
      "kl": 1.9345703125,
      "learning_rate": 2.0969855601463965e-07,
      "loss": 0.0507,
      "num_tokens": 184796985.0,
      "reward": -0.25824618339538574,
      "reward_std": 1.0666064023971558,
      "rewards/cosine_scaled_reward/mean": -0.25824615359306335,
      "rewards/cosine_scaled_reward/std": 1.2928942441940308,
      "step": 231
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0870535714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1993.0,
      "completions/mean_length": 614.2232666015625,
      "completions/mean_terminated_length": 477.5061340332031,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4661417322834644,
      "grad_norm": 36.38364791870117,
      "kl": 1.9814453125,
      "learning_rate": 2.0521316360912726e-07,
      "loss": -0.0086,
      "num_tokens": 185478049.0,
      "reward": -0.25805556774139404,
      "reward_std": 0.9266217350959778,
      "rewards/cosine_scaled_reward/mean": -0.25805553793907166,
      "rewards/cosine_scaled_reward/std": 1.2928153276443481,
      "step": 232
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0758928571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1997.0,
      "completions/mean_length": 577.5011596679688,
      "completions/mean_terminated_length": 456.7355041503906,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.4724409448818898,
      "grad_norm": 26.197174072265625,
      "kl": 1.97509765625,
      "learning_rate": 2.0076384291297133e-07,
      "loss": 0.0475,
      "num_tokens": 186115810.0,
      "reward": -0.12041401863098145,
      "reward_std": 1.0578092336654663,
      "rewards/cosine_scaled_reward/mean": -0.12041400372982025,
      "rewards/cosine_scaled_reward/std": 1.3638639450073242,
      "step": 233
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 577.1819458007812,
      "completions/mean_terminated_length": 465.9435729980469,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.478740157480315,
      "grad_norm": 27.090312957763672,
      "kl": 1.75390625,
      "learning_rate": 1.9635113836908167e-07,
      "loss": -0.0501,
      "num_tokens": 186768133.0,
      "reward": -0.22769944369792938,
      "reward_std": 0.9778302907943726,
      "rewards/cosine_scaled_reward/mean": -0.2276994287967682,
      "rewards/cosine_scaled_reward/std": 1.3098596334457397,
      "step": 234
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 602.958740234375,
      "completions/mean_terminated_length": 472.86981201171875,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.48503937007874,
      "grad_norm": 29.3377628326416,
      "kl": 1.5732421875,
      "learning_rate": 1.9197558993981783e-07,
      "loss": -0.0195,
      "num_tokens": 187432064.0,
      "reward": -0.0907805934548378,
      "reward_std": 1.0175319910049438,
      "rewards/cosine_scaled_reward/mean": -0.0907806009054184,
      "rewards/cosine_scaled_reward/std": 1.3775309324264526,
      "step": 235
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0747767857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1991.0,
      "completions/mean_length": 599.8203125,
      "completions/mean_terminated_length": 482.7780456542969,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4913385826771655,
      "grad_norm": 30.716096878051758,
      "kl": 1.5380859375,
      "learning_rate": 1.876377330409169e-07,
      "loss": 0.0355,
      "num_tokens": 188096079.0,
      "reward": 0.010077612474560738,
      "reward_std": 1.1685551404953003,
      "rewards/cosine_scaled_reward/mean": 0.010077609680593014,
      "rewards/cosine_scaled_reward/std": 1.4161264896392822,
      "step": 236
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1808.0,
      "completions/mean_length": 582.03125,
      "completions/mean_terminated_length": 457.796630859375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.4976377952755906,
      "grad_norm": 13.030389785766602,
      "kl": 1.29638671875,
      "learning_rate": 1.833380984759764e-07,
      "loss": -0.008,
      "num_tokens": 188745819.0,
      "reward": 0.11718709766864777,
      "reward_std": 1.0303460359573364,
      "rewards/cosine_scaled_reward/mean": 0.11718709021806717,
      "rewards/cosine_scaled_reward/std": 1.448854684829712,
      "step": 237
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1951.0,
      "completions/mean_length": 601.419677734375,
      "completions/mean_terminated_length": 471.19219970703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5039370078740157,
      "grad_norm": 36.063846588134766,
      "kl": 1.2216796875,
      "learning_rate": 1.790772123715028e-07,
      "loss": -0.0307,
      "num_tokens": 189409971.0,
      "reward": 0.08020710945129395,
      "reward_std": 1.0514938831329346,
      "rewards/cosine_scaled_reward/mean": 0.08020710200071335,
      "rewards/cosine_scaled_reward/std": 1.4387195110321045,
      "step": 238
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1968.0,
      "completions/mean_length": 587.857177734375,
      "completions/mean_terminated_length": 450.5787658691406,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.510236220472441,
      "grad_norm": 30.80532455444336,
      "kl": 1.3818359375,
      "learning_rate": 1.7485559611253148e-07,
      "loss": 0.0485,
      "num_tokens": 190066787.0,
      "reward": 0.10385970026254654,
      "reward_std": 1.2190966606140137,
      "rewards/cosine_scaled_reward/mean": 0.10385970771312714,
      "rewards/cosine_scaled_reward/std": 1.4451911449432373,
      "step": 239
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2033.0,
      "completions/mean_length": 619.474365234375,
      "completions/mean_terminated_length": 485.16851806640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.516535433070866,
      "grad_norm": 39.22409439086914,
      "kl": 1.02783203125,
      "learning_rate": 1.706737662788277e-07,
      "loss": -0.058,
      "num_tokens": 190759772.0,
      "reward": -0.0937255322933197,
      "reward_std": 1.0495779514312744,
      "rewards/cosine_scaled_reward/mean": -0.09372551739215851,
      "rewards/cosine_scaled_reward/std": 1.3757864236831665,
      "step": 240
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0948660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2005.0,
      "completions/mean_length": 656.9453125,
      "completions/mean_terminated_length": 511.1504211425781,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5228346456692914,
      "grad_norm": 23.179277420043945,
      "kl": 1.03369140625,
      "learning_rate": 1.665322345816746e-07,
      "loss": -0.0006,
      "num_tokens": 191516299.0,
      "reward": -0.07772157341241837,
      "reward_std": 1.0302178859710693,
      "rewards/cosine_scaled_reward/mean": -0.07772157341241837,
      "rewards/cosine_scaled_reward/std": 1.3834202289581299,
      "step": 241
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0892857142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 626.7410888671875,
      "completions/mean_terminated_length": 487.4019775390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5291338582677165,
      "grad_norm": 23.85130500793457,
      "kl": 1.12841796875,
      "learning_rate": 1.624315078012579e-07,
      "loss": -0.0371,
      "num_tokens": 192203363.0,
      "reward": -0.07777473330497742,
      "reward_std": 1.0998897552490234,
      "rewards/cosine_scaled_reward/mean": -0.07777471840381622,
      "rewards/cosine_scaled_reward/std": 1.383444905281067,
      "step": 242
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0725446428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2026.0,
      "completions/mean_length": 595.3616333007812,
      "completions/mean_terminated_length": 481.7376708984375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5354330708661417,
      "grad_norm": 9.8560209274292,
      "kl": 1.13330078125,
      "learning_rate": 1.5837208772465326e-07,
      "loss": -0.0159,
      "num_tokens": 192875015.0,
      "reward": -0.14465674757957458,
      "reward_std": 1.1027064323425293,
      "rewards/cosine_scaled_reward/mean": -0.14465674757957458,
      "rewards/cosine_scaled_reward/std": 1.3533574342727661,
      "step": 243
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0747767857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2016.0,
      "completions/mean_length": 595.3549194335938,
      "completions/mean_terminated_length": 477.9517517089844,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.541732283464567,
      "grad_norm": 20.040447235107422,
      "kl": 0.97607421875,
      "learning_rate": 1.5435447108442496e-07,
      "loss": -0.0603,
      "num_tokens": 193530709.0,
      "reward": -0.04615917429327965,
      "reward_std": 1.0335677862167358,
      "rewards/cosine_scaled_reward/mean": -0.04615917429327965,
      "rewards/cosine_scaled_reward/std": 1.3946411609649658,
      "step": 244
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1082589285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1999.0,
      "completions/mean_length": 653.046875,
      "completions/mean_terminated_length": 483.6971435546875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5480314960629922,
      "grad_norm": 15.755793571472168,
      "kl": 1.23974609375,
      "learning_rate": 1.5037914949784296e-07,
      "loss": 0.0051,
      "num_tokens": 194240943.0,
      "reward": -0.13129432499408722,
      "reward_std": 1.0558489561080933,
      "rewards/cosine_scaled_reward/mean": -0.13129432499408722,
      "rewards/cosine_scaled_reward/std": 1.3597780466079712,
      "step": 245
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0848214285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2007.0,
      "completions/mean_length": 623.2824096679688,
      "completions/mean_terminated_length": 491.2353515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5543307086614173,
      "grad_norm": 25.45458221435547,
      "kl": 1.384521484375,
      "learning_rate": 1.4644660940672627e-07,
      "loss": 0.0341,
      "num_tokens": 194933308.0,
      "reward": -0.014127791859209538,
      "reward_std": 1.109133005142212,
      "rewards/cosine_scaled_reward/mean": -0.01412778440862894,
      "rewards/cosine_scaled_reward/std": 1.4083439111709595,
      "step": 246
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1004464285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2040.0,
      "completions/mean_length": 625.2767944335938,
      "completions/mean_terminated_length": 466.4118957519531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5606299212598427,
      "grad_norm": 34.377811431884766,
      "kl": 1.45751953125,
      "learning_rate": 1.425573320179188e-07,
      "loss": -0.0061,
      "num_tokens": 195623860.0,
      "reward": -0.03965873643755913,
      "reward_std": 1.0644091367721558,
      "rewards/cosine_scaled_reward/mean": -0.03965873643755913,
      "rewards/cosine_scaled_reward/std": 1.397312879562378,
      "step": 247
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0837053571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1990.0,
      "completions/mean_length": 625.8058471679688,
      "completions/mean_terminated_length": 495.8855285644531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5669291338582676,
      "grad_norm": 37.41438293457031,
      "kl": 1.186279296875,
      "learning_rate": 1.3871179324440675e-07,
      "loss": 0.0349,
      "num_tokens": 196315094.0,
      "reward": -0.05050932243466377,
      "reward_std": 1.1455857753753662,
      "rewards/cosine_scaled_reward/mean": -0.050509314984083176,
      "rewards/cosine_scaled_reward/std": 1.3940231800079346,
      "step": 248
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 604.8828125,
      "completions/mean_terminated_length": 482.58477783203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.573228346456693,
      "grad_norm": 10.482443809509277,
      "kl": 1.39111328125,
      "learning_rate": 1.3491046364708293e-07,
      "loss": -0.0028,
      "num_tokens": 196981293.0,
      "reward": 0.052977096289396286,
      "reward_std": 1.158719539642334,
      "rewards/cosine_scaled_reward/mean": 0.05297710373997688,
      "rewards/cosine_scaled_reward/std": 1.4308778047561646,
      "step": 249
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0770089285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2035.0,
      "completions/mean_length": 604.6283569335938,
      "completions/mean_terminated_length": 484.20196533203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.579527559055118,
      "grad_norm": 27.658212661743164,
      "kl": 1.5458984375,
      "learning_rate": 1.3115380837716683e-07,
      "loss": -0.0587,
      "num_tokens": 197652224.0,
      "reward": -0.140888974070549,
      "reward_std": 1.1005117893218994,
      "rewards/cosine_scaled_reward/mean": -0.140888974070549,
      "rewards/cosine_scaled_reward/std": 1.3547370433807373,
      "step": 250
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0870535714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1868.0,
      "completions/mean_length": 577.8114013671875,
      "completions/mean_terminated_length": 437.6222839355469,
      "completions/min_length": 3.0,
      "completions/min_terminated_length": 3.0,
      "epoch": 1.5858267716535432,
      "grad_norm": 18.967714309692383,
      "kl": 1.3759765625,
      "learning_rate": 1.2744228711928584e-07,
      "loss": 0.0063,
      "num_tokens": 198316631.0,
      "reward": -0.0707879513502121,
      "reward_std": 1.1439727544784546,
      "rewards/cosine_scaled_reward/mean": -0.0707879438996315,
      "rewards/cosine_scaled_reward/std": 1.386030912399292,
      "step": 251
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0948660714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2019.0,
      "completions/mean_length": 600.638427734375,
      "completions/mean_terminated_length": 448.9420471191406,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5921259842519686,
      "grad_norm": 12.37597942352295,
      "kl": 2.0859375,
      "learning_rate": 1.2377635403522585e-07,
      "loss": 0.0865,
      "num_tokens": 198984099.0,
      "reward": -0.15372416377067566,
      "reward_std": 1.1067149639129639,
      "rewards/cosine_scaled_reward/mean": -0.15372416377067566,
      "rewards/cosine_scaled_reward/std": 1.3480288982391357,
      "step": 252
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0870535714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1861.0,
      "completions/mean_length": 588.2232666015625,
      "completions/mean_terminated_length": 449.02691650390625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.5984251968503937,
      "grad_norm": 12.937677383422852,
      "kl": 1.830078125,
      "learning_rate": 1.2015645770835764e-07,
      "loss": 0.015,
      "num_tokens": 199644251.0,
      "reward": -0.06733787059783936,
      "reward_std": 1.0872905254364014,
      "rewards/cosine_scaled_reward/mean": -0.06733787059783936,
      "rewards/cosine_scaled_reward/std": 1.3873533010482788,
      "step": 253
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1771.0,
      "completions/mean_length": 603.3292846679688,
      "completions/mean_terminated_length": 473.2737121582031,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.604724409448819,
      "grad_norm": 20.4193115234375,
      "kl": 1.68896484375,
      "learning_rate": 1.1658304108874573e-07,
      "loss": -0.0324,
      "num_tokens": 200326770.0,
      "reward": -0.2275594025850296,
      "reward_std": 0.9816082715988159,
      "rewards/cosine_scaled_reward/mean": -0.2275594025850296,
      "rewards/cosine_scaled_reward/std": 1.3098082542419434,
      "step": 254
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0926339285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2031.0,
      "completions/mean_length": 618.0078125,
      "completions/mean_terminated_length": 472.0184326171875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6110236220472443,
      "grad_norm": 53.69316864013672,
      "kl": 1.55517578125,
      "learning_rate": 1.1305654143894672e-07,
      "loss": -0.0259,
      "num_tokens": 201010777.0,
      "reward": -0.343801349401474,
      "reward_std": 0.8968811631202698,
      "rewards/cosine_scaled_reward/mean": -0.3438013195991516,
      "rewards/cosine_scaled_reward/std": 1.236782431602478,
      "step": 255
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0837053571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2046.0,
      "completions/mean_length": 577.9576416015625,
      "completions/mean_terminated_length": 443.6662902832031,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6173228346456692,
      "grad_norm": 54.09641647338867,
      "kl": 1.740234375,
      "learning_rate": 1.0957739028050327e-07,
      "loss": 0.0163,
      "num_tokens": 201664147.0,
      "reward": -0.0305644441395998,
      "reward_std": 1.1066824197769165,
      "rewards/cosine_scaled_reward/mean": -0.030564431101083755,
      "rewards/cosine_scaled_reward/std": 1.4016599655151367,
      "step": 256
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1936.0,
      "completions/mean_length": 594.8928833007812,
      "completions/mean_terminated_length": 464.0778503417969,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6236220472440945,
      "grad_norm": 41.750972747802734,
      "kl": 1.7802734375,
      "learning_rate": 1.0614601334114098e-07,
      "loss": -0.0287,
      "num_tokens": 202334163.0,
      "reward": -0.15131886303424835,
      "reward_std": 1.011100172996521,
      "rewards/cosine_scaled_reward/mean": -0.15131886303424835,
      "rewards/cosine_scaled_reward/std": 1.3501999378204346,
      "step": 257
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0848214285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1873.0,
      "completions/mean_length": 580.3381958007812,
      "completions/mean_terminated_length": 444.31097412109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6299212598425197,
      "grad_norm": 17.3424015045166,
      "kl": 2.0615234375,
      "learning_rate": 1.0276283050267392e-07,
      "loss": -0.0038,
      "num_tokens": 202981090.0,
      "reward": -0.2078736275434494,
      "reward_std": 1.0589529275894165,
      "rewards/cosine_scaled_reward/mean": -0.20787358283996582,
      "rewards/cosine_scaled_reward/std": 1.3210111856460571,
      "step": 258
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0881696428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2025.0,
      "completions/mean_length": 594.2299194335938,
      "completions/mean_terminated_length": 453.65728759765625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6362204724409448,
      "grad_norm": 13.328221321105957,
      "kl": 1.9267578125,
      "learning_rate": 9.942825574962594e-08,
      "loss": 0.0285,
      "num_tokens": 203637856.0,
      "reward": -0.10727537423372269,
      "reward_std": 1.1523621082305908,
      "rewards/cosine_scaled_reward/mean": -0.10727538168430328,
      "rewards/cosine_scaled_reward/std": 1.3700629472732544,
      "step": 259
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0580357142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1974.0,
      "completions/mean_length": 538.950927734375,
      "completions/mean_terminated_length": 445.976318359375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6425196850393702,
      "grad_norm": 64.22296142578125,
      "kl": 1.751953125,
      "learning_rate": 9.614269711857281e-08,
      "loss": 0.0107,
      "num_tokens": 204248084.0,
      "reward": 0.05649043619632721,
      "reward_std": 1.1370935440063477,
      "rewards/cosine_scaled_reward/mean": 0.05649043247103691,
      "rewards/cosine_scaled_reward/std": 1.4317340850830078,
      "step": 260
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0725446428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 600.2154541015625,
      "completions/mean_terminated_length": 486.97113037109375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6488188976377953,
      "grad_norm": 46.42252731323242,
      "kl": 1.7373046875,
      "learning_rate": 9.290655664821296e-08,
      "loss": -0.0064,
      "num_tokens": 204908949.0,
      "reward": -0.010218242183327675,
      "reward_std": 1.023814082145691,
      "rewards/cosine_scaled_reward/mean": -0.01021824311465025,
      "rewards/cosine_scaled_reward/std": 1.4091296195983887,
      "step": 261
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1960.0,
      "completions/mean_length": 604.7745971679688,
      "completions/mean_terminated_length": 469.0867004394531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6551181102362205,
      "grad_norm": 27.619144439697266,
      "kl": 1.8291015625,
      "learning_rate": 8.972023033017168e-08,
      "loss": 0.0308,
      "num_tokens": 205583179.0,
      "reward": -0.04715301841497421,
      "reward_std": 1.1445320844650269,
      "rewards/cosine_scaled_reward/mean": -0.04715301841497421,
      "rewards/cosine_scaled_reward/std": 1.395334005355835,
      "step": 262
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0803571428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2039.0,
      "completions/mean_length": 594.4832763671875,
      "completions/mean_terminated_length": 467.4769592285156,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6614173228346458,
      "grad_norm": 12.245925903320312,
      "kl": 1.54150390625,
      "learning_rate": 8.658410806054567e-08,
      "loss": 0.0044,
      "num_tokens": 206251260.0,
      "reward": -0.12348128110170364,
      "reward_std": 1.0466835498809814,
      "rewards/cosine_scaled_reward/mean": -0.12348127365112305,
      "rewards/cosine_scaled_reward/std": 1.3621493577957153,
      "step": 263
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1947.0,
      "completions/mean_length": 594.427490234375,
      "completions/mean_terminated_length": 463.570556640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6677165354330707,
      "grad_norm": 28.225969314575195,
      "kl": 1.6474609375,
      "learning_rate": 8.34985735921932e-08,
      "loss": -0.0019,
      "num_tokens": 206937723.0,
      "reward": -0.21088972687721252,
      "reward_std": 1.039965033531189,
      "rewards/cosine_scaled_reward/mean": -0.21088969707489014,
      "rewards/cosine_scaled_reward/std": 1.3190141916275024,
      "step": 264
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0669642857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 580.4129638671875,
      "completions/mean_terminated_length": 475.0837097167969,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.674015748031496,
      "grad_norm": 54.712013244628906,
      "kl": 1.8115234375,
      "learning_rate": 8.046400448777574e-08,
      "loss": 0.1046,
      "num_tokens": 207575053.0,
      "reward": 0.012515909038484097,
      "reward_std": 1.1547653675079346,
      "rewards/cosine_scaled_reward/mean": 0.012515915557742119,
      "rewards/cosine_scaled_reward/std": 1.4178693294525146,
      "step": 265
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0558035714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1959.0,
      "completions/mean_length": 575.0245971679688,
      "completions/mean_terminated_length": 487.9692687988281,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6803149606299213,
      "grad_norm": 37.593994140625,
      "kl": 1.59375,
      "learning_rate": 7.748077207355764e-08,
      "loss": 0.0369,
      "num_tokens": 208211523.0,
      "reward": 0.09279867261648178,
      "reward_std": 1.2414970397949219,
      "rewards/cosine_scaled_reward/mean": 0.09279866516590118,
      "rewards/cosine_scaled_reward/std": 1.4429327249526978,
      "step": 266
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0669642857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2000.0,
      "completions/mean_length": 584.6060791015625,
      "completions/mean_terminated_length": 479.5777282714844,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6866141732283464,
      "grad_norm": 69.95362854003906,
      "kl": 1.4931640625,
      "learning_rate": 7.45492413939689e-08,
      "loss": -0.0334,
      "num_tokens": 208858722.0,
      "reward": -0.08139531314373016,
      "reward_std": 1.0697132349014282,
      "rewards/cosine_scaled_reward/mean": -0.08139531314373016,
      "rewards/cosine_scaled_reward/std": 1.3822332620620728,
      "step": 267
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0714285714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1953.0,
      "completions/mean_length": 610.9029541015625,
      "completions/mean_terminated_length": 500.35699462890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.6929133858267718,
      "grad_norm": 156.42855834960938,
      "kl": 1.3740234375,
      "learning_rate": 7.166977116693567e-08,
      "loss": -0.1079,
      "num_tokens": 209551675.0,
      "reward": -0.2246764898300171,
      "reward_std": 0.9921932816505432,
      "rewards/cosine_scaled_reward/mean": -0.2246764600276947,
      "rewards/cosine_scaled_reward/std": 1.3119205236434937,
      "step": 268
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0613839285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2041.0,
      "completions/mean_length": 562.0892944335938,
      "completions/mean_terminated_length": 464.9131774902344,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.699212598425197,
      "grad_norm": 32.215087890625,
      "kl": 1.58251953125,
      "learning_rate": 6.884271373998607e-08,
      "loss": 0.0249,
      "num_tokens": 210199083.0,
      "reward": 0.029325606301426888,
      "reward_std": 1.042394995689392,
      "rewards/cosine_scaled_reward/mean": 0.02932562120258808,
      "rewards/cosine_scaled_reward/std": 1.4233999252319336,
      "step": 269
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.09375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 652.8694458007812,
      "completions/mean_terminated_length": 508.5455627441406,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.705511811023622,
      "grad_norm": 26.553613662719727,
      "kl": 1.3935546875,
      "learning_rate": 6.6068415047135e-08,
      "loss": -0.0025,
      "num_tokens": 210912934.0,
      "reward": -0.037757713347673416,
      "reward_std": 1.1614078283309937,
      "rewards/cosine_scaled_reward/mean": -0.037757713347673416,
      "rewards/cosine_scaled_reward/std": 1.3996750116348267,
      "step": 270
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0803571428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 569.8683471679688,
      "completions/mean_terminated_length": 440.711181640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7118110236220474,
      "grad_norm": 37.41157531738281,
      "kl": 2.03759765625,
      "learning_rate": 6.334721456655362e-08,
      "loss": -0.0195,
      "num_tokens": 211553952.0,
      "reward": -0.2244766652584076,
      "reward_std": 1.026918888092041,
      "rewards/cosine_scaled_reward/mean": -0.2244766503572464,
      "rewards/cosine_scaled_reward/std": 1.311730146408081,
      "step": 271
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0703125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1838.0,
      "completions/mean_length": 584.3080444335938,
      "completions/mean_terminated_length": 473.608642578125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7181102362204723,
      "grad_norm": 41.6332893371582,
      "kl": 1.55810546875,
      "learning_rate": 6.067944527902918e-08,
      "loss": -0.0227,
      "num_tokens": 212207844.0,
      "reward": -0.08754130452871323,
      "reward_std": 1.0253074169158936,
      "rewards/cosine_scaled_reward/mean": -0.08754128962755203,
      "rewards/cosine_scaled_reward/std": 1.3790392875671387,
      "step": 272
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1846.0,
      "completions/mean_length": 635.3147583007812,
      "completions/mean_terminated_length": 508.138671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7244094488188977,
      "grad_norm": 186.7555389404297,
      "kl": 1.55322265625,
      "learning_rate": 5.806543362721944e-08,
      "loss": -0.0904,
      "num_tokens": 212912238.0,
      "reward": -0.22805923223495483,
      "reward_std": 0.8819826245307922,
      "rewards/cosine_scaled_reward/mean": -0.22805921733379364,
      "rewards/cosine_scaled_reward/std": 1.310096025466919,
      "step": 273
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2014.0,
      "completions/mean_length": 623.6171875,
      "completions/mean_terminated_length": 489.70086669921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7307086614173228,
      "grad_norm": 25.393768310546875,
      "kl": 1.5244140625,
      "learning_rate": 5.550549947570771e-08,
      "loss": -0.0251,
      "num_tokens": 213620199.0,
      "reward": -0.12161993980407715,
      "reward_std": 1.0506173372268677,
      "rewards/cosine_scaled_reward/mean": -0.12161993980407715,
      "rewards/cosine_scaled_reward/std": 1.3646435737609863,
      "step": 274
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0758928571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1970.0,
      "completions/mean_length": 586.161865234375,
      "completions/mean_terminated_length": 466.10748291015625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.737007874015748,
      "grad_norm": 40.13348388671875,
      "kl": 1.43701171875,
      "learning_rate": 5.299995607186219e-08,
      "loss": -0.117,
      "num_tokens": 214302392.0,
      "reward": -0.2332076132297516,
      "reward_std": 0.917121410369873,
      "rewards/cosine_scaled_reward/mean": -0.2332075983285904,
      "rewards/cosine_scaled_reward/std": 1.3055304288864136,
      "step": 275
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0848214285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2042.0,
      "completions/mean_length": 597.0881958007812,
      "completions/mean_terminated_length": 462.6134033203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7433070866141733,
      "grad_norm": 22.70816993713379,
      "kl": 1.7373046875,
      "learning_rate": 5.0549110007505394e-08,
      "loss": 0.0543,
      "num_tokens": 214956023.0,
      "reward": -0.23842087388038635,
      "reward_std": 1.129518747329712,
      "rewards/cosine_scaled_reward/mean": -0.23842085897922516,
      "rewards/cosine_scaled_reward/std": 1.3046225309371948,
      "step": 276
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2017.0,
      "completions/mean_length": 557.9375,
      "completions/mean_terminated_length": 458.6000061035156,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7496062992125985,
      "grad_norm": 66.7606201171875,
      "kl": 1.36669921875,
      "learning_rate": 4.815326118139812e-08,
      "loss": -0.0319,
      "num_tokens": 215573407.0,
      "reward": 0.08316393196582794,
      "reward_std": 1.129285454750061,
      "rewards/cosine_scaled_reward/mean": 0.08316391706466675,
      "rewards/cosine_scaled_reward/std": 1.4398179054260254,
      "step": 277
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0848214285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1855.0,
      "completions/mean_length": 567.6116333007812,
      "completions/mean_terminated_length": 430.4048767089844,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7559055118110236,
      "grad_norm": 116.46288299560547,
      "kl": 1.400390625,
      "learning_rate": 4.581270276254195e-08,
      "loss": -0.1344,
      "num_tokens": 216237715.0,
      "reward": -0.1613207757472992,
      "reward_std": 0.9567738771438599,
      "rewards/cosine_scaled_reward/mean": -0.1613207757472992,
      "rewards/cosine_scaled_reward/std": 1.3453031778335571,
      "step": 278
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.056919642857142905,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2032.0,
      "completions/mean_length": 532.4285888671875,
      "completions/mean_terminated_length": 440.9562072753906,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.762204724409449,
      "grad_norm": 20.068740844726562,
      "kl": 1.5703125,
      "learning_rate": 4.35277211543057e-08,
      "loss": 0.009,
      "num_tokens": 216842355.0,
      "reward": 0.09989994019269943,
      "reward_std": 1.1928493976593018,
      "rewards/cosine_scaled_reward/mean": 0.09989994019269943,
      "rewards/cosine_scaled_reward/std": 1.4446313381195068,
      "step": 279
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0580357142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2013.0,
      "completions/mean_length": 536.8092041015625,
      "completions/mean_terminated_length": 443.70263671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.768503937007874,
      "grad_norm": 46.137939453125,
      "kl": 1.7451171875,
      "learning_rate": 4.129859595937946e-08,
      "loss": 0.0508,
      "num_tokens": 217447640.0,
      "reward": 0.04671396315097809,
      "reward_std": 1.1750531196594238,
      "rewards/cosine_scaled_reward/mean": 0.04671395570039749,
      "rewards/cosine_scaled_reward/std": 1.4285207986831665,
      "step": 280
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1812.0,
      "completions/mean_length": 606.0189819335938,
      "completions/mean_terminated_length": 476.2055969238281,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7748031496062993,
      "grad_norm": 23.016414642333984,
      "kl": 1.33642578125,
      "learning_rate": 3.912559994556086e-08,
      "loss": -0.0769,
      "num_tokens": 218142409.0,
      "reward": -0.19041763246059418,
      "reward_std": 1.0285288095474243,
      "rewards/cosine_scaled_reward/mean": -0.19041761755943298,
      "rewards/cosine_scaled_reward/std": 1.329443335533142,
      "step": 281
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0736607142857143,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2027.0,
      "completions/mean_length": 582.8772583007812,
      "completions/mean_terminated_length": 466.3735046386719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7811023622047244,
      "grad_norm": 38.84382629394531,
      "kl": 1.611328125,
      "learning_rate": 3.7008999012377865e-08,
      "loss": 0.0061,
      "num_tokens": 218810795.0,
      "reward": -0.09773646295070648,
      "reward_std": 1.072763204574585,
      "rewards/cosine_scaled_reward/mean": -0.09773645550012589,
      "rewards/cosine_scaled_reward/std": 1.374637484550476,
      "step": 282
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0758928571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1971.0,
      "completions/mean_length": 627.9420166015625,
      "completions/mean_terminated_length": 511.31884765625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.7874015748031495,
      "grad_norm": 37.48160171508789,
      "kl": 1.054931640625,
      "learning_rate": 3.494905215855187e-08,
      "loss": -0.0542,
      "num_tokens": 219529207.0,
      "reward": -0.27194881439208984,
      "reward_std": 0.925375759601593,
      "rewards/cosine_scaled_reward/mean": -0.27194881439208984,
      "rewards/cosine_scaled_reward/std": 1.285094976425171,
      "step": 283
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0636160714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2015.0,
      "completions/mean_length": 596.8605346679688,
      "completions/mean_terminated_length": 498.27294921875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.793700787401575,
      "grad_norm": 37.41320037841797,
      "kl": 1.32421875,
      "learning_rate": 3.2946011450305065e-08,
      "loss": -0.0132,
      "num_tokens": 220194058.0,
      "reward": -0.09820376336574554,
      "reward_std": 1.0949289798736572,
      "rewards/cosine_scaled_reward/mean": -0.09820375591516495,
      "rewards/cosine_scaled_reward/std": 1.3751029968261719,
      "step": 284
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0725446428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1958.0,
      "completions/mean_length": 571.5625,
      "completions/mean_terminated_length": 456.0770263671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8,
      "grad_norm": 12.217202186584473,
      "kl": 1.6943359375,
      "learning_rate": 3.100012199051627e-08,
      "loss": -0.0625,
      "num_tokens": 220837490.0,
      "reward": -0.2304655760526657,
      "reward_std": 1.0655834674835205,
      "rewards/cosine_scaled_reward/mean": -0.2304655760526657,
      "rewards/cosine_scaled_reward/std": 1.3076075315475464,
      "step": 285
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0647321428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2037.0,
      "completions/mean_length": 559.1328125,
      "completions/mean_terminated_length": 456.0847473144531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8062992125984252,
      "grad_norm": 34.513824462890625,
      "kl": 1.451171875,
      "learning_rate": 2.9111621888728956e-08,
      "loss": 0.0098,
      "num_tokens": 221487257.0,
      "reward": -0.07044383883476257,
      "reward_std": 1.1145482063293457,
      "rewards/cosine_scaled_reward/mean": -0.07044383883476257,
      "rewards/cosine_scaled_reward/std": 1.3855737447738647,
      "step": 286
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 624.193115234375,
      "completions/mean_terminated_length": 490.3309020996094,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8125984251968505,
      "grad_norm": 24.26609992980957,
      "kl": 1.314453125,
      "learning_rate": 2.7280742232014876e-08,
      "loss": -0.0561,
      "num_tokens": 222189414.0,
      "reward": -0.15032429993152618,
      "reward_std": 1.0422052145004272,
      "rewards/cosine_scaled_reward/mean": -0.15032431483268738,
      "rewards/cosine_scaled_reward/std": 1.3495419025421143,
      "step": 287
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.1015625,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1996.0,
      "completions/mean_length": 632.7299194335938,
      "completions/mean_terminated_length": 472.74285888671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8188976377952755,
      "grad_norm": 28.193471908569336,
      "kl": 1.4814453125,
      "learning_rate": 2.5507707056696748e-08,
      "loss": -0.0471,
      "num_tokens": 222892132.0,
      "reward": -0.1942838728427887,
      "reward_std": 0.9096174836158752,
      "rewards/cosine_scaled_reward/mean": -0.1942838579416275,
      "rewards/cosine_scaled_reward/std": 1.327970266342163,
      "step": 288
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0613839285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1859.0,
      "completions/mean_length": 555.6998291015625,
      "completions/mean_terminated_length": 458.1058349609375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8251968503937008,
      "grad_norm": 45.35464859008789,
      "kl": 1.57080078125,
      "learning_rate": 2.3792733320934343e-08,
      "loss": 0.0264,
      "num_tokens": 223515623.0,
      "reward": -0.08820369839668274,
      "reward_std": 1.0817683935165405,
      "rewards/cosine_scaled_reward/mean": -0.08820368349552155,
      "rewards/cosine_scaled_reward/std": 1.3794188499450684,
      "step": 289
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.060267857142857095,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 537.7467041015625,
      "completions/mean_terminated_length": 440.8895568847656,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.831496062992126,
      "grad_norm": 25.04986572265625,
      "kl": 1.8056640625,
      "learning_rate": 2.2136030878176003e-08,
      "loss": -0.0126,
      "num_tokens": 224132324.0,
      "reward": -0.0332709364593029,
      "reward_std": 1.1177502870559692,
      "rewards/cosine_scaled_reward/mean": -0.0332709439098835,
      "rewards/cosine_scaled_reward/std": 1.4000455141067505,
      "step": 290
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0714285714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1871.0,
      "completions/mean_length": 578.9888916015625,
      "completions/mean_terminated_length": 465.9880065917969,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8377952755905511,
      "grad_norm": 64.00562286376953,
      "kl": 1.52001953125,
      "learning_rate": 2.0537802451479958e-08,
      "loss": -0.0162,
      "num_tokens": 224779450.0,
      "reward": -0.1576380431652069,
      "reward_std": 1.0528969764709473,
      "rewards/cosine_scaled_reward/mean": -0.1576380431652069,
      "rewards/cosine_scaled_reward/std": 1.3467310667037964,
      "step": 291
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0859375,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2030.0,
      "completions/mean_length": 604.2678833007812,
      "completions/mean_terminated_length": 468.5323791503906,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8440944881889765,
      "grad_norm": 14.133583068847656,
      "kl": 1.51904296875,
      "learning_rate": 1.8998243608708108e-08,
      "loss": -0.0226,
      "num_tokens": 225448730.0,
      "reward": -0.1075335368514061,
      "reward_std": 1.0885225534439087,
      "rewards/cosine_scaled_reward/mean": -0.1075335294008255,
      "rewards/cosine_scaled_reward/std": 1.3702043294906616,
      "step": 292
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0613839285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2004.0,
      "completions/mean_length": 544.9464721679688,
      "completions/mean_terminated_length": 446.64923095703125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8503937007874016,
      "grad_norm": 31.37044906616211,
      "kl": 1.7294921875,
      "learning_rate": 1.751754273859507e-08,
      "loss": -0.017,
      "num_tokens": 226069018.0,
      "reward": -0.012751868925988674,
      "reward_std": 1.0909769535064697,
      "rewards/cosine_scaled_reward/mean": -0.012751864269375801,
      "rewards/cosine_scaled_reward/std": 1.4074139595031738,
      "step": 293
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0837053571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2028.0,
      "completions/mean_length": 595.966552734375,
      "completions/mean_terminated_length": 463.32037353515625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8566929133858268,
      "grad_norm": 96.38923645019531,
      "kl": 1.5947265625,
      "learning_rate": 1.6095881027696213e-08,
      "loss": -0.0047,
      "num_tokens": 226750364.0,
      "reward": -0.007183407433331013,
      "reward_std": 1.0385618209838867,
      "rewards/cosine_scaled_reward/mean": -0.007183406967669725,
      "rewards/cosine_scaled_reward/std": 1.4105796813964844,
      "step": 294
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0691964285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1965.0,
      "completions/mean_length": 592.6160888671875,
      "completions/mean_terminated_length": 484.4220886230469,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8629921259842521,
      "grad_norm": 44.77436065673828,
      "kl": 1.37353515625,
      "learning_rate": 1.4733432438216397e-08,
      "loss": 0.0028,
      "num_tokens": 227422516.0,
      "reward": -0.11443806439638138,
      "reward_std": 1.1223397254943848,
      "rewards/cosine_scaled_reward/mean": -0.11443805694580078,
      "rewards/cosine_scaled_reward/std": 1.3673371076583862,
      "step": 295
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0926339285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2029.0,
      "completions/mean_length": 639.3359375,
      "completions/mean_terminated_length": 495.52398681640625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.869291338582677,
      "grad_norm": 31.86522674560547,
      "kl": 1.75634765625,
      "learning_rate": 1.3430363686723234e-08,
      "loss": 0.0412,
      "num_tokens": 228121073.0,
      "reward": -0.15190958976745605,
      "reward_std": 1.0342063903808594,
      "rewards/cosine_scaled_reward/mean": -0.15190958976745605,
      "rewards/cosine_scaled_reward/std": 1.3505640029907227,
      "step": 296
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0837053571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1931.0,
      "completions/mean_length": 612.8170166015625,
      "completions/mean_terminated_length": 481.71014404296875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8755905511811024,
      "grad_norm": 42.08808517456055,
      "kl": 1.8037109375,
      "learning_rate": 1.2186834223746612e-08,
      "loss": 0.0692,
      "num_tokens": 228801581.0,
      "reward": -0.2318280190229416,
      "reward_std": 1.0947842597961426,
      "rewards/cosine_scaled_reward/mean": -0.2318280190229416,
      "rewards/cosine_scaled_reward/std": 1.3084616661071777,
      "step": 297
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0758928571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2022.0,
      "completions/mean_length": 599.0045166015625,
      "completions/mean_terminated_length": 480.00482177734375,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8818897637795275,
      "grad_norm": 44.561763763427734,
      "kl": 2.0576171875,
      "learning_rate": 1.100299621426759e-08,
      "loss": 0.0587,
      "num_tokens": 229459969.0,
      "reward": -0.168549582362175,
      "reward_std": 1.0784419775009155,
      "rewards/cosine_scaled_reward/mean": -0.1685495525598526,
      "rewards/cosine_scaled_reward/std": 1.3423055410385132,
      "step": 298
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0747767857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2012.0,
      "completions/mean_length": 599.2801513671875,
      "completions/mean_terminated_length": 482.1942138671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.8881889763779527,
      "grad_norm": 23.00943946838379,
      "kl": 1.57763671875,
      "learning_rate": 9.878994519098572e-09,
      "loss": -0.0356,
      "num_tokens": 230117756.0,
      "reward": -0.07141243666410446,
      "reward_std": 1.0951851606369019,
      "rewards/cosine_scaled_reward/mean": -0.07141242921352386,
      "rewards/cosine_scaled_reward/std": 1.3864585161209106,
      "step": 299
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0691964285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1846.0,
      "completions/mean_length": 583.7154541015625,
      "completions/mean_terminated_length": 474.8597106933594,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.894488188976378,
      "grad_norm": 21.019100189208984,
      "kl": 1.55859375,
      "learning_rate": 8.814966677157365e-09,
      "loss": 0.029,
      "num_tokens": 230794525.0,
      "reward": -0.10454464703798294,
      "reward_std": 1.1253935098648071,
      "rewards/cosine_scaled_reward/mean": -0.10454463958740234,
      "rewards/cosine_scaled_reward/std": 1.3719398975372314,
      "step": 300
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0691964285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2043.0,
      "completions/mean_length": 568.1752319335938,
      "completions/mean_terminated_length": 458.1642761230469,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9007874015748032,
      "grad_norm": 35.79774475097656,
      "kl": 1.35546875,
      "learning_rate": 7.811042888637209e-09,
      "loss": -0.0656,
      "num_tokens": 231431818.0,
      "reward": -0.0032371284905821085,
      "reward_std": 1.1527271270751953,
      "rewards/cosine_scaled_reward/mean": -0.003237124066799879,
      "rewards/cosine_scaled_reward/std": 1.4113599061965942,
      "step": 301
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.078125,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 569.0725708007812,
      "completions/mean_terminated_length": 443.7397155761719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9070866141732283,
      "grad_norm": 19.340476989746094,
      "kl": 1.546875,
      "learning_rate": 6.867345999074736e-09,
      "loss": 0.0095,
      "num_tokens": 232076667.0,
      "reward": -0.17012757062911987,
      "reward_std": 1.0372531414031982,
      "rewards/cosine_scaled_reward/mean": -0.17012754082679749,
      "rewards/cosine_scaled_reward/std": 1.3394016027450562,
      "step": 302
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0714285714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1933.0,
      "completions/mean_length": 592.1194458007812,
      "completions/mean_terminated_length": 480.1286315917969,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9133858267716537,
      "grad_norm": 32.1158332824707,
      "kl": 1.28515625,
      "learning_rate": 5.983991484317996e-09,
      "loss": -0.045,
      "num_tokens": 232756214.0,
      "reward": -0.13372990489006042,
      "reward_std": 1.0666496753692627,
      "rewards/cosine_scaled_reward/mean": -0.13372988998889923,
      "rewards/cosine_scaled_reward/std": 1.3574635982513428,
      "step": 303
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0770089285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1954.0,
      "completions/mean_length": 599.7467041015625,
      "completions/mean_terminated_length": 478.9129638671875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9196850393700786,
      "grad_norm": 24.803268432617188,
      "kl": 1.7548828125,
      "learning_rate": 5.161087436396095e-09,
      "loss": 0.0698,
      "num_tokens": 233415603.0,
      "reward": -0.09106288850307465,
      "reward_std": 1.1765754222869873,
      "rewards/cosine_scaled_reward/mean": -0.09106288105249405,
      "rewards/cosine_scaled_reward/std": 1.377711534500122,
      "step": 304
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0837053571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2048.0,
      "completions/mean_length": 616.6217041015625,
      "completions/mean_terminated_length": 485.8623962402344,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.925984251968504,
      "grad_norm": 54.1906852722168,
      "kl": 1.47607421875,
      "learning_rate": 4.398734550292715e-09,
      "loss": 0.0022,
      "num_tokens": 234111856.0,
      "reward": -0.19376374781131744,
      "reward_std": 1.0452275276184082,
      "rewards/cosine_scaled_reward/mean": -0.19376373291015625,
      "rewards/cosine_scaled_reward/std": 1.327578067779541,
      "step": 305
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0758928571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1728.0,
      "completions/mean_length": 543.5011596679688,
      "completions/mean_terminated_length": 419.9432373046875,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9322834645669291,
      "grad_norm": 17.24119758605957,
      "kl": 1.52978515625,
      "learning_rate": 3.697026111624091e-09,
      "loss": 0.0133,
      "num_tokens": 234738641.0,
      "reward": -0.1634216010570526,
      "reward_std": 1.0093762874603271,
      "rewards/cosine_scaled_reward/mean": -0.16342158615589142,
      "rewards/cosine_scaled_reward/std": 1.3429415225982666,
      "step": 306
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0691964285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1922.0,
      "completions/mean_length": 587.6752319335938,
      "completions/mean_terminated_length": 479.1139221191406,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9385826771653543,
      "grad_norm": 21.988357543945312,
      "kl": 1.7470703125,
      "learning_rate": 3.05604798522463e-09,
      "loss": 0.0234,
      "num_tokens": 235389614.0,
      "reward": -0.09133020788431168,
      "reward_std": 1.1265548467636108,
      "rewards/cosine_scaled_reward/mean": -0.09133020788431168,
      "rewards/cosine_scaled_reward/std": 1.377872109413147,
      "step": 307
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0959821428571429,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1987.0,
      "completions/mean_length": 597.7980346679688,
      "completions/mean_terminated_length": 443.8259582519531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9448818897637796,
      "grad_norm": 16.0437068939209,
      "kl": 1.390625,
      "learning_rate": 2.4758786046395476e-09,
      "loss": -0.0116,
      "num_tokens": 236085449.0,
      "reward": -0.11006226390600204,
      "reward_std": 1.004935622215271,
      "rewards/cosine_scaled_reward/mean": -0.11006225645542145,
      "rewards/cosine_scaled_reward/std": 1.3682260513305664,
      "step": 308
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0915178571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1992.0,
      "completions/mean_length": 613.5636596679688,
      "completions/mean_terminated_length": 469.0626525878906,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9511811023622048,
      "grad_norm": 20.78128433227539,
      "kl": 1.42236328125,
      "learning_rate": 1.9565889625275944e-09,
      "loss": 0.0618,
      "num_tokens": 236781810.0,
      "reward": 0.04371757060289383,
      "reward_std": 1.1667969226837158,
      "rewards/cosine_scaled_reward/mean": 0.043717559427022934,
      "rewards/cosine_scaled_reward/std": 1.4271618127822876,
      "step": 309
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0926339285714286,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1875.0,
      "completions/mean_length": 603.5390625,
      "completions/mean_terminated_length": 456.07257080078125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.95748031496063,
      "grad_norm": 32.46472930908203,
      "kl": 1.677734375,
      "learning_rate": 1.4982426019738426e-09,
      "loss": -0.0579,
      "num_tokens": 237464693.0,
      "reward": -0.2270333468914032,
      "reward_std": 1.0241820812225342,
      "rewards/cosine_scaled_reward/mean": -0.2270333468914032,
      "rewards/cosine_scaled_reward/std": 1.3094993829727173,
      "step": 310
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0669642857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1874.0,
      "completions/mean_length": 570.4185791015625,
      "completions/mean_terminated_length": 464.3719787597656,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9637795275590553,
      "grad_norm": 31.72895050048828,
      "kl": 1.27587890625,
      "learning_rate": 1.1008956087144582e-09,
      "loss": 0.0253,
      "num_tokens": 238104108.0,
      "reward": 0.02927999757230282,
      "reward_std": 1.155513048171997,
      "rewards/cosine_scaled_reward/mean": 0.02927999384701252,
      "rewards/cosine_scaled_reward/std": 1.4235868453979492,
      "step": 311
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0825892857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1949.0,
      "completions/mean_length": 588.1261596679688,
      "completions/mean_terminated_length": 456.7019348144531,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9700787401574802,
      "grad_norm": 49.65065383911133,
      "kl": 1.6025390625,
      "learning_rate": 7.645966042734153e-10,
      "loss": 0.0015,
      "num_tokens": 238755581.0,
      "reward": -0.10733804106712341,
      "reward_std": 1.0519185066223145,
      "rewards/cosine_scaled_reward/mean": -0.10733802616596222,
      "rewards/cosine_scaled_reward/std": 1.3700921535491943,
      "step": 312
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0870535714285714,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1935.0,
      "completions/mean_length": 603.6796875,
      "completions/mean_terminated_length": 465.9572448730469,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9763779527559056,
      "grad_norm": 27.285886764526367,
      "kl": 1.5439453125,
      "learning_rate": 4.893867400131979e-10,
      "loss": 0.0261,
      "num_tokens": 239423902.0,
      "reward": -0.10080787539482117,
      "reward_std": 1.1222314834594727,
      "rewards/cosine_scaled_reward/mean": -0.10080786794424057,
      "rewards/cosine_scaled_reward/std": 1.3731297254562378,
      "step": 313
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0747767857142857,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1831.0,
      "completions/mean_length": 561.9408569335938,
      "completions/mean_terminated_length": 441.837158203125,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9826771653543307,
      "grad_norm": 36.39866256713867,
      "kl": 1.474609375,
      "learning_rate": 2.7529969209910686e-10,
      "loss": -0.0051,
      "num_tokens": 240067561.0,
      "reward": -0.1611294150352478,
      "reward_std": 0.9675538539886475,
      "rewards/cosine_scaled_reward/mean": -0.1611294001340866,
      "rewards/cosine_scaled_reward/std": 1.3451868295669556,
      "step": 314
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.0680803571428571,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 1883.0,
      "completions/mean_length": 570.872802734375,
      "completions/mean_terminated_length": 462.962890625,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9889763779527558,
      "grad_norm": 75.8726577758789,
      "kl": 1.7724609375,
      "learning_rate": 1.2236165737850024e-10,
      "loss": 0.0088,
      "num_tokens": 240702599.0,
      "reward": -0.14775627851486206,
      "reward_std": 1.0878593921661377,
      "rewards/cosine_scaled_reward/mean": -0.14775624871253967,
      "rewards/cosine_scaled_reward/std": 1.3513950109481812,
      "step": 315
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.08859223300970875,
      "completions/max_length": 2048.0,
      "completions/max_terminated_length": 2008.0,
      "completions/mean_length": 603.0764770507812,
      "completions/mean_terminated_length": 462.6244812011719,
      "completions/min_length": 2.0,
      "completions/min_terminated_length": 2.0,
      "epoch": 1.9952755905511812,
      "grad_norm": 29.035417556762695,
      "kl": 1.70947265625,
      "learning_rate": 3.059135017535741e-11,
      "loss": 0.0663,
      "num_tokens": 241380215.0,
      "reward": -0.12112902104854584,
      "reward_std": 1.117789387702942,
      "rewards/cosine_scaled_reward/mean": -0.12112902104854584,
      "rewards/cosine_scaled_reward/std": 1.364324688911438,
      "step": 316
    },
    {
      "epoch": 1.9952755905511812,
      "step": 316,
      "total_flos": 0.0,
      "train_loss": 0.0234055612822943,
      "train_runtime": 34749.8496,
      "train_samples_per_second": 0.512,
      "train_steps_per_second": 0.009
    }
  ],
  "logging_steps": 1,
  "max_steps": 316,
  "num_input_tokens_seen": 241380215,
  "num_train_epochs": 2,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}