Model save

b15c6b3 verified 3 months ago

95.8 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.029870808752146965,
	"eval_steps": 500,
	"global_step": 100,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2946428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2953.0,
	"completions/mean_length": 1557.727783203125,
	"completions/mean_terminated_length": 925.18359375,
	"completions/min_length": 46.0,
	"completions/min_terminated_length": 46.0,
	"epoch": 0.00029870808752146963,
	"grad_norm": 0.08854348212480545,
	"kl": 0.0002200603485107422,
	"learning_rate": 0.0,
	"loss": 0.1118,
	"num_tokens": 384987.0,
	"reward": 0.1071428656578064,
	"reward_std": 0.15226024389266968,
	"rewards/accuracy_reward/mean": 0.1071428582072258,
	"rewards/accuracy_reward/std": 0.30998751521110535,
	"step": 1
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3839285714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2925.0,
	"completions/mean_length": 1786.2501220703125,
	"completions/mean_terminated_length": 984.9855346679688,
	"completions/min_length": 57.0,
	"completions/min_terminated_length": 57.0,
	"epoch": 0.0005974161750429393,
	"grad_norm": 0.0577642060816288,
	"kl": 0.00018739700317382812,
	"learning_rate": 1e-07,
	"loss": 0.026,
	"num_tokens": 819379.0,
	"reward": 0.0535714328289032,
	"reward_std": 0.056364625692367554,
	"rewards/accuracy_reward/mean": 0.0535714291036129,
	"rewards/accuracy_reward/std": 0.2256743162870407,
	"step": 2
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3705357142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3003.0,
	"completions/mean_length": 1748.2188720703125,
	"completions/mean_terminated_length": 968.9716186523438,
	"completions/min_length": 36.0,
	"completions/min_terminated_length": 36.0,
	"epoch": 0.0008961242625644089,
	"grad_norm": 0.0502021387219429,
	"kl": 0.0002162456512451172,
	"learning_rate": 2e-07,
	"loss": 0.0674,
	"num_tokens": 1249588.0,
	"reward": 0.0714285746216774,
	"reward_std": 0.10370119661092758,
	"rewards/accuracy_reward/mean": 0.0714285746216774,
	"rewards/accuracy_reward/std": 0.2581161558628082,
	"step": 3
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2633928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2901.0,
	"completions/mean_length": 1507.15185546875,
	"completions/mean_terminated_length": 947.5999755859375,
	"completions/min_length": 29.0,
	"completions/min_terminated_length": 29.0,
	"epoch": 0.0011948323500858785,
	"grad_norm": 0.05528466776013374,
	"kl": 0.0002148151397705078,
	"learning_rate": 3e-07,
	"loss": 0.0171,
	"num_tokens": 1627094.0,
	"reward": 0.0491071455180645,
	"reward_std": 0.07936029881238937,
	"rewards/accuracy_reward/mean": 0.0491071417927742,
	"rewards/accuracy_reward/std": 0.21657568216323853,
	"step": 4
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.34375,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3027.0,
	"completions/mean_length": 1706.196533203125,
	"completions/mean_terminated_length": 990.7755126953125,
	"completions/min_length": 30.0,
	"completions/min_terminated_length": 30.0,
	"epoch": 0.0014935404376073482,
	"grad_norm": 0.05157456174492836,
	"kl": 0.00016391277313232422,
	"learning_rate": 4e-07,
	"loss": 0.0591,
	"num_tokens": 2044514.0,
	"reward": 0.0803571492433548,
	"reward_std": 0.11693429946899414,
	"rewards/accuracy_reward/mean": 0.0803571417927742,
	"rewards/accuracy_reward/std": 0.2724541425704956,
	"step": 5
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3125,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3036.0,
	"completions/mean_length": 1646.27685546875,
	"completions/mean_terminated_length": 998.2207641601562,
	"completions/min_length": 65.0,
	"completions/min_terminated_length": 65.0,
	"epoch": 0.0017922485251288178,
	"grad_norm": 0.12111014127731323,
	"kl": 0.00024390220642089844,
	"learning_rate": 5e-07,
	"loss": 0.1069,
	"num_tokens": 2451360.0,
	"reward": 0.125,
	"reward_std": 0.14548751711845398,
	"rewards/accuracy_reward/mean": 0.12962962687015533,
	"rewards/accuracy_reward/std": 0.336675763130188,
	"step": 6
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2857142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2999.0,
	"completions/mean_length": 1562.5938720703125,
	"completions/mean_terminated_length": 958.8312377929688,
	"completions/min_length": 8.0,
	"completions/min_terminated_length": 8.0,
	"epoch": 0.0020909566126502874,
	"grad_norm": 0.05596686899662018,
	"kl": 0.00021219253540039062,
	"learning_rate": 6e-07,
	"loss": 0.0483,
	"num_tokens": 2843053.0,
	"reward": 0.066964291036129,
	"reward_std": 0.11663484573364258,
	"rewards/accuracy_reward/mean": 0.0669642835855484,
	"rewards/accuracy_reward/std": 0.2505199611186981,
	"step": 7
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3392857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3026.0,
	"completions/mean_length": 1740.3126220703125,
	"completions/mean_terminated_length": 1056.4730224609375,
	"completions/min_length": 46.0,
	"completions/min_terminated_length": 46.0,
	"epoch": 0.002389664700171757,
	"grad_norm": 0.07892504334449768,
	"kl": 0.00020313262939453125,
	"learning_rate": 7e-07,
	"loss": 0.013,
	"num_tokens": 3267483.0,
	"reward": 0.0580357164144516,
	"reward_std": 0.07350330799818039,
	"rewards/accuracy_reward/mean": 0.06018518656492233,
	"rewards/accuracy_reward/std": 0.23838205635547638,
	"step": 8
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2723214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2990.0,
	"completions/mean_length": 1496.6295166015625,
	"completions/mean_terminated_length": 907.0736083984375,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"epoch": 0.0026883727876932267,
	"grad_norm": 0.07782948017120361,
	"kl": 0.0001881122589111328,
	"learning_rate": 8e-07,
	"loss": 0.0718,
	"num_tokens": 3639680.0,
	"reward": 0.1116071492433548,
	"reward_std": 0.14939311146736145,
	"rewards/accuracy_reward/mean": 0.1116071417927742,
	"rewards/accuracy_reward/std": 0.31558772921562195,
	"step": 9
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3348214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3015.0,
	"completions/mean_length": 1665.102783203125,
	"completions/mean_terminated_length": 956.932861328125,
	"completions/min_length": 29.0,
	"completions/min_terminated_length": 29.0,
	"epoch": 0.0029870808752146963,
	"grad_norm": 0.0631859079003334,
	"kl": 0.00020241737365722656,
	"learning_rate": 9e-07,
	"loss": 0.0873,
	"num_tokens": 4046735.0,
	"reward": 0.0937500074505806,
	"reward_std": 0.14353612065315247,
	"rewards/accuracy_reward/mean": 0.09375,
	"rewards/accuracy_reward/std": 0.2921334207057953,
	"step": 10
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3571428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2965.0,
	"completions/mean_length": 1728.6785888671875,
	"completions/mean_terminated_length": 982.388916015625,
	"completions/min_length": 34.0,
	"completions/min_terminated_length": 34.0,
	"epoch": 0.003285788962736166,
	"grad_norm": 0.053209614008665085,
	"kl": 0.00017750263214111328,
	"learning_rate": 1e-06,
	"loss": 0.0142,
	"num_tokens": 4469191.0,
	"reward": 0.0401785746216774,
	"reward_std": 0.07350330799818039,
	"rewards/accuracy_reward/mean": 0.0401785708963871,
	"rewards/accuracy_reward/std": 0.19681765139102936,
	"step": 11
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3392857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3024.0,
	"completions/mean_length": 1664.825927734375,
	"completions/mean_terminated_length": 942.2230224609375,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.0035844970502576356,
	"grad_norm": 0.04564949870109558,
	"kl": 0.00018143653869628906,
	"learning_rate": 9.996954135095478e-07,
	"loss": 0.0166,
	"num_tokens": 4875240.0,
	"reward": 0.0357142873108387,
	"reward_std": 0.07289712876081467,
	"rewards/accuracy_reward/mean": 0.0357142873108387,
	"rewards/accuracy_reward/std": 0.18599249422550201,
	"step": 12
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2723214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3041.0,
	"completions/mean_length": 1547.21435546875,
	"completions/mean_terminated_length": 976.5889282226562,
	"completions/min_length": 9.0,
	"completions/min_terminated_length": 9.0,
	"epoch": 0.003883205137779105,
	"grad_norm": 0.04013432562351227,
	"kl": 0.00025653839111328125,
	"learning_rate": 9.98782025129912e-07,
	"loss": 0.0329,
	"num_tokens": 5256416.0,
	"reward": 0.0848214328289032,
	"reward_std": 0.07936029881238937,
	"rewards/accuracy_reward/mean": 0.0848214253783226,
	"rewards/accuracy_reward/std": 0.2792397737503052,
	"step": 13
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3883928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2960.0,
	"completions/mean_length": 1803.071533203125,
	"completions/mean_terminated_length": 997.2554931640625,
	"completions/min_length": 27.0,
	"completions/min_terminated_length": 27.0,
	"epoch": 0.004181913225300575,
	"grad_norm": 85.07632446289062,
	"kl": 0.2520885467529297,
	"learning_rate": 9.972609476841365e-07,
	"loss": 0.0295,
	"num_tokens": 5695616.0,
	"reward": 0.0446428582072258,
	"reward_std": 0.0417863167822361,
	"rewards/accuracy_reward/mean": 0.0446428582072258,
	"rewards/accuracy_reward/std": 0.20698098838329315,
	"step": 14
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3348214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3021.0,
	"completions/mean_length": 1632.0223388671875,
	"completions/mean_terminated_length": 907.2013549804688,
	"completions/min_length": 90.0,
	"completions/min_terminated_length": 90.0,
	"epoch": 0.004480621312822045,
	"grad_norm": 0.0776248499751091,
	"kl": 0.0002193450927734375,
	"learning_rate": 9.95134034370785e-07,
	"loss": 0.0752,
	"num_tokens": 6096357.0,
	"reward": 0.1205357164144516,
	"reward_std": 0.14683552086353302,
	"rewards/accuracy_reward/mean": 0.1205357164144516,
	"rewards/accuracy_reward/std": 0.32631614804267883,
	"step": 15
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2857142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3062.0,
	"completions/mean_length": 1543.3482666015625,
	"completions/mean_terminated_length": 931.8875122070312,
	"completions/min_length": 32.0,
	"completions/min_terminated_length": 32.0,
	"epoch": 0.004779329400343514,
	"grad_norm": 0.05424968898296356,
	"kl": 0.00020742416381835938,
	"learning_rate": 9.92403876506104e-07,
	"loss": 0.0186,
	"num_tokens": 6478547.0,
	"reward": 0.0491071455180645,
	"reward_std": 0.07936030626296997,
	"rewards/accuracy_reward/mean": 0.0491071417927742,
	"rewards/accuracy_reward/std": 0.21657569706439972,
	"step": 16
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3035714285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2991.0,
	"completions/mean_length": 1667.509033203125,
	"completions/mean_terminated_length": 1055.294921875,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.005078037487864984,
	"grad_norm": 0.07909969985485077,
	"kl": 0.00016796588897705078,
	"learning_rate": 9.890738003669027e-07,
	"loss": 0.1047,
	"num_tokens": 6889301.0,
	"reward": 0.15625,
	"reward_std": 0.15616022050380707,
	"rewards/accuracy_reward/mean": 0.15625,
	"rewards/accuracy_reward/std": 0.3639053702354431,
	"step": 17
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3169642857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3032.0,
	"completions/mean_length": 1650.3751220703125,
	"completions/mean_terminated_length": 990.6666870117188,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"epoch": 0.005376745575386453,
	"grad_norm": 0.09321057796478271,
	"kl": 0.00021791458129882812,
	"learning_rate": 9.851478631379982e-07,
	"loss": 0.1088,
	"num_tokens": 7294537.0,
	"reward": 0.1607142984867096,
	"reward_std": 0.1844095140695572,
	"rewards/accuracy_reward/mean": 0.1607142835855484,
	"rewards/accuracy_reward/std": 0.3680897653102875,
	"step": 18
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4776785714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3022.0,
	"completions/mean_length": 2020.1876220703125,
	"completions/mean_terminated_length": 1058.2735595703125,
	"completions/min_length": 62.0,
	"completions/min_terminated_length": 62.0,
	"epoch": 0.0056754536629079234,
	"grad_norm": 0.06517499685287476,
	"kl": 0.0001583099365234375,
	"learning_rate": 9.806308479691594e-07,
	"loss": 0.0896,
	"num_tokens": 7782355.0,
	"reward": 0.0803571492433548,
	"reward_std": 0.122494637966156,
	"rewards/accuracy_reward/mean": 0.0803571417927742,
	"rewards/accuracy_reward/std": 0.2724541425704956,
	"step": 19
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2321428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3035.0,
	"completions/mean_length": 1522.8973388671875,
	"completions/mean_terminated_length": 1054.56396484375,
	"completions/min_length": 7.0,
	"completions/min_terminated_length": 7.0,
	"epoch": 0.005974161750429393,
	"grad_norm": 0.16769619286060333,
	"kl": 0.0007884502410888672,
	"learning_rate": 9.755282581475767e-07,
	"loss": 0.064,
	"num_tokens": 8161596.0,
	"reward": 0.1473214328289032,
	"reward_std": 0.163971409201622,
	"rewards/accuracy_reward/mean": 0.1473214328289032,
	"rewards/accuracy_reward/std": 0.35521984100341797,
	"step": 20
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3392857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3044.0,
	"completions/mean_length": 1634.37060546875,
	"completions/mean_terminated_length": 896.12841796875,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.006272869837950863,
	"grad_norm": 0.09035351872444153,
	"kl": 0.00022339820861816406,
	"learning_rate": 9.698463103929541e-07,
	"loss": 0.0312,
	"num_tokens": 8564495.0,
	"reward": 0.0535714328289032,
	"reward_std": 0.05050762742757797,
	"rewards/accuracy_reward/mean": 0.0535714291036129,
	"rewards/accuracy_reward/std": 0.2256743162870407,
	"step": 21
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3080357142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3040.0,
	"completions/mean_length": 1619.3660888671875,
	"completions/mean_terminated_length": 972.7096557617188,
	"completions/min_length": 19.0,
	"completions/min_terminated_length": 19.0,
	"epoch": 0.006571577925472332,
	"grad_norm": 0.044720232486724854,
	"kl": 0.00024008750915527344,
	"learning_rate": 9.635919272833937e-07,
	"loss": 0.0381,
	"num_tokens": 8963833.0,
	"reward": 0.0401785746216774,
	"reward_std": 0.08222462981939316,
	"rewards/accuracy_reward/mean": 0.0401785708963871,
	"rewards/accuracy_reward/std": 0.19681765139102936,
	"step": 22
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3348214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2990.0,
	"completions/mean_length": 1700.1473388671875,
	"completions/mean_terminated_length": 1009.617431640625,
	"completions/min_length": 16.0,
	"completions/min_terminated_length": 16.0,
	"epoch": 0.006870286012993802,
	"grad_norm": 0.03696313127875328,
	"kl": 0.0001952648162841797,
	"learning_rate": 9.567727288213004e-07,
	"loss": 0.0385,
	"num_tokens": 9378978.0,
	"reward": 0.0223214291036129,
	"reward_std": 0.06313453614711761,
	"rewards/accuracy_reward/mean": 0.0223214291036129,
	"rewards/accuracy_reward/std": 0.14805756509304047,
	"step": 23
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2946428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3046.0,
	"completions/mean_length": 1579.0001220703125,
	"completions/mean_terminated_length": 955.341796875,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.007168994100515271,
	"grad_norm": 0.04915790259838104,
	"kl": 0.0002124309539794922,
	"learning_rate": 9.493970231495834e-07,
	"loss": 0.0689,
	"num_tokens": 9767946.0,
	"reward": 0.0937500074505806,
	"reward_std": 0.08747823536396027,
	"rewards/accuracy_reward/mean": 0.09375,
	"rewards/accuracy_reward/std": 0.2921334207057953,
	"step": 24
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3258928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2958.0,
	"completions/mean_length": 1655.321533203125,
	"completions/mean_terminated_length": 970.4370727539062,
	"completions/min_length": 69.0,
	"completions/min_terminated_length": 69.0,
	"epoch": 0.007467702188036741,
	"grad_norm": 0.08162926882505417,
	"kl": 0.00021219253540039062,
	"learning_rate": 9.414737964294634e-07,
	"loss": 0.106,
	"num_tokens": 10176490.0,
	"reward": 0.165178582072258,
	"reward_std": 0.2200292944908142,
	"rewards/accuracy_reward/mean": 0.1651785671710968,
	"rewards/accuracy_reward/std": 0.37217333912849426,
	"step": 25
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3571428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3072.0,
	"completions/mean_length": 1757.1474609375,
	"completions/mean_terminated_length": 1026.673583984375,
	"completions/min_length": 37.0,
	"completions/min_terminated_length": 37.0,
	"epoch": 0.00776641027555821,
	"grad_norm": 0.05597545579075813,
	"kl": 0.00016999244689941406,
	"learning_rate": 9.330127018922193e-07,
	"loss": 0.0569,
	"num_tokens": 10604371.0,
	"reward": 0.1205357164144516,
	"reward_std": 0.1474389135837555,
	"rewards/accuracy_reward/mean": 0.1205357164144516,
	"rewards/accuracy_reward/std": 0.32631614804267883,
	"step": 26
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3080357142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2834.0,
	"completions/mean_length": 1574.700927734375,
	"completions/mean_terminated_length": 908.1612548828125,
	"completions/min_length": 15.0,
	"completions/min_terminated_length": 15.0,
	"epoch": 0.00806511836307968,
	"grad_norm": 0.09308373928070068,
	"kl": 0.00026679039001464844,
	"learning_rate": 9.240240480782129e-07,
	"loss": -0.0172,
	"num_tokens": 10994768.0,
	"reward": 0.0714285746216774,
	"reward_std": 0.08161844313144684,
	"rewards/accuracy_reward/mean": 0.0714285746216774,
	"rewards/accuracy_reward/std": 0.2581161558628082,
	"step": 27
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2991071428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2953.0,
	"completions/mean_length": 1551.4241943359375,
	"completions/mean_terminated_length": 902.5159301757812,
	"completions/min_length": 13.0,
	"completions/min_terminated_length": 13.0,
	"epoch": 0.00836382645060115,
	"grad_norm": 0.03204449638724327,
	"kl": 0.00022554397583007812,
	"learning_rate": 9.145187862775208e-07,
	"loss": 0.0183,
	"num_tokens": 11379607.0,
	"reward": 0.0223214291036129,
	"reward_std": 0.04569191485643387,
	"rewards/accuracy_reward/mean": 0.0223214291036129,
	"rewards/accuracy_reward/std": 0.14805756509304047,
	"step": 28
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3035714285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3051.0,
	"completions/mean_length": 1574.294677734375,
	"completions/mean_terminated_length": 921.44873046875,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.008662534538122619,
	"grad_norm": 0.14834517240524292,
	"kl": 0.0002460479736328125,
	"learning_rate": 9.045084971874737e-07,
	"loss": 0.0947,
	"num_tokens": 11767841.0,
	"reward": 0.1160714328289032,
	"reward_std": 0.139630526304245,
	"rewards/accuracy_reward/mean": 0.1160714253783226,
	"rewards/accuracy_reward/std": 0.321027934551239,
	"step": 29
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3348214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2951.0,
	"completions/mean_length": 1691.6563720703125,
	"completions/mean_terminated_length": 996.8523559570312,
	"completions/min_length": 55.0,
	"completions/min_terminated_length": 55.0,
	"epoch": 0.00896124262564409,
	"grad_norm": 0.06935401260852814,
	"kl": 0.00021028518676757812,
	"learning_rate": 8.940053768033608e-07,
	"loss": 0.077,
	"num_tokens": 12182348.0,
	"reward": 0.1428571492433548,
	"reward_std": 0.17555983364582062,
	"rewards/accuracy_reward/mean": 0.14814814925193787,
	"rewards/accuracy_reward/std": 0.35607197880744934,
	"step": 30
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2901785714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3022.0,
	"completions/mean_length": 1568.509033203125,
	"completions/mean_terminated_length": 953.8742065429688,
	"completions/min_length": 14.0,
	"completions/min_terminated_length": 14.0,
	"epoch": 0.009259950713165559,
	"grad_norm": 0.06759945303201675,
	"kl": 0.0002391338348388672,
	"learning_rate": 8.83022221559489e-07,
	"loss": 0.0138,
	"num_tokens": 12567686.0,
	"reward": 0.1116071492433548,
	"reward_std": 0.07350331544876099,
	"rewards/accuracy_reward/mean": 0.1116071417927742,
	"rewards/accuracy_reward/std": 0.31558772921562195,
	"step": 31
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2723214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2847.0,
	"completions/mean_length": 1535.107177734375,
	"completions/mean_terminated_length": 959.950927734375,
	"completions/min_length": 7.0,
	"completions/min_terminated_length": 7.0,
	"epoch": 0.009558658800687028,
	"grad_norm": 0.07123875617980957,
	"kl": 0.00024700164794921875,
	"learning_rate": 8.71572412738697e-07,
	"loss": 0.0526,
	"num_tokens": 12953174.0,
	"reward": 0.0892857164144516,
	"reward_std": 0.11394162476062775,
	"rewards/accuracy_reward/mean": 0.09259258955717087,
	"rewards/accuracy_reward/std": 0.29053398966789246,
	"step": 32
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3035714285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3050.0,
	"completions/mean_length": 1552.263427734375,
	"completions/mean_terminated_length": 889.8140869140625,
	"completions/min_length": 7.0,
	"completions/min_terminated_length": 7.0,
	"epoch": 0.009857366888208497,
	"grad_norm": 0.05994417518377304,
	"kl": 0.00028204917907714844,
	"learning_rate": 8.596699001693255e-07,
	"loss": 0.0441,
	"num_tokens": 13335825.0,
	"reward": 0.0892857164144516,
	"reward_std": 0.10504640638828278,
	"rewards/accuracy_reward/mean": 0.09259258955717087,
	"rewards/accuracy_reward/std": 0.29053395986557007,
	"step": 33
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2767857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2990.0,
	"completions/mean_length": 1505.9910888671875,
	"completions/mean_terminated_length": 906.6543579101562,
	"completions/min_length": 20.0,
	"completions/min_terminated_length": 20.0,
	"epoch": 0.010156074975729968,
	"grad_norm": 0.06331104040145874,
	"kl": 0.00026798248291015625,
	"learning_rate": 8.473291852294986e-07,
	"loss": 0.1067,
	"num_tokens": 13708551.0,
	"reward": 0.125,
	"reward_std": 0.1584138572216034,
	"rewards/accuracy_reward/mean": 0.125,
	"rewards/accuracy_reward/std": 0.3314596116542816,
	"step": 34
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2946428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3050.0,
	"completions/mean_length": 1632.8795166015625,
	"completions/mean_terminated_length": 1031.7279052734375,
	"completions/min_length": 77.0,
	"completions/min_terminated_length": 77.0,
	"epoch": 0.010454783063251438,
	"grad_norm": 0.03785333409905434,
	"kl": 0.0002741813659667969,
	"learning_rate": 8.34565303179429e-07,
	"loss": 0.0503,
	"num_tokens": 14113332.0,
	"reward": 0.0491071455180645,
	"reward_std": 0.10010232776403427,
	"rewards/accuracy_reward/mean": 0.0491071417927742,
	"rewards/accuracy_reward/std": 0.21657568216323853,
	"step": 35
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3125,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3060.0,
	"completions/mean_length": 1601.3929443359375,
	"completions/mean_terminated_length": 932.93505859375,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.010753491150772907,
	"grad_norm": 0.07039965689182281,
	"kl": 0.0002465248107910156,
	"learning_rate": 8.213938048432696e-07,
	"loss": 0.0907,
	"num_tokens": 14508548.0,
	"reward": 0.133928582072258,
	"reward_std": 0.15811721980571747,
	"rewards/accuracy_reward/mean": 0.1339285671710968,
	"rewards/accuracy_reward/std": 0.3413383364677429,
	"step": 36
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2053571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2849.0,
	"completions/mean_length": 1399.2188720703125,
	"completions/mean_terminated_length": 966.927001953125,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"epoch": 0.011052199238294378,
	"grad_norm": 0.06250524520874023,
	"kl": 0.00029087066650390625,
	"learning_rate": 8.07830737662829e-07,
	"loss": 0.0706,
	"num_tokens": 14859805.0,
	"reward": 0.125,
	"reward_std": 0.133773535490036,
	"rewards/accuracy_reward/mean": 0.125,
	"rewards/accuracy_reward/std": 0.3314596116542816,
	"step": 37
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2857142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3013.0,
	"completions/mean_length": 1635.884033203125,
	"completions/mean_terminated_length": 1061.4375,
	"completions/min_length": 23.0,
	"completions/min_terminated_length": 23.0,
	"epoch": 0.011350907325815847,
	"grad_norm": 0.028273796662688255,
	"kl": 0.0002434253692626953,
	"learning_rate": 7.938926261462365e-07,
	"loss": 0.0175,
	"num_tokens": 15263779.0,
	"reward": 0.0357142873108387,
	"reward_std": 0.04764331132173538,
	"rewards/accuracy_reward/mean": 0.0357142873108387,
	"rewards/accuracy_reward/std": 0.18599249422550201,
	"step": 38
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2767857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2456.0,
	"completions/mean_length": 1450.90185546875,
	"completions/mean_terminated_length": 830.4815063476562,
	"completions/min_length": 25.0,
	"completions/min_terminated_length": 25.0,
	"epoch": 0.011649615413337316,
	"grad_norm": 0.057446062564849854,
	"kl": 0.0003275871276855469,
	"learning_rate": 7.795964517353733e-07,
	"loss": 0.0511,
	"num_tokens": 15624173.0,
	"reward": 0.0803571492433548,
	"reward_std": 0.10851971060037613,
	"rewards/accuracy_reward/mean": 0.0803571417927742,
	"rewards/accuracy_reward/std": 0.2724541425704956,
	"step": 39
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2589285714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3027.0,
	"completions/mean_length": 1514.7723388671875,
	"completions/mean_terminated_length": 970.6806640625,
	"completions/min_length": 31.0,
	"completions/min_terminated_length": 31.0,
	"epoch": 0.011948323500858785,
	"grad_norm": 0.06986912339925766,
	"kl": 0.0003490447998046875,
	"learning_rate": 7.649596321166024e-07,
	"loss": 0.0786,
	"num_tokens": 16002314.0,
	"reward": 0.133928582072258,
	"reward_std": 0.17373399436473846,
	"rewards/accuracy_reward/mean": 0.1339285671710968,
	"rewards/accuracy_reward/std": 0.3413383364677429,
	"step": 40
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2544642857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3011.0,
	"completions/mean_length": 1548.4732666015625,
	"completions/mean_terminated_length": 1028.4671630859375,
	"completions/min_length": 55.0,
	"completions/min_terminated_length": 55.0,
	"epoch": 0.012247031588380256,
	"grad_norm": 0.40365350246429443,
	"kl": 0.0009794235229492188,
	"learning_rate": 7.5e-07,
	"loss": 0.0019,
	"num_tokens": 16389036.0,
	"reward": 0.0401785746216774,
	"reward_std": 0.0689915269613266,
	"rewards/accuracy_reward/mean": 0.0401785708963871,
	"rewards/accuracy_reward/std": 0.19681765139102936,
	"step": 41
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1741071428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2763.0,
	"completions/mean_length": 1306.65185546875,
	"completions/mean_terminated_length": 934.497314453125,
	"completions/min_length": 34.0,
	"completions/min_terminated_length": 34.0,
	"epoch": 0.012545739675901725,
	"grad_norm": 0.04599951580166817,
	"kl": 0.0003376007080078125,
	"learning_rate": 7.347357813929454e-07,
	"loss": 0.0255,
	"num_tokens": 16717726.0,
	"reward": 0.0580357164144516,
	"reward_std": 0.0787569135427475,
	"rewards/accuracy_reward/mean": 0.0580357126891613,
	"rewards/accuracy_reward/std": 0.23433461785316467,
	"step": 42
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2946428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3070.0,
	"completions/mean_length": 1608.08935546875,
	"completions/mean_terminated_length": 996.582275390625,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"epoch": 0.012844447763423195,
	"grad_norm": 0.042162321507930756,
	"kl": 0.0003190040588378906,
	"learning_rate": 7.191855733945386e-07,
	"loss": 0.0291,
	"num_tokens": 17115178.0,
	"reward": 0.0357142873108387,
	"reward_std": 0.060876406729221344,
	"rewards/accuracy_reward/mean": 0.0357142873108387,
	"rewards/accuracy_reward/std": 0.18599249422550201,
	"step": 43
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2098214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2964.0,
	"completions/mean_length": 1279.0848388671875,
	"completions/mean_terminated_length": 803.0,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.013143155850944664,
	"grad_norm": 0.08639585971832275,
	"kl": 0.00032329559326171875,
	"learning_rate": 7.033683215379002e-07,
	"loss": 0.0171,
	"num_tokens": 17437797.0,
	"reward": 0.0803571492433548,
	"reward_std": 0.14158472418785095,
	"rewards/accuracy_reward/mean": 0.0803571417927742,
	"rewards/accuracy_reward/std": 0.2724541425704956,
	"step": 44
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2857142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3066.0,
	"completions/mean_length": 1557.5804443359375,
	"completions/mean_terminated_length": 951.8125,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.013441863938466135,
	"grad_norm": 0.0711236298084259,
	"kl": 0.00033283233642578125,
	"learning_rate": 6.87303296707956e-07,
	"loss": 0.0433,
	"num_tokens": 17823671.0,
	"reward": 0.1428571492433548,
	"reward_std": 0.14066898822784424,
	"rewards/accuracy_reward/mean": 0.1428571492433548,
	"rewards/accuracy_reward/std": 0.35071080923080444,
	"step": 45
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2767857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3038.0,
	"completions/mean_length": 1589.357177734375,
	"completions/mean_terminated_length": 1021.9259033203125,
	"completions/min_length": 6.0,
	"completions/min_terminated_length": 6.0,
	"epoch": 0.013740572025987604,
	"grad_norm": 0.055200062692165375,
	"kl": 0.0003032684326171875,
	"learning_rate": 6.710100716628344e-07,
	"loss": 0.0797,
	"num_tokens": 18216415.0,
	"reward": 0.098214291036129,
	"reward_std": 0.13225442171096802,
	"rewards/accuracy_reward/mean": 0.0982142835855484,
	"rewards/accuracy_reward/std": 0.29827070236206055,
	"step": 46
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2678571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2984.0,
	"completions/mean_length": 1515.5535888671875,
	"completions/mean_terminated_length": 946.1218872070312,
	"completions/min_length": 33.0,
	"completions/min_terminated_length": 33.0,
	"epoch": 0.014039280113509073,
	"grad_norm": 0.07808107137680054,
	"kl": 0.00030994415283203125,
	"learning_rate": 6.545084971874736e-07,
	"loss": 0.0337,
	"num_tokens": 18590435.0,
	"reward": 0.0848214328289032,
	"reward_std": 0.10205654054880142,
	"rewards/accuracy_reward/mean": 0.0848214253783226,
	"rewards/accuracy_reward/std": 0.2792397737503052,
	"step": 47
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2633928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2920.0,
	"completions/mean_length": 1490.2501220703125,
	"completions/mean_terminated_length": 924.654541015625,
	"completions/min_length": 8.0,
	"completions/min_terminated_length": 8.0,
	"epoch": 0.014337988201030542,
	"grad_norm": 0.06969325244426727,
	"kl": 0.0003323554992675781,
	"learning_rate": 6.378186779084995e-07,
	"loss": 0.0863,
	"num_tokens": 18963555.0,
	"reward": 0.1383928656578064,
	"reward_std": 0.16006861627101898,
	"rewards/accuracy_reward/mean": 0.1383928507566452,
	"rewards/accuracy_reward/std": 0.34608522057533264,
	"step": 48
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2276785714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3034.0,
	"completions/mean_length": 1513.2232666015625,
	"completions/mean_terminated_length": 1053.6993408203125,
	"completions/min_length": 39.0,
	"completions/min_terminated_length": 39.0,
	"epoch": 0.014636696288552013,
	"grad_norm": 0.0502396859228611,
	"kl": 0.00036144256591796875,
	"learning_rate": 6.209609477998338e-07,
	"loss": 0.0331,
	"num_tokens": 19336581.0,
	"reward": 0.1116071492433548,
	"reward_std": 0.10174980014562607,
	"rewards/accuracy_reward/mean": 0.1116071417927742,
	"rewards/accuracy_reward/std": 0.31558772921562195,
	"step": 49
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2142857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3013.0,
	"completions/mean_length": 1356.0491943359375,
	"completions/mean_terminated_length": 888.0625,
	"completions/min_length": 14.0,
	"completions/min_terminated_length": 14.0,
	"epoch": 0.014935404376073482,
	"grad_norm": 0.07083041965961456,
	"kl": 0.0003476142883300781,
	"learning_rate": 6.039558454088795e-07,
	"loss": 0.0898,
	"num_tokens": 19674280.0,
	"reward": 0.1428571492433548,
	"reward_std": 0.17659832537174225,
	"rewards/accuracy_reward/mean": 0.1428571492433548,
	"rewards/accuracy_reward/std": 0.35071080923080444,
	"step": 50
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2678571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3060.0,
	"completions/mean_length": 1581.5045166015625,
	"completions/mean_terminated_length": 1036.201171875,
	"completions/min_length": 24.0,
	"completions/min_terminated_length": 24.0,
	"epoch": 0.015234112463594952,
	"grad_norm": 0.07027444243431091,
	"kl": 0.00034427642822265625,
	"learning_rate": 5.868240888334652e-07,
	"loss": 0.0775,
	"num_tokens": 20068041.0,
	"reward": 0.165178582072258,
	"reward_std": 0.163971409201622,
	"rewards/accuracy_reward/mean": 0.1651785671710968,
	"rewards/accuracy_reward/std": 0.37217333912849426,
	"step": 51
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2053571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2985.0,
	"completions/mean_length": 1486.7366943359375,
	"completions/mean_terminated_length": 1077.061767578125,
	"completions/min_length": 27.0,
	"completions/min_terminated_length": 27.0,
	"epoch": 0.01553282055111642,
	"grad_norm": 0.04818105697631836,
	"kl": 0.00038242340087890625,
	"learning_rate": 5.695865504800327e-07,
	"loss": 0.0411,
	"num_tokens": 20435622.0,
	"reward": 0.0535714328289032,
	"reward_std": 0.06673339754343033,
	"rewards/accuracy_reward/mean": 0.0535714291036129,
	"rewards/accuracy_reward/std": 0.2256743162870407,
	"step": 52
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2633928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3045.0,
	"completions/mean_length": 1494.6429443359375,
	"completions/mean_terminated_length": 930.6181640625,
	"completions/min_length": 55.0,
	"completions/min_terminated_length": 55.0,
	"epoch": 0.01583152863863789,
	"grad_norm": 0.046202462166547775,
	"kl": 0.00034809112548828125,
	"learning_rate": 5.522642316338268e-07,
	"loss": 0.0624,
	"num_tokens": 20804750.0,
	"reward": 0.0758928582072258,
	"reward_std": 0.10100797563791275,
	"rewards/accuracy_reward/mean": 0.0758928582072258,
	"rewards/accuracy_reward/std": 0.26541972160339355,
	"step": 53
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1830357142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3039.0,
	"completions/mean_length": 1359.74560546875,
	"completions/mean_terminated_length": 976.1256713867188,
	"completions/min_length": 9.0,
	"completions/min_terminated_length": 9.0,
	"epoch": 0.01613023672615936,
	"grad_norm": 0.07204340398311615,
	"kl": 0.0004181861877441406,
	"learning_rate": 5.348782368720625e-07,
	"loss": 0.0153,
	"num_tokens": 21144645.0,
	"reward": 0.0625,
	"reward_std": 0.0978442057967186,
	"rewards/accuracy_reward/mean": 0.0625,
	"rewards/accuracy_reward/std": 0.2426035851240158,
	"step": 54
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3303571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2961.0,
	"completions/mean_length": 1705.763427734375,
	"completions/mean_terminated_length": 1031.75341796875,
	"completions/min_length": 14.0,
	"completions/min_terminated_length": 14.0,
	"epoch": 0.016428944813680832,
	"grad_norm": 0.029594114050269127,
	"kl": 0.0003113746643066406,
	"learning_rate": 5.174497483512505e-07,
	"loss": 0.0516,
	"num_tokens": 21564768.0,
	"reward": 0.0446428582072258,
	"reward_std": 0.06222161650657654,
	"rewards/accuracy_reward/mean": 0.0446428582072258,
	"rewards/accuracy_reward/std": 0.20698098838329315,
	"step": 55
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2276785714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3035.0,
	"completions/mean_length": 1459.009033203125,
	"completions/mean_terminated_length": 983.5028686523438,
	"completions/min_length": 59.0,
	"completions/min_terminated_length": 59.0,
	"epoch": 0.0167276529012023,
	"grad_norm": 0.04346403852105141,
	"kl": 0.0004267692565917969,
	"learning_rate": 5e-07,
	"loss": 0.0202,
	"num_tokens": 21926530.0,
	"reward": 0.098214291036129,
	"reward_std": 0.0417863167822361,
	"rewards/accuracy_reward/mean": 0.0982142835855484,
	"rewards/accuracy_reward/std": 0.29827070236206055,
	"step": 56
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2589285714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3001.0,
	"completions/mean_length": 1461.7991943359375,
	"completions/mean_terminated_length": 899.1987915039062,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 0.01702636098872377,
	"grad_norm": 0.04805866628885269,
	"kl": 0.0003714561462402344,
	"learning_rate": 4.825502516487496e-07,
	"loss": 0.0097,
	"num_tokens": 22286485.0,
	"reward": 0.0446428582072258,
	"reward_std": 0.07740890979766846,
	"rewards/accuracy_reward/mean": 0.0446428582072258,
	"rewards/accuracy_reward/std": 0.20698098838329315,
	"step": 57
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2232142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3008.0,
	"completions/mean_length": 1363.43310546875,
	"completions/mean_terminated_length": 872.4655151367188,
	"completions/min_length": 6.0,
	"completions/min_terminated_length": 6.0,
	"epoch": 0.017325069076245238,
	"grad_norm": 0.0761556327342987,
	"kl": 0.00038814544677734375,
	"learning_rate": 4.6512176312793735e-07,
	"loss": 0.0167,
	"num_tokens": 22629270.0,
	"reward": 0.1160714328289032,
	"reward_std": 0.13225442171096802,
	"rewards/accuracy_reward/mean": 0.1160714253783226,
	"rewards/accuracy_reward/std": 0.321027934551239,
	"step": 58
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2098214285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3065.0,
	"completions/mean_length": 1447.0179443359375,
	"completions/mean_terminated_length": 1015.5254516601562,
	"completions/min_length": 41.0,
	"completions/min_terminated_length": 41.0,
	"epoch": 0.01762377716376671,
	"grad_norm": 0.08180122822523117,
	"kl": 0.0004076957702636719,
	"learning_rate": 4.477357683661733e-07,
	"loss": 0.1042,
	"num_tokens": 22988322.0,
	"reward": 0.1473214328289032,
	"reward_std": 0.2089243084192276,
	"rewards/accuracy_reward/mean": 0.1473214328289032,
	"rewards/accuracy_reward/std": 0.35521984100341797,
	"step": 59
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2142857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3057.0,
	"completions/mean_length": 1429.5491943359375,
	"completions/mean_terminated_length": 981.6079711914062,
	"completions/min_length": 59.0,
	"completions/min_terminated_length": 59.0,
	"epoch": 0.01792248525128818,
	"grad_norm": 0.08614456653594971,
	"kl": 0.0004153251647949219,
	"learning_rate": 4.304134495199674e-07,
	"loss": 0.1011,
	"num_tokens": 23345885.0,
	"reward": 0.1116071492433548,
	"reward_std": 0.1331673562526703,
	"rewards/accuracy_reward/mean": 0.1116071417927742,
	"rewards/accuracy_reward/std": 0.31558772921562195,
	"step": 60
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.21875,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2965.0,
	"completions/mean_length": 1379.759033203125,
	"completions/mean_terminated_length": 905.931396484375,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"epoch": 0.018221193338809647,
	"grad_norm": 0.07557892054319382,
	"kl": 0.0004553794860839844,
	"learning_rate": 4.131759111665348e-07,
	"loss": 0.0796,
	"num_tokens": 23690391.0,
	"reward": 0.1294642984867096,
	"reward_std": 0.15225742757320404,
	"rewards/accuracy_reward/mean": 0.1294642835855484,
	"rewards/accuracy_reward/std": 0.3364649713039398,
	"step": 61
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.25,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2885.0,
	"completions/mean_length": 1448.169677734375,
	"completions/mean_terminated_length": 906.8928833007812,
	"completions/min_length": 33.0,
	"completions/min_terminated_length": 33.0,
	"epoch": 0.018519901426331118,
	"grad_norm": 0.051801372319459915,
	"kl": 0.00043392181396484375,
	"learning_rate": 3.960441545911204e-07,
	"loss": 0.046,
	"num_tokens": 24052941.0,
	"reward": 0.0758928582072258,
	"reward_std": 0.09333522617816925,
	"rewards/accuracy_reward/mean": 0.0758928582072258,
	"rewards/accuracy_reward/std": 0.26541972160339355,
	"step": 62
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2321428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3043.0,
	"completions/mean_length": 1465.83935546875,
	"completions/mean_terminated_length": 980.2557983398438,
	"completions/min_length": 27.0,
	"completions/min_terminated_length": 27.0,
	"epoch": 0.01881860951385259,
	"grad_norm": 0.11123108118772507,
	"kl": 0.0003695487976074219,
	"learning_rate": 3.790390522001662e-07,
	"loss": 0.0854,
	"num_tokens": 24414641.0,
	"reward": 0.1696428656578064,
	"reward_std": 0.15706866979599,
	"rewards/accuracy_reward/mean": 0.1696428507566452,
	"rewards/accuracy_reward/std": 0.37615931034088135,
	"step": 63
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3258928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2902.0,
	"completions/mean_length": 1662.165283203125,
	"completions/mean_terminated_length": 980.5894165039062,
	"completions/min_length": 12.0,
	"completions/min_terminated_length": 12.0,
	"epoch": 0.019117317601374056,
	"grad_norm": 0.04303564503788948,
	"kl": 0.0003743171691894531,
	"learning_rate": 3.621813220915004e-07,
	"loss": 0.0592,
	"num_tokens": 24823998.0,
	"reward": 0.0892857164144516,
	"reward_std": 0.11181911826133728,
	"rewards/accuracy_reward/mean": 0.0892857164144516,
	"rewards/accuracy_reward/std": 0.28579434752464294,
	"step": 64
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2455357142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2906.0,
	"completions/mean_length": 1489.8482666015625,
	"completions/mean_terminated_length": 974.94677734375,
	"completions/min_length": 36.0,
	"completions/min_terminated_length": 36.0,
	"epoch": 0.019416025688895527,
	"grad_norm": 0.06886183470487595,
	"kl": 0.000438690185546875,
	"learning_rate": 3.454915028125263e-07,
	"loss": 0.0292,
	"num_tokens": 25193332.0,
	"reward": 0.0803571492433548,
	"reward_std": 0.11572191119194031,
	"rewards/accuracy_reward/mean": 0.0803571417927742,
	"rewards/accuracy_reward/std": 0.2724541425704956,
	"step": 65
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2053571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3026.0,
	"completions/mean_length": 1461.888427734375,
	"completions/mean_terminated_length": 1045.7921142578125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"epoch": 0.019714733776416995,
	"grad_norm": 0.05718955025076866,
	"kl": 0.00038433074951171875,
	"learning_rate": 3.2898992833716563e-07,
	"loss": 0.0252,
	"num_tokens": 25563507.0,
	"reward": 0.0803571492433548,
	"reward_std": 0.11437670141458511,
	"rewards/accuracy_reward/mean": 0.0803571417927742,
	"rewards/accuracy_reward/std": 0.2724541425704956,
	"step": 66
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2008928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2723.0,
	"completions/mean_length": 1381.7098388671875,
	"completions/mean_terminated_length": 956.7764892578125,
	"completions/min_length": 34.0,
	"completions/min_terminated_length": 34.0,
	"epoch": 0.020013441863938466,
	"grad_norm": 0.07317039370536804,
	"kl": 0.0004258155822753906,
	"learning_rate": 3.1269670329204393e-07,
	"loss": 0.0637,
	"num_tokens": 25908066.0,
	"reward": 0.1383928656578064,
	"reward_std": 0.148354634642601,
	"rewards/accuracy_reward/mean": 0.1383928507566452,
	"rewards/accuracy_reward/std": 0.34608522057533264,
	"step": 67
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2767857142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2957.0,
	"completions/mean_length": 1621.2723388671875,
	"completions/mean_terminated_length": 1066.0555419921875,
	"completions/min_length": 46.0,
	"completions/min_terminated_length": 46.0,
	"epoch": 0.020312149951459937,
	"grad_norm": 0.05922617018222809,
	"kl": 0.0003914833068847656,
	"learning_rate": 2.9663167846209996e-07,
	"loss": 0.062,
	"num_tokens": 26307151.0,
	"reward": 0.1160714328289032,
	"reward_std": 0.1546439230442047,
	"rewards/accuracy_reward/mean": 0.1160714253783226,
	"rewards/accuracy_reward/std": 0.321027934551239,
	"step": 68
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2276785714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3031.0,
	"completions/mean_length": 1374.5179443359375,
	"completions/mean_terminated_length": 874.10400390625,
	"completions/min_length": 16.0,
	"completions/min_terminated_length": 16.0,
	"epoch": 0.020610858038981404,
	"grad_norm": 0.04980211332440376,
	"kl": 0.0004363059997558594,
	"learning_rate": 2.808144266054612e-07,
	"loss": 0.0071,
	"num_tokens": 26653419.0,
	"reward": 0.0401785746216774,
	"reward_std": 0.06808140873908997,
	"rewards/accuracy_reward/mean": 0.0401785708963871,
	"rewards/accuracy_reward/std": 0.19681765139102936,
	"step": 69
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2053571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2942.0,
	"completions/mean_length": 1441.7410888671875,
	"completions/mean_terminated_length": 1020.438232421875,
	"completions/min_length": 59.0,
	"completions/min_terminated_length": 59.0,
	"epoch": 0.020909566126502875,
	"grad_norm": 12926.7119140625,
	"kl": 17.625338554382324,
	"learning_rate": 2.6526421860705473e-07,
	"loss": 0.8373,
	"num_tokens": 27010817.0,
	"reward": 0.196428582072258,
	"reward_std": 0.20935659110546112,
	"rewards/accuracy_reward/mean": 0.1964285671710968,
	"rewards/accuracy_reward/std": 0.3981861472129822,
	"step": 70
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1964285714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2713.0,
	"completions/mean_length": 1265.977783203125,
	"completions/mean_terminated_length": 824.5055541992188,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.021208274214024346,
	"grad_norm": 0.03522089868783951,
	"kl": 0.0004029273986816406,
	"learning_rate": 2.500000000000001e-07,
	"loss": 0.0525,
	"num_tokens": 27329436.0,
	"reward": 0.15625,
	"reward_std": 0.06612721085548401,
	"rewards/accuracy_reward/mean": 0.15625,
	"rewards/accuracy_reward/std": 0.3639053702354431,
	"step": 71
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1875,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2868.0,
	"completions/mean_length": 1227.24560546875,
	"completions/mean_terminated_length": 801.532958984375,
	"completions/min_length": 71.0,
	"completions/min_terminated_length": 71.0,
	"epoch": 0.021506982301545814,
	"grad_norm": 0.059741340577602386,
	"kl": 0.00044918060302734375,
	"learning_rate": 2.350403678833976e-07,
	"loss": 0.0907,
	"num_tokens": 27643499.0,
	"reward": 0.125,
	"reward_std": 0.14744171500205994,
	"rewards/accuracy_reward/mean": 0.125,
	"rewards/accuracy_reward/std": 0.3314596116542816,
	"step": 72
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2321428571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3000.0,
	"completions/mean_length": 1325.4107666015625,
	"completions/mean_terminated_length": 797.3720703125,
	"completions/min_length": 8.0,
	"completions/min_terminated_length": 8.0,
	"epoch": 0.021805690389067284,
	"grad_norm": 0.15673233568668365,
	"kl": 0.0005125999450683594,
	"learning_rate": 2.2040354826462664e-07,
	"loss": 0.138,
	"num_tokens": 27977223.0,
	"reward": 0.28125,
	"reward_std": 0.19404374063014984,
	"rewards/accuracy_reward/mean": 0.28125,
	"rewards/accuracy_reward/std": 0.45061618089675903,
	"step": 73
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2410714285714286,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3061.0,
	"completions/mean_length": 1447.68310546875,
	"completions/mean_terminated_length": 931.7235717773438,
	"completions/min_length": 32.0,
	"completions/min_terminated_length": 32.0,
	"epoch": 0.022104398476588755,
	"grad_norm": 0.0585806779563427,
	"kl": 0.000385284423828125,
	"learning_rate": 2.0610737385376348e-07,
	"loss": 0.0809,
	"num_tokens": 28339184.0,
	"reward": 0.1116071492433548,
	"reward_std": 0.14127518236637115,
	"rewards/accuracy_reward/mean": 0.1116071417927742,
	"rewards/accuracy_reward/std": 0.31558772921562195,
	"step": 74
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1964285714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2937.0,
	"completions/mean_length": 1326.0045166015625,
	"completions/mean_terminated_length": 899.20556640625,
	"completions/min_length": 84.0,
	"completions/min_terminated_length": 84.0,
	"epoch": 0.022403106564110223,
	"grad_norm": 0.042158063501119614,
	"kl": 0.0004096031188964844,
	"learning_rate": 1.9216926233717084e-07,
	"loss": 0.0529,
	"num_tokens": 28673249.0,
	"reward": 0.1026785746216774,
	"reward_std": 0.10309500992298126,
	"rewards/accuracy_reward/mean": 0.1026785746216774,
	"rewards/accuracy_reward/std": 0.3042183816432953,
	"step": 75
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.25,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3025.0,
	"completions/mean_length": 1532.821533203125,
	"completions/mean_terminated_length": 1019.7619018554688,
	"completions/min_length": 38.0,
	"completions/min_terminated_length": 38.0,
	"epoch": 0.022701814651631694,
	"grad_norm": 0.05144224688410759,
	"kl": 0.0004096031188964844,
	"learning_rate": 1.7860619515673032e-07,
	"loss": 0.0708,
	"num_tokens": 29049481.0,
	"reward": 0.0580357164144516,
	"reward_std": 0.1079135313630104,
	"rewards/accuracy_reward/mean": 0.0580357126891613,
	"rewards/accuracy_reward/std": 0.23433461785316467,
	"step": 76
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2857142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2889.0,
	"completions/mean_length": 1556.80810546875,
	"completions/mean_terminated_length": 950.7312622070312,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"epoch": 0.02300052273915316,
	"grad_norm": 0.08849991112947464,
	"kl": 0.0004596710205078125,
	"learning_rate": 1.6543469682057104e-07,
	"loss": 0.09,
	"num_tokens": 29432390.0,
	"reward": 0.1071428656578064,
	"reward_std": 0.133773535490036,
	"rewards/accuracy_reward/mean": 0.1071428582072258,
	"rewards/accuracy_reward/std": 0.30998748540878296,
	"step": 77
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2455357142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2941.0,
	"completions/mean_length": 1509.65185546875,
	"completions/mean_terminated_length": 1001.1952514648438,
	"completions/min_length": 12.0,
	"completions/min_terminated_length": 12.0,
	"epoch": 0.023299230826674632,
	"grad_norm": 0.07996930927038193,
	"kl": 0.00043010711669921875,
	"learning_rate": 1.5267081477050131e-07,
	"loss": 0.0554,
	"num_tokens": 29806576.0,
	"reward": 0.1473214328289032,
	"reward_std": 0.15225742757320404,
	"rewards/accuracy_reward/mean": 0.1527777761220932,
	"rewards/accuracy_reward/std": 0.36060887575149536,
	"step": 78
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1964285714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3066.0,
	"completions/mean_length": 1378.2679443359375,
	"completions/mean_terminated_length": 964.2444458007812,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"epoch": 0.023597938914196103,
	"grad_norm": 0.058245718479156494,
	"kl": 0.0004696846008300781,
	"learning_rate": 1.4033009983067452e-07,
	"loss": 0.0367,
	"num_tokens": 30154956.0,
	"reward": 0.0848214328289032,
	"reward_std": 0.0913810282945633,
	"rewards/accuracy_reward/mean": 0.0848214253783226,
	"rewards/accuracy_reward/std": 0.27923980355262756,
	"step": 79
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.25,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3032.0,
	"completions/mean_length": 1506.0982666015625,
	"completions/mean_terminated_length": 984.1309814453125,
	"completions/min_length": 76.0,
	"completions/min_terminated_length": 76.0,
	"epoch": 0.02389664700171757,
	"grad_norm": 0.047782279551029205,
	"kl": 0.0004444122314453125,
	"learning_rate": 1.284275872613028e-07,
	"loss": 0.0613,
	"num_tokens": 30530354.0,
	"reward": 0.1116071492433548,
	"reward_std": 0.10174980014562607,
	"rewards/accuracy_reward/mean": 0.1116071417927742,
	"rewards/accuracy_reward/std": 0.31558772921562195,
	"step": 80
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2366071428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3046.0,
	"completions/mean_length": 1465.12060546875,
	"completions/mean_terminated_length": 967.0818481445312,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 0.02419535508923904,
	"grad_norm": 0.04319946467876434,
	"kl": 0.00041961669921875,
	"learning_rate": 1.1697777844051104e-07,
	"loss": 0.035,
	"num_tokens": 30893933.0,
	"reward": 0.0714285746216774,
	"reward_std": 0.08070831745862961,
	"rewards/accuracy_reward/mean": 0.0714285746216774,
	"rewards/accuracy_reward/std": 0.2581161558628082,
	"step": 81
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2633928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3035.0,
	"completions/mean_length": 1520.5535888671875,
	"completions/mean_terminated_length": 965.7938842773438,
	"completions/min_length": 26.0,
	"completions/min_terminated_length": 26.0,
	"epoch": 0.024494063176760512,
	"grad_norm": 0.0543820746243,
	"kl": 0.00045108795166015625,
	"learning_rate": 1.0599462319663904e-07,
	"loss": 0.0783,
	"num_tokens": 31275169.0,
	"reward": 0.1071428656578064,
	"reward_std": 0.10040179640054703,
	"rewards/accuracy_reward/mean": 0.1071428582072258,
	"rewards/accuracy_reward/std": 0.30998748540878296,
	"step": 82
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1741071428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2949.0,
	"completions/mean_length": 1234.84375,
	"completions/mean_terminated_length": 847.5513916015625,
	"completions/min_length": 31.0,
	"completions/min_terminated_length": 31.0,
	"epoch": 0.02479277126428198,
	"grad_norm": 0.07529015839099884,
	"kl": 0.0005130767822265625,
	"learning_rate": 9.549150281252632e-08,
	"loss": -0.0128,
	"num_tokens": 31593070.0,
	"reward": 0.098214291036129,
	"reward_std": 0.09528662264347076,
	"rewards/accuracy_reward/mean": 0.0982142835855484,
	"rewards/accuracy_reward/std": 0.29827070236206055,
	"step": 83
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1428571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3036.0,
	"completions/mean_length": 1257.71875,
	"completions/mean_terminated_length": 955.3385620117188,
	"completions/min_length": 6.0,
	"completions/min_terminated_length": 6.0,
	"epoch": 0.02509147935180345,
	"grad_norm": 0.07526399195194244,
	"kl": 0.0004706382751464844,
	"learning_rate": 8.548121372247919e-08,
	"loss": 0.021,
	"num_tokens": 31910783.0,
	"reward": 0.125,
	"reward_std": 0.1480451077222824,
	"rewards/accuracy_reward/mean": 0.125,
	"rewards/accuracy_reward/std": 0.3314596116542816,
	"step": 84
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.21875,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2802.0,
	"completions/mean_length": 1385.544677734375,
	"completions/mean_terminated_length": 913.337158203125,
	"completions/min_length": 35.0,
	"completions/min_terminated_length": 35.0,
	"epoch": 0.02539018743932492,
	"grad_norm": 0.08289183676242828,
	"kl": 0.00047397613525390625,
	"learning_rate": 7.597595192178702e-08,
	"loss": 0.0353,
	"num_tokens": 32255185.0,
	"reward": 0.0758928582072258,
	"reward_std": 0.09333522617816925,
	"rewards/accuracy_reward/mean": 0.0758928582072258,
	"rewards/accuracy_reward/std": 0.26541972160339355,
	"step": 85
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1428571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3038.0,
	"completions/mean_length": 1268.4910888671875,
	"completions/mean_terminated_length": 967.90625,
	"completions/min_length": 20.0,
	"completions/min_terminated_length": 20.0,
	"epoch": 0.02568889552684639,
	"grad_norm": 0.10912561416625977,
	"kl": 0.0005125999450683594,
	"learning_rate": 6.698729810778064e-08,
	"loss": 0.0096,
	"num_tokens": 32578655.0,
	"reward": 0.0937500074505806,
	"reward_std": 0.1361600160598755,
	"rewards/accuracy_reward/mean": 0.09375,
	"rewards/accuracy_reward/std": 0.2921334207057953,
	"step": 86
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2857142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2813.0,
	"completions/mean_length": 1568.419677734375,
	"completions/mean_terminated_length": 966.9874877929688,
	"completions/min_length": 76.0,
	"completions/min_terminated_length": 76.0,
	"epoch": 0.02598760361436786,
	"grad_norm": 0.05263072997331619,
	"kl": 0.00043487548828125,
	"learning_rate": 5.8526203570536504e-08,
	"loss": 0.041,
	"num_tokens": 32962037.0,
	"reward": 0.1071428656578064,
	"reward_std": 0.1477484405040741,
	"rewards/accuracy_reward/mean": 0.1071428582072258,
	"rewards/accuracy_reward/std": 0.30998748540878296,
	"step": 87
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2232142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2898.0,
	"completions/mean_length": 1397.7188720703125,
	"completions/mean_terminated_length": 916.6034545898438,
	"completions/min_length": 13.0,
	"completions/min_terminated_length": 13.0,
	"epoch": 0.026286311701889328,
	"grad_norm": 0.0686383992433548,
	"kl": 0.0004343986511230469,
	"learning_rate": 5.060297685041659e-08,
	"loss": 0.0171,
	"num_tokens": 33310942.0,
	"reward": 0.0848214328289032,
	"reward_std": 0.11498290300369263,
	"rewards/accuracy_reward/mean": 0.0848214253783226,
	"rewards/accuracy_reward/std": 0.2792397737503052,
	"step": 88
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1383928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2961.0,
	"completions/mean_length": 1205.0179443359375,
	"completions/mean_terminated_length": 905.139892578125,
	"completions/min_length": 12.0,
	"completions/min_terminated_length": 12.0,
	"epoch": 0.0265850197894108,
	"grad_norm": 0.06757447123527527,
	"kl": 0.0004744529724121094,
	"learning_rate": 4.322727117869951e-08,
	"loss": 0.0952,
	"num_tokens": 33623122.0,
	"reward": 0.1517857164144516,
	"reward_std": 0.14383558928966522,
	"rewards/accuracy_reward/mean": 0.15740740299224854,
	"rewards/accuracy_reward/std": 0.365030437707901,
	"step": 89
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2053571428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3054.0,
	"completions/mean_length": 1399.7813720703125,
	"completions/mean_terminated_length": 967.6348266601562,
	"completions/min_length": 48.0,
	"completions/min_terminated_length": 48.0,
	"epoch": 0.02688372787693227,
	"grad_norm": 0.06952133029699326,
	"kl": 0.0004596710205078125,
	"learning_rate": 3.6408072716606345e-08,
	"loss": 0.077,
	"num_tokens": 33975369.0,
	"reward": 0.1026785746216774,
	"reward_std": 0.1513473093509674,
	"rewards/accuracy_reward/mean": 0.1026785746216774,
	"rewards/accuracy_reward/std": 0.3042184114456177,
	"step": 90
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2232142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2935.0,
	"completions/mean_length": 1382.477783203125,
	"completions/mean_terminated_length": 896.9827270507812,
	"completions/min_length": 37.0,
	"completions/min_terminated_length": 37.0,
	"epoch": 0.027182435964453737,
	"grad_norm": 0.04722387343645096,
	"kl": 0.00042724609375,
	"learning_rate": 3.015368960704584e-08,
	"loss": 0.0283,
	"num_tokens": 34320844.0,
	"reward": 0.0625,
	"reward_std": 0.08942963182926178,
	"rewards/accuracy_reward/mean": 0.0625,
	"rewards/accuracy_reward/std": 0.2426035851240158,
	"step": 91
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3080357142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3052.0,
	"completions/mean_length": 1598.888427734375,
	"completions/mean_terminated_length": 943.1160888671875,
	"completions/min_length": 37.0,
	"completions/min_terminated_length": 37.0,
	"epoch": 0.027481144051975208,
	"grad_norm": 0.0545211099088192,
	"kl": 0.0004267692565917969,
	"learning_rate": 2.4471741852423233e-08,
	"loss": 0.0618,
	"num_tokens": 34720275.0,
	"reward": 0.165178582072258,
	"reward_std": 0.10461412370204926,
	"rewards/accuracy_reward/mean": 0.1651785671710968,
	"rewards/accuracy_reward/std": 0.37217333912849426,
	"step": 92
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2008928571428571,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3048.0,
	"completions/mean_length": 1342.759033203125,
	"completions/mean_terminated_length": 908.0335083007812,
	"completions/min_length": 10.0,
	"completions/min_terminated_length": 10.0,
	"epoch": 0.027779852139496675,
	"grad_norm": 0.08833102881908417,
	"kl": 0.0004458427429199219,
	"learning_rate": 1.936915203084055e-08,
	"loss": 0.1027,
	"num_tokens": 35055293.0,
	"reward": 0.2008928656578064,
	"reward_std": 0.20320014655590057,
	"rewards/accuracy_reward/mean": 0.2008928507566452,
	"rewards/accuracy_reward/std": 0.40156546235084534,
	"step": 93
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2276785714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2886.0,
	"completions/mean_length": 1383.7232666015625,
	"completions/mean_terminated_length": 886.0230712890625,
	"completions/min_length": 59.0,
	"completions/min_terminated_length": 59.0,
	"epoch": 0.028078560227018146,
	"grad_norm": 0.05874941125512123,
	"kl": 0.00044918060302734375,
	"learning_rate": 1.4852136862001763e-08,
	"loss": 0.0177,
	"num_tokens": 35402255.0,
	"reward": 0.1160714328289032,
	"reward_std": 0.07485132664442062,
	"rewards/accuracy_reward/mean": 0.1160714253783226,
	"rewards/accuracy_reward/std": 0.321027934551239,
	"step": 94
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2544642857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2787.0,
	"completions/mean_length": 1439.9866943359375,
	"completions/mean_terminated_length": 882.9521484375,
	"completions/min_length": 6.0,
	"completions/min_terminated_length": 6.0,
	"epoch": 0.028377268314539617,
	"grad_norm": 0.06310974061489105,
	"kl": 0.0004143714904785156,
	"learning_rate": 1.0926199633097154e-08,
	"loss": 0.0354,
	"num_tokens": 35758540.0,
	"reward": 0.0714285746216774,
	"reward_std": 0.08131170272827148,
	"rewards/accuracy_reward/mean": 0.0714285746216774,
	"rewards/accuracy_reward/std": 0.2581161558628082,
	"step": 95
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2366071428571429,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2926.0,
	"completions/mean_length": 1451.446533203125,
	"completions/mean_terminated_length": 949.1696166992188,
	"completions/min_length": 56.0,
	"completions/min_terminated_length": 56.0,
	"epoch": 0.028675976402061085,
	"grad_norm": 0.5383374094963074,
	"kl": 0.0004506111145019531,
	"learning_rate": 7.59612349389599e-09,
	"loss": 0.0656,
	"num_tokens": 36119232.0,
	"reward": 0.0803571492433548,
	"reward_std": 0.12054044008255005,
	"rewards/accuracy_reward/mean": 0.0803571417927742,
	"rewards/accuracy_reward/std": 0.2724541425704956,
	"step": 96
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1964285714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3056.0,
	"completions/mean_length": 1336.75,
	"completions/mean_terminated_length": 912.5778198242188,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"epoch": 0.028974684489582556,
	"grad_norm": 0.05063558369874954,
	"kl": 0.0004563331604003906,
	"learning_rate": 4.865965629214819e-09,
	"loss": 0.0335,
	"num_tokens": 36454472.0,
	"reward": 0.0580357164144516,
	"reward_std": 0.06478200107812881,
	"rewards/accuracy_reward/mean": 0.0580357126891613,
	"rewards/accuracy_reward/std": 0.23433460295200348,
	"step": 97
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2232142857142857,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3010.0,
	"completions/mean_length": 1458.107177734375,
	"completions/mean_terminated_length": 994.3448486328125,
	"completions/min_length": 17.0,
	"completions/min_terminated_length": 17.0,
	"epoch": 0.029273392577104027,
	"grad_norm": 0.08015123009681702,
	"kl": 0.00043582916259765625,
	"learning_rate": 2.739052315863355e-09,
	"loss": 0.0717,
	"num_tokens": 36820608.0,
	"reward": 0.1383928656578064,
	"reward_std": 0.11632810533046722,
	"rewards/accuracy_reward/mean": 0.1383928507566452,
	"rewards/accuracy_reward/std": 0.34608522057533264,
	"step": 98
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2276785714285714,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3033.0,
	"completions/mean_length": 1310.1741943359375,
	"completions/mean_terminated_length": 790.7918701171875,
	"completions/min_length": 53.0,
	"completions/min_terminated_length": 53.0,
	"epoch": 0.029572100664625494,
	"grad_norm": 0.052152227610349655,
	"kl": 0.0004668235778808594,
	"learning_rate": 1.217974870087901e-09,
	"loss": 0.0223,
	"num_tokens": 37151047.0,
	"reward": 0.1205357164144516,
	"reward_std": 0.11047111451625824,
	"rewards/accuracy_reward/mean": 0.1205357164144516,
	"rewards/accuracy_reward/std": 0.32631614804267883,
	"step": 99
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2455357142857143,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2885.0,
	"completions/mean_length": 1361.1473388671875,
	"completions/mean_terminated_length": 804.3609619140625,
	"completions/min_length": 13.0,
	"completions/min_terminated_length": 13.0,
	"epoch": 0.029870808752146965,
	"grad_norm": 0.08615954965353012,
	"kl": 0.0004239082336425781,
	"learning_rate": 3.0458649045211894e-10,
	"loss": 0.1212,
	"num_tokens": 37502112.0,
	"reward": 0.165178582072258,
	"reward_std": 0.19959399104118347,
	"rewards/accuracy_reward/mean": 0.1651785671710968,
	"rewards/accuracy_reward/std": 0.37217333912849426,
	"step": 100
	},
	{
	"epoch": 0.029870808752146965,
	"step": 100,
	"total_flos": 0.0,
	"train_loss": 0.06171137083787471,
	"train_runtime": 3149.8828,
	"train_samples_per_second": 7.111,
	"train_steps_per_second": 0.032
	}
	],
	"logging_steps": 1,
	"max_steps": 100,
	"num_input_tokens_seen": 37502112,
	"num_train_epochs": 1,
	"save_steps": 50,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}