hdong0's picture
Model save
b15c6b3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.029870808752146965,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2946428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2953.0,
"completions/mean_length": 1557.727783203125,
"completions/mean_terminated_length": 925.18359375,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.00029870808752146963,
"grad_norm": 0.08854348212480545,
"kl": 0.0002200603485107422,
"learning_rate": 0.0,
"loss": 0.1118,
"num_tokens": 384987.0,
"reward": 0.1071428656578064,
"reward_std": 0.15226024389266968,
"rewards/accuracy_reward/mean": 0.1071428582072258,
"rewards/accuracy_reward/std": 0.30998751521110535,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3839285714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2925.0,
"completions/mean_length": 1786.2501220703125,
"completions/mean_terminated_length": 984.9855346679688,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.0005974161750429393,
"grad_norm": 0.0577642060816288,
"kl": 0.00018739700317382812,
"learning_rate": 1e-07,
"loss": 0.026,
"num_tokens": 819379.0,
"reward": 0.0535714328289032,
"reward_std": 0.056364625692367554,
"rewards/accuracy_reward/mean": 0.0535714291036129,
"rewards/accuracy_reward/std": 0.2256743162870407,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3705357142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3003.0,
"completions/mean_length": 1748.2188720703125,
"completions/mean_terminated_length": 968.9716186523438,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.0008961242625644089,
"grad_norm": 0.0502021387219429,
"kl": 0.0002162456512451172,
"learning_rate": 2e-07,
"loss": 0.0674,
"num_tokens": 1249588.0,
"reward": 0.0714285746216774,
"reward_std": 0.10370119661092758,
"rewards/accuracy_reward/mean": 0.0714285746216774,
"rewards/accuracy_reward/std": 0.2581161558628082,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2633928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2901.0,
"completions/mean_length": 1507.15185546875,
"completions/mean_terminated_length": 947.5999755859375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.0011948323500858785,
"grad_norm": 0.05528466776013374,
"kl": 0.0002148151397705078,
"learning_rate": 3e-07,
"loss": 0.0171,
"num_tokens": 1627094.0,
"reward": 0.0491071455180645,
"reward_std": 0.07936029881238937,
"rewards/accuracy_reward/mean": 0.0491071417927742,
"rewards/accuracy_reward/std": 0.21657568216323853,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3027.0,
"completions/mean_length": 1706.196533203125,
"completions/mean_terminated_length": 990.7755126953125,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.0014935404376073482,
"grad_norm": 0.05157456174492836,
"kl": 0.00016391277313232422,
"learning_rate": 4e-07,
"loss": 0.0591,
"num_tokens": 2044514.0,
"reward": 0.0803571492433548,
"reward_std": 0.11693429946899414,
"rewards/accuracy_reward/mean": 0.0803571417927742,
"rewards/accuracy_reward/std": 0.2724541425704956,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3036.0,
"completions/mean_length": 1646.27685546875,
"completions/mean_terminated_length": 998.2207641601562,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.0017922485251288178,
"grad_norm": 0.12111014127731323,
"kl": 0.00024390220642089844,
"learning_rate": 5e-07,
"loss": 0.1069,
"num_tokens": 2451360.0,
"reward": 0.125,
"reward_std": 0.14548751711845398,
"rewards/accuracy_reward/mean": 0.12962962687015533,
"rewards/accuracy_reward/std": 0.336675763130188,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2999.0,
"completions/mean_length": 1562.5938720703125,
"completions/mean_terminated_length": 958.8312377929688,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.0020909566126502874,
"grad_norm": 0.05596686899662018,
"kl": 0.00021219253540039062,
"learning_rate": 6e-07,
"loss": 0.0483,
"num_tokens": 2843053.0,
"reward": 0.066964291036129,
"reward_std": 0.11663484573364258,
"rewards/accuracy_reward/mean": 0.0669642835855484,
"rewards/accuracy_reward/std": 0.2505199611186981,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3392857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 1740.3126220703125,
"completions/mean_terminated_length": 1056.4730224609375,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.002389664700171757,
"grad_norm": 0.07892504334449768,
"kl": 0.00020313262939453125,
"learning_rate": 7e-07,
"loss": 0.013,
"num_tokens": 3267483.0,
"reward": 0.0580357164144516,
"reward_std": 0.07350330799818039,
"rewards/accuracy_reward/mean": 0.06018518656492233,
"rewards/accuracy_reward/std": 0.23838205635547638,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2723214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 1496.6295166015625,
"completions/mean_terminated_length": 907.0736083984375,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.0026883727876932267,
"grad_norm": 0.07782948017120361,
"kl": 0.0001881122589111328,
"learning_rate": 8e-07,
"loss": 0.0718,
"num_tokens": 3639680.0,
"reward": 0.1116071492433548,
"reward_std": 0.14939311146736145,
"rewards/accuracy_reward/mean": 0.1116071417927742,
"rewards/accuracy_reward/std": 0.31558772921562195,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3348214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 1665.102783203125,
"completions/mean_terminated_length": 956.932861328125,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.0029870808752146963,
"grad_norm": 0.0631859079003334,
"kl": 0.00020241737365722656,
"learning_rate": 9e-07,
"loss": 0.0873,
"num_tokens": 4046735.0,
"reward": 0.0937500074505806,
"reward_std": 0.14353612065315247,
"rewards/accuracy_reward/mean": 0.09375,
"rewards/accuracy_reward/std": 0.2921334207057953,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3571428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2965.0,
"completions/mean_length": 1728.6785888671875,
"completions/mean_terminated_length": 982.388916015625,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.003285788962736166,
"grad_norm": 0.053209614008665085,
"kl": 0.00017750263214111328,
"learning_rate": 1e-06,
"loss": 0.0142,
"num_tokens": 4469191.0,
"reward": 0.0401785746216774,
"reward_std": 0.07350330799818039,
"rewards/accuracy_reward/mean": 0.0401785708963871,
"rewards/accuracy_reward/std": 0.19681765139102936,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3392857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3024.0,
"completions/mean_length": 1664.825927734375,
"completions/mean_terminated_length": 942.2230224609375,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.0035844970502576356,
"grad_norm": 0.04564949870109558,
"kl": 0.00018143653869628906,
"learning_rate": 9.996954135095478e-07,
"loss": 0.0166,
"num_tokens": 4875240.0,
"reward": 0.0357142873108387,
"reward_std": 0.07289712876081467,
"rewards/accuracy_reward/mean": 0.0357142873108387,
"rewards/accuracy_reward/std": 0.18599249422550201,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2723214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3041.0,
"completions/mean_length": 1547.21435546875,
"completions/mean_terminated_length": 976.5889282226562,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.003883205137779105,
"grad_norm": 0.04013432562351227,
"kl": 0.00025653839111328125,
"learning_rate": 9.98782025129912e-07,
"loss": 0.0329,
"num_tokens": 5256416.0,
"reward": 0.0848214328289032,
"reward_std": 0.07936029881238937,
"rewards/accuracy_reward/mean": 0.0848214253783226,
"rewards/accuracy_reward/std": 0.2792397737503052,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3883928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2960.0,
"completions/mean_length": 1803.071533203125,
"completions/mean_terminated_length": 997.2554931640625,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.004181913225300575,
"grad_norm": 85.07632446289062,
"kl": 0.2520885467529297,
"learning_rate": 9.972609476841365e-07,
"loss": 0.0295,
"num_tokens": 5695616.0,
"reward": 0.0446428582072258,
"reward_std": 0.0417863167822361,
"rewards/accuracy_reward/mean": 0.0446428582072258,
"rewards/accuracy_reward/std": 0.20698098838329315,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3348214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3021.0,
"completions/mean_length": 1632.0223388671875,
"completions/mean_terminated_length": 907.2013549804688,
"completions/min_length": 90.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.004480621312822045,
"grad_norm": 0.0776248499751091,
"kl": 0.0002193450927734375,
"learning_rate": 9.95134034370785e-07,
"loss": 0.0752,
"num_tokens": 6096357.0,
"reward": 0.1205357164144516,
"reward_std": 0.14683552086353302,
"rewards/accuracy_reward/mean": 0.1205357164144516,
"rewards/accuracy_reward/std": 0.32631614804267883,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3062.0,
"completions/mean_length": 1543.3482666015625,
"completions/mean_terminated_length": 931.8875122070312,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.004779329400343514,
"grad_norm": 0.05424968898296356,
"kl": 0.00020742416381835938,
"learning_rate": 9.92403876506104e-07,
"loss": 0.0186,
"num_tokens": 6478547.0,
"reward": 0.0491071455180645,
"reward_std": 0.07936030626296997,
"rewards/accuracy_reward/mean": 0.0491071417927742,
"rewards/accuracy_reward/std": 0.21657569706439972,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3035714285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2991.0,
"completions/mean_length": 1667.509033203125,
"completions/mean_terminated_length": 1055.294921875,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.005078037487864984,
"grad_norm": 0.07909969985485077,
"kl": 0.00016796588897705078,
"learning_rate": 9.890738003669027e-07,
"loss": 0.1047,
"num_tokens": 6889301.0,
"reward": 0.15625,
"reward_std": 0.15616022050380707,
"rewards/accuracy_reward/mean": 0.15625,
"rewards/accuracy_reward/std": 0.3639053702354431,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3169642857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3032.0,
"completions/mean_length": 1650.3751220703125,
"completions/mean_terminated_length": 990.6666870117188,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.005376745575386453,
"grad_norm": 0.09321057796478271,
"kl": 0.00021791458129882812,
"learning_rate": 9.851478631379982e-07,
"loss": 0.1088,
"num_tokens": 7294537.0,
"reward": 0.1607142984867096,
"reward_std": 0.1844095140695572,
"rewards/accuracy_reward/mean": 0.1607142835855484,
"rewards/accuracy_reward/std": 0.3680897653102875,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4776785714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3022.0,
"completions/mean_length": 2020.1876220703125,
"completions/mean_terminated_length": 1058.2735595703125,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.0056754536629079234,
"grad_norm": 0.06517499685287476,
"kl": 0.0001583099365234375,
"learning_rate": 9.806308479691594e-07,
"loss": 0.0896,
"num_tokens": 7782355.0,
"reward": 0.0803571492433548,
"reward_std": 0.122494637966156,
"rewards/accuracy_reward/mean": 0.0803571417927742,
"rewards/accuracy_reward/std": 0.2724541425704956,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2321428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 1522.8973388671875,
"completions/mean_terminated_length": 1054.56396484375,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.005974161750429393,
"grad_norm": 0.16769619286060333,
"kl": 0.0007884502410888672,
"learning_rate": 9.755282581475767e-07,
"loss": 0.064,
"num_tokens": 8161596.0,
"reward": 0.1473214328289032,
"reward_std": 0.163971409201622,
"rewards/accuracy_reward/mean": 0.1473214328289032,
"rewards/accuracy_reward/std": 0.35521984100341797,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3392857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3044.0,
"completions/mean_length": 1634.37060546875,
"completions/mean_terminated_length": 896.12841796875,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.006272869837950863,
"grad_norm": 0.09035351872444153,
"kl": 0.00022339820861816406,
"learning_rate": 9.698463103929541e-07,
"loss": 0.0312,
"num_tokens": 8564495.0,
"reward": 0.0535714328289032,
"reward_std": 0.05050762742757797,
"rewards/accuracy_reward/mean": 0.0535714291036129,
"rewards/accuracy_reward/std": 0.2256743162870407,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3080357142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3040.0,
"completions/mean_length": 1619.3660888671875,
"completions/mean_terminated_length": 972.7096557617188,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.006571577925472332,
"grad_norm": 0.044720232486724854,
"kl": 0.00024008750915527344,
"learning_rate": 9.635919272833937e-07,
"loss": 0.0381,
"num_tokens": 8963833.0,
"reward": 0.0401785746216774,
"reward_std": 0.08222462981939316,
"rewards/accuracy_reward/mean": 0.0401785708963871,
"rewards/accuracy_reward/std": 0.19681765139102936,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3348214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 1700.1473388671875,
"completions/mean_terminated_length": 1009.617431640625,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.006870286012993802,
"grad_norm": 0.03696313127875328,
"kl": 0.0001952648162841797,
"learning_rate": 9.567727288213004e-07,
"loss": 0.0385,
"num_tokens": 9378978.0,
"reward": 0.0223214291036129,
"reward_std": 0.06313453614711761,
"rewards/accuracy_reward/mean": 0.0223214291036129,
"rewards/accuracy_reward/std": 0.14805756509304047,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2946428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3046.0,
"completions/mean_length": 1579.0001220703125,
"completions/mean_terminated_length": 955.341796875,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.007168994100515271,
"grad_norm": 0.04915790259838104,
"kl": 0.0002124309539794922,
"learning_rate": 9.493970231495834e-07,
"loss": 0.0689,
"num_tokens": 9767946.0,
"reward": 0.0937500074505806,
"reward_std": 0.08747823536396027,
"rewards/accuracy_reward/mean": 0.09375,
"rewards/accuracy_reward/std": 0.2921334207057953,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3258928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2958.0,
"completions/mean_length": 1655.321533203125,
"completions/mean_terminated_length": 970.4370727539062,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.007467702188036741,
"grad_norm": 0.08162926882505417,
"kl": 0.00021219253540039062,
"learning_rate": 9.414737964294634e-07,
"loss": 0.106,
"num_tokens": 10176490.0,
"reward": 0.165178582072258,
"reward_std": 0.2200292944908142,
"rewards/accuracy_reward/mean": 0.1651785671710968,
"rewards/accuracy_reward/std": 0.37217333912849426,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3571428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3072.0,
"completions/mean_length": 1757.1474609375,
"completions/mean_terminated_length": 1026.673583984375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.00776641027555821,
"grad_norm": 0.05597545579075813,
"kl": 0.00016999244689941406,
"learning_rate": 9.330127018922193e-07,
"loss": 0.0569,
"num_tokens": 10604371.0,
"reward": 0.1205357164144516,
"reward_std": 0.1474389135837555,
"rewards/accuracy_reward/mean": 0.1205357164144516,
"rewards/accuracy_reward/std": 0.32631614804267883,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3080357142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2834.0,
"completions/mean_length": 1574.700927734375,
"completions/mean_terminated_length": 908.1612548828125,
"completions/min_length": 15.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.00806511836307968,
"grad_norm": 0.09308373928070068,
"kl": 0.00026679039001464844,
"learning_rate": 9.240240480782129e-07,
"loss": -0.0172,
"num_tokens": 10994768.0,
"reward": 0.0714285746216774,
"reward_std": 0.08161844313144684,
"rewards/accuracy_reward/mean": 0.0714285746216774,
"rewards/accuracy_reward/std": 0.2581161558628082,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2991071428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2953.0,
"completions/mean_length": 1551.4241943359375,
"completions/mean_terminated_length": 902.5159301757812,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.00836382645060115,
"grad_norm": 0.03204449638724327,
"kl": 0.00022554397583007812,
"learning_rate": 9.145187862775208e-07,
"loss": 0.0183,
"num_tokens": 11379607.0,
"reward": 0.0223214291036129,
"reward_std": 0.04569191485643387,
"rewards/accuracy_reward/mean": 0.0223214291036129,
"rewards/accuracy_reward/std": 0.14805756509304047,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3035714285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3051.0,
"completions/mean_length": 1574.294677734375,
"completions/mean_terminated_length": 921.44873046875,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.008662534538122619,
"grad_norm": 0.14834517240524292,
"kl": 0.0002460479736328125,
"learning_rate": 9.045084971874737e-07,
"loss": 0.0947,
"num_tokens": 11767841.0,
"reward": 0.1160714328289032,
"reward_std": 0.139630526304245,
"rewards/accuracy_reward/mean": 0.1160714253783226,
"rewards/accuracy_reward/std": 0.321027934551239,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3348214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2951.0,
"completions/mean_length": 1691.6563720703125,
"completions/mean_terminated_length": 996.8523559570312,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.00896124262564409,
"grad_norm": 0.06935401260852814,
"kl": 0.00021028518676757812,
"learning_rate": 8.940053768033608e-07,
"loss": 0.077,
"num_tokens": 12182348.0,
"reward": 0.1428571492433548,
"reward_std": 0.17555983364582062,
"rewards/accuracy_reward/mean": 0.14814814925193787,
"rewards/accuracy_reward/std": 0.35607197880744934,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2901785714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3022.0,
"completions/mean_length": 1568.509033203125,
"completions/mean_terminated_length": 953.8742065429688,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.009259950713165559,
"grad_norm": 0.06759945303201675,
"kl": 0.0002391338348388672,
"learning_rate": 8.83022221559489e-07,
"loss": 0.0138,
"num_tokens": 12567686.0,
"reward": 0.1116071492433548,
"reward_std": 0.07350331544876099,
"rewards/accuracy_reward/mean": 0.1116071417927742,
"rewards/accuracy_reward/std": 0.31558772921562195,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2723214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2847.0,
"completions/mean_length": 1535.107177734375,
"completions/mean_terminated_length": 959.950927734375,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.009558658800687028,
"grad_norm": 0.07123875617980957,
"kl": 0.00024700164794921875,
"learning_rate": 8.71572412738697e-07,
"loss": 0.0526,
"num_tokens": 12953174.0,
"reward": 0.0892857164144516,
"reward_std": 0.11394162476062775,
"rewards/accuracy_reward/mean": 0.09259258955717087,
"rewards/accuracy_reward/std": 0.29053398966789246,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3035714285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3050.0,
"completions/mean_length": 1552.263427734375,
"completions/mean_terminated_length": 889.8140869140625,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.009857366888208497,
"grad_norm": 0.05994417518377304,
"kl": 0.00028204917907714844,
"learning_rate": 8.596699001693255e-07,
"loss": 0.0441,
"num_tokens": 13335825.0,
"reward": 0.0892857164144516,
"reward_std": 0.10504640638828278,
"rewards/accuracy_reward/mean": 0.09259258955717087,
"rewards/accuracy_reward/std": 0.29053395986557007,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2767857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 1505.9910888671875,
"completions/mean_terminated_length": 906.6543579101562,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.010156074975729968,
"grad_norm": 0.06331104040145874,
"kl": 0.00026798248291015625,
"learning_rate": 8.473291852294986e-07,
"loss": 0.1067,
"num_tokens": 13708551.0,
"reward": 0.125,
"reward_std": 0.1584138572216034,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3314596116542816,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2946428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3050.0,
"completions/mean_length": 1632.8795166015625,
"completions/mean_terminated_length": 1031.7279052734375,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.010454783063251438,
"grad_norm": 0.03785333409905434,
"kl": 0.0002741813659667969,
"learning_rate": 8.34565303179429e-07,
"loss": 0.0503,
"num_tokens": 14113332.0,
"reward": 0.0491071455180645,
"reward_std": 0.10010232776403427,
"rewards/accuracy_reward/mean": 0.0491071417927742,
"rewards/accuracy_reward/std": 0.21657568216323853,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3060.0,
"completions/mean_length": 1601.3929443359375,
"completions/mean_terminated_length": 932.93505859375,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.010753491150772907,
"grad_norm": 0.07039965689182281,
"kl": 0.0002465248107910156,
"learning_rate": 8.213938048432696e-07,
"loss": 0.0907,
"num_tokens": 14508548.0,
"reward": 0.133928582072258,
"reward_std": 0.15811721980571747,
"rewards/accuracy_reward/mean": 0.1339285671710968,
"rewards/accuracy_reward/std": 0.3413383364677429,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2053571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2849.0,
"completions/mean_length": 1399.2188720703125,
"completions/mean_terminated_length": 966.927001953125,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.011052199238294378,
"grad_norm": 0.06250524520874023,
"kl": 0.00029087066650390625,
"learning_rate": 8.07830737662829e-07,
"loss": 0.0706,
"num_tokens": 14859805.0,
"reward": 0.125,
"reward_std": 0.133773535490036,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3314596116542816,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3013.0,
"completions/mean_length": 1635.884033203125,
"completions/mean_terminated_length": 1061.4375,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.011350907325815847,
"grad_norm": 0.028273796662688255,
"kl": 0.0002434253692626953,
"learning_rate": 7.938926261462365e-07,
"loss": 0.0175,
"num_tokens": 15263779.0,
"reward": 0.0357142873108387,
"reward_std": 0.04764331132173538,
"rewards/accuracy_reward/mean": 0.0357142873108387,
"rewards/accuracy_reward/std": 0.18599249422550201,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2767857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2456.0,
"completions/mean_length": 1450.90185546875,
"completions/mean_terminated_length": 830.4815063476562,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.011649615413337316,
"grad_norm": 0.057446062564849854,
"kl": 0.0003275871276855469,
"learning_rate": 7.795964517353733e-07,
"loss": 0.0511,
"num_tokens": 15624173.0,
"reward": 0.0803571492433548,
"reward_std": 0.10851971060037613,
"rewards/accuracy_reward/mean": 0.0803571417927742,
"rewards/accuracy_reward/std": 0.2724541425704956,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2589285714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3027.0,
"completions/mean_length": 1514.7723388671875,
"completions/mean_terminated_length": 970.6806640625,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.011948323500858785,
"grad_norm": 0.06986912339925766,
"kl": 0.0003490447998046875,
"learning_rate": 7.649596321166024e-07,
"loss": 0.0786,
"num_tokens": 16002314.0,
"reward": 0.133928582072258,
"reward_std": 0.17373399436473846,
"rewards/accuracy_reward/mean": 0.1339285671710968,
"rewards/accuracy_reward/std": 0.3413383364677429,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2544642857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3011.0,
"completions/mean_length": 1548.4732666015625,
"completions/mean_terminated_length": 1028.4671630859375,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.012247031588380256,
"grad_norm": 0.40365350246429443,
"kl": 0.0009794235229492188,
"learning_rate": 7.5e-07,
"loss": 0.0019,
"num_tokens": 16389036.0,
"reward": 0.0401785746216774,
"reward_std": 0.0689915269613266,
"rewards/accuracy_reward/mean": 0.0401785708963871,
"rewards/accuracy_reward/std": 0.19681765139102936,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1741071428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2763.0,
"completions/mean_length": 1306.65185546875,
"completions/mean_terminated_length": 934.497314453125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.012545739675901725,
"grad_norm": 0.04599951580166817,
"kl": 0.0003376007080078125,
"learning_rate": 7.347357813929454e-07,
"loss": 0.0255,
"num_tokens": 16717726.0,
"reward": 0.0580357164144516,
"reward_std": 0.0787569135427475,
"rewards/accuracy_reward/mean": 0.0580357126891613,
"rewards/accuracy_reward/std": 0.23433461785316467,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2946428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3070.0,
"completions/mean_length": 1608.08935546875,
"completions/mean_terminated_length": 996.582275390625,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.012844447763423195,
"grad_norm": 0.042162321507930756,
"kl": 0.0003190040588378906,
"learning_rate": 7.191855733945386e-07,
"loss": 0.0291,
"num_tokens": 17115178.0,
"reward": 0.0357142873108387,
"reward_std": 0.060876406729221344,
"rewards/accuracy_reward/mean": 0.0357142873108387,
"rewards/accuracy_reward/std": 0.18599249422550201,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2098214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2964.0,
"completions/mean_length": 1279.0848388671875,
"completions/mean_terminated_length": 803.0,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.013143155850944664,
"grad_norm": 0.08639585971832275,
"kl": 0.00032329559326171875,
"learning_rate": 7.033683215379002e-07,
"loss": 0.0171,
"num_tokens": 17437797.0,
"reward": 0.0803571492433548,
"reward_std": 0.14158472418785095,
"rewards/accuracy_reward/mean": 0.0803571417927742,
"rewards/accuracy_reward/std": 0.2724541425704956,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3066.0,
"completions/mean_length": 1557.5804443359375,
"completions/mean_terminated_length": 951.8125,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.013441863938466135,
"grad_norm": 0.0711236298084259,
"kl": 0.00033283233642578125,
"learning_rate": 6.87303296707956e-07,
"loss": 0.0433,
"num_tokens": 17823671.0,
"reward": 0.1428571492433548,
"reward_std": 0.14066898822784424,
"rewards/accuracy_reward/mean": 0.1428571492433548,
"rewards/accuracy_reward/std": 0.35071080923080444,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2767857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 1589.357177734375,
"completions/mean_terminated_length": 1021.9259033203125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.013740572025987604,
"grad_norm": 0.055200062692165375,
"kl": 0.0003032684326171875,
"learning_rate": 6.710100716628344e-07,
"loss": 0.0797,
"num_tokens": 18216415.0,
"reward": 0.098214291036129,
"reward_std": 0.13225442171096802,
"rewards/accuracy_reward/mean": 0.0982142835855484,
"rewards/accuracy_reward/std": 0.29827070236206055,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2678571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2984.0,
"completions/mean_length": 1515.5535888671875,
"completions/mean_terminated_length": 946.1218872070312,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.014039280113509073,
"grad_norm": 0.07808107137680054,
"kl": 0.00030994415283203125,
"learning_rate": 6.545084971874736e-07,
"loss": 0.0337,
"num_tokens": 18590435.0,
"reward": 0.0848214328289032,
"reward_std": 0.10205654054880142,
"rewards/accuracy_reward/mean": 0.0848214253783226,
"rewards/accuracy_reward/std": 0.2792397737503052,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2633928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2920.0,
"completions/mean_length": 1490.2501220703125,
"completions/mean_terminated_length": 924.654541015625,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.014337988201030542,
"grad_norm": 0.06969325244426727,
"kl": 0.0003323554992675781,
"learning_rate": 6.378186779084995e-07,
"loss": 0.0863,
"num_tokens": 18963555.0,
"reward": 0.1383928656578064,
"reward_std": 0.16006861627101898,
"rewards/accuracy_reward/mean": 0.1383928507566452,
"rewards/accuracy_reward/std": 0.34608522057533264,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2276785714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3034.0,
"completions/mean_length": 1513.2232666015625,
"completions/mean_terminated_length": 1053.6993408203125,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.014636696288552013,
"grad_norm": 0.0502396859228611,
"kl": 0.00036144256591796875,
"learning_rate": 6.209609477998338e-07,
"loss": 0.0331,
"num_tokens": 19336581.0,
"reward": 0.1116071492433548,
"reward_std": 0.10174980014562607,
"rewards/accuracy_reward/mean": 0.1116071417927742,
"rewards/accuracy_reward/std": 0.31558772921562195,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2142857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3013.0,
"completions/mean_length": 1356.0491943359375,
"completions/mean_terminated_length": 888.0625,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.014935404376073482,
"grad_norm": 0.07083041965961456,
"kl": 0.0003476142883300781,
"learning_rate": 6.039558454088795e-07,
"loss": 0.0898,
"num_tokens": 19674280.0,
"reward": 0.1428571492433548,
"reward_std": 0.17659832537174225,
"rewards/accuracy_reward/mean": 0.1428571492433548,
"rewards/accuracy_reward/std": 0.35071080923080444,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2678571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3060.0,
"completions/mean_length": 1581.5045166015625,
"completions/mean_terminated_length": 1036.201171875,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.015234112463594952,
"grad_norm": 0.07027444243431091,
"kl": 0.00034427642822265625,
"learning_rate": 5.868240888334652e-07,
"loss": 0.0775,
"num_tokens": 20068041.0,
"reward": 0.165178582072258,
"reward_std": 0.163971409201622,
"rewards/accuracy_reward/mean": 0.1651785671710968,
"rewards/accuracy_reward/std": 0.37217333912849426,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2053571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2985.0,
"completions/mean_length": 1486.7366943359375,
"completions/mean_terminated_length": 1077.061767578125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.01553282055111642,
"grad_norm": 0.04818105697631836,
"kl": 0.00038242340087890625,
"learning_rate": 5.695865504800327e-07,
"loss": 0.0411,
"num_tokens": 20435622.0,
"reward": 0.0535714328289032,
"reward_std": 0.06673339754343033,
"rewards/accuracy_reward/mean": 0.0535714291036129,
"rewards/accuracy_reward/std": 0.2256743162870407,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2633928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3045.0,
"completions/mean_length": 1494.6429443359375,
"completions/mean_terminated_length": 930.6181640625,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.01583152863863789,
"grad_norm": 0.046202462166547775,
"kl": 0.00034809112548828125,
"learning_rate": 5.522642316338268e-07,
"loss": 0.0624,
"num_tokens": 20804750.0,
"reward": 0.0758928582072258,
"reward_std": 0.10100797563791275,
"rewards/accuracy_reward/mean": 0.0758928582072258,
"rewards/accuracy_reward/std": 0.26541972160339355,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1830357142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3039.0,
"completions/mean_length": 1359.74560546875,
"completions/mean_terminated_length": 976.1256713867188,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.01613023672615936,
"grad_norm": 0.07204340398311615,
"kl": 0.0004181861877441406,
"learning_rate": 5.348782368720625e-07,
"loss": 0.0153,
"num_tokens": 21144645.0,
"reward": 0.0625,
"reward_std": 0.0978442057967186,
"rewards/accuracy_reward/mean": 0.0625,
"rewards/accuracy_reward/std": 0.2426035851240158,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3303571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 1705.763427734375,
"completions/mean_terminated_length": 1031.75341796875,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.016428944813680832,
"grad_norm": 0.029594114050269127,
"kl": 0.0003113746643066406,
"learning_rate": 5.174497483512505e-07,
"loss": 0.0516,
"num_tokens": 21564768.0,
"reward": 0.0446428582072258,
"reward_std": 0.06222161650657654,
"rewards/accuracy_reward/mean": 0.0446428582072258,
"rewards/accuracy_reward/std": 0.20698098838329315,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2276785714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 1459.009033203125,
"completions/mean_terminated_length": 983.5028686523438,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.0167276529012023,
"grad_norm": 0.04346403852105141,
"kl": 0.0004267692565917969,
"learning_rate": 5e-07,
"loss": 0.0202,
"num_tokens": 21926530.0,
"reward": 0.098214291036129,
"reward_std": 0.0417863167822361,
"rewards/accuracy_reward/mean": 0.0982142835855484,
"rewards/accuracy_reward/std": 0.29827070236206055,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2589285714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3001.0,
"completions/mean_length": 1461.7991943359375,
"completions/mean_terminated_length": 899.1987915039062,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.01702636098872377,
"grad_norm": 0.04805866628885269,
"kl": 0.0003714561462402344,
"learning_rate": 4.825502516487496e-07,
"loss": 0.0097,
"num_tokens": 22286485.0,
"reward": 0.0446428582072258,
"reward_std": 0.07740890979766846,
"rewards/accuracy_reward/mean": 0.0446428582072258,
"rewards/accuracy_reward/std": 0.20698098838329315,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2232142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3008.0,
"completions/mean_length": 1363.43310546875,
"completions/mean_terminated_length": 872.4655151367188,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.017325069076245238,
"grad_norm": 0.0761556327342987,
"kl": 0.00038814544677734375,
"learning_rate": 4.6512176312793735e-07,
"loss": 0.0167,
"num_tokens": 22629270.0,
"reward": 0.1160714328289032,
"reward_std": 0.13225442171096802,
"rewards/accuracy_reward/mean": 0.1160714253783226,
"rewards/accuracy_reward/std": 0.321027934551239,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2098214285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3065.0,
"completions/mean_length": 1447.0179443359375,
"completions/mean_terminated_length": 1015.5254516601562,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.01762377716376671,
"grad_norm": 0.08180122822523117,
"kl": 0.0004076957702636719,
"learning_rate": 4.477357683661733e-07,
"loss": 0.1042,
"num_tokens": 22988322.0,
"reward": 0.1473214328289032,
"reward_std": 0.2089243084192276,
"rewards/accuracy_reward/mean": 0.1473214328289032,
"rewards/accuracy_reward/std": 0.35521984100341797,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2142857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3057.0,
"completions/mean_length": 1429.5491943359375,
"completions/mean_terminated_length": 981.6079711914062,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.01792248525128818,
"grad_norm": 0.08614456653594971,
"kl": 0.0004153251647949219,
"learning_rate": 4.304134495199674e-07,
"loss": 0.1011,
"num_tokens": 23345885.0,
"reward": 0.1116071492433548,
"reward_std": 0.1331673562526703,
"rewards/accuracy_reward/mean": 0.1116071417927742,
"rewards/accuracy_reward/std": 0.31558772921562195,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2965.0,
"completions/mean_length": 1379.759033203125,
"completions/mean_terminated_length": 905.931396484375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.018221193338809647,
"grad_norm": 0.07557892054319382,
"kl": 0.0004553794860839844,
"learning_rate": 4.131759111665348e-07,
"loss": 0.0796,
"num_tokens": 23690391.0,
"reward": 0.1294642984867096,
"reward_std": 0.15225742757320404,
"rewards/accuracy_reward/mean": 0.1294642835855484,
"rewards/accuracy_reward/std": 0.3364649713039398,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2885.0,
"completions/mean_length": 1448.169677734375,
"completions/mean_terminated_length": 906.8928833007812,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.018519901426331118,
"grad_norm": 0.051801372319459915,
"kl": 0.00043392181396484375,
"learning_rate": 3.960441545911204e-07,
"loss": 0.046,
"num_tokens": 24052941.0,
"reward": 0.0758928582072258,
"reward_std": 0.09333522617816925,
"rewards/accuracy_reward/mean": 0.0758928582072258,
"rewards/accuracy_reward/std": 0.26541972160339355,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2321428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3043.0,
"completions/mean_length": 1465.83935546875,
"completions/mean_terminated_length": 980.2557983398438,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.01881860951385259,
"grad_norm": 0.11123108118772507,
"kl": 0.0003695487976074219,
"learning_rate": 3.790390522001662e-07,
"loss": 0.0854,
"num_tokens": 24414641.0,
"reward": 0.1696428656578064,
"reward_std": 0.15706866979599,
"rewards/accuracy_reward/mean": 0.1696428507566452,
"rewards/accuracy_reward/std": 0.37615931034088135,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3258928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2902.0,
"completions/mean_length": 1662.165283203125,
"completions/mean_terminated_length": 980.5894165039062,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.019117317601374056,
"grad_norm": 0.04303564503788948,
"kl": 0.0003743171691894531,
"learning_rate": 3.621813220915004e-07,
"loss": 0.0592,
"num_tokens": 24823998.0,
"reward": 0.0892857164144516,
"reward_std": 0.11181911826133728,
"rewards/accuracy_reward/mean": 0.0892857164144516,
"rewards/accuracy_reward/std": 0.28579434752464294,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2455357142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2906.0,
"completions/mean_length": 1489.8482666015625,
"completions/mean_terminated_length": 974.94677734375,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.019416025688895527,
"grad_norm": 0.06886183470487595,
"kl": 0.000438690185546875,
"learning_rate": 3.454915028125263e-07,
"loss": 0.0292,
"num_tokens": 25193332.0,
"reward": 0.0803571492433548,
"reward_std": 0.11572191119194031,
"rewards/accuracy_reward/mean": 0.0803571417927742,
"rewards/accuracy_reward/std": 0.2724541425704956,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2053571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3026.0,
"completions/mean_length": 1461.888427734375,
"completions/mean_terminated_length": 1045.7921142578125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.019714733776416995,
"grad_norm": 0.05718955025076866,
"kl": 0.00038433074951171875,
"learning_rate": 3.2898992833716563e-07,
"loss": 0.0252,
"num_tokens": 25563507.0,
"reward": 0.0803571492433548,
"reward_std": 0.11437670141458511,
"rewards/accuracy_reward/mean": 0.0803571417927742,
"rewards/accuracy_reward/std": 0.2724541425704956,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2008928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2723.0,
"completions/mean_length": 1381.7098388671875,
"completions/mean_terminated_length": 956.7764892578125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.020013441863938466,
"grad_norm": 0.07317039370536804,
"kl": 0.0004258155822753906,
"learning_rate": 3.1269670329204393e-07,
"loss": 0.0637,
"num_tokens": 25908066.0,
"reward": 0.1383928656578064,
"reward_std": 0.148354634642601,
"rewards/accuracy_reward/mean": 0.1383928507566452,
"rewards/accuracy_reward/std": 0.34608522057533264,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2767857142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2957.0,
"completions/mean_length": 1621.2723388671875,
"completions/mean_terminated_length": 1066.0555419921875,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.020312149951459937,
"grad_norm": 0.05922617018222809,
"kl": 0.0003914833068847656,
"learning_rate": 2.9663167846209996e-07,
"loss": 0.062,
"num_tokens": 26307151.0,
"reward": 0.1160714328289032,
"reward_std": 0.1546439230442047,
"rewards/accuracy_reward/mean": 0.1160714253783226,
"rewards/accuracy_reward/std": 0.321027934551239,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2276785714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3031.0,
"completions/mean_length": 1374.5179443359375,
"completions/mean_terminated_length": 874.10400390625,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.020610858038981404,
"grad_norm": 0.04980211332440376,
"kl": 0.0004363059997558594,
"learning_rate": 2.808144266054612e-07,
"loss": 0.0071,
"num_tokens": 26653419.0,
"reward": 0.0401785746216774,
"reward_std": 0.06808140873908997,
"rewards/accuracy_reward/mean": 0.0401785708963871,
"rewards/accuracy_reward/std": 0.19681765139102936,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2053571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2942.0,
"completions/mean_length": 1441.7410888671875,
"completions/mean_terminated_length": 1020.438232421875,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.020909566126502875,
"grad_norm": 12926.7119140625,
"kl": 17.625338554382324,
"learning_rate": 2.6526421860705473e-07,
"loss": 0.8373,
"num_tokens": 27010817.0,
"reward": 0.196428582072258,
"reward_std": 0.20935659110546112,
"rewards/accuracy_reward/mean": 0.1964285671710968,
"rewards/accuracy_reward/std": 0.3981861472129822,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1964285714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2713.0,
"completions/mean_length": 1265.977783203125,
"completions/mean_terminated_length": 824.5055541992188,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.021208274214024346,
"grad_norm": 0.03522089868783951,
"kl": 0.0004029273986816406,
"learning_rate": 2.500000000000001e-07,
"loss": 0.0525,
"num_tokens": 27329436.0,
"reward": 0.15625,
"reward_std": 0.06612721085548401,
"rewards/accuracy_reward/mean": 0.15625,
"rewards/accuracy_reward/std": 0.3639053702354431,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2868.0,
"completions/mean_length": 1227.24560546875,
"completions/mean_terminated_length": 801.532958984375,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.021506982301545814,
"grad_norm": 0.059741340577602386,
"kl": 0.00044918060302734375,
"learning_rate": 2.350403678833976e-07,
"loss": 0.0907,
"num_tokens": 27643499.0,
"reward": 0.125,
"reward_std": 0.14744171500205994,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3314596116542816,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2321428571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3000.0,
"completions/mean_length": 1325.4107666015625,
"completions/mean_terminated_length": 797.3720703125,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.021805690389067284,
"grad_norm": 0.15673233568668365,
"kl": 0.0005125999450683594,
"learning_rate": 2.2040354826462664e-07,
"loss": 0.138,
"num_tokens": 27977223.0,
"reward": 0.28125,
"reward_std": 0.19404374063014984,
"rewards/accuracy_reward/mean": 0.28125,
"rewards/accuracy_reward/std": 0.45061618089675903,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2410714285714286,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3061.0,
"completions/mean_length": 1447.68310546875,
"completions/mean_terminated_length": 931.7235717773438,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.022104398476588755,
"grad_norm": 0.0585806779563427,
"kl": 0.000385284423828125,
"learning_rate": 2.0610737385376348e-07,
"loss": 0.0809,
"num_tokens": 28339184.0,
"reward": 0.1116071492433548,
"reward_std": 0.14127518236637115,
"rewards/accuracy_reward/mean": 0.1116071417927742,
"rewards/accuracy_reward/std": 0.31558772921562195,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1964285714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2937.0,
"completions/mean_length": 1326.0045166015625,
"completions/mean_terminated_length": 899.20556640625,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.022403106564110223,
"grad_norm": 0.042158063501119614,
"kl": 0.0004096031188964844,
"learning_rate": 1.9216926233717084e-07,
"loss": 0.0529,
"num_tokens": 28673249.0,
"reward": 0.1026785746216774,
"reward_std": 0.10309500992298126,
"rewards/accuracy_reward/mean": 0.1026785746216774,
"rewards/accuracy_reward/std": 0.3042183816432953,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3025.0,
"completions/mean_length": 1532.821533203125,
"completions/mean_terminated_length": 1019.7619018554688,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.022701814651631694,
"grad_norm": 0.05144224688410759,
"kl": 0.0004096031188964844,
"learning_rate": 1.7860619515673032e-07,
"loss": 0.0708,
"num_tokens": 29049481.0,
"reward": 0.0580357164144516,
"reward_std": 0.1079135313630104,
"rewards/accuracy_reward/mean": 0.0580357126891613,
"rewards/accuracy_reward/std": 0.23433461785316467,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2889.0,
"completions/mean_length": 1556.80810546875,
"completions/mean_terminated_length": 950.7312622070312,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.02300052273915316,
"grad_norm": 0.08849991112947464,
"kl": 0.0004596710205078125,
"learning_rate": 1.6543469682057104e-07,
"loss": 0.09,
"num_tokens": 29432390.0,
"reward": 0.1071428656578064,
"reward_std": 0.133773535490036,
"rewards/accuracy_reward/mean": 0.1071428582072258,
"rewards/accuracy_reward/std": 0.30998748540878296,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2455357142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2941.0,
"completions/mean_length": 1509.65185546875,
"completions/mean_terminated_length": 1001.1952514648438,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.023299230826674632,
"grad_norm": 0.07996930927038193,
"kl": 0.00043010711669921875,
"learning_rate": 1.5267081477050131e-07,
"loss": 0.0554,
"num_tokens": 29806576.0,
"reward": 0.1473214328289032,
"reward_std": 0.15225742757320404,
"rewards/accuracy_reward/mean": 0.1527777761220932,
"rewards/accuracy_reward/std": 0.36060887575149536,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1964285714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3066.0,
"completions/mean_length": 1378.2679443359375,
"completions/mean_terminated_length": 964.2444458007812,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.023597938914196103,
"grad_norm": 0.058245718479156494,
"kl": 0.0004696846008300781,
"learning_rate": 1.4033009983067452e-07,
"loss": 0.0367,
"num_tokens": 30154956.0,
"reward": 0.0848214328289032,
"reward_std": 0.0913810282945633,
"rewards/accuracy_reward/mean": 0.0848214253783226,
"rewards/accuracy_reward/std": 0.27923980355262756,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3032.0,
"completions/mean_length": 1506.0982666015625,
"completions/mean_terminated_length": 984.1309814453125,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.02389664700171757,
"grad_norm": 0.047782279551029205,
"kl": 0.0004444122314453125,
"learning_rate": 1.284275872613028e-07,
"loss": 0.0613,
"num_tokens": 30530354.0,
"reward": 0.1116071492433548,
"reward_std": 0.10174980014562607,
"rewards/accuracy_reward/mean": 0.1116071417927742,
"rewards/accuracy_reward/std": 0.31558772921562195,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2366071428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3046.0,
"completions/mean_length": 1465.12060546875,
"completions/mean_terminated_length": 967.0818481445312,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.02419535508923904,
"grad_norm": 0.04319946467876434,
"kl": 0.00041961669921875,
"learning_rate": 1.1697777844051104e-07,
"loss": 0.035,
"num_tokens": 30893933.0,
"reward": 0.0714285746216774,
"reward_std": 0.08070831745862961,
"rewards/accuracy_reward/mean": 0.0714285746216774,
"rewards/accuracy_reward/std": 0.2581161558628082,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2633928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3035.0,
"completions/mean_length": 1520.5535888671875,
"completions/mean_terminated_length": 965.7938842773438,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.024494063176760512,
"grad_norm": 0.0543820746243,
"kl": 0.00045108795166015625,
"learning_rate": 1.0599462319663904e-07,
"loss": 0.0783,
"num_tokens": 31275169.0,
"reward": 0.1071428656578064,
"reward_std": 0.10040179640054703,
"rewards/accuracy_reward/mean": 0.1071428582072258,
"rewards/accuracy_reward/std": 0.30998748540878296,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1741071428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2949.0,
"completions/mean_length": 1234.84375,
"completions/mean_terminated_length": 847.5513916015625,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.02479277126428198,
"grad_norm": 0.07529015839099884,
"kl": 0.0005130767822265625,
"learning_rate": 9.549150281252632e-08,
"loss": -0.0128,
"num_tokens": 31593070.0,
"reward": 0.098214291036129,
"reward_std": 0.09528662264347076,
"rewards/accuracy_reward/mean": 0.0982142835855484,
"rewards/accuracy_reward/std": 0.29827070236206055,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1428571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3036.0,
"completions/mean_length": 1257.71875,
"completions/mean_terminated_length": 955.3385620117188,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.02509147935180345,
"grad_norm": 0.07526399195194244,
"kl": 0.0004706382751464844,
"learning_rate": 8.548121372247919e-08,
"loss": 0.021,
"num_tokens": 31910783.0,
"reward": 0.125,
"reward_std": 0.1480451077222824,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3314596116542816,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2802.0,
"completions/mean_length": 1385.544677734375,
"completions/mean_terminated_length": 913.337158203125,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.02539018743932492,
"grad_norm": 0.08289183676242828,
"kl": 0.00047397613525390625,
"learning_rate": 7.597595192178702e-08,
"loss": 0.0353,
"num_tokens": 32255185.0,
"reward": 0.0758928582072258,
"reward_std": 0.09333522617816925,
"rewards/accuracy_reward/mean": 0.0758928582072258,
"rewards/accuracy_reward/std": 0.26541972160339355,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1428571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 1268.4910888671875,
"completions/mean_terminated_length": 967.90625,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.02568889552684639,
"grad_norm": 0.10912561416625977,
"kl": 0.0005125999450683594,
"learning_rate": 6.698729810778064e-08,
"loss": 0.0096,
"num_tokens": 32578655.0,
"reward": 0.0937500074505806,
"reward_std": 0.1361600160598755,
"rewards/accuracy_reward/mean": 0.09375,
"rewards/accuracy_reward/std": 0.2921334207057953,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2857142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2813.0,
"completions/mean_length": 1568.419677734375,
"completions/mean_terminated_length": 966.9874877929688,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.02598760361436786,
"grad_norm": 0.05263072997331619,
"kl": 0.00043487548828125,
"learning_rate": 5.8526203570536504e-08,
"loss": 0.041,
"num_tokens": 32962037.0,
"reward": 0.1071428656578064,
"reward_std": 0.1477484405040741,
"rewards/accuracy_reward/mean": 0.1071428582072258,
"rewards/accuracy_reward/std": 0.30998748540878296,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2232142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2898.0,
"completions/mean_length": 1397.7188720703125,
"completions/mean_terminated_length": 916.6034545898438,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.026286311701889328,
"grad_norm": 0.0686383992433548,
"kl": 0.0004343986511230469,
"learning_rate": 5.060297685041659e-08,
"loss": 0.0171,
"num_tokens": 33310942.0,
"reward": 0.0848214328289032,
"reward_std": 0.11498290300369263,
"rewards/accuracy_reward/mean": 0.0848214253783226,
"rewards/accuracy_reward/std": 0.2792397737503052,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1383928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2961.0,
"completions/mean_length": 1205.0179443359375,
"completions/mean_terminated_length": 905.139892578125,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0265850197894108,
"grad_norm": 0.06757447123527527,
"kl": 0.0004744529724121094,
"learning_rate": 4.322727117869951e-08,
"loss": 0.0952,
"num_tokens": 33623122.0,
"reward": 0.1517857164144516,
"reward_std": 0.14383558928966522,
"rewards/accuracy_reward/mean": 0.15740740299224854,
"rewards/accuracy_reward/std": 0.365030437707901,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2053571428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3054.0,
"completions/mean_length": 1399.7813720703125,
"completions/mean_terminated_length": 967.6348266601562,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.02688372787693227,
"grad_norm": 0.06952133029699326,
"kl": 0.0004596710205078125,
"learning_rate": 3.6408072716606345e-08,
"loss": 0.077,
"num_tokens": 33975369.0,
"reward": 0.1026785746216774,
"reward_std": 0.1513473093509674,
"rewards/accuracy_reward/mean": 0.1026785746216774,
"rewards/accuracy_reward/std": 0.3042184114456177,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2232142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2935.0,
"completions/mean_length": 1382.477783203125,
"completions/mean_terminated_length": 896.9827270507812,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.027182435964453737,
"grad_norm": 0.04722387343645096,
"kl": 0.00042724609375,
"learning_rate": 3.015368960704584e-08,
"loss": 0.0283,
"num_tokens": 34320844.0,
"reward": 0.0625,
"reward_std": 0.08942963182926178,
"rewards/accuracy_reward/mean": 0.0625,
"rewards/accuracy_reward/std": 0.2426035851240158,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3080357142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3052.0,
"completions/mean_length": 1598.888427734375,
"completions/mean_terminated_length": 943.1160888671875,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.027481144051975208,
"grad_norm": 0.0545211099088192,
"kl": 0.0004267692565917969,
"learning_rate": 2.4471741852423233e-08,
"loss": 0.0618,
"num_tokens": 34720275.0,
"reward": 0.165178582072258,
"reward_std": 0.10461412370204926,
"rewards/accuracy_reward/mean": 0.1651785671710968,
"rewards/accuracy_reward/std": 0.37217333912849426,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2008928571428571,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3048.0,
"completions/mean_length": 1342.759033203125,
"completions/mean_terminated_length": 908.0335083007812,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.027779852139496675,
"grad_norm": 0.08833102881908417,
"kl": 0.0004458427429199219,
"learning_rate": 1.936915203084055e-08,
"loss": 0.1027,
"num_tokens": 35055293.0,
"reward": 0.2008928656578064,
"reward_std": 0.20320014655590057,
"rewards/accuracy_reward/mean": 0.2008928507566452,
"rewards/accuracy_reward/std": 0.40156546235084534,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2276785714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2886.0,
"completions/mean_length": 1383.7232666015625,
"completions/mean_terminated_length": 886.0230712890625,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.028078560227018146,
"grad_norm": 0.05874941125512123,
"kl": 0.00044918060302734375,
"learning_rate": 1.4852136862001763e-08,
"loss": 0.0177,
"num_tokens": 35402255.0,
"reward": 0.1160714328289032,
"reward_std": 0.07485132664442062,
"rewards/accuracy_reward/mean": 0.1160714253783226,
"rewards/accuracy_reward/std": 0.321027934551239,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2544642857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2787.0,
"completions/mean_length": 1439.9866943359375,
"completions/mean_terminated_length": 882.9521484375,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.028377268314539617,
"grad_norm": 0.06310974061489105,
"kl": 0.0004143714904785156,
"learning_rate": 1.0926199633097154e-08,
"loss": 0.0354,
"num_tokens": 35758540.0,
"reward": 0.0714285746216774,
"reward_std": 0.08131170272827148,
"rewards/accuracy_reward/mean": 0.0714285746216774,
"rewards/accuracy_reward/std": 0.2581161558628082,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2366071428571429,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2926.0,
"completions/mean_length": 1451.446533203125,
"completions/mean_terminated_length": 949.1696166992188,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.028675976402061085,
"grad_norm": 0.5383374094963074,
"kl": 0.0004506111145019531,
"learning_rate": 7.59612349389599e-09,
"loss": 0.0656,
"num_tokens": 36119232.0,
"reward": 0.0803571492433548,
"reward_std": 0.12054044008255005,
"rewards/accuracy_reward/mean": 0.0803571417927742,
"rewards/accuracy_reward/std": 0.2724541425704956,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1964285714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3056.0,
"completions/mean_length": 1336.75,
"completions/mean_terminated_length": 912.5778198242188,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.028974684489582556,
"grad_norm": 0.05063558369874954,
"kl": 0.0004563331604003906,
"learning_rate": 4.865965629214819e-09,
"loss": 0.0335,
"num_tokens": 36454472.0,
"reward": 0.0580357164144516,
"reward_std": 0.06478200107812881,
"rewards/accuracy_reward/mean": 0.0580357126891613,
"rewards/accuracy_reward/std": 0.23433460295200348,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2232142857142857,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3010.0,
"completions/mean_length": 1458.107177734375,
"completions/mean_terminated_length": 994.3448486328125,
"completions/min_length": 17.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.029273392577104027,
"grad_norm": 0.08015123009681702,
"kl": 0.00043582916259765625,
"learning_rate": 2.739052315863355e-09,
"loss": 0.0717,
"num_tokens": 36820608.0,
"reward": 0.1383928656578064,
"reward_std": 0.11632810533046722,
"rewards/accuracy_reward/mean": 0.1383928507566452,
"rewards/accuracy_reward/std": 0.34608522057533264,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2276785714285714,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3033.0,
"completions/mean_length": 1310.1741943359375,
"completions/mean_terminated_length": 790.7918701171875,
"completions/min_length": 53.0,
"completions/min_terminated_length": 53.0,
"epoch": 0.029572100664625494,
"grad_norm": 0.052152227610349655,
"kl": 0.0004668235778808594,
"learning_rate": 1.217974870087901e-09,
"loss": 0.0223,
"num_tokens": 37151047.0,
"reward": 0.1205357164144516,
"reward_std": 0.11047111451625824,
"rewards/accuracy_reward/mean": 0.1205357164144516,
"rewards/accuracy_reward/std": 0.32631614804267883,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2455357142857143,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2885.0,
"completions/mean_length": 1361.1473388671875,
"completions/mean_terminated_length": 804.3609619140625,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.029870808752146965,
"grad_norm": 0.08615954965353012,
"kl": 0.0004239082336425781,
"learning_rate": 3.0458649045211894e-10,
"loss": 0.1212,
"num_tokens": 37502112.0,
"reward": 0.165178582072258,
"reward_std": 0.19959399104118347,
"rewards/accuracy_reward/mean": 0.1651785671710968,
"rewards/accuracy_reward/std": 0.37217333912849426,
"step": 100
},
{
"epoch": 0.029870808752146965,
"step": 100,
"total_flos": 0.0,
"train_loss": 0.06171137083787471,
"train_runtime": 3149.8828,
"train_samples_per_second": 7.111,
"train_steps_per_second": 0.032
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 37502112,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}