|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.029870808752146965, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2946428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2953.0, |
|
"completions/mean_length": 1557.727783203125, |
|
"completions/mean_terminated_length": 925.18359375, |
|
"completions/min_length": 46.0, |
|
"completions/min_terminated_length": 46.0, |
|
"epoch": 0.00029870808752146963, |
|
"grad_norm": 0.08854348212480545, |
|
"kl": 0.0002200603485107422, |
|
"learning_rate": 0.0, |
|
"loss": 0.1118, |
|
"num_tokens": 384987.0, |
|
"reward": 0.1071428656578064, |
|
"reward_std": 0.15226024389266968, |
|
"rewards/accuracy_reward/mean": 0.1071428582072258, |
|
"rewards/accuracy_reward/std": 0.30998751521110535, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3839285714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2925.0, |
|
"completions/mean_length": 1786.2501220703125, |
|
"completions/mean_terminated_length": 984.9855346679688, |
|
"completions/min_length": 57.0, |
|
"completions/min_terminated_length": 57.0, |
|
"epoch": 0.0005974161750429393, |
|
"grad_norm": 0.0577642060816288, |
|
"kl": 0.00018739700317382812, |
|
"learning_rate": 1e-07, |
|
"loss": 0.026, |
|
"num_tokens": 819379.0, |
|
"reward": 0.0535714328289032, |
|
"reward_std": 0.056364625692367554, |
|
"rewards/accuracy_reward/mean": 0.0535714291036129, |
|
"rewards/accuracy_reward/std": 0.2256743162870407, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3705357142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3003.0, |
|
"completions/mean_length": 1748.2188720703125, |
|
"completions/mean_terminated_length": 968.9716186523438, |
|
"completions/min_length": 36.0, |
|
"completions/min_terminated_length": 36.0, |
|
"epoch": 0.0008961242625644089, |
|
"grad_norm": 0.0502021387219429, |
|
"kl": 0.0002162456512451172, |
|
"learning_rate": 2e-07, |
|
"loss": 0.0674, |
|
"num_tokens": 1249588.0, |
|
"reward": 0.0714285746216774, |
|
"reward_std": 0.10370119661092758, |
|
"rewards/accuracy_reward/mean": 0.0714285746216774, |
|
"rewards/accuracy_reward/std": 0.2581161558628082, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2633928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2901.0, |
|
"completions/mean_length": 1507.15185546875, |
|
"completions/mean_terminated_length": 947.5999755859375, |
|
"completions/min_length": 29.0, |
|
"completions/min_terminated_length": 29.0, |
|
"epoch": 0.0011948323500858785, |
|
"grad_norm": 0.05528466776013374, |
|
"kl": 0.0002148151397705078, |
|
"learning_rate": 3e-07, |
|
"loss": 0.0171, |
|
"num_tokens": 1627094.0, |
|
"reward": 0.0491071455180645, |
|
"reward_std": 0.07936029881238937, |
|
"rewards/accuracy_reward/mean": 0.0491071417927742, |
|
"rewards/accuracy_reward/std": 0.21657568216323853, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.34375, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3027.0, |
|
"completions/mean_length": 1706.196533203125, |
|
"completions/mean_terminated_length": 990.7755126953125, |
|
"completions/min_length": 30.0, |
|
"completions/min_terminated_length": 30.0, |
|
"epoch": 0.0014935404376073482, |
|
"grad_norm": 0.05157456174492836, |
|
"kl": 0.00016391277313232422, |
|
"learning_rate": 4e-07, |
|
"loss": 0.0591, |
|
"num_tokens": 2044514.0, |
|
"reward": 0.0803571492433548, |
|
"reward_std": 0.11693429946899414, |
|
"rewards/accuracy_reward/mean": 0.0803571417927742, |
|
"rewards/accuracy_reward/std": 0.2724541425704956, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3036.0, |
|
"completions/mean_length": 1646.27685546875, |
|
"completions/mean_terminated_length": 998.2207641601562, |
|
"completions/min_length": 65.0, |
|
"completions/min_terminated_length": 65.0, |
|
"epoch": 0.0017922485251288178, |
|
"grad_norm": 0.12111014127731323, |
|
"kl": 0.00024390220642089844, |
|
"learning_rate": 5e-07, |
|
"loss": 0.1069, |
|
"num_tokens": 2451360.0, |
|
"reward": 0.125, |
|
"reward_std": 0.14548751711845398, |
|
"rewards/accuracy_reward/mean": 0.12962962687015533, |
|
"rewards/accuracy_reward/std": 0.336675763130188, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2857142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2999.0, |
|
"completions/mean_length": 1562.5938720703125, |
|
"completions/mean_terminated_length": 958.8312377929688, |
|
"completions/min_length": 8.0, |
|
"completions/min_terminated_length": 8.0, |
|
"epoch": 0.0020909566126502874, |
|
"grad_norm": 0.05596686899662018, |
|
"kl": 0.00021219253540039062, |
|
"learning_rate": 6e-07, |
|
"loss": 0.0483, |
|
"num_tokens": 2843053.0, |
|
"reward": 0.066964291036129, |
|
"reward_std": 0.11663484573364258, |
|
"rewards/accuracy_reward/mean": 0.0669642835855484, |
|
"rewards/accuracy_reward/std": 0.2505199611186981, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3392857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3026.0, |
|
"completions/mean_length": 1740.3126220703125, |
|
"completions/mean_terminated_length": 1056.4730224609375, |
|
"completions/min_length": 46.0, |
|
"completions/min_terminated_length": 46.0, |
|
"epoch": 0.002389664700171757, |
|
"grad_norm": 0.07892504334449768, |
|
"kl": 0.00020313262939453125, |
|
"learning_rate": 7e-07, |
|
"loss": 0.013, |
|
"num_tokens": 3267483.0, |
|
"reward": 0.0580357164144516, |
|
"reward_std": 0.07350330799818039, |
|
"rewards/accuracy_reward/mean": 0.06018518656492233, |
|
"rewards/accuracy_reward/std": 0.23838205635547638, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2723214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2990.0, |
|
"completions/mean_length": 1496.6295166015625, |
|
"completions/mean_terminated_length": 907.0736083984375, |
|
"completions/min_length": 4.0, |
|
"completions/min_terminated_length": 4.0, |
|
"epoch": 0.0026883727876932267, |
|
"grad_norm": 0.07782948017120361, |
|
"kl": 0.0001881122589111328, |
|
"learning_rate": 8e-07, |
|
"loss": 0.0718, |
|
"num_tokens": 3639680.0, |
|
"reward": 0.1116071492433548, |
|
"reward_std": 0.14939311146736145, |
|
"rewards/accuracy_reward/mean": 0.1116071417927742, |
|
"rewards/accuracy_reward/std": 0.31558772921562195, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3348214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3015.0, |
|
"completions/mean_length": 1665.102783203125, |
|
"completions/mean_terminated_length": 956.932861328125, |
|
"completions/min_length": 29.0, |
|
"completions/min_terminated_length": 29.0, |
|
"epoch": 0.0029870808752146963, |
|
"grad_norm": 0.0631859079003334, |
|
"kl": 0.00020241737365722656, |
|
"learning_rate": 9e-07, |
|
"loss": 0.0873, |
|
"num_tokens": 4046735.0, |
|
"reward": 0.0937500074505806, |
|
"reward_std": 0.14353612065315247, |
|
"rewards/accuracy_reward/mean": 0.09375, |
|
"rewards/accuracy_reward/std": 0.2921334207057953, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3571428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2965.0, |
|
"completions/mean_length": 1728.6785888671875, |
|
"completions/mean_terminated_length": 982.388916015625, |
|
"completions/min_length": 34.0, |
|
"completions/min_terminated_length": 34.0, |
|
"epoch": 0.003285788962736166, |
|
"grad_norm": 0.053209614008665085, |
|
"kl": 0.00017750263214111328, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0142, |
|
"num_tokens": 4469191.0, |
|
"reward": 0.0401785746216774, |
|
"reward_std": 0.07350330799818039, |
|
"rewards/accuracy_reward/mean": 0.0401785708963871, |
|
"rewards/accuracy_reward/std": 0.19681765139102936, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3392857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3024.0, |
|
"completions/mean_length": 1664.825927734375, |
|
"completions/mean_terminated_length": 942.2230224609375, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.0035844970502576356, |
|
"grad_norm": 0.04564949870109558, |
|
"kl": 0.00018143653869628906, |
|
"learning_rate": 9.996954135095478e-07, |
|
"loss": 0.0166, |
|
"num_tokens": 4875240.0, |
|
"reward": 0.0357142873108387, |
|
"reward_std": 0.07289712876081467, |
|
"rewards/accuracy_reward/mean": 0.0357142873108387, |
|
"rewards/accuracy_reward/std": 0.18599249422550201, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2723214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3041.0, |
|
"completions/mean_length": 1547.21435546875, |
|
"completions/mean_terminated_length": 976.5889282226562, |
|
"completions/min_length": 9.0, |
|
"completions/min_terminated_length": 9.0, |
|
"epoch": 0.003883205137779105, |
|
"grad_norm": 0.04013432562351227, |
|
"kl": 0.00025653839111328125, |
|
"learning_rate": 9.98782025129912e-07, |
|
"loss": 0.0329, |
|
"num_tokens": 5256416.0, |
|
"reward": 0.0848214328289032, |
|
"reward_std": 0.07936029881238937, |
|
"rewards/accuracy_reward/mean": 0.0848214253783226, |
|
"rewards/accuracy_reward/std": 0.2792397737503052, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3883928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2960.0, |
|
"completions/mean_length": 1803.071533203125, |
|
"completions/mean_terminated_length": 997.2554931640625, |
|
"completions/min_length": 27.0, |
|
"completions/min_terminated_length": 27.0, |
|
"epoch": 0.004181913225300575, |
|
"grad_norm": 85.07632446289062, |
|
"kl": 0.2520885467529297, |
|
"learning_rate": 9.972609476841365e-07, |
|
"loss": 0.0295, |
|
"num_tokens": 5695616.0, |
|
"reward": 0.0446428582072258, |
|
"reward_std": 0.0417863167822361, |
|
"rewards/accuracy_reward/mean": 0.0446428582072258, |
|
"rewards/accuracy_reward/std": 0.20698098838329315, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3348214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3021.0, |
|
"completions/mean_length": 1632.0223388671875, |
|
"completions/mean_terminated_length": 907.2013549804688, |
|
"completions/min_length": 90.0, |
|
"completions/min_terminated_length": 90.0, |
|
"epoch": 0.004480621312822045, |
|
"grad_norm": 0.0776248499751091, |
|
"kl": 0.0002193450927734375, |
|
"learning_rate": 9.95134034370785e-07, |
|
"loss": 0.0752, |
|
"num_tokens": 6096357.0, |
|
"reward": 0.1205357164144516, |
|
"reward_std": 0.14683552086353302, |
|
"rewards/accuracy_reward/mean": 0.1205357164144516, |
|
"rewards/accuracy_reward/std": 0.32631614804267883, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2857142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3062.0, |
|
"completions/mean_length": 1543.3482666015625, |
|
"completions/mean_terminated_length": 931.8875122070312, |
|
"completions/min_length": 32.0, |
|
"completions/min_terminated_length": 32.0, |
|
"epoch": 0.004779329400343514, |
|
"grad_norm": 0.05424968898296356, |
|
"kl": 0.00020742416381835938, |
|
"learning_rate": 9.92403876506104e-07, |
|
"loss": 0.0186, |
|
"num_tokens": 6478547.0, |
|
"reward": 0.0491071455180645, |
|
"reward_std": 0.07936030626296997, |
|
"rewards/accuracy_reward/mean": 0.0491071417927742, |
|
"rewards/accuracy_reward/std": 0.21657569706439972, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3035714285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2991.0, |
|
"completions/mean_length": 1667.509033203125, |
|
"completions/mean_terminated_length": 1055.294921875, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.005078037487864984, |
|
"grad_norm": 0.07909969985485077, |
|
"kl": 0.00016796588897705078, |
|
"learning_rate": 9.890738003669027e-07, |
|
"loss": 0.1047, |
|
"num_tokens": 6889301.0, |
|
"reward": 0.15625, |
|
"reward_std": 0.15616022050380707, |
|
"rewards/accuracy_reward/mean": 0.15625, |
|
"rewards/accuracy_reward/std": 0.3639053702354431, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3169642857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3032.0, |
|
"completions/mean_length": 1650.3751220703125, |
|
"completions/mean_terminated_length": 990.6666870117188, |
|
"completions/min_length": 4.0, |
|
"completions/min_terminated_length": 4.0, |
|
"epoch": 0.005376745575386453, |
|
"grad_norm": 0.09321057796478271, |
|
"kl": 0.00021791458129882812, |
|
"learning_rate": 9.851478631379982e-07, |
|
"loss": 0.1088, |
|
"num_tokens": 7294537.0, |
|
"reward": 0.1607142984867096, |
|
"reward_std": 0.1844095140695572, |
|
"rewards/accuracy_reward/mean": 0.1607142835855484, |
|
"rewards/accuracy_reward/std": 0.3680897653102875, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.4776785714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3022.0, |
|
"completions/mean_length": 2020.1876220703125, |
|
"completions/mean_terminated_length": 1058.2735595703125, |
|
"completions/min_length": 62.0, |
|
"completions/min_terminated_length": 62.0, |
|
"epoch": 0.0056754536629079234, |
|
"grad_norm": 0.06517499685287476, |
|
"kl": 0.0001583099365234375, |
|
"learning_rate": 9.806308479691594e-07, |
|
"loss": 0.0896, |
|
"num_tokens": 7782355.0, |
|
"reward": 0.0803571492433548, |
|
"reward_std": 0.122494637966156, |
|
"rewards/accuracy_reward/mean": 0.0803571417927742, |
|
"rewards/accuracy_reward/std": 0.2724541425704956, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2321428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3035.0, |
|
"completions/mean_length": 1522.8973388671875, |
|
"completions/mean_terminated_length": 1054.56396484375, |
|
"completions/min_length": 7.0, |
|
"completions/min_terminated_length": 7.0, |
|
"epoch": 0.005974161750429393, |
|
"grad_norm": 0.16769619286060333, |
|
"kl": 0.0007884502410888672, |
|
"learning_rate": 9.755282581475767e-07, |
|
"loss": 0.064, |
|
"num_tokens": 8161596.0, |
|
"reward": 0.1473214328289032, |
|
"reward_std": 0.163971409201622, |
|
"rewards/accuracy_reward/mean": 0.1473214328289032, |
|
"rewards/accuracy_reward/std": 0.35521984100341797, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3392857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3044.0, |
|
"completions/mean_length": 1634.37060546875, |
|
"completions/mean_terminated_length": 896.12841796875, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.006272869837950863, |
|
"grad_norm": 0.09035351872444153, |
|
"kl": 0.00022339820861816406, |
|
"learning_rate": 9.698463103929541e-07, |
|
"loss": 0.0312, |
|
"num_tokens": 8564495.0, |
|
"reward": 0.0535714328289032, |
|
"reward_std": 0.05050762742757797, |
|
"rewards/accuracy_reward/mean": 0.0535714291036129, |
|
"rewards/accuracy_reward/std": 0.2256743162870407, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3080357142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3040.0, |
|
"completions/mean_length": 1619.3660888671875, |
|
"completions/mean_terminated_length": 972.7096557617188, |
|
"completions/min_length": 19.0, |
|
"completions/min_terminated_length": 19.0, |
|
"epoch": 0.006571577925472332, |
|
"grad_norm": 0.044720232486724854, |
|
"kl": 0.00024008750915527344, |
|
"learning_rate": 9.635919272833937e-07, |
|
"loss": 0.0381, |
|
"num_tokens": 8963833.0, |
|
"reward": 0.0401785746216774, |
|
"reward_std": 0.08222462981939316, |
|
"rewards/accuracy_reward/mean": 0.0401785708963871, |
|
"rewards/accuracy_reward/std": 0.19681765139102936, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3348214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2990.0, |
|
"completions/mean_length": 1700.1473388671875, |
|
"completions/mean_terminated_length": 1009.617431640625, |
|
"completions/min_length": 16.0, |
|
"completions/min_terminated_length": 16.0, |
|
"epoch": 0.006870286012993802, |
|
"grad_norm": 0.03696313127875328, |
|
"kl": 0.0001952648162841797, |
|
"learning_rate": 9.567727288213004e-07, |
|
"loss": 0.0385, |
|
"num_tokens": 9378978.0, |
|
"reward": 0.0223214291036129, |
|
"reward_std": 0.06313453614711761, |
|
"rewards/accuracy_reward/mean": 0.0223214291036129, |
|
"rewards/accuracy_reward/std": 0.14805756509304047, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2946428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3046.0, |
|
"completions/mean_length": 1579.0001220703125, |
|
"completions/mean_terminated_length": 955.341796875, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.007168994100515271, |
|
"grad_norm": 0.04915790259838104, |
|
"kl": 0.0002124309539794922, |
|
"learning_rate": 9.493970231495834e-07, |
|
"loss": 0.0689, |
|
"num_tokens": 9767946.0, |
|
"reward": 0.0937500074505806, |
|
"reward_std": 0.08747823536396027, |
|
"rewards/accuracy_reward/mean": 0.09375, |
|
"rewards/accuracy_reward/std": 0.2921334207057953, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3258928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2958.0, |
|
"completions/mean_length": 1655.321533203125, |
|
"completions/mean_terminated_length": 970.4370727539062, |
|
"completions/min_length": 69.0, |
|
"completions/min_terminated_length": 69.0, |
|
"epoch": 0.007467702188036741, |
|
"grad_norm": 0.08162926882505417, |
|
"kl": 0.00021219253540039062, |
|
"learning_rate": 9.414737964294634e-07, |
|
"loss": 0.106, |
|
"num_tokens": 10176490.0, |
|
"reward": 0.165178582072258, |
|
"reward_std": 0.2200292944908142, |
|
"rewards/accuracy_reward/mean": 0.1651785671710968, |
|
"rewards/accuracy_reward/std": 0.37217333912849426, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3571428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3072.0, |
|
"completions/mean_length": 1757.1474609375, |
|
"completions/mean_terminated_length": 1026.673583984375, |
|
"completions/min_length": 37.0, |
|
"completions/min_terminated_length": 37.0, |
|
"epoch": 0.00776641027555821, |
|
"grad_norm": 0.05597545579075813, |
|
"kl": 0.00016999244689941406, |
|
"learning_rate": 9.330127018922193e-07, |
|
"loss": 0.0569, |
|
"num_tokens": 10604371.0, |
|
"reward": 0.1205357164144516, |
|
"reward_std": 0.1474389135837555, |
|
"rewards/accuracy_reward/mean": 0.1205357164144516, |
|
"rewards/accuracy_reward/std": 0.32631614804267883, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3080357142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2834.0, |
|
"completions/mean_length": 1574.700927734375, |
|
"completions/mean_terminated_length": 908.1612548828125, |
|
"completions/min_length": 15.0, |
|
"completions/min_terminated_length": 15.0, |
|
"epoch": 0.00806511836307968, |
|
"grad_norm": 0.09308373928070068, |
|
"kl": 0.00026679039001464844, |
|
"learning_rate": 9.240240480782129e-07, |
|
"loss": -0.0172, |
|
"num_tokens": 10994768.0, |
|
"reward": 0.0714285746216774, |
|
"reward_std": 0.08161844313144684, |
|
"rewards/accuracy_reward/mean": 0.0714285746216774, |
|
"rewards/accuracy_reward/std": 0.2581161558628082, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2991071428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2953.0, |
|
"completions/mean_length": 1551.4241943359375, |
|
"completions/mean_terminated_length": 902.5159301757812, |
|
"completions/min_length": 13.0, |
|
"completions/min_terminated_length": 13.0, |
|
"epoch": 0.00836382645060115, |
|
"grad_norm": 0.03204449638724327, |
|
"kl": 0.00022554397583007812, |
|
"learning_rate": 9.145187862775208e-07, |
|
"loss": 0.0183, |
|
"num_tokens": 11379607.0, |
|
"reward": 0.0223214291036129, |
|
"reward_std": 0.04569191485643387, |
|
"rewards/accuracy_reward/mean": 0.0223214291036129, |
|
"rewards/accuracy_reward/std": 0.14805756509304047, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3035714285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3051.0, |
|
"completions/mean_length": 1574.294677734375, |
|
"completions/mean_terminated_length": 921.44873046875, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.008662534538122619, |
|
"grad_norm": 0.14834517240524292, |
|
"kl": 0.0002460479736328125, |
|
"learning_rate": 9.045084971874737e-07, |
|
"loss": 0.0947, |
|
"num_tokens": 11767841.0, |
|
"reward": 0.1160714328289032, |
|
"reward_std": 0.139630526304245, |
|
"rewards/accuracy_reward/mean": 0.1160714253783226, |
|
"rewards/accuracy_reward/std": 0.321027934551239, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3348214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2951.0, |
|
"completions/mean_length": 1691.6563720703125, |
|
"completions/mean_terminated_length": 996.8523559570312, |
|
"completions/min_length": 55.0, |
|
"completions/min_terminated_length": 55.0, |
|
"epoch": 0.00896124262564409, |
|
"grad_norm": 0.06935401260852814, |
|
"kl": 0.00021028518676757812, |
|
"learning_rate": 8.940053768033608e-07, |
|
"loss": 0.077, |
|
"num_tokens": 12182348.0, |
|
"reward": 0.1428571492433548, |
|
"reward_std": 0.17555983364582062, |
|
"rewards/accuracy_reward/mean": 0.14814814925193787, |
|
"rewards/accuracy_reward/std": 0.35607197880744934, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2901785714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3022.0, |
|
"completions/mean_length": 1568.509033203125, |
|
"completions/mean_terminated_length": 953.8742065429688, |
|
"completions/min_length": 14.0, |
|
"completions/min_terminated_length": 14.0, |
|
"epoch": 0.009259950713165559, |
|
"grad_norm": 0.06759945303201675, |
|
"kl": 0.0002391338348388672, |
|
"learning_rate": 8.83022221559489e-07, |
|
"loss": 0.0138, |
|
"num_tokens": 12567686.0, |
|
"reward": 0.1116071492433548, |
|
"reward_std": 0.07350331544876099, |
|
"rewards/accuracy_reward/mean": 0.1116071417927742, |
|
"rewards/accuracy_reward/std": 0.31558772921562195, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2723214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2847.0, |
|
"completions/mean_length": 1535.107177734375, |
|
"completions/mean_terminated_length": 959.950927734375, |
|
"completions/min_length": 7.0, |
|
"completions/min_terminated_length": 7.0, |
|
"epoch": 0.009558658800687028, |
|
"grad_norm": 0.07123875617980957, |
|
"kl": 0.00024700164794921875, |
|
"learning_rate": 8.71572412738697e-07, |
|
"loss": 0.0526, |
|
"num_tokens": 12953174.0, |
|
"reward": 0.0892857164144516, |
|
"reward_std": 0.11394162476062775, |
|
"rewards/accuracy_reward/mean": 0.09259258955717087, |
|
"rewards/accuracy_reward/std": 0.29053398966789246, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3035714285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3050.0, |
|
"completions/mean_length": 1552.263427734375, |
|
"completions/mean_terminated_length": 889.8140869140625, |
|
"completions/min_length": 7.0, |
|
"completions/min_terminated_length": 7.0, |
|
"epoch": 0.009857366888208497, |
|
"grad_norm": 0.05994417518377304, |
|
"kl": 0.00028204917907714844, |
|
"learning_rate": 8.596699001693255e-07, |
|
"loss": 0.0441, |
|
"num_tokens": 13335825.0, |
|
"reward": 0.0892857164144516, |
|
"reward_std": 0.10504640638828278, |
|
"rewards/accuracy_reward/mean": 0.09259258955717087, |
|
"rewards/accuracy_reward/std": 0.29053395986557007, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2767857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2990.0, |
|
"completions/mean_length": 1505.9910888671875, |
|
"completions/mean_terminated_length": 906.6543579101562, |
|
"completions/min_length": 20.0, |
|
"completions/min_terminated_length": 20.0, |
|
"epoch": 0.010156074975729968, |
|
"grad_norm": 0.06331104040145874, |
|
"kl": 0.00026798248291015625, |
|
"learning_rate": 8.473291852294986e-07, |
|
"loss": 0.1067, |
|
"num_tokens": 13708551.0, |
|
"reward": 0.125, |
|
"reward_std": 0.1584138572216034, |
|
"rewards/accuracy_reward/mean": 0.125, |
|
"rewards/accuracy_reward/std": 0.3314596116542816, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2946428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3050.0, |
|
"completions/mean_length": 1632.8795166015625, |
|
"completions/mean_terminated_length": 1031.7279052734375, |
|
"completions/min_length": 77.0, |
|
"completions/min_terminated_length": 77.0, |
|
"epoch": 0.010454783063251438, |
|
"grad_norm": 0.03785333409905434, |
|
"kl": 0.0002741813659667969, |
|
"learning_rate": 8.34565303179429e-07, |
|
"loss": 0.0503, |
|
"num_tokens": 14113332.0, |
|
"reward": 0.0491071455180645, |
|
"reward_std": 0.10010232776403427, |
|
"rewards/accuracy_reward/mean": 0.0491071417927742, |
|
"rewards/accuracy_reward/std": 0.21657568216323853, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3125, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3060.0, |
|
"completions/mean_length": 1601.3929443359375, |
|
"completions/mean_terminated_length": 932.93505859375, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.010753491150772907, |
|
"grad_norm": 0.07039965689182281, |
|
"kl": 0.0002465248107910156, |
|
"learning_rate": 8.213938048432696e-07, |
|
"loss": 0.0907, |
|
"num_tokens": 14508548.0, |
|
"reward": 0.133928582072258, |
|
"reward_std": 0.15811721980571747, |
|
"rewards/accuracy_reward/mean": 0.1339285671710968, |
|
"rewards/accuracy_reward/std": 0.3413383364677429, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2053571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2849.0, |
|
"completions/mean_length": 1399.2188720703125, |
|
"completions/mean_terminated_length": 966.927001953125, |
|
"completions/min_length": 4.0, |
|
"completions/min_terminated_length": 4.0, |
|
"epoch": 0.011052199238294378, |
|
"grad_norm": 0.06250524520874023, |
|
"kl": 0.00029087066650390625, |
|
"learning_rate": 8.07830737662829e-07, |
|
"loss": 0.0706, |
|
"num_tokens": 14859805.0, |
|
"reward": 0.125, |
|
"reward_std": 0.133773535490036, |
|
"rewards/accuracy_reward/mean": 0.125, |
|
"rewards/accuracy_reward/std": 0.3314596116542816, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2857142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3013.0, |
|
"completions/mean_length": 1635.884033203125, |
|
"completions/mean_terminated_length": 1061.4375, |
|
"completions/min_length": 23.0, |
|
"completions/min_terminated_length": 23.0, |
|
"epoch": 0.011350907325815847, |
|
"grad_norm": 0.028273796662688255, |
|
"kl": 0.0002434253692626953, |
|
"learning_rate": 7.938926261462365e-07, |
|
"loss": 0.0175, |
|
"num_tokens": 15263779.0, |
|
"reward": 0.0357142873108387, |
|
"reward_std": 0.04764331132173538, |
|
"rewards/accuracy_reward/mean": 0.0357142873108387, |
|
"rewards/accuracy_reward/std": 0.18599249422550201, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2767857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2456.0, |
|
"completions/mean_length": 1450.90185546875, |
|
"completions/mean_terminated_length": 830.4815063476562, |
|
"completions/min_length": 25.0, |
|
"completions/min_terminated_length": 25.0, |
|
"epoch": 0.011649615413337316, |
|
"grad_norm": 0.057446062564849854, |
|
"kl": 0.0003275871276855469, |
|
"learning_rate": 7.795964517353733e-07, |
|
"loss": 0.0511, |
|
"num_tokens": 15624173.0, |
|
"reward": 0.0803571492433548, |
|
"reward_std": 0.10851971060037613, |
|
"rewards/accuracy_reward/mean": 0.0803571417927742, |
|
"rewards/accuracy_reward/std": 0.2724541425704956, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2589285714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3027.0, |
|
"completions/mean_length": 1514.7723388671875, |
|
"completions/mean_terminated_length": 970.6806640625, |
|
"completions/min_length": 31.0, |
|
"completions/min_terminated_length": 31.0, |
|
"epoch": 0.011948323500858785, |
|
"grad_norm": 0.06986912339925766, |
|
"kl": 0.0003490447998046875, |
|
"learning_rate": 7.649596321166024e-07, |
|
"loss": 0.0786, |
|
"num_tokens": 16002314.0, |
|
"reward": 0.133928582072258, |
|
"reward_std": 0.17373399436473846, |
|
"rewards/accuracy_reward/mean": 0.1339285671710968, |
|
"rewards/accuracy_reward/std": 0.3413383364677429, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2544642857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3011.0, |
|
"completions/mean_length": 1548.4732666015625, |
|
"completions/mean_terminated_length": 1028.4671630859375, |
|
"completions/min_length": 55.0, |
|
"completions/min_terminated_length": 55.0, |
|
"epoch": 0.012247031588380256, |
|
"grad_norm": 0.40365350246429443, |
|
"kl": 0.0009794235229492188, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.0019, |
|
"num_tokens": 16389036.0, |
|
"reward": 0.0401785746216774, |
|
"reward_std": 0.0689915269613266, |
|
"rewards/accuracy_reward/mean": 0.0401785708963871, |
|
"rewards/accuracy_reward/std": 0.19681765139102936, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1741071428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2763.0, |
|
"completions/mean_length": 1306.65185546875, |
|
"completions/mean_terminated_length": 934.497314453125, |
|
"completions/min_length": 34.0, |
|
"completions/min_terminated_length": 34.0, |
|
"epoch": 0.012545739675901725, |
|
"grad_norm": 0.04599951580166817, |
|
"kl": 0.0003376007080078125, |
|
"learning_rate": 7.347357813929454e-07, |
|
"loss": 0.0255, |
|
"num_tokens": 16717726.0, |
|
"reward": 0.0580357164144516, |
|
"reward_std": 0.0787569135427475, |
|
"rewards/accuracy_reward/mean": 0.0580357126891613, |
|
"rewards/accuracy_reward/std": 0.23433461785316467, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2946428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3070.0, |
|
"completions/mean_length": 1608.08935546875, |
|
"completions/mean_terminated_length": 996.582275390625, |
|
"completions/min_length": 4.0, |
|
"completions/min_terminated_length": 4.0, |
|
"epoch": 0.012844447763423195, |
|
"grad_norm": 0.042162321507930756, |
|
"kl": 0.0003190040588378906, |
|
"learning_rate": 7.191855733945386e-07, |
|
"loss": 0.0291, |
|
"num_tokens": 17115178.0, |
|
"reward": 0.0357142873108387, |
|
"reward_std": 0.060876406729221344, |
|
"rewards/accuracy_reward/mean": 0.0357142873108387, |
|
"rewards/accuracy_reward/std": 0.18599249422550201, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2098214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2964.0, |
|
"completions/mean_length": 1279.0848388671875, |
|
"completions/mean_terminated_length": 803.0, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.013143155850944664, |
|
"grad_norm": 0.08639585971832275, |
|
"kl": 0.00032329559326171875, |
|
"learning_rate": 7.033683215379002e-07, |
|
"loss": 0.0171, |
|
"num_tokens": 17437797.0, |
|
"reward": 0.0803571492433548, |
|
"reward_std": 0.14158472418785095, |
|
"rewards/accuracy_reward/mean": 0.0803571417927742, |
|
"rewards/accuracy_reward/std": 0.2724541425704956, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2857142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3066.0, |
|
"completions/mean_length": 1557.5804443359375, |
|
"completions/mean_terminated_length": 951.8125, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.013441863938466135, |
|
"grad_norm": 0.0711236298084259, |
|
"kl": 0.00033283233642578125, |
|
"learning_rate": 6.87303296707956e-07, |
|
"loss": 0.0433, |
|
"num_tokens": 17823671.0, |
|
"reward": 0.1428571492433548, |
|
"reward_std": 0.14066898822784424, |
|
"rewards/accuracy_reward/mean": 0.1428571492433548, |
|
"rewards/accuracy_reward/std": 0.35071080923080444, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2767857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3038.0, |
|
"completions/mean_length": 1589.357177734375, |
|
"completions/mean_terminated_length": 1021.9259033203125, |
|
"completions/min_length": 6.0, |
|
"completions/min_terminated_length": 6.0, |
|
"epoch": 0.013740572025987604, |
|
"grad_norm": 0.055200062692165375, |
|
"kl": 0.0003032684326171875, |
|
"learning_rate": 6.710100716628344e-07, |
|
"loss": 0.0797, |
|
"num_tokens": 18216415.0, |
|
"reward": 0.098214291036129, |
|
"reward_std": 0.13225442171096802, |
|
"rewards/accuracy_reward/mean": 0.0982142835855484, |
|
"rewards/accuracy_reward/std": 0.29827070236206055, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2678571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2984.0, |
|
"completions/mean_length": 1515.5535888671875, |
|
"completions/mean_terminated_length": 946.1218872070312, |
|
"completions/min_length": 33.0, |
|
"completions/min_terminated_length": 33.0, |
|
"epoch": 0.014039280113509073, |
|
"grad_norm": 0.07808107137680054, |
|
"kl": 0.00030994415283203125, |
|
"learning_rate": 6.545084971874736e-07, |
|
"loss": 0.0337, |
|
"num_tokens": 18590435.0, |
|
"reward": 0.0848214328289032, |
|
"reward_std": 0.10205654054880142, |
|
"rewards/accuracy_reward/mean": 0.0848214253783226, |
|
"rewards/accuracy_reward/std": 0.2792397737503052, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2633928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2920.0, |
|
"completions/mean_length": 1490.2501220703125, |
|
"completions/mean_terminated_length": 924.654541015625, |
|
"completions/min_length": 8.0, |
|
"completions/min_terminated_length": 8.0, |
|
"epoch": 0.014337988201030542, |
|
"grad_norm": 0.06969325244426727, |
|
"kl": 0.0003323554992675781, |
|
"learning_rate": 6.378186779084995e-07, |
|
"loss": 0.0863, |
|
"num_tokens": 18963555.0, |
|
"reward": 0.1383928656578064, |
|
"reward_std": 0.16006861627101898, |
|
"rewards/accuracy_reward/mean": 0.1383928507566452, |
|
"rewards/accuracy_reward/std": 0.34608522057533264, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2276785714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3034.0, |
|
"completions/mean_length": 1513.2232666015625, |
|
"completions/mean_terminated_length": 1053.6993408203125, |
|
"completions/min_length": 39.0, |
|
"completions/min_terminated_length": 39.0, |
|
"epoch": 0.014636696288552013, |
|
"grad_norm": 0.0502396859228611, |
|
"kl": 0.00036144256591796875, |
|
"learning_rate": 6.209609477998338e-07, |
|
"loss": 0.0331, |
|
"num_tokens": 19336581.0, |
|
"reward": 0.1116071492433548, |
|
"reward_std": 0.10174980014562607, |
|
"rewards/accuracy_reward/mean": 0.1116071417927742, |
|
"rewards/accuracy_reward/std": 0.31558772921562195, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2142857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3013.0, |
|
"completions/mean_length": 1356.0491943359375, |
|
"completions/mean_terminated_length": 888.0625, |
|
"completions/min_length": 14.0, |
|
"completions/min_terminated_length": 14.0, |
|
"epoch": 0.014935404376073482, |
|
"grad_norm": 0.07083041965961456, |
|
"kl": 0.0003476142883300781, |
|
"learning_rate": 6.039558454088795e-07, |
|
"loss": 0.0898, |
|
"num_tokens": 19674280.0, |
|
"reward": 0.1428571492433548, |
|
"reward_std": 0.17659832537174225, |
|
"rewards/accuracy_reward/mean": 0.1428571492433548, |
|
"rewards/accuracy_reward/std": 0.35071080923080444, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2678571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3060.0, |
|
"completions/mean_length": 1581.5045166015625, |
|
"completions/mean_terminated_length": 1036.201171875, |
|
"completions/min_length": 24.0, |
|
"completions/min_terminated_length": 24.0, |
|
"epoch": 0.015234112463594952, |
|
"grad_norm": 0.07027444243431091, |
|
"kl": 0.00034427642822265625, |
|
"learning_rate": 5.868240888334652e-07, |
|
"loss": 0.0775, |
|
"num_tokens": 20068041.0, |
|
"reward": 0.165178582072258, |
|
"reward_std": 0.163971409201622, |
|
"rewards/accuracy_reward/mean": 0.1651785671710968, |
|
"rewards/accuracy_reward/std": 0.37217333912849426, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2053571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2985.0, |
|
"completions/mean_length": 1486.7366943359375, |
|
"completions/mean_terminated_length": 1077.061767578125, |
|
"completions/min_length": 27.0, |
|
"completions/min_terminated_length": 27.0, |
|
"epoch": 0.01553282055111642, |
|
"grad_norm": 0.04818105697631836, |
|
"kl": 0.00038242340087890625, |
|
"learning_rate": 5.695865504800327e-07, |
|
"loss": 0.0411, |
|
"num_tokens": 20435622.0, |
|
"reward": 0.0535714328289032, |
|
"reward_std": 0.06673339754343033, |
|
"rewards/accuracy_reward/mean": 0.0535714291036129, |
|
"rewards/accuracy_reward/std": 0.2256743162870407, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2633928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3045.0, |
|
"completions/mean_length": 1494.6429443359375, |
|
"completions/mean_terminated_length": 930.6181640625, |
|
"completions/min_length": 55.0, |
|
"completions/min_terminated_length": 55.0, |
|
"epoch": 0.01583152863863789, |
|
"grad_norm": 0.046202462166547775, |
|
"kl": 0.00034809112548828125, |
|
"learning_rate": 5.522642316338268e-07, |
|
"loss": 0.0624, |
|
"num_tokens": 20804750.0, |
|
"reward": 0.0758928582072258, |
|
"reward_std": 0.10100797563791275, |
|
"rewards/accuracy_reward/mean": 0.0758928582072258, |
|
"rewards/accuracy_reward/std": 0.26541972160339355, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1830357142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3039.0, |
|
"completions/mean_length": 1359.74560546875, |
|
"completions/mean_terminated_length": 976.1256713867188, |
|
"completions/min_length": 9.0, |
|
"completions/min_terminated_length": 9.0, |
|
"epoch": 0.01613023672615936, |
|
"grad_norm": 0.07204340398311615, |
|
"kl": 0.0004181861877441406, |
|
"learning_rate": 5.348782368720625e-07, |
|
"loss": 0.0153, |
|
"num_tokens": 21144645.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.0978442057967186, |
|
"rewards/accuracy_reward/mean": 0.0625, |
|
"rewards/accuracy_reward/std": 0.2426035851240158, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3303571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2961.0, |
|
"completions/mean_length": 1705.763427734375, |
|
"completions/mean_terminated_length": 1031.75341796875, |
|
"completions/min_length": 14.0, |
|
"completions/min_terminated_length": 14.0, |
|
"epoch": 0.016428944813680832, |
|
"grad_norm": 0.029594114050269127, |
|
"kl": 0.0003113746643066406, |
|
"learning_rate": 5.174497483512505e-07, |
|
"loss": 0.0516, |
|
"num_tokens": 21564768.0, |
|
"reward": 0.0446428582072258, |
|
"reward_std": 0.06222161650657654, |
|
"rewards/accuracy_reward/mean": 0.0446428582072258, |
|
"rewards/accuracy_reward/std": 0.20698098838329315, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2276785714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3035.0, |
|
"completions/mean_length": 1459.009033203125, |
|
"completions/mean_terminated_length": 983.5028686523438, |
|
"completions/min_length": 59.0, |
|
"completions/min_terminated_length": 59.0, |
|
"epoch": 0.0167276529012023, |
|
"grad_norm": 0.04346403852105141, |
|
"kl": 0.0004267692565917969, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0202, |
|
"num_tokens": 21926530.0, |
|
"reward": 0.098214291036129, |
|
"reward_std": 0.0417863167822361, |
|
"rewards/accuracy_reward/mean": 0.0982142835855484, |
|
"rewards/accuracy_reward/std": 0.29827070236206055, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2589285714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3001.0, |
|
"completions/mean_length": 1461.7991943359375, |
|
"completions/mean_terminated_length": 899.1987915039062, |
|
"completions/min_length": 5.0, |
|
"completions/min_terminated_length": 5.0, |
|
"epoch": 0.01702636098872377, |
|
"grad_norm": 0.04805866628885269, |
|
"kl": 0.0003714561462402344, |
|
"learning_rate": 4.825502516487496e-07, |
|
"loss": 0.0097, |
|
"num_tokens": 22286485.0, |
|
"reward": 0.0446428582072258, |
|
"reward_std": 0.07740890979766846, |
|
"rewards/accuracy_reward/mean": 0.0446428582072258, |
|
"rewards/accuracy_reward/std": 0.20698098838329315, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2232142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3008.0, |
|
"completions/mean_length": 1363.43310546875, |
|
"completions/mean_terminated_length": 872.4655151367188, |
|
"completions/min_length": 6.0, |
|
"completions/min_terminated_length": 6.0, |
|
"epoch": 0.017325069076245238, |
|
"grad_norm": 0.0761556327342987, |
|
"kl": 0.00038814544677734375, |
|
"learning_rate": 4.6512176312793735e-07, |
|
"loss": 0.0167, |
|
"num_tokens": 22629270.0, |
|
"reward": 0.1160714328289032, |
|
"reward_std": 0.13225442171096802, |
|
"rewards/accuracy_reward/mean": 0.1160714253783226, |
|
"rewards/accuracy_reward/std": 0.321027934551239, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2098214285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3065.0, |
|
"completions/mean_length": 1447.0179443359375, |
|
"completions/mean_terminated_length": 1015.5254516601562, |
|
"completions/min_length": 41.0, |
|
"completions/min_terminated_length": 41.0, |
|
"epoch": 0.01762377716376671, |
|
"grad_norm": 0.08180122822523117, |
|
"kl": 0.0004076957702636719, |
|
"learning_rate": 4.477357683661733e-07, |
|
"loss": 0.1042, |
|
"num_tokens": 22988322.0, |
|
"reward": 0.1473214328289032, |
|
"reward_std": 0.2089243084192276, |
|
"rewards/accuracy_reward/mean": 0.1473214328289032, |
|
"rewards/accuracy_reward/std": 0.35521984100341797, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2142857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3057.0, |
|
"completions/mean_length": 1429.5491943359375, |
|
"completions/mean_terminated_length": 981.6079711914062, |
|
"completions/min_length": 59.0, |
|
"completions/min_terminated_length": 59.0, |
|
"epoch": 0.01792248525128818, |
|
"grad_norm": 0.08614456653594971, |
|
"kl": 0.0004153251647949219, |
|
"learning_rate": 4.304134495199674e-07, |
|
"loss": 0.1011, |
|
"num_tokens": 23345885.0, |
|
"reward": 0.1116071492433548, |
|
"reward_std": 0.1331673562526703, |
|
"rewards/accuracy_reward/mean": 0.1116071417927742, |
|
"rewards/accuracy_reward/std": 0.31558772921562195, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2965.0, |
|
"completions/mean_length": 1379.759033203125, |
|
"completions/mean_terminated_length": 905.931396484375, |
|
"completions/min_length": 2.0, |
|
"completions/min_terminated_length": 2.0, |
|
"epoch": 0.018221193338809647, |
|
"grad_norm": 0.07557892054319382, |
|
"kl": 0.0004553794860839844, |
|
"learning_rate": 4.131759111665348e-07, |
|
"loss": 0.0796, |
|
"num_tokens": 23690391.0, |
|
"reward": 0.1294642984867096, |
|
"reward_std": 0.15225742757320404, |
|
"rewards/accuracy_reward/mean": 0.1294642835855484, |
|
"rewards/accuracy_reward/std": 0.3364649713039398, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2885.0, |
|
"completions/mean_length": 1448.169677734375, |
|
"completions/mean_terminated_length": 906.8928833007812, |
|
"completions/min_length": 33.0, |
|
"completions/min_terminated_length": 33.0, |
|
"epoch": 0.018519901426331118, |
|
"grad_norm": 0.051801372319459915, |
|
"kl": 0.00043392181396484375, |
|
"learning_rate": 3.960441545911204e-07, |
|
"loss": 0.046, |
|
"num_tokens": 24052941.0, |
|
"reward": 0.0758928582072258, |
|
"reward_std": 0.09333522617816925, |
|
"rewards/accuracy_reward/mean": 0.0758928582072258, |
|
"rewards/accuracy_reward/std": 0.26541972160339355, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2321428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3043.0, |
|
"completions/mean_length": 1465.83935546875, |
|
"completions/mean_terminated_length": 980.2557983398438, |
|
"completions/min_length": 27.0, |
|
"completions/min_terminated_length": 27.0, |
|
"epoch": 0.01881860951385259, |
|
"grad_norm": 0.11123108118772507, |
|
"kl": 0.0003695487976074219, |
|
"learning_rate": 3.790390522001662e-07, |
|
"loss": 0.0854, |
|
"num_tokens": 24414641.0, |
|
"reward": 0.1696428656578064, |
|
"reward_std": 0.15706866979599, |
|
"rewards/accuracy_reward/mean": 0.1696428507566452, |
|
"rewards/accuracy_reward/std": 0.37615931034088135, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3258928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2902.0, |
|
"completions/mean_length": 1662.165283203125, |
|
"completions/mean_terminated_length": 980.5894165039062, |
|
"completions/min_length": 12.0, |
|
"completions/min_terminated_length": 12.0, |
|
"epoch": 0.019117317601374056, |
|
"grad_norm": 0.04303564503788948, |
|
"kl": 0.0003743171691894531, |
|
"learning_rate": 3.621813220915004e-07, |
|
"loss": 0.0592, |
|
"num_tokens": 24823998.0, |
|
"reward": 0.0892857164144516, |
|
"reward_std": 0.11181911826133728, |
|
"rewards/accuracy_reward/mean": 0.0892857164144516, |
|
"rewards/accuracy_reward/std": 0.28579434752464294, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2455357142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2906.0, |
|
"completions/mean_length": 1489.8482666015625, |
|
"completions/mean_terminated_length": 974.94677734375, |
|
"completions/min_length": 36.0, |
|
"completions/min_terminated_length": 36.0, |
|
"epoch": 0.019416025688895527, |
|
"grad_norm": 0.06886183470487595, |
|
"kl": 0.000438690185546875, |
|
"learning_rate": 3.454915028125263e-07, |
|
"loss": 0.0292, |
|
"num_tokens": 25193332.0, |
|
"reward": 0.0803571492433548, |
|
"reward_std": 0.11572191119194031, |
|
"rewards/accuracy_reward/mean": 0.0803571417927742, |
|
"rewards/accuracy_reward/std": 0.2724541425704956, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2053571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3026.0, |
|
"completions/mean_length": 1461.888427734375, |
|
"completions/mean_terminated_length": 1045.7921142578125, |
|
"completions/min_length": 2.0, |
|
"completions/min_terminated_length": 2.0, |
|
"epoch": 0.019714733776416995, |
|
"grad_norm": 0.05718955025076866, |
|
"kl": 0.00038433074951171875, |
|
"learning_rate": 3.2898992833716563e-07, |
|
"loss": 0.0252, |
|
"num_tokens": 25563507.0, |
|
"reward": 0.0803571492433548, |
|
"reward_std": 0.11437670141458511, |
|
"rewards/accuracy_reward/mean": 0.0803571417927742, |
|
"rewards/accuracy_reward/std": 0.2724541425704956, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2008928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2723.0, |
|
"completions/mean_length": 1381.7098388671875, |
|
"completions/mean_terminated_length": 956.7764892578125, |
|
"completions/min_length": 34.0, |
|
"completions/min_terminated_length": 34.0, |
|
"epoch": 0.020013441863938466, |
|
"grad_norm": 0.07317039370536804, |
|
"kl": 0.0004258155822753906, |
|
"learning_rate": 3.1269670329204393e-07, |
|
"loss": 0.0637, |
|
"num_tokens": 25908066.0, |
|
"reward": 0.1383928656578064, |
|
"reward_std": 0.148354634642601, |
|
"rewards/accuracy_reward/mean": 0.1383928507566452, |
|
"rewards/accuracy_reward/std": 0.34608522057533264, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2767857142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2957.0, |
|
"completions/mean_length": 1621.2723388671875, |
|
"completions/mean_terminated_length": 1066.0555419921875, |
|
"completions/min_length": 46.0, |
|
"completions/min_terminated_length": 46.0, |
|
"epoch": 0.020312149951459937, |
|
"grad_norm": 0.05922617018222809, |
|
"kl": 0.0003914833068847656, |
|
"learning_rate": 2.9663167846209996e-07, |
|
"loss": 0.062, |
|
"num_tokens": 26307151.0, |
|
"reward": 0.1160714328289032, |
|
"reward_std": 0.1546439230442047, |
|
"rewards/accuracy_reward/mean": 0.1160714253783226, |
|
"rewards/accuracy_reward/std": 0.321027934551239, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2276785714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3031.0, |
|
"completions/mean_length": 1374.5179443359375, |
|
"completions/mean_terminated_length": 874.10400390625, |
|
"completions/min_length": 16.0, |
|
"completions/min_terminated_length": 16.0, |
|
"epoch": 0.020610858038981404, |
|
"grad_norm": 0.04980211332440376, |
|
"kl": 0.0004363059997558594, |
|
"learning_rate": 2.808144266054612e-07, |
|
"loss": 0.0071, |
|
"num_tokens": 26653419.0, |
|
"reward": 0.0401785746216774, |
|
"reward_std": 0.06808140873908997, |
|
"rewards/accuracy_reward/mean": 0.0401785708963871, |
|
"rewards/accuracy_reward/std": 0.19681765139102936, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2053571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2942.0, |
|
"completions/mean_length": 1441.7410888671875, |
|
"completions/mean_terminated_length": 1020.438232421875, |
|
"completions/min_length": 59.0, |
|
"completions/min_terminated_length": 59.0, |
|
"epoch": 0.020909566126502875, |
|
"grad_norm": 12926.7119140625, |
|
"kl": 17.625338554382324, |
|
"learning_rate": 2.6526421860705473e-07, |
|
"loss": 0.8373, |
|
"num_tokens": 27010817.0, |
|
"reward": 0.196428582072258, |
|
"reward_std": 0.20935659110546112, |
|
"rewards/accuracy_reward/mean": 0.1964285671710968, |
|
"rewards/accuracy_reward/std": 0.3981861472129822, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1964285714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2713.0, |
|
"completions/mean_length": 1265.977783203125, |
|
"completions/mean_terminated_length": 824.5055541992188, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.021208274214024346, |
|
"grad_norm": 0.03522089868783951, |
|
"kl": 0.0004029273986816406, |
|
"learning_rate": 2.500000000000001e-07, |
|
"loss": 0.0525, |
|
"num_tokens": 27329436.0, |
|
"reward": 0.15625, |
|
"reward_std": 0.06612721085548401, |
|
"rewards/accuracy_reward/mean": 0.15625, |
|
"rewards/accuracy_reward/std": 0.3639053702354431, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1875, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2868.0, |
|
"completions/mean_length": 1227.24560546875, |
|
"completions/mean_terminated_length": 801.532958984375, |
|
"completions/min_length": 71.0, |
|
"completions/min_terminated_length": 71.0, |
|
"epoch": 0.021506982301545814, |
|
"grad_norm": 0.059741340577602386, |
|
"kl": 0.00044918060302734375, |
|
"learning_rate": 2.350403678833976e-07, |
|
"loss": 0.0907, |
|
"num_tokens": 27643499.0, |
|
"reward": 0.125, |
|
"reward_std": 0.14744171500205994, |
|
"rewards/accuracy_reward/mean": 0.125, |
|
"rewards/accuracy_reward/std": 0.3314596116542816, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2321428571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3000.0, |
|
"completions/mean_length": 1325.4107666015625, |
|
"completions/mean_terminated_length": 797.3720703125, |
|
"completions/min_length": 8.0, |
|
"completions/min_terminated_length": 8.0, |
|
"epoch": 0.021805690389067284, |
|
"grad_norm": 0.15673233568668365, |
|
"kl": 0.0005125999450683594, |
|
"learning_rate": 2.2040354826462664e-07, |
|
"loss": 0.138, |
|
"num_tokens": 27977223.0, |
|
"reward": 0.28125, |
|
"reward_std": 0.19404374063014984, |
|
"rewards/accuracy_reward/mean": 0.28125, |
|
"rewards/accuracy_reward/std": 0.45061618089675903, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2410714285714286, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3061.0, |
|
"completions/mean_length": 1447.68310546875, |
|
"completions/mean_terminated_length": 931.7235717773438, |
|
"completions/min_length": 32.0, |
|
"completions/min_terminated_length": 32.0, |
|
"epoch": 0.022104398476588755, |
|
"grad_norm": 0.0585806779563427, |
|
"kl": 0.000385284423828125, |
|
"learning_rate": 2.0610737385376348e-07, |
|
"loss": 0.0809, |
|
"num_tokens": 28339184.0, |
|
"reward": 0.1116071492433548, |
|
"reward_std": 0.14127518236637115, |
|
"rewards/accuracy_reward/mean": 0.1116071417927742, |
|
"rewards/accuracy_reward/std": 0.31558772921562195, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1964285714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2937.0, |
|
"completions/mean_length": 1326.0045166015625, |
|
"completions/mean_terminated_length": 899.20556640625, |
|
"completions/min_length": 84.0, |
|
"completions/min_terminated_length": 84.0, |
|
"epoch": 0.022403106564110223, |
|
"grad_norm": 0.042158063501119614, |
|
"kl": 0.0004096031188964844, |
|
"learning_rate": 1.9216926233717084e-07, |
|
"loss": 0.0529, |
|
"num_tokens": 28673249.0, |
|
"reward": 0.1026785746216774, |
|
"reward_std": 0.10309500992298126, |
|
"rewards/accuracy_reward/mean": 0.1026785746216774, |
|
"rewards/accuracy_reward/std": 0.3042183816432953, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3025.0, |
|
"completions/mean_length": 1532.821533203125, |
|
"completions/mean_terminated_length": 1019.7619018554688, |
|
"completions/min_length": 38.0, |
|
"completions/min_terminated_length": 38.0, |
|
"epoch": 0.022701814651631694, |
|
"grad_norm": 0.05144224688410759, |
|
"kl": 0.0004096031188964844, |
|
"learning_rate": 1.7860619515673032e-07, |
|
"loss": 0.0708, |
|
"num_tokens": 29049481.0, |
|
"reward": 0.0580357164144516, |
|
"reward_std": 0.1079135313630104, |
|
"rewards/accuracy_reward/mean": 0.0580357126891613, |
|
"rewards/accuracy_reward/std": 0.23433461785316467, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2857142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2889.0, |
|
"completions/mean_length": 1556.80810546875, |
|
"completions/mean_terminated_length": 950.7312622070312, |
|
"completions/min_length": 4.0, |
|
"completions/min_terminated_length": 4.0, |
|
"epoch": 0.02300052273915316, |
|
"grad_norm": 0.08849991112947464, |
|
"kl": 0.0004596710205078125, |
|
"learning_rate": 1.6543469682057104e-07, |
|
"loss": 0.09, |
|
"num_tokens": 29432390.0, |
|
"reward": 0.1071428656578064, |
|
"reward_std": 0.133773535490036, |
|
"rewards/accuracy_reward/mean": 0.1071428582072258, |
|
"rewards/accuracy_reward/std": 0.30998748540878296, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2455357142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2941.0, |
|
"completions/mean_length": 1509.65185546875, |
|
"completions/mean_terminated_length": 1001.1952514648438, |
|
"completions/min_length": 12.0, |
|
"completions/min_terminated_length": 12.0, |
|
"epoch": 0.023299230826674632, |
|
"grad_norm": 0.07996930927038193, |
|
"kl": 0.00043010711669921875, |
|
"learning_rate": 1.5267081477050131e-07, |
|
"loss": 0.0554, |
|
"num_tokens": 29806576.0, |
|
"reward": 0.1473214328289032, |
|
"reward_std": 0.15225742757320404, |
|
"rewards/accuracy_reward/mean": 0.1527777761220932, |
|
"rewards/accuracy_reward/std": 0.36060887575149536, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1964285714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3066.0, |
|
"completions/mean_length": 1378.2679443359375, |
|
"completions/mean_terminated_length": 964.2444458007812, |
|
"completions/min_length": 4.0, |
|
"completions/min_terminated_length": 4.0, |
|
"epoch": 0.023597938914196103, |
|
"grad_norm": 0.058245718479156494, |
|
"kl": 0.0004696846008300781, |
|
"learning_rate": 1.4033009983067452e-07, |
|
"loss": 0.0367, |
|
"num_tokens": 30154956.0, |
|
"reward": 0.0848214328289032, |
|
"reward_std": 0.0913810282945633, |
|
"rewards/accuracy_reward/mean": 0.0848214253783226, |
|
"rewards/accuracy_reward/std": 0.27923980355262756, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.25, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3032.0, |
|
"completions/mean_length": 1506.0982666015625, |
|
"completions/mean_terminated_length": 984.1309814453125, |
|
"completions/min_length": 76.0, |
|
"completions/min_terminated_length": 76.0, |
|
"epoch": 0.02389664700171757, |
|
"grad_norm": 0.047782279551029205, |
|
"kl": 0.0004444122314453125, |
|
"learning_rate": 1.284275872613028e-07, |
|
"loss": 0.0613, |
|
"num_tokens": 30530354.0, |
|
"reward": 0.1116071492433548, |
|
"reward_std": 0.10174980014562607, |
|
"rewards/accuracy_reward/mean": 0.1116071417927742, |
|
"rewards/accuracy_reward/std": 0.31558772921562195, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2366071428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3046.0, |
|
"completions/mean_length": 1465.12060546875, |
|
"completions/mean_terminated_length": 967.0818481445312, |
|
"completions/min_length": 11.0, |
|
"completions/min_terminated_length": 11.0, |
|
"epoch": 0.02419535508923904, |
|
"grad_norm": 0.04319946467876434, |
|
"kl": 0.00041961669921875, |
|
"learning_rate": 1.1697777844051104e-07, |
|
"loss": 0.035, |
|
"num_tokens": 30893933.0, |
|
"reward": 0.0714285746216774, |
|
"reward_std": 0.08070831745862961, |
|
"rewards/accuracy_reward/mean": 0.0714285746216774, |
|
"rewards/accuracy_reward/std": 0.2581161558628082, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2633928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3035.0, |
|
"completions/mean_length": 1520.5535888671875, |
|
"completions/mean_terminated_length": 965.7938842773438, |
|
"completions/min_length": 26.0, |
|
"completions/min_terminated_length": 26.0, |
|
"epoch": 0.024494063176760512, |
|
"grad_norm": 0.0543820746243, |
|
"kl": 0.00045108795166015625, |
|
"learning_rate": 1.0599462319663904e-07, |
|
"loss": 0.0783, |
|
"num_tokens": 31275169.0, |
|
"reward": 0.1071428656578064, |
|
"reward_std": 0.10040179640054703, |
|
"rewards/accuracy_reward/mean": 0.1071428582072258, |
|
"rewards/accuracy_reward/std": 0.30998748540878296, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1741071428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2949.0, |
|
"completions/mean_length": 1234.84375, |
|
"completions/mean_terminated_length": 847.5513916015625, |
|
"completions/min_length": 31.0, |
|
"completions/min_terminated_length": 31.0, |
|
"epoch": 0.02479277126428198, |
|
"grad_norm": 0.07529015839099884, |
|
"kl": 0.0005130767822265625, |
|
"learning_rate": 9.549150281252632e-08, |
|
"loss": -0.0128, |
|
"num_tokens": 31593070.0, |
|
"reward": 0.098214291036129, |
|
"reward_std": 0.09528662264347076, |
|
"rewards/accuracy_reward/mean": 0.0982142835855484, |
|
"rewards/accuracy_reward/std": 0.29827070236206055, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1428571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3036.0, |
|
"completions/mean_length": 1257.71875, |
|
"completions/mean_terminated_length": 955.3385620117188, |
|
"completions/min_length": 6.0, |
|
"completions/min_terminated_length": 6.0, |
|
"epoch": 0.02509147935180345, |
|
"grad_norm": 0.07526399195194244, |
|
"kl": 0.0004706382751464844, |
|
"learning_rate": 8.548121372247919e-08, |
|
"loss": 0.021, |
|
"num_tokens": 31910783.0, |
|
"reward": 0.125, |
|
"reward_std": 0.1480451077222824, |
|
"rewards/accuracy_reward/mean": 0.125, |
|
"rewards/accuracy_reward/std": 0.3314596116542816, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.21875, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2802.0, |
|
"completions/mean_length": 1385.544677734375, |
|
"completions/mean_terminated_length": 913.337158203125, |
|
"completions/min_length": 35.0, |
|
"completions/min_terminated_length": 35.0, |
|
"epoch": 0.02539018743932492, |
|
"grad_norm": 0.08289183676242828, |
|
"kl": 0.00047397613525390625, |
|
"learning_rate": 7.597595192178702e-08, |
|
"loss": 0.0353, |
|
"num_tokens": 32255185.0, |
|
"reward": 0.0758928582072258, |
|
"reward_std": 0.09333522617816925, |
|
"rewards/accuracy_reward/mean": 0.0758928582072258, |
|
"rewards/accuracy_reward/std": 0.26541972160339355, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1428571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3038.0, |
|
"completions/mean_length": 1268.4910888671875, |
|
"completions/mean_terminated_length": 967.90625, |
|
"completions/min_length": 20.0, |
|
"completions/min_terminated_length": 20.0, |
|
"epoch": 0.02568889552684639, |
|
"grad_norm": 0.10912561416625977, |
|
"kl": 0.0005125999450683594, |
|
"learning_rate": 6.698729810778064e-08, |
|
"loss": 0.0096, |
|
"num_tokens": 32578655.0, |
|
"reward": 0.0937500074505806, |
|
"reward_std": 0.1361600160598755, |
|
"rewards/accuracy_reward/mean": 0.09375, |
|
"rewards/accuracy_reward/std": 0.2921334207057953, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2857142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2813.0, |
|
"completions/mean_length": 1568.419677734375, |
|
"completions/mean_terminated_length": 966.9874877929688, |
|
"completions/min_length": 76.0, |
|
"completions/min_terminated_length": 76.0, |
|
"epoch": 0.02598760361436786, |
|
"grad_norm": 0.05263072997331619, |
|
"kl": 0.00043487548828125, |
|
"learning_rate": 5.8526203570536504e-08, |
|
"loss": 0.041, |
|
"num_tokens": 32962037.0, |
|
"reward": 0.1071428656578064, |
|
"reward_std": 0.1477484405040741, |
|
"rewards/accuracy_reward/mean": 0.1071428582072258, |
|
"rewards/accuracy_reward/std": 0.30998748540878296, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2232142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2898.0, |
|
"completions/mean_length": 1397.7188720703125, |
|
"completions/mean_terminated_length": 916.6034545898438, |
|
"completions/min_length": 13.0, |
|
"completions/min_terminated_length": 13.0, |
|
"epoch": 0.026286311701889328, |
|
"grad_norm": 0.0686383992433548, |
|
"kl": 0.0004343986511230469, |
|
"learning_rate": 5.060297685041659e-08, |
|
"loss": 0.0171, |
|
"num_tokens": 33310942.0, |
|
"reward": 0.0848214328289032, |
|
"reward_std": 0.11498290300369263, |
|
"rewards/accuracy_reward/mean": 0.0848214253783226, |
|
"rewards/accuracy_reward/std": 0.2792397737503052, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1383928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2961.0, |
|
"completions/mean_length": 1205.0179443359375, |
|
"completions/mean_terminated_length": 905.139892578125, |
|
"completions/min_length": 12.0, |
|
"completions/min_terminated_length": 12.0, |
|
"epoch": 0.0265850197894108, |
|
"grad_norm": 0.06757447123527527, |
|
"kl": 0.0004744529724121094, |
|
"learning_rate": 4.322727117869951e-08, |
|
"loss": 0.0952, |
|
"num_tokens": 33623122.0, |
|
"reward": 0.1517857164144516, |
|
"reward_std": 0.14383558928966522, |
|
"rewards/accuracy_reward/mean": 0.15740740299224854, |
|
"rewards/accuracy_reward/std": 0.365030437707901, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2053571428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3054.0, |
|
"completions/mean_length": 1399.7813720703125, |
|
"completions/mean_terminated_length": 967.6348266601562, |
|
"completions/min_length": 48.0, |
|
"completions/min_terminated_length": 48.0, |
|
"epoch": 0.02688372787693227, |
|
"grad_norm": 0.06952133029699326, |
|
"kl": 0.0004596710205078125, |
|
"learning_rate": 3.6408072716606345e-08, |
|
"loss": 0.077, |
|
"num_tokens": 33975369.0, |
|
"reward": 0.1026785746216774, |
|
"reward_std": 0.1513473093509674, |
|
"rewards/accuracy_reward/mean": 0.1026785746216774, |
|
"rewards/accuracy_reward/std": 0.3042184114456177, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2232142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2935.0, |
|
"completions/mean_length": 1382.477783203125, |
|
"completions/mean_terminated_length": 896.9827270507812, |
|
"completions/min_length": 37.0, |
|
"completions/min_terminated_length": 37.0, |
|
"epoch": 0.027182435964453737, |
|
"grad_norm": 0.04722387343645096, |
|
"kl": 0.00042724609375, |
|
"learning_rate": 3.015368960704584e-08, |
|
"loss": 0.0283, |
|
"num_tokens": 34320844.0, |
|
"reward": 0.0625, |
|
"reward_std": 0.08942963182926178, |
|
"rewards/accuracy_reward/mean": 0.0625, |
|
"rewards/accuracy_reward/std": 0.2426035851240158, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.3080357142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3052.0, |
|
"completions/mean_length": 1598.888427734375, |
|
"completions/mean_terminated_length": 943.1160888671875, |
|
"completions/min_length": 37.0, |
|
"completions/min_terminated_length": 37.0, |
|
"epoch": 0.027481144051975208, |
|
"grad_norm": 0.0545211099088192, |
|
"kl": 0.0004267692565917969, |
|
"learning_rate": 2.4471741852423233e-08, |
|
"loss": 0.0618, |
|
"num_tokens": 34720275.0, |
|
"reward": 0.165178582072258, |
|
"reward_std": 0.10461412370204926, |
|
"rewards/accuracy_reward/mean": 0.1651785671710968, |
|
"rewards/accuracy_reward/std": 0.37217333912849426, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2008928571428571, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3048.0, |
|
"completions/mean_length": 1342.759033203125, |
|
"completions/mean_terminated_length": 908.0335083007812, |
|
"completions/min_length": 10.0, |
|
"completions/min_terminated_length": 10.0, |
|
"epoch": 0.027779852139496675, |
|
"grad_norm": 0.08833102881908417, |
|
"kl": 0.0004458427429199219, |
|
"learning_rate": 1.936915203084055e-08, |
|
"loss": 0.1027, |
|
"num_tokens": 35055293.0, |
|
"reward": 0.2008928656578064, |
|
"reward_std": 0.20320014655590057, |
|
"rewards/accuracy_reward/mean": 0.2008928507566452, |
|
"rewards/accuracy_reward/std": 0.40156546235084534, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2276785714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2886.0, |
|
"completions/mean_length": 1383.7232666015625, |
|
"completions/mean_terminated_length": 886.0230712890625, |
|
"completions/min_length": 59.0, |
|
"completions/min_terminated_length": 59.0, |
|
"epoch": 0.028078560227018146, |
|
"grad_norm": 0.05874941125512123, |
|
"kl": 0.00044918060302734375, |
|
"learning_rate": 1.4852136862001763e-08, |
|
"loss": 0.0177, |
|
"num_tokens": 35402255.0, |
|
"reward": 0.1160714328289032, |
|
"reward_std": 0.07485132664442062, |
|
"rewards/accuracy_reward/mean": 0.1160714253783226, |
|
"rewards/accuracy_reward/std": 0.321027934551239, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2544642857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2787.0, |
|
"completions/mean_length": 1439.9866943359375, |
|
"completions/mean_terminated_length": 882.9521484375, |
|
"completions/min_length": 6.0, |
|
"completions/min_terminated_length": 6.0, |
|
"epoch": 0.028377268314539617, |
|
"grad_norm": 0.06310974061489105, |
|
"kl": 0.0004143714904785156, |
|
"learning_rate": 1.0926199633097154e-08, |
|
"loss": 0.0354, |
|
"num_tokens": 35758540.0, |
|
"reward": 0.0714285746216774, |
|
"reward_std": 0.08131170272827148, |
|
"rewards/accuracy_reward/mean": 0.0714285746216774, |
|
"rewards/accuracy_reward/std": 0.2581161558628082, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2366071428571429, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2926.0, |
|
"completions/mean_length": 1451.446533203125, |
|
"completions/mean_terminated_length": 949.1696166992188, |
|
"completions/min_length": 56.0, |
|
"completions/min_terminated_length": 56.0, |
|
"epoch": 0.028675976402061085, |
|
"grad_norm": 0.5383374094963074, |
|
"kl": 0.0004506111145019531, |
|
"learning_rate": 7.59612349389599e-09, |
|
"loss": 0.0656, |
|
"num_tokens": 36119232.0, |
|
"reward": 0.0803571492433548, |
|
"reward_std": 0.12054044008255005, |
|
"rewards/accuracy_reward/mean": 0.0803571417927742, |
|
"rewards/accuracy_reward/std": 0.2724541425704956, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.1964285714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3056.0, |
|
"completions/mean_length": 1336.75, |
|
"completions/mean_terminated_length": 912.5778198242188, |
|
"completions/min_length": 2.0, |
|
"completions/min_terminated_length": 2.0, |
|
"epoch": 0.028974684489582556, |
|
"grad_norm": 0.05063558369874954, |
|
"kl": 0.0004563331604003906, |
|
"learning_rate": 4.865965629214819e-09, |
|
"loss": 0.0335, |
|
"num_tokens": 36454472.0, |
|
"reward": 0.0580357164144516, |
|
"reward_std": 0.06478200107812881, |
|
"rewards/accuracy_reward/mean": 0.0580357126891613, |
|
"rewards/accuracy_reward/std": 0.23433460295200348, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2232142857142857, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3010.0, |
|
"completions/mean_length": 1458.107177734375, |
|
"completions/mean_terminated_length": 994.3448486328125, |
|
"completions/min_length": 17.0, |
|
"completions/min_terminated_length": 17.0, |
|
"epoch": 0.029273392577104027, |
|
"grad_norm": 0.08015123009681702, |
|
"kl": 0.00043582916259765625, |
|
"learning_rate": 2.739052315863355e-09, |
|
"loss": 0.0717, |
|
"num_tokens": 36820608.0, |
|
"reward": 0.1383928656578064, |
|
"reward_std": 0.11632810533046722, |
|
"rewards/accuracy_reward/mean": 0.1383928507566452, |
|
"rewards/accuracy_reward/std": 0.34608522057533264, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2276785714285714, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 3033.0, |
|
"completions/mean_length": 1310.1741943359375, |
|
"completions/mean_terminated_length": 790.7918701171875, |
|
"completions/min_length": 53.0, |
|
"completions/min_terminated_length": 53.0, |
|
"epoch": 0.029572100664625494, |
|
"grad_norm": 0.052152227610349655, |
|
"kl": 0.0004668235778808594, |
|
"learning_rate": 1.217974870087901e-09, |
|
"loss": 0.0223, |
|
"num_tokens": 37151047.0, |
|
"reward": 0.1205357164144516, |
|
"reward_std": 0.11047111451625824, |
|
"rewards/accuracy_reward/mean": 0.1205357164144516, |
|
"rewards/accuracy_reward/std": 0.32631614804267883, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio/high_max": 0.0, |
|
"clip_ratio/high_mean": 0.0, |
|
"clip_ratio/low_mean": 0.0, |
|
"clip_ratio/low_min": 0.0, |
|
"clip_ratio/region_mean": 0.0, |
|
"completions/clipped_ratio": 0.2455357142857143, |
|
"completions/max_length": 3072.0, |
|
"completions/max_terminated_length": 2885.0, |
|
"completions/mean_length": 1361.1473388671875, |
|
"completions/mean_terminated_length": 804.3609619140625, |
|
"completions/min_length": 13.0, |
|
"completions/min_terminated_length": 13.0, |
|
"epoch": 0.029870808752146965, |
|
"grad_norm": 0.08615954965353012, |
|
"kl": 0.0004239082336425781, |
|
"learning_rate": 3.0458649045211894e-10, |
|
"loss": 0.1212, |
|
"num_tokens": 37502112.0, |
|
"reward": 0.165178582072258, |
|
"reward_std": 0.19959399104118347, |
|
"rewards/accuracy_reward/mean": 0.1651785671710968, |
|
"rewards/accuracy_reward/std": 0.37217333912849426, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.029870808752146965, |
|
"step": 100, |
|
"total_flos": 0.0, |
|
"train_loss": 0.06171137083787471, |
|
"train_runtime": 3149.8828, |
|
"train_samples_per_second": 7.111, |
|
"train_steps_per_second": 0.032 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 37502112, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|