|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 180, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 2945.7202758789062, |
|
"epoch": 0.027972027972027972, |
|
"grad_norm": 0.08992995321750641, |
|
"kl": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"reward": 0.5104166641831398, |
|
"reward_std": 0.2102233674377203, |
|
"rewards/accuracy_reward": 0.0892857164144516, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4211309626698494, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 3191.761962890625, |
|
"epoch": 0.055944055944055944, |
|
"grad_norm": 0.30593544244766235, |
|
"kl": 0.0, |
|
"learning_rate": 5.555555555555555e-08, |
|
"loss": 0.0, |
|
"reward": 0.5022321417927742, |
|
"reward_std": 0.20526811853051186, |
|
"rewards/accuracy_reward": 0.0922619067132473, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4099702462553978, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 2838.3720703125, |
|
"epoch": 0.08391608391608392, |
|
"grad_norm": 0.09959172457456589, |
|
"kl": 4.9114227294921875e-05, |
|
"learning_rate": 1.111111111111111e-07, |
|
"loss": 0.0, |
|
"reward": 0.583333358168602, |
|
"reward_std": 0.22632932662963867, |
|
"rewards/accuracy_reward": 0.13095238152891397, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4523809626698494, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 2734.3572387695312, |
|
"epoch": 0.11188811188811189, |
|
"grad_norm": 0.11217548698186874, |
|
"kl": 5.0961971282958984e-05, |
|
"learning_rate": 1.6666666666666665e-07, |
|
"loss": 0.0, |
|
"reward": 0.5424107313156128, |
|
"reward_std": 0.2021841686218977, |
|
"rewards/accuracy_reward": 0.10416667023673654, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4382440522313118, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 2863.5357666015625, |
|
"epoch": 0.13986013986013987, |
|
"grad_norm": 0.11380404978990555, |
|
"kl": 5.334615707397461e-05, |
|
"learning_rate": 2.222222222222222e-07, |
|
"loss": 0.0, |
|
"reward": 0.5967262014746666, |
|
"reward_std": 0.22172891348600388, |
|
"rewards/accuracy_reward": 0.15476190764456987, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.441964291036129, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 3099.0416870117188, |
|
"epoch": 0.16783216783216784, |
|
"grad_norm": 0.104108065366745, |
|
"kl": 5.179643630981445e-05, |
|
"learning_rate": 2.7777777777777776e-07, |
|
"loss": 0.0, |
|
"reward": 0.5126488134264946, |
|
"reward_std": 0.22282272577285767, |
|
"rewards/accuracy_reward": 0.09226190578192472, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4203869178891182, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 3416.1934814453125, |
|
"epoch": 0.1958041958041958, |
|
"grad_norm": 0.10306566208600998, |
|
"kl": 5.620718002319336e-05, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.510416679084301, |
|
"reward_std": 0.261018592864275, |
|
"rewards/accuracy_reward": 0.11607143003493547, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3943452462553978, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 3140.9405517578125, |
|
"epoch": 0.22377622377622378, |
|
"grad_norm": 0.09083328396081924, |
|
"kl": 5.40614128112793e-05, |
|
"learning_rate": 3.888888888888889e-07, |
|
"loss": 0.0, |
|
"reward": 0.5208333283662796, |
|
"reward_std": 0.2423064224421978, |
|
"rewards/accuracy_reward": 0.1011904776096344, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4196428582072258, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 3093.7916870117188, |
|
"epoch": 0.2517482517482518, |
|
"grad_norm": 0.0941944494843483, |
|
"kl": 4.83393669128418e-05, |
|
"learning_rate": 4.444444444444444e-07, |
|
"loss": 0.0, |
|
"reward": 0.568452388048172, |
|
"reward_std": 0.2969280257821083, |
|
"rewards/accuracy_reward": 0.1488095261156559, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4196428656578064, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 3059.0924072265625, |
|
"epoch": 0.27972027972027974, |
|
"grad_norm": 3.2292351722717285, |
|
"kl": 4.953145980834961e-05, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 0.5967262089252472, |
|
"reward_std": 0.29494407773017883, |
|
"rewards/accuracy_reward": 0.1666666716337204, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4300595298409462, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 3060.21728515625, |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.09246377646923065, |
|
"kl": 4.6312808990478516e-05, |
|
"learning_rate": 5.555555555555555e-07, |
|
"loss": 0.0, |
|
"reward": 0.4940476343035698, |
|
"reward_std": 0.20687389746308327, |
|
"rewards/accuracy_reward": 0.07440476445481181, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4196428656578064, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 3174.196533203125, |
|
"epoch": 0.3356643356643357, |
|
"grad_norm": 0.10116475075483322, |
|
"kl": 5.650520324707031e-05, |
|
"learning_rate": 6.111111111111112e-07, |
|
"loss": 0.0, |
|
"reward": 0.5066964402794838, |
|
"reward_std": 0.22321293503046036, |
|
"rewards/accuracy_reward": 0.09821428754366934, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4084821566939354, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 3070.0089721679688, |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.10479767620563507, |
|
"kl": 0.0002524256706237793, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 0.0, |
|
"reward": 0.6614583432674408, |
|
"reward_std": 0.2681122124195099, |
|
"rewards/accuracy_reward": 0.24404762126505375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4174107164144516, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 3290.7738647460938, |
|
"epoch": 0.3916083916083916, |
|
"grad_norm": 0.19513735175132751, |
|
"kl": 5.537271499633789e-05, |
|
"learning_rate": 7.222222222222221e-07, |
|
"loss": 0.0, |
|
"reward": 0.6562500074505806, |
|
"reward_std": 0.3831389471888542, |
|
"rewards/accuracy_reward": 0.25595238618552685, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4002976268529892, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 3630.5714721679688, |
|
"epoch": 0.4195804195804196, |
|
"grad_norm": 0.08842650800943375, |
|
"kl": 4.839897155761719e-05, |
|
"learning_rate": 7.777777777777778e-07, |
|
"loss": 0.0, |
|
"reward": 0.545386902987957, |
|
"reward_std": 0.27424266561865807, |
|
"rewards/accuracy_reward": 0.17559524066746235, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3697916716337204, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 3649.27685546875, |
|
"epoch": 0.44755244755244755, |
|
"grad_norm": 0.09835156053304672, |
|
"kl": 4.208087921142578e-05, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.0, |
|
"reward": 0.6138392984867096, |
|
"reward_std": 0.3248288035392761, |
|
"rewards/accuracy_reward": 0.2440476231276989, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3697916716337204, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 3623.0596313476562, |
|
"epoch": 0.4755244755244755, |
|
"grad_norm": 0.11101175099611282, |
|
"kl": 4.845857620239258e-05, |
|
"learning_rate": 8.888888888888888e-07, |
|
"loss": 0.0, |
|
"reward": 0.5372024029493332, |
|
"reward_std": 0.2965564988553524, |
|
"rewards/accuracy_reward": 0.1726190485060215, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3645833507180214, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 3498.8096313476562, |
|
"epoch": 0.5034965034965035, |
|
"grad_norm": 0.10944987833499908, |
|
"kl": 4.649162292480469e-05, |
|
"learning_rate": 9.444444444444444e-07, |
|
"loss": 0.0, |
|
"reward": 0.537202388048172, |
|
"reward_std": 0.3150208666920662, |
|
"rewards/accuracy_reward": 0.17261905316263437, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3645833358168602, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 3287.4048461914062, |
|
"epoch": 0.5314685314685315, |
|
"grad_norm": 0.1434430330991745, |
|
"kl": 5.739927291870117e-05, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0, |
|
"reward": 0.6160714402794838, |
|
"reward_std": 0.3258791044354439, |
|
"rewards/accuracy_reward": 0.22619047947227955, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3898809477686882, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 3627.2202758789062, |
|
"epoch": 0.5594405594405595, |
|
"grad_norm": 0.09357653558254242, |
|
"kl": 4.5299530029296875e-05, |
|
"learning_rate": 9.999153867018255e-07, |
|
"loss": 0.0, |
|
"reward": 0.5297619104385376, |
|
"reward_std": 0.3098462224006653, |
|
"rewards/accuracy_reward": 0.16369047947227955, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3660714402794838, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 3486.3453369140625, |
|
"epoch": 0.5874125874125874, |
|
"grad_norm": 0.09195814281702042, |
|
"kl": 4.369020462036133e-05, |
|
"learning_rate": 9.996615786269034e-07, |
|
"loss": 0.0, |
|
"reward": 0.574404776096344, |
|
"reward_std": 0.3131341114640236, |
|
"rewards/accuracy_reward": 0.1964285783469677, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.377976194024086, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 3686.1131591796875, |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.10805041342973709, |
|
"kl": 3.5136938095092773e-05, |
|
"learning_rate": 9.992386712220707e-07, |
|
"loss": 0.0, |
|
"reward": 0.5855654701590538, |
|
"reward_std": 0.317733321338892, |
|
"rewards/accuracy_reward": 0.2172619104385376, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3683035746216774, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 3132.3988037109375, |
|
"epoch": 0.6433566433566433, |
|
"grad_norm": 0.10081591457128525, |
|
"kl": 4.07099723815918e-05, |
|
"learning_rate": 9.986468235255064e-07, |
|
"loss": 0.0, |
|
"reward": 0.6860119104385376, |
|
"reward_std": 0.3946758955717087, |
|
"rewards/accuracy_reward": 0.2797619067132473, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4062500074505806, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 3288.7650146484375, |
|
"epoch": 0.6713286713286714, |
|
"grad_norm": 0.08800289034843445, |
|
"kl": 3.781914710998535e-05, |
|
"learning_rate": 9.978862581069245e-07, |
|
"loss": 0.0, |
|
"reward": 0.6815476417541504, |
|
"reward_std": 0.3288978040218353, |
|
"rewards/accuracy_reward": 0.2857142947614193, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3958333432674408, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 3099.306640625, |
|
"epoch": 0.6993006993006993, |
|
"grad_norm": 0.11318381130695343, |
|
"kl": 3.314018249511719e-05, |
|
"learning_rate": 9.969572609838744e-07, |
|
"loss": 0.0, |
|
"reward": 0.7291666716337204, |
|
"reward_std": 0.3738822266459465, |
|
"rewards/accuracy_reward": 0.3125000074505806, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4166666716337204, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 3236.0416870117188, |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.10187575966119766, |
|
"kl": 3.30805778503418e-05, |
|
"learning_rate": 9.958601815141803e-07, |
|
"loss": 0.0, |
|
"reward": 0.6294642984867096, |
|
"reward_std": 0.352918803691864, |
|
"rewards/accuracy_reward": 0.2410714365541935, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3883928656578064, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 2626.6101684570312, |
|
"epoch": 0.7552447552447552, |
|
"grad_norm": 0.1044318675994873, |
|
"kl": 5.5670738220214844e-05, |
|
"learning_rate": 9.94595432264564e-07, |
|
"loss": 0.0, |
|
"reward": 0.6822916865348816, |
|
"reward_std": 0.3088056966662407, |
|
"rewards/accuracy_reward": 0.23809524439275265, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4441964402794838, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 3134.5774536132812, |
|
"epoch": 0.7832167832167832, |
|
"grad_norm": 0.08211353421211243, |
|
"kl": 5.906820297241211e-05, |
|
"learning_rate": 9.931634888554935e-07, |
|
"loss": 0.0, |
|
"reward": 0.4985119104385376, |
|
"reward_std": 0.1905420981347561, |
|
"rewards/accuracy_reward": 0.08630952518433332, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.412202388048172, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 2987.3304443359375, |
|
"epoch": 0.8111888111888111, |
|
"grad_norm": 0.08840525150299072, |
|
"kl": 5.1856040954589844e-05, |
|
"learning_rate": 9.915648897823232e-07, |
|
"loss": 0.0, |
|
"reward": 0.6510416716337204, |
|
"reward_std": 0.2952599339187145, |
|
"rewards/accuracy_reward": 0.20833334140479565, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4427083358168602, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 3049.0328369140625, |
|
"epoch": 0.8391608391608392, |
|
"grad_norm": 0.09037017077207565, |
|
"kl": 7.843971252441406e-05, |
|
"learning_rate": 9.89800236212786e-07, |
|
"loss": 0.0, |
|
"reward": 0.574404776096344, |
|
"reward_std": 0.26248788461089134, |
|
"rewards/accuracy_reward": 0.15476190787740052, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4196428582072258, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 2818.0387573242188, |
|
"epoch": 0.8671328671328671, |
|
"grad_norm": 0.08912839740514755, |
|
"kl": 8.428096771240234e-05, |
|
"learning_rate": 9.878701917609207e-07, |
|
"loss": 0.0, |
|
"reward": 0.6257440596818924, |
|
"reward_std": 0.2523465231060982, |
|
"rewards/accuracy_reward": 0.17261905036866665, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4531250074505806, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 2675.1845703125, |
|
"epoch": 0.8951048951048951, |
|
"grad_norm": 0.0868036225438118, |
|
"kl": 0.00010216236114501953, |
|
"learning_rate": 9.857754822375126e-07, |
|
"loss": 0.0, |
|
"reward": 0.6904762089252472, |
|
"reward_std": 0.25569987669587135, |
|
"rewards/accuracy_reward": 0.2410714328289032, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.449404776096344, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 2967.2411499023438, |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.11090683937072754, |
|
"kl": 0.0001150369644165039, |
|
"learning_rate": 9.83516895377146e-07, |
|
"loss": 0.0, |
|
"reward": 0.6324404925107956, |
|
"reward_std": 0.2739677280187607, |
|
"rewards/accuracy_reward": 0.2083333358168602, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4241071492433548, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 2569.0238647460938, |
|
"epoch": 0.951048951048951, |
|
"grad_norm": 0.10315605998039246, |
|
"kl": 0.0001518726348876953, |
|
"learning_rate": 9.8109528054197e-07, |
|
"loss": 0.0, |
|
"reward": 0.7061012089252472, |
|
"reward_std": 0.30017876625061035, |
|
"rewards/accuracy_reward": 0.2172619067132473, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4888392984867096, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 3555.8452758789062, |
|
"epoch": 0.9790209790209791, |
|
"grad_norm": 0.14011836051940918, |
|
"kl": 0.00016379356384277344, |
|
"learning_rate": 9.785115484022869e-07, |
|
"loss": 0.0, |
|
"reward": 0.4531250149011612, |
|
"reward_std": 0.23945146799087524, |
|
"rewards/accuracy_reward": 0.09226190857589245, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3608631044626236, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 3795.0263671875, |
|
"epoch": 1.0, |
|
"grad_norm": 0.14011836051940918, |
|
"kl": 0.00016848246256510416, |
|
"learning_rate": 9.757666705940878e-07, |
|
"loss": 0.0, |
|
"reward": 0.3928571542104085, |
|
"reward_std": 0.16382549703121185, |
|
"rewards/accuracy_reward": 0.0436507947742939, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3492063581943512, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 2951.9464111328125, |
|
"epoch": 1.027972027972028, |
|
"grad_norm": 0.1335798054933548, |
|
"kl": 0.00022125244140625, |
|
"learning_rate": 9.728616793536587e-07, |
|
"loss": 0.0, |
|
"reward": 0.497023805975914, |
|
"reward_std": 0.16415998339653015, |
|
"rewards/accuracy_reward": 0.07142857275903225, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4255952388048172, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 3158.5267944335938, |
|
"epoch": 1.055944055944056, |
|
"grad_norm": 0.16623912751674652, |
|
"kl": 0.00020551681518554688, |
|
"learning_rate": 9.697976671294003e-07, |
|
"loss": 0.0, |
|
"reward": 0.5148809626698494, |
|
"reward_std": 0.22238681092858315, |
|
"rewards/accuracy_reward": 0.0922619067132473, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4226190596818924, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 2795.3155517578125, |
|
"epoch": 1.083916083916084, |
|
"grad_norm": 0.12368790060281754, |
|
"kl": 0.0002484321594238281, |
|
"learning_rate": 9.665757861710007e-07, |
|
"loss": 0.0, |
|
"reward": 0.5699404999613762, |
|
"reward_std": 0.22224155068397522, |
|
"rewards/accuracy_reward": 0.11309523973613977, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.456845261156559, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 2715.77392578125, |
|
"epoch": 1.1118881118881119, |
|
"grad_norm": 0.09115960448980331, |
|
"kl": 0.00038814544677734375, |
|
"learning_rate": 9.631972480961233e-07, |
|
"loss": 0.0, |
|
"reward": 0.5424107313156128, |
|
"reward_std": 0.21795128658413887, |
|
"rewards/accuracy_reward": 0.09226190415211022, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4501488283276558, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 2979.4970092773438, |
|
"epoch": 1.1398601398601398, |
|
"grad_norm": 0.17517763376235962, |
|
"kl": 0.0002455711364746094, |
|
"learning_rate": 9.59663323434766e-07, |
|
"loss": 0.0, |
|
"reward": 0.5796131044626236, |
|
"reward_std": 0.22455461882054806, |
|
"rewards/accuracy_reward": 0.1517857164144516, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.427827388048172, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 3172.4226684570312, |
|
"epoch": 1.167832167832168, |
|
"grad_norm": 0.09061767905950546, |
|
"kl": 0.00025773048400878906, |
|
"learning_rate": 9.55975341151467e-07, |
|
"loss": 0.0, |
|
"reward": 0.4962797835469246, |
|
"reward_std": 0.2209780216217041, |
|
"rewards/accuracy_reward": 0.07738095452077687, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4188988208770752, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 3369.6339721679688, |
|
"epoch": 1.1958041958041958, |
|
"grad_norm": 0.07707580924034119, |
|
"kl": 0.00022912025451660156, |
|
"learning_rate": 9.521346881455354e-07, |
|
"loss": 0.0, |
|
"reward": 0.5066964328289032, |
|
"reward_std": 0.221993587911129, |
|
"rewards/accuracy_reward": 0.1011904776096344, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4055059626698494, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 3192.27685546875, |
|
"epoch": 1.2237762237762237, |
|
"grad_norm": 0.10000985860824585, |
|
"kl": 0.00026798248291015625, |
|
"learning_rate": 9.481428087294959e-07, |
|
"loss": 0.0, |
|
"reward": 0.5111607164144516, |
|
"reward_std": 0.22597463242709637, |
|
"rewards/accuracy_reward": 0.095238097012043, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4159226268529892, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 3079.33935546875, |
|
"epoch": 1.2517482517482517, |
|
"grad_norm": 0.10022323578596115, |
|
"kl": 0.00036716461181640625, |
|
"learning_rate": 9.440012040859408e-07, |
|
"loss": 0.0, |
|
"reward": 0.5535714477300644, |
|
"reward_std": 0.2452528141438961, |
|
"rewards/accuracy_reward": 0.13988095615059137, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4136904776096344, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 3041.0208740234375, |
|
"epoch": 1.2797202797202798, |
|
"grad_norm": 0.08870735764503479, |
|
"kl": 0.0003390312194824219, |
|
"learning_rate": 9.397114317029974e-07, |
|
"loss": 0.0, |
|
"reward": 0.5625000149011612, |
|
"reward_std": 0.2692300006747246, |
|
"rewards/accuracy_reward": 0.13392857648432255, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4285714402794838, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 3139.7202758789062, |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 0.1289481371641159, |
|
"kl": 0.0003085136413574219, |
|
"learning_rate": 9.352751047886198e-07, |
|
"loss": 0.0, |
|
"reward": 0.490327388048172, |
|
"reward_std": 0.2173246443271637, |
|
"rewards/accuracy_reward": 0.07142857275903225, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4188988134264946, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 3241.5535888671875, |
|
"epoch": 1.3356643356643356, |
|
"grad_norm": 0.08511940389871597, |
|
"kl": 0.0003581047058105469, |
|
"learning_rate": 9.306938916639285e-07, |
|
"loss": 0.0, |
|
"reward": 0.504464291036129, |
|
"reward_std": 0.2454623058438301, |
|
"rewards/accuracy_reward": 0.09523809747770429, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.409226194024086, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 3153.041748046875, |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.11432822048664093, |
|
"kl": 0.00035762786865234375, |
|
"learning_rate": 9.259695151358214e-07, |
|
"loss": 0.0, |
|
"reward": 0.6778274029493332, |
|
"reward_std": 0.29422780871391296, |
|
"rewards/accuracy_reward": 0.2619047723710537, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4159226343035698, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 3244.0416870117188, |
|
"epoch": 1.3916083916083917, |
|
"grad_norm": 0.07997111976146698, |
|
"kl": 0.0005092620849609375, |
|
"learning_rate": 9.21103751849098e-07, |
|
"loss": 0.0, |
|
"reward": 0.721726194024086, |
|
"reward_std": 0.4153262600302696, |
|
"rewards/accuracy_reward": 0.3125000037252903, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.409226194024086, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 3661.2827758789062, |
|
"epoch": 1.4195804195804196, |
|
"grad_norm": 0.09987284988164902, |
|
"kl": 0.0004477500915527344, |
|
"learning_rate": 9.160984316183354e-07, |
|
"loss": 0.0, |
|
"reward": 0.5461309626698494, |
|
"reward_std": 0.27763326466083527, |
|
"rewards/accuracy_reward": 0.18750000558793545, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3586309552192688, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 3599.571533203125, |
|
"epoch": 1.4475524475524475, |
|
"grad_norm": 0.09226205199956894, |
|
"kl": 0.000530242919921875, |
|
"learning_rate": 9.109554367397697e-07, |
|
"loss": 0.0, |
|
"reward": 0.636904776096344, |
|
"reward_std": 0.34169958531856537, |
|
"rewards/accuracy_reward": 0.2500000037252903, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3869047611951828, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 3564.58935546875, |
|
"epoch": 1.4755244755244754, |
|
"grad_norm": 0.09048299491405487, |
|
"kl": 0.000576019287109375, |
|
"learning_rate": 9.056767012834416e-07, |
|
"loss": 0.0, |
|
"reward": 0.5989583507180214, |
|
"reward_std": 0.33351704478263855, |
|
"rewards/accuracy_reward": 0.223214291036129, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3757440596818924, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 3535.7650146484375, |
|
"epoch": 1.5034965034965035, |
|
"grad_norm": 0.09732116013765335, |
|
"kl": 0.0005936622619628906, |
|
"learning_rate": 9.002642103658719e-07, |
|
"loss": 0.0, |
|
"reward": 0.57738097012043, |
|
"reward_std": 0.3000512942671776, |
|
"rewards/accuracy_reward": 0.19642857182770967, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.380952388048172, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 3258.33935546875, |
|
"epoch": 1.5314685314685315, |
|
"grad_norm": 0.08897067606449127, |
|
"kl": 0.0005841255187988281, |
|
"learning_rate": 8.9471999940354e-07, |
|
"loss": 0.0, |
|
"reward": 0.6555059626698494, |
|
"reward_std": 0.31323162093758583, |
|
"rewards/accuracy_reward": 0.25297619588673115, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.402529776096344, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 3488.7828369140625, |
|
"epoch": 1.5594405594405596, |
|
"grad_norm": 0.11047809571027756, |
|
"kl": 0.0006580352783203125, |
|
"learning_rate": 8.890461533474473e-07, |
|
"loss": 0.0, |
|
"reward": 0.6056547611951828, |
|
"reward_std": 0.3212001100182533, |
|
"rewards/accuracy_reward": 0.220238097012043, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.385416679084301, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 3451.3245239257812, |
|
"epoch": 1.5874125874125875, |
|
"grad_norm": 0.09569097310304642, |
|
"kl": 0.0006022453308105469, |
|
"learning_rate": 8.832448058990521e-07, |
|
"loss": 0.0, |
|
"reward": 0.596726194024086, |
|
"reward_std": 0.3251044377684593, |
|
"rewards/accuracy_reward": 0.2142857164144516, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.382440485060215, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 3633.193603515625, |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 0.09286098182201385, |
|
"kl": 0.0006651878356933594, |
|
"learning_rate": 8.773181387078719e-07, |
|
"loss": 0.0, |
|
"reward": 0.6272321566939354, |
|
"reward_std": 0.3380197770893574, |
|
"rewards/accuracy_reward": 0.2410714291036129, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3861607238650322, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 3144.7053833007812, |
|
"epoch": 1.6433566433566433, |
|
"grad_norm": 0.1019466370344162, |
|
"kl": 0.0008211135864257812, |
|
"learning_rate": 8.712683805510545e-07, |
|
"loss": 0.0, |
|
"reward": 0.7075893133878708, |
|
"reward_std": 0.3953409940004349, |
|
"rewards/accuracy_reward": 0.2916666716337204, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4159226268529892, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 3188.6845703125, |
|
"epoch": 1.6713286713286712, |
|
"grad_norm": 0.11080282181501389, |
|
"kl": 0.0006132125854492188, |
|
"learning_rate": 8.650978064952258e-07, |
|
"loss": 0.0, |
|
"reward": 0.7604166865348816, |
|
"reward_std": 0.3040749914944172, |
|
"rewards/accuracy_reward": 0.3571428656578064, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4032738134264946, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 3010.1488647460938, |
|
"epoch": 1.6993006993006992, |
|
"grad_norm": 0.08334992080926895, |
|
"kl": 0.0007467269897460938, |
|
"learning_rate": 8.588087370409302e-07, |
|
"loss": 0.0, |
|
"reward": 0.7819940596818924, |
|
"reward_std": 0.3632723242044449, |
|
"rewards/accuracy_reward": 0.3571428656578064, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.424851194024086, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 3139.3452758789062, |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 0.1026916578412056, |
|
"kl": 0.0007228851318359375, |
|
"learning_rate": 8.52403537249985e-07, |
|
"loss": 0.0, |
|
"reward": 0.7075893133878708, |
|
"reward_std": 0.32955069839954376, |
|
"rewards/accuracy_reward": 0.2916666716337204, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4159226343035698, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 2651.9703369140625, |
|
"epoch": 1.7552447552447552, |
|
"grad_norm": 0.11016727238893509, |
|
"kl": 0.0008554458618164062, |
|
"learning_rate": 8.458846158560786e-07, |
|
"loss": 0.0, |
|
"reward": 0.731398805975914, |
|
"reward_std": 0.3333098441362381, |
|
"rewards/accuracy_reward": 0.2708333395421505, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.460565485060215, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 3171.544677734375, |
|
"epoch": 1.7832167832167833, |
|
"grad_norm": 0.199759379029274, |
|
"kl": 0.0009202957153320312, |
|
"learning_rate": 8.392544243589427e-07, |
|
"loss": 0.0, |
|
"reward": 0.5431547686457634, |
|
"reward_std": 0.22581714019179344, |
|
"rewards/accuracy_reward": 0.12202381296083331, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4211309626698494, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 3002.52685546875, |
|
"epoch": 1.8111888111888113, |
|
"grad_norm": 0.09409494698047638, |
|
"kl": 0.000827789306640625, |
|
"learning_rate": 8.325154561024443e-07, |
|
"loss": 0.0, |
|
"reward": 0.6986607313156128, |
|
"reward_std": 0.3044434152543545, |
|
"rewards/accuracy_reward": 0.25297619588673115, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4456845223903656, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 3002.6904907226562, |
|
"epoch": 1.8391608391608392, |
|
"grad_norm": 0.08244930952787399, |
|
"kl": 0.0010738372802734375, |
|
"learning_rate": 8.256702453369412e-07, |
|
"loss": 0.0, |
|
"reward": 0.5766369253396988, |
|
"reward_std": 0.2669289857149124, |
|
"rewards/accuracy_reward": 0.14583334070630372, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.430803582072258, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 2784.8601684570312, |
|
"epoch": 1.867132867132867, |
|
"grad_norm": 0.10322090983390808, |
|
"kl": 0.0010652542114257812, |
|
"learning_rate": 8.187213662662538e-07, |
|
"loss": 0.0, |
|
"reward": 0.6480654776096344, |
|
"reward_std": 0.2842121906578541, |
|
"rewards/accuracy_reward": 0.1934523843228817, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.454613097012043, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 2688.4434814453125, |
|
"epoch": 1.895104895104895, |
|
"grad_norm": 0.10172371566295624, |
|
"kl": 0.001178741455078125, |
|
"learning_rate": 8.11671432079612e-07, |
|
"loss": 0.0, |
|
"reward": 0.7075892984867096, |
|
"reward_std": 0.2647625356912613, |
|
"rewards/accuracy_reward": 0.2410714291036129, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4665178582072258, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 2745.5892944335938, |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 0.10732463002204895, |
|
"kl": 0.0011472702026367188, |
|
"learning_rate": 8.045230939689424e-07, |
|
"loss": 0.0, |
|
"reward": 0.7194940596818924, |
|
"reward_std": 0.3091561570763588, |
|
"rewards/accuracy_reward": 0.2559523805975914, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.463541679084301, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 2508.4762573242188, |
|
"epoch": 1.951048951048951, |
|
"grad_norm": 0.10578262805938721, |
|
"kl": 0.0013446807861328125, |
|
"learning_rate": 7.972790401318627e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7916666716337204, |
|
"reward_std": 0.34562500566244125, |
|
"rewards/accuracy_reward": 0.2738095298409462, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5178571492433548, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 3271.014892578125, |
|
"epoch": 1.9790209790209792, |
|
"grad_norm": 0.11061587184667587, |
|
"kl": 0.0012416839599609375, |
|
"learning_rate": 7.899419947607611e-07, |
|
"loss": 0.0, |
|
"reward": 0.5305059626698494, |
|
"reward_std": 0.3043720945715904, |
|
"rewards/accuracy_reward": 0.127976194024086, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4025297686457634, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 3785.77197265625, |
|
"epoch": 2.0, |
|
"grad_norm": 0.15183548629283905, |
|
"kl": 0.001277923583984375, |
|
"learning_rate": 7.825147170183384e-07, |
|
"loss": 0.0, |
|
"reward": 0.4117063581943512, |
|
"reward_std": 0.199227308233579, |
|
"rewards/accuracy_reward": 0.059523810942967735, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3521825472513835, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 2841.7887573242188, |
|
"epoch": 2.027972027972028, |
|
"grad_norm": 0.1057605892419815, |
|
"kl": 0.0014410018920898438, |
|
"learning_rate": 7.75e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5319940447807312, |
|
"reward_std": 0.18857014551758766, |
|
"rewards/accuracy_reward": 0.09226190880872309, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4397321492433548, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 3105.610107421875, |
|
"epoch": 2.055944055944056, |
|
"grad_norm": 0.14314356446266174, |
|
"kl": 0.0013837814331054688, |
|
"learning_rate": 7.674006696834872e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5357143059372902, |
|
"reward_std": 0.23290237039327621, |
|
"rewards/accuracy_reward": 0.0922619067132473, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.443452388048172, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 2653.3482666015625, |
|
"epoch": 2.0839160839160837, |
|
"grad_norm": 0.16954892873764038, |
|
"kl": 0.00127410888671875, |
|
"learning_rate": 7.597195838661425e-07, |
|
"loss": 0.0001, |
|
"reward": 0.59300597012043, |
|
"reward_std": 0.2353062480688095, |
|
"rewards/accuracy_reward": 0.11011904804036021, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.482886902987957, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 2528.6578369140625, |
|
"epoch": 2.111888111888112, |
|
"grad_norm": 0.0982322096824646, |
|
"kl": 0.00152587890625, |
|
"learning_rate": 7.51959631090208e-07, |
|
"loss": 0.0001, |
|
"reward": 0.565476194024086, |
|
"reward_std": 0.2114737629890442, |
|
"rewards/accuracy_reward": 0.08928571362048388, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.476190485060215, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 2843.4524536132812, |
|
"epoch": 2.13986013986014, |
|
"grad_norm": 0.13836292922496796, |
|
"kl": 0.0012874603271484375, |
|
"learning_rate": 7.441237295565641e-07, |
|
"loss": 0.0001, |
|
"reward": 0.605654776096344, |
|
"reward_std": 0.240522138774395, |
|
"rewards/accuracy_reward": 0.1458333358168602, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4598214328289032, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 3015.6488647460938, |
|
"epoch": 2.167832167832168, |
|
"grad_norm": 0.4612804055213928, |
|
"kl": 0.0013256072998046875, |
|
"learning_rate": 7.362148260273126e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5468750074505806, |
|
"reward_std": 0.2539355792105198, |
|
"rewards/accuracy_reward": 0.09821428940631449, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4486607164144516, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 3276.9346313476562, |
|
"epoch": 2.195804195804196, |
|
"grad_norm": 0.09492038935422897, |
|
"kl": 0.0012264251708984375, |
|
"learning_rate": 7.282358947176205e-07, |
|
"loss": 0.0, |
|
"reward": 0.5208333358168602, |
|
"reward_std": 0.2553631514310837, |
|
"rewards/accuracy_reward": 0.10416666883975267, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4166666716337204, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 3023.4345703125, |
|
"epoch": 2.2237762237762237, |
|
"grad_norm": 0.09587059915065765, |
|
"kl": 0.0013256072998046875, |
|
"learning_rate": 7.201899361772391e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5803571492433548, |
|
"reward_std": 0.268021535128355, |
|
"rewards/accuracy_reward": 0.13392857275903225, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4464285746216774, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 3000.0030517578125, |
|
"epoch": 2.2517482517482517, |
|
"grad_norm": 0.10489190369844437, |
|
"kl": 0.001300811767578125, |
|
"learning_rate": 7.120799761621197e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6220238208770752, |
|
"reward_std": 0.29535772278904915, |
|
"rewards/accuracy_reward": 0.17559524066746235, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.446428582072258, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 2823.2530517578125, |
|
"epoch": 2.2797202797202796, |
|
"grad_norm": 0.102653369307518, |
|
"kl": 0.0018138885498046875, |
|
"learning_rate": 7.039090644965509e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6383928507566452, |
|
"reward_std": 0.32175029069185257, |
|
"rewards/accuracy_reward": 0.15773809887468815, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4806547686457634, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 2965.2232666015625, |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.109110988676548, |
|
"kl": 0.0014629364013671875, |
|
"learning_rate": 6.956802739262445e-07, |
|
"loss": 0.0001, |
|
"reward": 0.534226194024086, |
|
"reward_std": 0.24012810364365578, |
|
"rewards/accuracy_reward": 0.08035714481957257, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4538690522313118, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 3099.669677734375, |
|
"epoch": 2.335664335664336, |
|
"grad_norm": 0.09192686527967453, |
|
"kl": 0.0014801025390625, |
|
"learning_rate": 6.873966989628009e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5416666716337204, |
|
"reward_std": 0.25171075016260147, |
|
"rewards/accuracy_reward": 0.09523809631355107, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.446428582072258, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 2962.357177734375, |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 0.10174567997455597, |
|
"kl": 0.00141143798828125, |
|
"learning_rate": 6.790614547199906e-07, |
|
"loss": 0.0001, |
|
"reward": 0.709077388048172, |
|
"reward_std": 0.2970610596239567, |
|
"rewards/accuracy_reward": 0.2619047686457634, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4471726268529892, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 3212.949462890625, |
|
"epoch": 2.3916083916083917, |
|
"grad_norm": 0.09201247245073318, |
|
"kl": 0.001979827880859375, |
|
"learning_rate": 6.706776757422868e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7433035671710968, |
|
"reward_std": 0.4256303459405899, |
|
"rewards/accuracy_reward": 0.3244047723710537, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4188988208770752, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 3537.5238647460938, |
|
"epoch": 2.4195804195804196, |
|
"grad_norm": 0.10975628346204758, |
|
"kl": 0.0016155242919921875, |
|
"learning_rate": 6.622485148260915e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6011904776096344, |
|
"reward_std": 0.24968973733484745, |
|
"rewards/accuracy_reward": 0.2083333432674408, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3928571492433548, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 3494.4881591796875, |
|
"epoch": 2.4475524475524475, |
|
"grad_norm": 0.09692779183387756, |
|
"kl": 0.00200653076171875, |
|
"learning_rate": 6.537771418340981e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6830357313156128, |
|
"reward_std": 0.37514135241508484, |
|
"rewards/accuracy_reward": 0.2738095298409462, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.409226194024086, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 3433.0982666015625, |
|
"epoch": 2.4755244755244754, |
|
"grad_norm": 0.10614696890115738, |
|
"kl": 0.00193023681640625, |
|
"learning_rate": 6.452667425032349e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6532738283276558, |
|
"reward_std": 0.38044291734695435, |
|
"rewards/accuracy_reward": 0.24107143096625805, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.412202388048172, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 3480.40185546875, |
|
"epoch": 2.5034965034965033, |
|
"grad_norm": 0.13003912568092346, |
|
"kl": 0.0018329620361328125, |
|
"learning_rate": 6.367205172466403e-07, |
|
"loss": 0.0001, |
|
"reward": 0.643601194024086, |
|
"reward_std": 0.31658271327614784, |
|
"rewards/accuracy_reward": 0.24702381901443005, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3965773805975914, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 3084.0804443359375, |
|
"epoch": 2.5314685314685317, |
|
"grad_norm": 0.09549430012702942, |
|
"kl": 0.002101898193359375, |
|
"learning_rate": 6.281416799501187e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7269345223903656, |
|
"reward_std": 0.36572954058647156, |
|
"rewards/accuracy_reward": 0.27678571827709675, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4501488134264946, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 3418.8155517578125, |
|
"epoch": 2.5594405594405596, |
|
"grad_norm": 0.11826770007610321, |
|
"kl": 0.0019245147705078125, |
|
"learning_rate": 6.195334567635283e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6346726343035698, |
|
"reward_std": 0.34816064313054085, |
|
"rewards/accuracy_reward": 0.23511905036866665, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3995535746216774, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 3348.9732666015625, |
|
"epoch": 2.5874125874125875, |
|
"grad_norm": 0.09275460988283157, |
|
"kl": 0.001911163330078125, |
|
"learning_rate": 6.10899084887559e-07, |
|
"loss": 0.0001, |
|
"reward": 0.70238097012043, |
|
"reward_std": 0.3649657368659973, |
|
"rewards/accuracy_reward": 0.2827381044626236, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4196428656578064, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 3493.886962890625, |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 0.09432340413331985, |
|
"kl": 0.001922607421875, |
|
"learning_rate": 6.022418113563535e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6912202388048172, |
|
"reward_std": 0.3915746212005615, |
|
"rewards/accuracy_reward": 0.27380952425301075, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4174107164144516, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 2935.6726684570312, |
|
"epoch": 2.6433566433566433, |
|
"grad_norm": 0.09711038321256638, |
|
"kl": 0.0020694732666015625, |
|
"learning_rate": 5.935648918164306e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8415178656578064, |
|
"reward_std": 0.3873477354645729, |
|
"rewards/accuracy_reward": 0.3928571566939354, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4486607238650322, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 3010.7738037109375, |
|
"epoch": 2.6713286713286712, |
|
"grad_norm": 0.09161806106567383, |
|
"kl": 0.0018405914306640625, |
|
"learning_rate": 5.848715893023689e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8355654925107956, |
|
"reward_std": 0.2888724021613598, |
|
"rewards/accuracy_reward": 0.3720238208770752, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4635416865348816, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 2880.8810424804688, |
|
"epoch": 2.699300699300699, |
|
"grad_norm": 0.10187753289937973, |
|
"kl": 0.002147674560546875, |
|
"learning_rate": 5.761651730097142e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8497024178504944, |
|
"reward_std": 0.36032697558403015, |
|
"rewards/accuracy_reward": 0.3869047686457634, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4627976194024086, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 3113.1726684570312, |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.09418516606092453, |
|
"kl": 0.0018100738525390625, |
|
"learning_rate": 5.674489170655675e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7410714477300644, |
|
"reward_std": 0.3313647545874119, |
|
"rewards/accuracy_reward": 0.2976190485060215, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.443452388048172, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 2379.4732666015625, |
|
"epoch": 2.755244755244755, |
|
"grad_norm": 0.10177874565124512, |
|
"kl": 0.002620697021484375, |
|
"learning_rate": 5.587260992973209e-07, |
|
"loss": 0.0001, |
|
"reward": 0.820684552192688, |
|
"reward_std": 0.3514714166522026, |
|
"rewards/accuracy_reward": 0.3035714328289032, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.517113097012043, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 2891.5149536132812, |
|
"epoch": 2.7832167832167833, |
|
"grad_norm": 0.1491205096244812, |
|
"kl": 0.0019989013671875, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5982143133878708, |
|
"reward_std": 0.26152127981185913, |
|
"rewards/accuracy_reward": 0.13095238455571234, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4672619104385376, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 2804.0357666015625, |
|
"epoch": 2.8111888111888113, |
|
"grad_norm": 0.11217369139194489, |
|
"kl": 0.002048492431640625, |
|
"learning_rate": 5.41273900702679e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7849702537059784, |
|
"reward_std": 0.3465358465909958, |
|
"rewards/accuracy_reward": 0.2946428656578064, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.490327388048172, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 2823.2500610351562, |
|
"epoch": 2.839160839160839, |
|
"grad_norm": 0.10132594406604767, |
|
"kl": 0.002750396728515625, |
|
"learning_rate": 5.325510829344324e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6532738134264946, |
|
"reward_std": 0.31015192717313766, |
|
"rewards/accuracy_reward": 0.15773809794336557, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4955357238650322, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 2617.6280517578125, |
|
"epoch": 2.867132867132867, |
|
"grad_norm": 0.10968906432390213, |
|
"kl": 0.002490997314453125, |
|
"learning_rate": 5.238348269902859e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7165178805589676, |
|
"reward_std": 0.2918235771358013, |
|
"rewards/accuracy_reward": 0.2261904776096344, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4903273805975914, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 2468.5327758789062, |
|
"epoch": 2.895104895104895, |
|
"grad_norm": 0.10988292843103409, |
|
"kl": 0.00304412841796875, |
|
"learning_rate": 5.151284106976311e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7790178805589676, |
|
"reward_std": 0.29311515390872955, |
|
"rewards/accuracy_reward": 0.2678571455180645, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5111607164144516, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 2649.8631591796875, |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 0.11911734938621521, |
|
"kl": 0.00260162353515625, |
|
"learning_rate": 5.064351081835694e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7857142984867096, |
|
"reward_std": 0.3036133013665676, |
|
"rewards/accuracy_reward": 0.2946428656578064, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4910714477300644, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 2294.0923767089844, |
|
"epoch": 2.951048951048951, |
|
"grad_norm": 0.10718347877264023, |
|
"kl": 0.00302886962890625, |
|
"learning_rate": 4.977581886436462e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8206845372915268, |
|
"reward_std": 0.37444857507944107, |
|
"rewards/accuracy_reward": 0.255952388048172, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.564732164144516, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 3227.6697387695312, |
|
"epoch": 2.979020979020979, |
|
"grad_norm": 0.13584300875663757, |
|
"kl": 0.002513885498046875, |
|
"learning_rate": 4.891009151124411e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5438988208770752, |
|
"reward_std": 0.2875536195933819, |
|
"rewards/accuracy_reward": 0.1160714328289032, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.427827388048172, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 3472.122802734375, |
|
"epoch": 3.0, |
|
"grad_norm": 0.14357814192771912, |
|
"kl": 0.0024566650390625, |
|
"learning_rate": 4.804665432364719e-07, |
|
"loss": 0.0001, |
|
"reward": 0.4494047661622365, |
|
"reward_std": 0.2378531942764918, |
|
"rewards/accuracy_reward": 0.06349206529557705, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3859127064545949, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 2802.714324951172, |
|
"epoch": 3.027972027972028, |
|
"grad_norm": 0.10655632615089417, |
|
"kl": 0.002201080322265625, |
|
"learning_rate": 4.7185832004988133e-07, |
|
"loss": 0.0001, |
|
"reward": 0.594494067132473, |
|
"reward_std": 0.2724935933947563, |
|
"rewards/accuracy_reward": 0.10714285774156451, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4873512014746666, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 3067.919677734375, |
|
"epoch": 3.055944055944056, |
|
"grad_norm": 0.17373289167881012, |
|
"kl": 0.0023937225341796875, |
|
"learning_rate": 4.632794827533596e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5587797686457634, |
|
"reward_std": 0.26902300491929054, |
|
"rewards/accuracy_reward": 0.0922619067132473, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4665178656578064, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 2598.9345703125, |
|
"epoch": 3.0839160839160837, |
|
"grad_norm": 0.13991814851760864, |
|
"kl": 0.002651214599609375, |
|
"learning_rate": 4.547332574967653e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6733631193637848, |
|
"reward_std": 0.2784017063677311, |
|
"rewards/accuracy_reward": 0.13095238571986556, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5424107164144516, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 2481.2262573242188, |
|
"epoch": 3.111888111888112, |
|
"grad_norm": 0.12796764075756073, |
|
"kl": 0.00284576416015625, |
|
"learning_rate": 4.4622285816590186e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6428571492433548, |
|
"reward_std": 0.25486208125948906, |
|
"rewards/accuracy_reward": 0.10416666907258332, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5386904925107956, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 2844.6786499023438, |
|
"epoch": 3.13986013986014, |
|
"grad_norm": 0.11658414453268051, |
|
"kl": 0.0024261474609375, |
|
"learning_rate": 4.3775148517390846e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.2692863494157791, |
|
"rewards/accuracy_reward": 0.16964286006987095, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.482142873108387, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 2809.416748046875, |
|
"epoch": 3.167832167832168, |
|
"grad_norm": 0.12356197834014893, |
|
"kl": 0.002544403076171875, |
|
"learning_rate": 4.293223242577131e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6004464477300644, |
|
"reward_std": 0.26328422501683235, |
|
"rewards/accuracy_reward": 0.09821428754366934, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5022321492433548, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 3025.8661499023438, |
|
"epoch": 3.195804195804196, |
|
"grad_norm": 0.11549896001815796, |
|
"kl": 0.00232696533203125, |
|
"learning_rate": 4.209385452800095e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.3151152990758419, |
|
"rewards/accuracy_reward": 0.1428571455180645, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4642857313156128, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 2975.202392578125, |
|
"epoch": 3.2237762237762237, |
|
"grad_norm": 0.10660137236118317, |
|
"kl": 0.00252532958984375, |
|
"learning_rate": 4.126033010371991e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5825892984867096, |
|
"reward_std": 0.2717975974082947, |
|
"rewards/accuracy_reward": 0.12202380993403494, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.460565485060215, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 2774.3155517578125, |
|
"epoch": 3.2517482517482517, |
|
"grad_norm": 0.1331847459077835, |
|
"kl": 0.002536773681640625, |
|
"learning_rate": 4.043197260737555e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6711309552192688, |
|
"reward_std": 0.2978888005018234, |
|
"rewards/accuracy_reward": 0.18750000093132257, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4836309626698494, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 2632.1339721679688, |
|
"epoch": 3.2797202797202796, |
|
"grad_norm": 0.11804653704166412, |
|
"kl": 0.002796173095703125, |
|
"learning_rate": 3.9609093550344907e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6644345372915268, |
|
"reward_std": 0.33424488455057144, |
|
"rewards/accuracy_reward": 0.1547619067132473, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.509672611951828, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 2838.5297241210938, |
|
"epoch": 3.3076923076923075, |
|
"grad_norm": 0.13138364255428314, |
|
"kl": 0.002597808837890625, |
|
"learning_rate": 3.8792002383788036e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5818452388048172, |
|
"reward_std": 0.259741447865963, |
|
"rewards/accuracy_reward": 0.08928571850992739, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4925595372915268, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 3014.5476684570312, |
|
"epoch": 3.335664335664336, |
|
"grad_norm": 0.10719572007656097, |
|
"kl": 0.00262451171875, |
|
"learning_rate": 3.7981006382276093e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6049107164144516, |
|
"reward_std": 0.27329112216830254, |
|
"rewards/accuracy_reward": 0.11904762033373117, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4858631044626236, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 2792.014892578125, |
|
"epoch": 3.3636363636363638, |
|
"grad_norm": 0.09954708069562912, |
|
"kl": 0.002727508544921875, |
|
"learning_rate": 3.7176410528237945e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7752976417541504, |
|
"reward_std": 0.3618427440524101, |
|
"rewards/accuracy_reward": 0.2797619104385376, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4955357238650322, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 2741.3720703125, |
|
"epoch": 3.3916083916083917, |
|
"grad_norm": 0.11019442230463028, |
|
"kl": 0.003749847412109375, |
|
"learning_rate": 3.637851739726874e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8489583432674408, |
|
"reward_std": 0.4548647478222847, |
|
"rewards/accuracy_reward": 0.3511904813349247, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4977678582072258, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 3301.1488647460938, |
|
"epoch": 3.4195804195804196, |
|
"grad_norm": 0.10979685932397842, |
|
"kl": 0.003269195556640625, |
|
"learning_rate": 3.5587627044343604e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6569940596818924, |
|
"reward_std": 0.29442668706178665, |
|
"rewards/accuracy_reward": 0.23511905409395695, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4218750149011612, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 3237.297607421875, |
|
"epoch": 3.4475524475524475, |
|
"grad_norm": 0.09944824874401093, |
|
"kl": 0.003009796142578125, |
|
"learning_rate": 3.4804036890979205e-07, |
|
"loss": 0.0001, |
|
"reward": 0.762648805975914, |
|
"reward_std": 0.35693658888339996, |
|
"rewards/accuracy_reward": 0.3184523917734623, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4441964402794838, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 3247.4404907226562, |
|
"epoch": 3.4755244755244754, |
|
"grad_norm": 0.1071368008852005, |
|
"kl": 0.003139495849609375, |
|
"learning_rate": 3.402804161338577e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7090774029493332, |
|
"reward_std": 0.39585674554109573, |
|
"rewards/accuracy_reward": 0.2708333358168602, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4382440522313118, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 3228.3363647460938, |
|
"epoch": 3.5034965034965033, |
|
"grad_norm": 0.10697885602712631, |
|
"kl": 0.003299713134765625, |
|
"learning_rate": 3.3259933031651266e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6659226417541504, |
|
"reward_std": 0.379042886197567, |
|
"rewards/accuracy_reward": 0.23511904664337635, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.430803582072258, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 2906.5477294921875, |
|
"epoch": 3.5314685314685317, |
|
"grad_norm": 0.12582722306251526, |
|
"kl": 0.003780364990234375, |
|
"learning_rate": 3.250000000000001e-07, |
|
"loss": 0.0002, |
|
"reward": 0.793898805975914, |
|
"reward_std": 0.3561762161552906, |
|
"rewards/accuracy_reward": 0.3244047611951828, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.469494067132473, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 3254.8929443359375, |
|
"epoch": 3.5594405594405596, |
|
"grad_norm": 0.09814245998859406, |
|
"kl": 0.003406524658203125, |
|
"learning_rate": 3.1748528298166164e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7239583432674408, |
|
"reward_std": 0.37790394574403763, |
|
"rewards/accuracy_reward": 0.2797619067132473, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4441964253783226, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 3201.9077758789062, |
|
"epoch": 3.5874125874125875, |
|
"grad_norm": 0.10079911351203918, |
|
"kl": 0.0031280517578125, |
|
"learning_rate": 3.1005800523923903e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7693452537059784, |
|
"reward_std": 0.406753808259964, |
|
"rewards/accuracy_reward": 0.3244047611951828, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.444940485060215, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 3339.5059814453125, |
|
"epoch": 3.6153846153846154, |
|
"grad_norm": 0.09590104967355728, |
|
"kl": 0.003513336181640625, |
|
"learning_rate": 3.027209598681373e-07, |
|
"loss": 0.0001, |
|
"reward": 0.725446455180645, |
|
"reward_std": 0.38120192289352417, |
|
"rewards/accuracy_reward": 0.29464286752045155, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.430803582072258, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 2748.104248046875, |
|
"epoch": 3.6433566433566433, |
|
"grad_norm": 0.13206864893436432, |
|
"kl": 0.004322052001953125, |
|
"learning_rate": 2.954769060310577e-07, |
|
"loss": 0.0002, |
|
"reward": 0.879464328289032, |
|
"reward_std": 0.4047969654202461, |
|
"rewards/accuracy_reward": 0.3928571492433548, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4866071566939354, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 3002.3394165039062, |
|
"epoch": 3.6713286713286712, |
|
"grad_norm": 0.10522563755512238, |
|
"kl": 0.00333404541015625, |
|
"learning_rate": 2.8832856792038794e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8526785969734192, |
|
"reward_std": 0.361838236451149, |
|
"rewards/accuracy_reward": 0.3750000149011612, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.477678582072258, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 2796.1934814453125, |
|
"epoch": 3.699300699300699, |
|
"grad_norm": 0.11168068647384644, |
|
"kl": 0.003910064697265625, |
|
"learning_rate": 2.812786337337463e-07, |
|
"loss": 0.0002, |
|
"reward": 0.903273805975914, |
|
"reward_std": 0.395388700067997, |
|
"rewards/accuracy_reward": 0.410714291036129, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4925595298409462, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 2953.2053833007812, |
|
"epoch": 3.7272727272727275, |
|
"grad_norm": 0.09826097637414932, |
|
"kl": 0.00336456298828125, |
|
"learning_rate": 2.743297546630587e-07, |
|
"loss": 0.0001, |
|
"reward": 0.7938988208770752, |
|
"reward_std": 0.3516330271959305, |
|
"rewards/accuracy_reward": 0.3214285783469677, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4724702537059784, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 2285.59521484375, |
|
"epoch": 3.755244755244755, |
|
"grad_norm": 0.11822542548179626, |
|
"kl": 0.0056304931640625, |
|
"learning_rate": 2.674845438975557e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8407738357782364, |
|
"reward_std": 0.3399865999817848, |
|
"rewards/accuracy_reward": 0.2976190522313118, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.543154776096344, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 2728.6280517578125, |
|
"epoch": 3.7832167832167833, |
|
"grad_norm": 0.17046403884887695, |
|
"kl": 0.003498077392578125, |
|
"learning_rate": 2.6074557564105724e-07, |
|
"loss": 0.0001, |
|
"reward": 0.649553582072258, |
|
"reward_std": 0.288466639816761, |
|
"rewards/accuracy_reward": 0.14285714365541935, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5066964402794838, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 2683.8005981445312, |
|
"epoch": 3.8111888111888113, |
|
"grad_norm": 0.11535181850194931, |
|
"kl": 0.00335693359375, |
|
"learning_rate": 2.541153841439214e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8236607313156128, |
|
"reward_std": 0.337150476872921, |
|
"rewards/accuracy_reward": 0.30059524066746235, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.523065485060215, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 2738.6964721679688, |
|
"epoch": 3.839160839160839, |
|
"grad_norm": 0.11976780742406845, |
|
"kl": 0.00394439697265625, |
|
"learning_rate": 2.475964627500149e-07, |
|
"loss": 0.0002, |
|
"reward": 0.703125, |
|
"reward_std": 0.33021562546491623, |
|
"rewards/accuracy_reward": 0.19047619588673115, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5126488208770752, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 2560.4107666015625, |
|
"epoch": 3.867132867132867, |
|
"grad_norm": 0.11340122669935226, |
|
"kl": 0.003749847412109375, |
|
"learning_rate": 2.411912629590699e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7388392984867096, |
|
"reward_std": 0.3105376362800598, |
|
"rewards/accuracy_reward": 0.2321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5066964402794838, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 2278.681640625, |
|
"epoch": 3.895104895104895, |
|
"grad_norm": 0.11167583614587784, |
|
"kl": 0.005157470703125, |
|
"learning_rate": 2.349021935047742e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8244047909975052, |
|
"reward_std": 0.2984638176858425, |
|
"rewards/accuracy_reward": 0.2857142873108387, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5386904925107956, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 2529.0982666015625, |
|
"epoch": 3.9230769230769234, |
|
"grad_norm": 0.15898652374744415, |
|
"kl": 0.0040130615234375, |
|
"learning_rate": 2.287316194489455e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7641369104385376, |
|
"reward_std": 0.32191672176122665, |
|
"rewards/accuracy_reward": 0.2589285783469677, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5052083507180214, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 2281.9910888671875, |
|
"epoch": 3.951048951048951, |
|
"grad_norm": 0.11731097847223282, |
|
"kl": 0.0043792724609375, |
|
"learning_rate": 2.2268186129212807e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9226190745830536, |
|
"reward_std": 0.3625694811344147, |
|
"rewards/accuracy_reward": 0.318452388048172, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6041666716337204, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 3223.9078979492188, |
|
"epoch": 3.979020979020979, |
|
"grad_norm": 0.1330711990594864, |
|
"kl": 0.0041351318359375, |
|
"learning_rate": 2.16755194100948e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5401785746216774, |
|
"reward_std": 0.319940485060215, |
|
"rewards/accuracy_reward": 0.11607143096625805, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4241071492433548, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 3505.614013671875, |
|
"epoch": 4.0, |
|
"grad_norm": 0.15241242945194244, |
|
"kl": 0.0035298665364583335, |
|
"learning_rate": 2.1095384665255267e-07, |
|
"loss": 0.0001, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.2562485933303833, |
|
"rewards/accuracy_reward": 0.08333333457509677, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3988095323244731, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 2669.4762573242188, |
|
"epoch": 4.027972027972028, |
|
"grad_norm": 0.132435142993927, |
|
"kl": 0.003795623779296875, |
|
"learning_rate": 2.0528000059645995e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6242559552192688, |
|
"reward_std": 0.2846153862774372, |
|
"rewards/accuracy_reward": 0.11904762126505375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5052083432674408, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 2907.2738647460938, |
|
"epoch": 4.055944055944056, |
|
"grad_norm": 0.14874334633350372, |
|
"kl": 0.003658294677734375, |
|
"learning_rate": 1.99735789634128e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5632440596818924, |
|
"reward_std": 0.27348505705595016, |
|
"rewards/accuracy_reward": 0.07440476352348924, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.488839291036129, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 2507.997100830078, |
|
"epoch": 4.083916083916084, |
|
"grad_norm": 0.1229638084769249, |
|
"kl": 0.004901885986328125, |
|
"learning_rate": 1.9432329871655836e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6912202537059784, |
|
"reward_std": 0.2840711995959282, |
|
"rewards/accuracy_reward": 0.15773809887468815, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5334821566939354, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 2388.607177734375, |
|
"epoch": 4.111888111888112, |
|
"grad_norm": 0.13819031417369843, |
|
"kl": 0.0043792724609375, |
|
"learning_rate": 1.8904456326023027e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6636904925107956, |
|
"reward_std": 0.2932458780705929, |
|
"rewards/accuracy_reward": 0.09821428940631449, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.565476194024086, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 2596.8035888671875, |
|
"epoch": 4.13986013986014, |
|
"grad_norm": 0.12725792825222015, |
|
"kl": 0.00445556640625, |
|
"learning_rate": 1.8390156838166462e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6793154925107956, |
|
"reward_std": 0.27951210737228394, |
|
"rewards/accuracy_reward": 0.17857143096625805, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5007440522313118, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 2657.2232666015625, |
|
"epoch": 4.1678321678321675, |
|
"grad_norm": 0.1211203783750534, |
|
"kl": 0.00418853759765625, |
|
"learning_rate": 1.7889624815090195e-07, |
|
"loss": 0.0002, |
|
"reward": 0.621279776096344, |
|
"reward_std": 0.293570376932621, |
|
"rewards/accuracy_reward": 0.11011905060149729, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5111607313156128, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 2903.3482666015625, |
|
"epoch": 4.195804195804196, |
|
"grad_norm": 0.14065128564834595, |
|
"kl": 0.003757476806640625, |
|
"learning_rate": 1.7403048486417868e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6443452388048172, |
|
"reward_std": 0.30524395406246185, |
|
"rewards/accuracy_reward": 0.1428571455180645, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.501488097012043, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 2863.4404907226562, |
|
"epoch": 4.223776223776224, |
|
"grad_norm": 0.12928026914596558, |
|
"kl": 0.003902435302734375, |
|
"learning_rate": 1.693061083360715e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6190476194024086, |
|
"reward_std": 0.306563138961792, |
|
"rewards/accuracy_reward": 0.1339285746216774, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4851190522313118, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 2692.7857666015625, |
|
"epoch": 4.251748251748252, |
|
"grad_norm": 0.12153156846761703, |
|
"kl": 0.00408935546875, |
|
"learning_rate": 1.6472489521138015e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6763392984867096, |
|
"reward_std": 0.31852778047323227, |
|
"rewards/accuracy_reward": 0.15773809980601072, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5186012014746666, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 2442.0684204101562, |
|
"epoch": 4.27972027972028, |
|
"grad_norm": 0.12675337493419647, |
|
"kl": 0.00487518310546875, |
|
"learning_rate": 1.6028856829700258e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7380952537059784, |
|
"reward_std": 0.3266712352633476, |
|
"rewards/accuracy_reward": 0.1815476231276989, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5565476417541504, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 2662.907745361328, |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 0.15777888894081116, |
|
"kl": 0.004787445068359375, |
|
"learning_rate": 1.5599879591405916e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6049107313156128, |
|
"reward_std": 0.23991192504763603, |
|
"rewards/accuracy_reward": 0.08333333604969084, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5215773954987526, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 2794.9673461914062, |
|
"epoch": 4.335664335664336, |
|
"grad_norm": 0.1465885192155838, |
|
"kl": 0.00409698486328125, |
|
"learning_rate": 1.5185719127050398e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5833333432674408, |
|
"reward_std": 0.2825283370912075, |
|
"rewards/accuracy_reward": 0.08928571594879031, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4940476343035698, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 2754.6488647460938, |
|
"epoch": 4.363636363636363, |
|
"grad_norm": 0.09911419451236725, |
|
"kl": 0.00395965576171875, |
|
"learning_rate": 1.4786531185446452e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7857142984867096, |
|
"reward_std": 0.3356376476585865, |
|
"rewards/accuracy_reward": 0.2976190559566021, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4880952537059784, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 2780.6726684570312, |
|
"epoch": 4.391608391608392, |
|
"grad_norm": 0.11424998939037323, |
|
"kl": 0.0051422119140625, |
|
"learning_rate": 1.4402465884853301e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8787202537059784, |
|
"reward_std": 0.41560350358486176, |
|
"rewards/accuracy_reward": 0.3779762014746666, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5007440522313118, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 3287.794677734375, |
|
"epoch": 4.41958041958042, |
|
"grad_norm": 0.1019367203116417, |
|
"kl": 0.00496673583984375, |
|
"learning_rate": 1.4033667656523404e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6815476268529892, |
|
"reward_std": 0.3411427028477192, |
|
"rewards/accuracy_reward": 0.24107143096625805, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.440476194024086, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 3285.3780517578125, |
|
"epoch": 4.4475524475524475, |
|
"grad_norm": 0.09824172407388687, |
|
"kl": 0.00437164306640625, |
|
"learning_rate": 1.3680275190387675e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7886904925107956, |
|
"reward_std": 0.3618383854627609, |
|
"rewards/accuracy_reward": 0.3452381044626236, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4434523954987526, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 3261.59228515625, |
|
"epoch": 4.475524475524476, |
|
"grad_norm": 0.12426801025867462, |
|
"kl": 0.004497528076171875, |
|
"learning_rate": 1.3342421382899935e-07, |
|
"loss": 0.0002, |
|
"reward": 0.699404776096344, |
|
"reward_std": 0.3835337683558464, |
|
"rewards/accuracy_reward": 0.2619047649204731, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4375000149011612, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 3172.2529907226562, |
|
"epoch": 4.503496503496503, |
|
"grad_norm": 0.12890060245990753, |
|
"kl": 0.004352569580078125, |
|
"learning_rate": 1.3020233287059976e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7105654776096344, |
|
"reward_std": 0.35477447509765625, |
|
"rewards/accuracy_reward": 0.25595238991081715, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4546131044626236, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 2853.4256591796875, |
|
"epoch": 4.531468531468532, |
|
"grad_norm": 0.13204464316368103, |
|
"kl": 0.00543975830078125, |
|
"learning_rate": 1.2713832064634125e-07, |
|
"loss": 0.0002, |
|
"reward": 0.814732164144516, |
|
"reward_std": 0.4248877093195915, |
|
"rewards/accuracy_reward": 0.3154761977493763, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.49925597012043, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 3321.5536499023438, |
|
"epoch": 4.559440559440559, |
|
"grad_norm": 0.09118322283029556, |
|
"kl": 0.00458526611328125, |
|
"learning_rate": 1.2423332940591238e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7172619253396988, |
|
"reward_std": 0.342393409460783, |
|
"rewards/accuracy_reward": 0.2708333395421505, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4464285746216774, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 3086.2708740234375, |
|
"epoch": 4.5874125874125875, |
|
"grad_norm": 0.09752269089221954, |
|
"kl": 0.00424957275390625, |
|
"learning_rate": 1.2148845159771312e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7566964328289032, |
|
"reward_std": 0.3746805787086487, |
|
"rewards/accuracy_reward": 0.3095238134264946, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4471726268529892, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 3220.9583740234375, |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.09589620679616928, |
|
"kl": 0.0044403076171875, |
|
"learning_rate": 1.1890471945802999e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7388393133878708, |
|
"reward_std": 0.3828039579093456, |
|
"rewards/accuracy_reward": 0.28571428917348385, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.453125, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 2777.7857666015625, |
|
"epoch": 4.643356643356643, |
|
"grad_norm": 0.11369551718235016, |
|
"kl": 0.0052642822265625, |
|
"learning_rate": 1.1648310462285385e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8772321492433548, |
|
"reward_std": 0.41676195710897446, |
|
"rewards/accuracy_reward": 0.3958333432674408, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4813988283276558, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 2835.4881591796875, |
|
"epoch": 4.671328671328672, |
|
"grad_norm": 0.11145463585853577, |
|
"kl": 0.004909515380859375, |
|
"learning_rate": 1.142245177624874e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8630952537059784, |
|
"reward_std": 0.3876107409596443, |
|
"rewards/accuracy_reward": 0.3779762014746666, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4851190596818924, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 2691.9882202148438, |
|
"epoch": 4.699300699300699, |
|
"grad_norm": 0.10534343868494034, |
|
"kl": 0.0055389404296875, |
|
"learning_rate": 1.1212980823907929e-07, |
|
"loss": 0.0002, |
|
"reward": 0.9337797909975052, |
|
"reward_std": 0.3816476985812187, |
|
"rewards/accuracy_reward": 0.4375, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4962797686457634, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 2880.0149536132812, |
|
"epoch": 4.7272727272727275, |
|
"grad_norm": 0.11654610186815262, |
|
"kl": 0.00537872314453125, |
|
"learning_rate": 1.1019976378721399e-07, |
|
"loss": 0.0002, |
|
"reward": 0.815476194024086, |
|
"reward_std": 0.3700762465596199, |
|
"rewards/accuracy_reward": 0.3303571529686451, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4851190522313118, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 2267.869140625, |
|
"epoch": 4.755244755244755, |
|
"grad_norm": 0.19363805651664734, |
|
"kl": 0.00749969482421875, |
|
"learning_rate": 1.0843511021767689e-07, |
|
"loss": 0.0003, |
|
"reward": 0.8407738357782364, |
|
"reward_std": 0.34815119206905365, |
|
"rewards/accuracy_reward": 0.2797619178891182, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5610119253396988, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 2730.889892578125, |
|
"epoch": 4.783216783216783, |
|
"grad_norm": 0.11546841263771057, |
|
"kl": 0.004669189453125, |
|
"learning_rate": 1.068365111445064e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6510416716337204, |
|
"reward_std": 0.29459198564291, |
|
"rewards/accuracy_reward": 0.13988095801323652, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5111607164144516, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 2624.4077758789062, |
|
"epoch": 4.811188811188811, |
|
"grad_norm": 0.09517718106508255, |
|
"kl": 0.00469207763671875, |
|
"learning_rate": 1.0540456773543595e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8214285969734192, |
|
"reward_std": 0.3938767686486244, |
|
"rewards/accuracy_reward": 0.27678571827709675, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5446428582072258, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 2643.919677734375, |
|
"epoch": 4.839160839160839, |
|
"grad_norm": 0.11820892244577408, |
|
"kl": 0.0055084228515625, |
|
"learning_rate": 1.041398184858196e-07, |
|
"loss": 0.0002, |
|
"reward": 0.6852678656578064, |
|
"reward_std": 0.30904605984687805, |
|
"rewards/accuracy_reward": 0.1666666662786156, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.518601194024086, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 2524.047607421875, |
|
"epoch": 4.867132867132867, |
|
"grad_norm": 0.1165422648191452, |
|
"kl": 0.00485992431640625, |
|
"learning_rate": 1.0304273901612565e-07, |
|
"loss": 0.0002, |
|
"reward": 0.7284226268529892, |
|
"reward_std": 0.3345734477043152, |
|
"rewards/accuracy_reward": 0.2142857164144516, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5141369178891182, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 2217.1279907226562, |
|
"epoch": 4.895104895104895, |
|
"grad_norm": 0.13623353838920593, |
|
"kl": 0.00641632080078125, |
|
"learning_rate": 1.0211374189307538e-07, |
|
"loss": 0.0003, |
|
"reward": 0.8489583432674408, |
|
"reward_std": 0.33298908174037933, |
|
"rewards/accuracy_reward": 0.2797619104385376, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5691964328289032, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 2418.5833129882812, |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 0.12970149517059326, |
|
"kl": 0.005584716796875, |
|
"learning_rate": 1.013531764744936e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8043155074119568, |
|
"reward_std": 0.3505142778158188, |
|
"rewards/accuracy_reward": 0.2738095261156559, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5305059626698494, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 2166.7738342285156, |
|
"epoch": 4.951048951048951, |
|
"grad_norm": 0.12993088364601135, |
|
"kl": 0.00594329833984375, |
|
"learning_rate": 1.0076132877792932e-07, |
|
"loss": 0.0002, |
|
"reward": 0.8958333432674408, |
|
"reward_std": 0.3779018819332123, |
|
"rewards/accuracy_reward": 0.2886904813349247, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6071428805589676, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 3136.8482360839844, |
|
"epoch": 4.979020979020979, |
|
"grad_norm": 0.139509916305542, |
|
"kl": 0.0050506591796875, |
|
"learning_rate": 1.0033842137309648e-07, |
|
"loss": 0.0002, |
|
"reward": 0.5885416865348816, |
|
"reward_std": 0.3464737571775913, |
|
"rewards/accuracy_reward": 0.13095238152891397, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.457589291036129, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 3431.8421223958335, |
|
"epoch": 5.0, |
|
"grad_norm": 0.139509916305542, |
|
"kl": 0.004852294921875, |
|
"learning_rate": 1.000846132981744e-07, |
|
"loss": 0.0001, |
|
"reward": 0.5496031840642294, |
|
"reward_std": 0.2647865464289983, |
|
"rewards/accuracy_reward": 0.11904762188593547, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4305555621782939, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 180, |
|
"total_flos": 0.0, |
|
"train_loss": 8.540082282778706e-05, |
|
"train_runtime": 37307.6007, |
|
"train_samples_per_second": 0.134, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 180, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 12, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|