{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 2945.7202758789062, "epoch": 0.027972027972027972, "grad_norm": 0.08992995321750641, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.5104166641831398, "reward_std": 0.2102233674377203, "rewards/accuracy_reward": 0.0892857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4211309626698494, "step": 1 }, { "completion_length": 3191.761962890625, "epoch": 0.055944055944055944, "grad_norm": 0.30593544244766235, "kl": 0.0, "learning_rate": 5.555555555555555e-08, "loss": 0.0, "reward": 0.5022321417927742, "reward_std": 0.20526811853051186, "rewards/accuracy_reward": 0.0922619067132473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4099702462553978, "step": 2 }, { "completion_length": 2838.3720703125, "epoch": 0.08391608391608392, "grad_norm": 0.09959172457456589, "kl": 4.9114227294921875e-05, "learning_rate": 1.111111111111111e-07, "loss": 0.0, "reward": 0.583333358168602, "reward_std": 0.22632932662963867, "rewards/accuracy_reward": 0.13095238152891397, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4523809626698494, "step": 3 }, { "completion_length": 2734.3572387695312, "epoch": 0.11188811188811189, "grad_norm": 0.11217548698186874, "kl": 5.0961971282958984e-05, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 0.5424107313156128, "reward_std": 0.2021841686218977, "rewards/accuracy_reward": 0.10416667023673654, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4382440522313118, "step": 4 }, { "completion_length": 2863.5357666015625, "epoch": 0.13986013986013987, "grad_norm": 0.11380404978990555, "kl": 5.334615707397461e-05, "learning_rate": 2.222222222222222e-07, "loss": 0.0, "reward": 0.5967262014746666, "reward_std": 0.22172891348600388, "rewards/accuracy_reward": 0.15476190764456987, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.441964291036129, "step": 5 }, { "completion_length": 3099.0416870117188, "epoch": 0.16783216783216784, "grad_norm": 0.104108065366745, "kl": 5.179643630981445e-05, "learning_rate": 2.7777777777777776e-07, "loss": 0.0, "reward": 0.5126488134264946, "reward_std": 0.22282272577285767, "rewards/accuracy_reward": 0.09226190578192472, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4203869178891182, "step": 6 }, { "completion_length": 3416.1934814453125, "epoch": 0.1958041958041958, "grad_norm": 0.10306566208600998, "kl": 5.620718002319336e-05, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.510416679084301, "reward_std": 0.261018592864275, "rewards/accuracy_reward": 0.11607143003493547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3943452462553978, "step": 7 }, { "completion_length": 3140.9405517578125, "epoch": 0.22377622377622378, "grad_norm": 0.09083328396081924, "kl": 5.40614128112793e-05, "learning_rate": 3.888888888888889e-07, "loss": 0.0, "reward": 0.5208333283662796, "reward_std": 0.2423064224421978, "rewards/accuracy_reward": 0.1011904776096344, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4196428582072258, "step": 8 }, { "completion_length": 3093.7916870117188, "epoch": 0.2517482517482518, "grad_norm": 0.0941944494843483, "kl": 4.83393669128418e-05, "learning_rate": 4.444444444444444e-07, "loss": 0.0, "reward": 0.568452388048172, "reward_std": 0.2969280257821083, "rewards/accuracy_reward": 0.1488095261156559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4196428656578064, "step": 9 }, { "completion_length": 3059.0924072265625, "epoch": 0.27972027972027974, "grad_norm": 3.2292351722717285, "kl": 4.953145980834961e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.5967262089252472, "reward_std": 0.29494407773017883, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4300595298409462, "step": 10 }, { "completion_length": 3060.21728515625, "epoch": 0.3076923076923077, "grad_norm": 0.09246377646923065, "kl": 4.6312808990478516e-05, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "reward": 0.4940476343035698, "reward_std": 0.20687389746308327, "rewards/accuracy_reward": 0.07440476445481181, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4196428656578064, "step": 11 }, { "completion_length": 3174.196533203125, "epoch": 0.3356643356643357, "grad_norm": 0.10116475075483322, "kl": 5.650520324707031e-05, "learning_rate": 6.111111111111112e-07, "loss": 0.0, "reward": 0.5066964402794838, "reward_std": 0.22321293503046036, "rewards/accuracy_reward": 0.09821428754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4084821566939354, "step": 12 }, { "completion_length": 3070.0089721679688, "epoch": 0.36363636363636365, "grad_norm": 0.10479767620563507, "kl": 0.0002524256706237793, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.6614583432674408, "reward_std": 0.2681122124195099, "rewards/accuracy_reward": 0.24404762126505375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4174107164144516, "step": 13 }, { "completion_length": 3290.7738647460938, "epoch": 0.3916083916083916, "grad_norm": 0.19513735175132751, "kl": 5.537271499633789e-05, "learning_rate": 7.222222222222221e-07, "loss": 0.0, "reward": 0.6562500074505806, "reward_std": 0.3831389471888542, "rewards/accuracy_reward": 0.25595238618552685, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4002976268529892, "step": 14 }, { "completion_length": 3630.5714721679688, "epoch": 0.4195804195804196, "grad_norm": 0.08842650800943375, "kl": 4.839897155761719e-05, "learning_rate": 7.777777777777778e-07, "loss": 0.0, "reward": 0.545386902987957, "reward_std": 0.27424266561865807, "rewards/accuracy_reward": 0.17559524066746235, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3697916716337204, "step": 15 }, { "completion_length": 3649.27685546875, "epoch": 0.44755244755244755, "grad_norm": 0.09835156053304672, "kl": 4.208087921142578e-05, "learning_rate": 8.333333333333333e-07, "loss": 0.0, "reward": 0.6138392984867096, "reward_std": 0.3248288035392761, "rewards/accuracy_reward": 0.2440476231276989, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3697916716337204, "step": 16 }, { "completion_length": 3623.0596313476562, "epoch": 0.4755244755244755, "grad_norm": 0.11101175099611282, "kl": 4.845857620239258e-05, "learning_rate": 8.888888888888888e-07, "loss": 0.0, "reward": 0.5372024029493332, "reward_std": 0.2965564988553524, "rewards/accuracy_reward": 0.1726190485060215, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3645833507180214, "step": 17 }, { "completion_length": 3498.8096313476562, "epoch": 0.5034965034965035, "grad_norm": 0.10944987833499908, "kl": 4.649162292480469e-05, "learning_rate": 9.444444444444444e-07, "loss": 0.0, "reward": 0.537202388048172, "reward_std": 0.3150208666920662, "rewards/accuracy_reward": 0.17261905316263437, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3645833358168602, "step": 18 }, { "completion_length": 3287.4048461914062, "epoch": 0.5314685314685315, "grad_norm": 0.1434430330991745, "kl": 5.739927291870117e-05, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.6160714402794838, "reward_std": 0.3258791044354439, "rewards/accuracy_reward": 0.22619047947227955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3898809477686882, "step": 19 }, { "completion_length": 3627.2202758789062, "epoch": 0.5594405594405595, "grad_norm": 0.09357653558254242, "kl": 4.5299530029296875e-05, "learning_rate": 9.999153867018255e-07, "loss": 0.0, "reward": 0.5297619104385376, "reward_std": 0.3098462224006653, "rewards/accuracy_reward": 0.16369047947227955, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3660714402794838, "step": 20 }, { "completion_length": 3486.3453369140625, "epoch": 0.5874125874125874, "grad_norm": 0.09195814281702042, "kl": 4.369020462036133e-05, "learning_rate": 9.996615786269034e-07, "loss": 0.0, "reward": 0.574404776096344, "reward_std": 0.3131341114640236, "rewards/accuracy_reward": 0.1964285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.377976194024086, "step": 21 }, { "completion_length": 3686.1131591796875, "epoch": 0.6153846153846154, "grad_norm": 0.10805041342973709, "kl": 3.5136938095092773e-05, "learning_rate": 9.992386712220707e-07, "loss": 0.0, "reward": 0.5855654701590538, "reward_std": 0.317733321338892, "rewards/accuracy_reward": 0.2172619104385376, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3683035746216774, "step": 22 }, { "completion_length": 3132.3988037109375, "epoch": 0.6433566433566433, "grad_norm": 0.10081591457128525, "kl": 4.07099723815918e-05, "learning_rate": 9.986468235255064e-07, "loss": 0.0, "reward": 0.6860119104385376, "reward_std": 0.3946758955717087, "rewards/accuracy_reward": 0.2797619067132473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4062500074505806, "step": 23 }, { "completion_length": 3288.7650146484375, "epoch": 0.6713286713286714, "grad_norm": 0.08800289034843445, "kl": 3.781914710998535e-05, "learning_rate": 9.978862581069245e-07, "loss": 0.0, "reward": 0.6815476417541504, "reward_std": 0.3288978040218353, "rewards/accuracy_reward": 0.2857142947614193, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3958333432674408, "step": 24 }, { "completion_length": 3099.306640625, "epoch": 0.6993006993006993, "grad_norm": 0.11318381130695343, "kl": 3.314018249511719e-05, "learning_rate": 9.969572609838744e-07, "loss": 0.0, "reward": 0.7291666716337204, "reward_std": 0.3738822266459465, "rewards/accuracy_reward": 0.3125000074505806, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4166666716337204, "step": 25 }, { "completion_length": 3236.0416870117188, "epoch": 0.7272727272727273, "grad_norm": 0.10187575966119766, "kl": 3.30805778503418e-05, "learning_rate": 9.958601815141803e-07, "loss": 0.0, "reward": 0.6294642984867096, "reward_std": 0.352918803691864, "rewards/accuracy_reward": 0.2410714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3883928656578064, "step": 26 }, { "completion_length": 2626.6101684570312, "epoch": 0.7552447552447552, "grad_norm": 0.1044318675994873, "kl": 5.5670738220214844e-05, "learning_rate": 9.94595432264564e-07, "loss": 0.0, "reward": 0.6822916865348816, "reward_std": 0.3088056966662407, "rewards/accuracy_reward": 0.23809524439275265, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4441964402794838, "step": 27 }, { "completion_length": 3134.5774536132812, "epoch": 0.7832167832167832, "grad_norm": 0.08211353421211243, "kl": 5.906820297241211e-05, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "reward": 0.4985119104385376, "reward_std": 0.1905420981347561, "rewards/accuracy_reward": 0.08630952518433332, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.412202388048172, "step": 28 }, { "completion_length": 2987.3304443359375, "epoch": 0.8111888111888111, "grad_norm": 0.08840525150299072, "kl": 5.1856040954589844e-05, "learning_rate": 9.915648897823232e-07, "loss": 0.0, "reward": 0.6510416716337204, "reward_std": 0.2952599339187145, "rewards/accuracy_reward": 0.20833334140479565, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4427083358168602, "step": 29 }, { "completion_length": 3049.0328369140625, "epoch": 0.8391608391608392, "grad_norm": 0.09037017077207565, "kl": 7.843971252441406e-05, "learning_rate": 9.89800236212786e-07, "loss": 0.0, "reward": 0.574404776096344, "reward_std": 0.26248788461089134, "rewards/accuracy_reward": 0.15476190787740052, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4196428582072258, "step": 30 }, { "completion_length": 2818.0387573242188, "epoch": 0.8671328671328671, "grad_norm": 0.08912839740514755, "kl": 8.428096771240234e-05, "learning_rate": 9.878701917609207e-07, "loss": 0.0, "reward": 0.6257440596818924, "reward_std": 0.2523465231060982, "rewards/accuracy_reward": 0.17261905036866665, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4531250074505806, "step": 31 }, { "completion_length": 2675.1845703125, "epoch": 0.8951048951048951, "grad_norm": 0.0868036225438118, "kl": 0.00010216236114501953, "learning_rate": 9.857754822375126e-07, "loss": 0.0, "reward": 0.6904762089252472, "reward_std": 0.25569987669587135, "rewards/accuracy_reward": 0.2410714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.449404776096344, "step": 32 }, { "completion_length": 2967.2411499023438, "epoch": 0.9230769230769231, "grad_norm": 0.11090683937072754, "kl": 0.0001150369644165039, "learning_rate": 9.83516895377146e-07, "loss": 0.0, "reward": 0.6324404925107956, "reward_std": 0.2739677280187607, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4241071492433548, "step": 33 }, { "completion_length": 2569.0238647460938, "epoch": 0.951048951048951, "grad_norm": 0.10315605998039246, "kl": 0.0001518726348876953, "learning_rate": 9.8109528054197e-07, "loss": 0.0, "reward": 0.7061012089252472, "reward_std": 0.30017876625061035, "rewards/accuracy_reward": 0.2172619067132473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4888392984867096, "step": 34 }, { "completion_length": 3555.8452758789062, "epoch": 0.9790209790209791, "grad_norm": 0.14011836051940918, "kl": 0.00016379356384277344, "learning_rate": 9.785115484022869e-07, "loss": 0.0, "reward": 0.4531250149011612, "reward_std": 0.23945146799087524, "rewards/accuracy_reward": 0.09226190857589245, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3608631044626236, "step": 35 }, { "completion_length": 3795.0263671875, "epoch": 1.0, "grad_norm": 0.14011836051940918, "kl": 0.00016848246256510416, "learning_rate": 9.757666705940878e-07, "loss": 0.0, "reward": 0.3928571542104085, "reward_std": 0.16382549703121185, "rewards/accuracy_reward": 0.0436507947742939, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3492063581943512, "step": 36 }, { "completion_length": 2951.9464111328125, "epoch": 1.027972027972028, "grad_norm": 0.1335798054933548, "kl": 0.00022125244140625, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.497023805975914, "reward_std": 0.16415998339653015, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4255952388048172, "step": 37 }, { "completion_length": 3158.5267944335938, "epoch": 1.055944055944056, "grad_norm": 0.16623912751674652, "kl": 0.00020551681518554688, "learning_rate": 9.697976671294003e-07, "loss": 0.0, "reward": 0.5148809626698494, "reward_std": 0.22238681092858315, "rewards/accuracy_reward": 0.0922619067132473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4226190596818924, "step": 38 }, { "completion_length": 2795.3155517578125, "epoch": 1.083916083916084, "grad_norm": 0.12368790060281754, "kl": 0.0002484321594238281, "learning_rate": 9.665757861710007e-07, "loss": 0.0, "reward": 0.5699404999613762, "reward_std": 0.22224155068397522, "rewards/accuracy_reward": 0.11309523973613977, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.456845261156559, "step": 39 }, { "completion_length": 2715.77392578125, "epoch": 1.1118881118881119, "grad_norm": 0.09115960448980331, "kl": 0.00038814544677734375, "learning_rate": 9.631972480961233e-07, "loss": 0.0, "reward": 0.5424107313156128, "reward_std": 0.21795128658413887, "rewards/accuracy_reward": 0.09226190415211022, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4501488283276558, "step": 40 }, { "completion_length": 2979.4970092773438, "epoch": 1.1398601398601398, "grad_norm": 0.17517763376235962, "kl": 0.0002455711364746094, "learning_rate": 9.59663323434766e-07, "loss": 0.0, "reward": 0.5796131044626236, "reward_std": 0.22455461882054806, "rewards/accuracy_reward": 0.1517857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.427827388048172, "step": 41 }, { "completion_length": 3172.4226684570312, "epoch": 1.167832167832168, "grad_norm": 0.09061767905950546, "kl": 0.00025773048400878906, "learning_rate": 9.55975341151467e-07, "loss": 0.0, "reward": 0.4962797835469246, "reward_std": 0.2209780216217041, "rewards/accuracy_reward": 0.07738095452077687, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4188988208770752, "step": 42 }, { "completion_length": 3369.6339721679688, "epoch": 1.1958041958041958, "grad_norm": 0.07707580924034119, "kl": 0.00022912025451660156, "learning_rate": 9.521346881455354e-07, "loss": 0.0, "reward": 0.5066964328289032, "reward_std": 0.221993587911129, "rewards/accuracy_reward": 0.1011904776096344, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4055059626698494, "step": 43 }, { "completion_length": 3192.27685546875, "epoch": 1.2237762237762237, "grad_norm": 0.10000985860824585, "kl": 0.00026798248291015625, "learning_rate": 9.481428087294959e-07, "loss": 0.0, "reward": 0.5111607164144516, "reward_std": 0.22597463242709637, "rewards/accuracy_reward": 0.095238097012043, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4159226268529892, "step": 44 }, { "completion_length": 3079.33935546875, "epoch": 1.2517482517482517, "grad_norm": 0.10022323578596115, "kl": 0.00036716461181640625, "learning_rate": 9.440012040859408e-07, "loss": 0.0, "reward": 0.5535714477300644, "reward_std": 0.2452528141438961, "rewards/accuracy_reward": 0.13988095615059137, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4136904776096344, "step": 45 }, { "completion_length": 3041.0208740234375, "epoch": 1.2797202797202798, "grad_norm": 0.08870735764503479, "kl": 0.0003390312194824219, "learning_rate": 9.397114317029974e-07, "loss": 0.0, "reward": 0.5625000149011612, "reward_std": 0.2692300006747246, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4285714402794838, "step": 46 }, { "completion_length": 3139.7202758789062, "epoch": 1.3076923076923077, "grad_norm": 0.1289481371641159, "kl": 0.0003085136413574219, "learning_rate": 9.352751047886198e-07, "loss": 0.0, "reward": 0.490327388048172, "reward_std": 0.2173246443271637, "rewards/accuracy_reward": 0.07142857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4188988134264946, "step": 47 }, { "completion_length": 3241.5535888671875, "epoch": 1.3356643356643356, "grad_norm": 0.08511940389871597, "kl": 0.0003581047058105469, "learning_rate": 9.306938916639285e-07, "loss": 0.0, "reward": 0.504464291036129, "reward_std": 0.2454623058438301, "rewards/accuracy_reward": 0.09523809747770429, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.409226194024086, "step": 48 }, { "completion_length": 3153.041748046875, "epoch": 1.3636363636363638, "grad_norm": 0.11432822048664093, "kl": 0.00035762786865234375, "learning_rate": 9.259695151358214e-07, "loss": 0.0, "reward": 0.6778274029493332, "reward_std": 0.29422780871391296, "rewards/accuracy_reward": 0.2619047723710537, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4159226343035698, "step": 49 }, { "completion_length": 3244.0416870117188, "epoch": 1.3916083916083917, "grad_norm": 0.07997111976146698, "kl": 0.0005092620849609375, "learning_rate": 9.21103751849098e-07, "loss": 0.0, "reward": 0.721726194024086, "reward_std": 0.4153262600302696, "rewards/accuracy_reward": 0.3125000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.409226194024086, "step": 50 }, { "completion_length": 3661.2827758789062, "epoch": 1.4195804195804196, "grad_norm": 0.09987284988164902, "kl": 0.0004477500915527344, "learning_rate": 9.160984316183354e-07, "loss": 0.0, "reward": 0.5461309626698494, "reward_std": 0.27763326466083527, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3586309552192688, "step": 51 }, { "completion_length": 3599.571533203125, "epoch": 1.4475524475524475, "grad_norm": 0.09226205199956894, "kl": 0.000530242919921875, "learning_rate": 9.109554367397697e-07, "loss": 0.0, "reward": 0.636904776096344, "reward_std": 0.34169958531856537, "rewards/accuracy_reward": 0.2500000037252903, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3869047611951828, "step": 52 }, { "completion_length": 3564.58935546875, "epoch": 1.4755244755244754, "grad_norm": 0.09048299491405487, "kl": 0.000576019287109375, "learning_rate": 9.056767012834416e-07, "loss": 0.0, "reward": 0.5989583507180214, "reward_std": 0.33351704478263855, "rewards/accuracy_reward": 0.223214291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3757440596818924, "step": 53 }, { "completion_length": 3535.7650146484375, "epoch": 1.5034965034965035, "grad_norm": 0.09732116013765335, "kl": 0.0005936622619628906, "learning_rate": 9.002642103658719e-07, "loss": 0.0, "reward": 0.57738097012043, "reward_std": 0.3000512942671776, "rewards/accuracy_reward": 0.19642857182770967, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.380952388048172, "step": 54 }, { "completion_length": 3258.33935546875, "epoch": 1.5314685314685315, "grad_norm": 0.08897067606449127, "kl": 0.0005841255187988281, "learning_rate": 8.9471999940354e-07, "loss": 0.0, "reward": 0.6555059626698494, "reward_std": 0.31323162093758583, "rewards/accuracy_reward": 0.25297619588673115, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.402529776096344, "step": 55 }, { "completion_length": 3488.7828369140625, "epoch": 1.5594405594405596, "grad_norm": 0.11047809571027756, "kl": 0.0006580352783203125, "learning_rate": 8.890461533474473e-07, "loss": 0.0, "reward": 0.6056547611951828, "reward_std": 0.3212001100182533, "rewards/accuracy_reward": 0.220238097012043, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.385416679084301, "step": 56 }, { "completion_length": 3451.3245239257812, "epoch": 1.5874125874125875, "grad_norm": 0.09569097310304642, "kl": 0.0006022453308105469, "learning_rate": 8.832448058990521e-07, "loss": 0.0, "reward": 0.596726194024086, "reward_std": 0.3251044377684593, "rewards/accuracy_reward": 0.2142857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.382440485060215, "step": 57 }, { "completion_length": 3633.193603515625, "epoch": 1.6153846153846154, "grad_norm": 0.09286098182201385, "kl": 0.0006651878356933594, "learning_rate": 8.773181387078719e-07, "loss": 0.0, "reward": 0.6272321566939354, "reward_std": 0.3380197770893574, "rewards/accuracy_reward": 0.2410714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3861607238650322, "step": 58 }, { "completion_length": 3144.7053833007812, "epoch": 1.6433566433566433, "grad_norm": 0.1019466370344162, "kl": 0.0008211135864257812, "learning_rate": 8.712683805510545e-07, "loss": 0.0, "reward": 0.7075893133878708, "reward_std": 0.3953409940004349, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4159226268529892, "step": 59 }, { "completion_length": 3188.6845703125, "epoch": 1.6713286713286712, "grad_norm": 0.11080282181501389, "kl": 0.0006132125854492188, "learning_rate": 8.650978064952258e-07, "loss": 0.0, "reward": 0.7604166865348816, "reward_std": 0.3040749914944172, "rewards/accuracy_reward": 0.3571428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4032738134264946, "step": 60 }, { "completion_length": 3010.1488647460938, "epoch": 1.6993006993006992, "grad_norm": 0.08334992080926895, "kl": 0.0007467269897460938, "learning_rate": 8.588087370409302e-07, "loss": 0.0, "reward": 0.7819940596818924, "reward_std": 0.3632723242044449, "rewards/accuracy_reward": 0.3571428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.424851194024086, "step": 61 }, { "completion_length": 3139.3452758789062, "epoch": 1.7272727272727273, "grad_norm": 0.1026916578412056, "kl": 0.0007228851318359375, "learning_rate": 8.52403537249985e-07, "loss": 0.0, "reward": 0.7075893133878708, "reward_std": 0.32955069839954376, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4159226343035698, "step": 62 }, { "completion_length": 2651.9703369140625, "epoch": 1.7552447552447552, "grad_norm": 0.11016727238893509, "kl": 0.0008554458618164062, "learning_rate": 8.458846158560786e-07, "loss": 0.0, "reward": 0.731398805975914, "reward_std": 0.3333098441362381, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.460565485060215, "step": 63 }, { "completion_length": 3171.544677734375, "epoch": 1.7832167832167833, "grad_norm": 0.199759379029274, "kl": 0.0009202957153320312, "learning_rate": 8.392544243589427e-07, "loss": 0.0, "reward": 0.5431547686457634, "reward_std": 0.22581714019179344, "rewards/accuracy_reward": 0.12202381296083331, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4211309626698494, "step": 64 }, { "completion_length": 3002.52685546875, "epoch": 1.8111888111888113, "grad_norm": 0.09409494698047638, "kl": 0.000827789306640625, "learning_rate": 8.325154561024443e-07, "loss": 0.0, "reward": 0.6986607313156128, "reward_std": 0.3044434152543545, "rewards/accuracy_reward": 0.25297619588673115, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4456845223903656, "step": 65 }, { "completion_length": 3002.6904907226562, "epoch": 1.8391608391608392, "grad_norm": 0.08244930952787399, "kl": 0.0010738372802734375, "learning_rate": 8.256702453369412e-07, "loss": 0.0, "reward": 0.5766369253396988, "reward_std": 0.2669289857149124, "rewards/accuracy_reward": 0.14583334070630372, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.430803582072258, "step": 66 }, { "completion_length": 2784.8601684570312, "epoch": 1.867132867132867, "grad_norm": 0.10322090983390808, "kl": 0.0010652542114257812, "learning_rate": 8.187213662662538e-07, "loss": 0.0, "reward": 0.6480654776096344, "reward_std": 0.2842121906578541, "rewards/accuracy_reward": 0.1934523843228817, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.454613097012043, "step": 67 }, { "completion_length": 2688.4434814453125, "epoch": 1.895104895104895, "grad_norm": 0.10172371566295624, "kl": 0.001178741455078125, "learning_rate": 8.11671432079612e-07, "loss": 0.0, "reward": 0.7075892984867096, "reward_std": 0.2647625356912613, "rewards/accuracy_reward": 0.2410714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178582072258, "step": 68 }, { "completion_length": 2745.5892944335938, "epoch": 1.9230769230769231, "grad_norm": 0.10732463002204895, "kl": 0.0011472702026367188, "learning_rate": 8.045230939689424e-07, "loss": 0.0, "reward": 0.7194940596818924, "reward_std": 0.3091561570763588, "rewards/accuracy_reward": 0.2559523805975914, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.463541679084301, "step": 69 }, { "completion_length": 2508.4762573242188, "epoch": 1.951048951048951, "grad_norm": 0.10578262805938721, "kl": 0.0013446807861328125, "learning_rate": 7.972790401318627e-07, "loss": 0.0001, "reward": 0.7916666716337204, "reward_std": 0.34562500566244125, "rewards/accuracy_reward": 0.2738095298409462, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5178571492433548, "step": 70 }, { "completion_length": 3271.014892578125, "epoch": 1.9790209790209792, "grad_norm": 0.11061587184667587, "kl": 0.0012416839599609375, "learning_rate": 7.899419947607611e-07, "loss": 0.0, "reward": 0.5305059626698494, "reward_std": 0.3043720945715904, "rewards/accuracy_reward": 0.127976194024086, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4025297686457634, "step": 71 }, { "completion_length": 3785.77197265625, "epoch": 2.0, "grad_norm": 0.15183548629283905, "kl": 0.001277923583984375, "learning_rate": 7.825147170183384e-07, "loss": 0.0, "reward": 0.4117063581943512, "reward_std": 0.199227308233579, "rewards/accuracy_reward": 0.059523810942967735, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3521825472513835, "step": 72 }, { "completion_length": 2841.7887573242188, "epoch": 2.027972027972028, "grad_norm": 0.1057605892419815, "kl": 0.0014410018920898438, "learning_rate": 7.75e-07, "loss": 0.0001, "reward": 0.5319940447807312, "reward_std": 0.18857014551758766, "rewards/accuracy_reward": 0.09226190880872309, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4397321492433548, "step": 73 }, { "completion_length": 3105.610107421875, "epoch": 2.055944055944056, "grad_norm": 0.14314356446266174, "kl": 0.0013837814331054688, "learning_rate": 7.674006696834872e-07, "loss": 0.0001, "reward": 0.5357143059372902, "reward_std": 0.23290237039327621, "rewards/accuracy_reward": 0.0922619067132473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443452388048172, "step": 74 }, { "completion_length": 2653.3482666015625, "epoch": 2.0839160839160837, "grad_norm": 0.16954892873764038, "kl": 0.00127410888671875, "learning_rate": 7.597195838661425e-07, "loss": 0.0001, "reward": 0.59300597012043, "reward_std": 0.2353062480688095, "rewards/accuracy_reward": 0.11011904804036021, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482886902987957, "step": 75 }, { "completion_length": 2528.6578369140625, "epoch": 2.111888111888112, "grad_norm": 0.0982322096824646, "kl": 0.00152587890625, "learning_rate": 7.51959631090208e-07, "loss": 0.0001, "reward": 0.565476194024086, "reward_std": 0.2114737629890442, "rewards/accuracy_reward": 0.08928571362048388, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.476190485060215, "step": 76 }, { "completion_length": 2843.4524536132812, "epoch": 2.13986013986014, "grad_norm": 0.13836292922496796, "kl": 0.0012874603271484375, "learning_rate": 7.441237295565641e-07, "loss": 0.0001, "reward": 0.605654776096344, "reward_std": 0.240522138774395, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4598214328289032, "step": 77 }, { "completion_length": 3015.6488647460938, "epoch": 2.167832167832168, "grad_norm": 0.4612804055213928, "kl": 0.0013256072998046875, "learning_rate": 7.362148260273126e-07, "loss": 0.0001, "reward": 0.5468750074505806, "reward_std": 0.2539355792105198, "rewards/accuracy_reward": 0.09821428940631449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607164144516, "step": 78 }, { "completion_length": 3276.9346313476562, "epoch": 2.195804195804196, "grad_norm": 0.09492038935422897, "kl": 0.0012264251708984375, "learning_rate": 7.282358947176205e-07, "loss": 0.0, "reward": 0.5208333358168602, "reward_std": 0.2553631514310837, "rewards/accuracy_reward": 0.10416666883975267, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4166666716337204, "step": 79 }, { "completion_length": 3023.4345703125, "epoch": 2.2237762237762237, "grad_norm": 0.09587059915065765, "kl": 0.0013256072998046875, "learning_rate": 7.201899361772391e-07, "loss": 0.0001, "reward": 0.5803571492433548, "reward_std": 0.268021535128355, "rewards/accuracy_reward": 0.13392857275903225, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464285746216774, "step": 80 }, { "completion_length": 3000.0030517578125, "epoch": 2.2517482517482517, "grad_norm": 0.10489190369844437, "kl": 0.001300811767578125, "learning_rate": 7.120799761621197e-07, "loss": 0.0001, "reward": 0.6220238208770752, "reward_std": 0.29535772278904915, "rewards/accuracy_reward": 0.17559524066746235, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.446428582072258, "step": 81 }, { "completion_length": 2823.2530517578125, "epoch": 2.2797202797202796, "grad_norm": 0.102653369307518, "kl": 0.0018138885498046875, "learning_rate": 7.039090644965509e-07, "loss": 0.0001, "reward": 0.6383928507566452, "reward_std": 0.32175029069185257, "rewards/accuracy_reward": 0.15773809887468815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4806547686457634, "step": 82 }, { "completion_length": 2965.2232666015625, "epoch": 2.3076923076923075, "grad_norm": 0.109110988676548, "kl": 0.0014629364013671875, "learning_rate": 6.956802739262445e-07, "loss": 0.0001, "reward": 0.534226194024086, "reward_std": 0.24012810364365578, "rewards/accuracy_reward": 0.08035714481957257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4538690522313118, "step": 83 }, { "completion_length": 3099.669677734375, "epoch": 2.335664335664336, "grad_norm": 0.09192686527967453, "kl": 0.0014801025390625, "learning_rate": 6.873966989628009e-07, "loss": 0.0001, "reward": 0.5416666716337204, "reward_std": 0.25171075016260147, "rewards/accuracy_reward": 0.09523809631355107, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.446428582072258, "step": 84 }, { "completion_length": 2962.357177734375, "epoch": 2.3636363636363638, "grad_norm": 0.10174567997455597, "kl": 0.00141143798828125, "learning_rate": 6.790614547199906e-07, "loss": 0.0001, "reward": 0.709077388048172, "reward_std": 0.2970610596239567, "rewards/accuracy_reward": 0.2619047686457634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4471726268529892, "step": 85 }, { "completion_length": 3212.949462890625, "epoch": 2.3916083916083917, "grad_norm": 0.09201247245073318, "kl": 0.001979827880859375, "learning_rate": 6.706776757422868e-07, "loss": 0.0001, "reward": 0.7433035671710968, "reward_std": 0.4256303459405899, "rewards/accuracy_reward": 0.3244047723710537, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4188988208770752, "step": 86 }, { "completion_length": 3537.5238647460938, "epoch": 2.4195804195804196, "grad_norm": 0.10975628346204758, "kl": 0.0016155242919921875, "learning_rate": 6.622485148260915e-07, "loss": 0.0001, "reward": 0.6011904776096344, "reward_std": 0.24968973733484745, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3928571492433548, "step": 87 }, { "completion_length": 3494.4881591796875, "epoch": 2.4475524475524475, "grad_norm": 0.09692779183387756, "kl": 0.00200653076171875, "learning_rate": 6.537771418340981e-07, "loss": 0.0001, "reward": 0.6830357313156128, "reward_std": 0.37514135241508484, "rewards/accuracy_reward": 0.2738095298409462, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.409226194024086, "step": 88 }, { "completion_length": 3433.0982666015625, "epoch": 2.4755244755244754, "grad_norm": 0.10614696890115738, "kl": 0.00193023681640625, "learning_rate": 6.452667425032349e-07, "loss": 0.0001, "reward": 0.6532738283276558, "reward_std": 0.38044291734695435, "rewards/accuracy_reward": 0.24107143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.412202388048172, "step": 89 }, { "completion_length": 3480.40185546875, "epoch": 2.5034965034965033, "grad_norm": 0.13003912568092346, "kl": 0.0018329620361328125, "learning_rate": 6.367205172466403e-07, "loss": 0.0001, "reward": 0.643601194024086, "reward_std": 0.31658271327614784, "rewards/accuracy_reward": 0.24702381901443005, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3965773805975914, "step": 90 }, { "completion_length": 3084.0804443359375, "epoch": 2.5314685314685317, "grad_norm": 0.09549430012702942, "kl": 0.002101898193359375, "learning_rate": 6.281416799501187e-07, "loss": 0.0001, "reward": 0.7269345223903656, "reward_std": 0.36572954058647156, "rewards/accuracy_reward": 0.27678571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4501488134264946, "step": 91 }, { "completion_length": 3418.8155517578125, "epoch": 2.5594405594405596, "grad_norm": 0.11826770007610321, "kl": 0.0019245147705078125, "learning_rate": 6.195334567635283e-07, "loss": 0.0001, "reward": 0.6346726343035698, "reward_std": 0.34816064313054085, "rewards/accuracy_reward": 0.23511905036866665, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3995535746216774, "step": 92 }, { "completion_length": 3348.9732666015625, "epoch": 2.5874125874125875, "grad_norm": 0.09275460988283157, "kl": 0.001911163330078125, "learning_rate": 6.10899084887559e-07, "loss": 0.0001, "reward": 0.70238097012043, "reward_std": 0.3649657368659973, "rewards/accuracy_reward": 0.2827381044626236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4196428656578064, "step": 93 }, { "completion_length": 3493.886962890625, "epoch": 2.6153846153846154, "grad_norm": 0.09432340413331985, "kl": 0.001922607421875, "learning_rate": 6.022418113563535e-07, "loss": 0.0001, "reward": 0.6912202388048172, "reward_std": 0.3915746212005615, "rewards/accuracy_reward": 0.27380952425301075, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4174107164144516, "step": 94 }, { "completion_length": 2935.6726684570312, "epoch": 2.6433566433566433, "grad_norm": 0.09711038321256638, "kl": 0.0020694732666015625, "learning_rate": 5.935648918164306e-07, "loss": 0.0001, "reward": 0.8415178656578064, "reward_std": 0.3873477354645729, "rewards/accuracy_reward": 0.3928571566939354, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4486607238650322, "step": 95 }, { "completion_length": 3010.7738037109375, "epoch": 2.6713286713286712, "grad_norm": 0.09161806106567383, "kl": 0.0018405914306640625, "learning_rate": 5.848715893023689e-07, "loss": 0.0001, "reward": 0.8355654925107956, "reward_std": 0.2888724021613598, "rewards/accuracy_reward": 0.3720238208770752, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4635416865348816, "step": 96 }, { "completion_length": 2880.8810424804688, "epoch": 2.699300699300699, "grad_norm": 0.10187753289937973, "kl": 0.002147674560546875, "learning_rate": 5.761651730097142e-07, "loss": 0.0001, "reward": 0.8497024178504944, "reward_std": 0.36032697558403015, "rewards/accuracy_reward": 0.3869047686457634, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4627976194024086, "step": 97 }, { "completion_length": 3113.1726684570312, "epoch": 2.7272727272727275, "grad_norm": 0.09418516606092453, "kl": 0.0018100738525390625, "learning_rate": 5.674489170655675e-07, "loss": 0.0001, "reward": 0.7410714477300644, "reward_std": 0.3313647545874119, "rewards/accuracy_reward": 0.2976190485060215, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.443452388048172, "step": 98 }, { "completion_length": 2379.4732666015625, "epoch": 2.755244755244755, "grad_norm": 0.10177874565124512, "kl": 0.002620697021484375, "learning_rate": 5.587260992973209e-07, "loss": 0.0001, "reward": 0.820684552192688, "reward_std": 0.3514714166522026, "rewards/accuracy_reward": 0.3035714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.517113097012043, "step": 99 }, { "completion_length": 2891.5149536132812, "epoch": 2.7832167832167833, "grad_norm": 0.1491205096244812, "kl": 0.0019989013671875, "learning_rate": 5.5e-07, "loss": 0.0001, "reward": 0.5982143133878708, "reward_std": 0.26152127981185913, "rewards/accuracy_reward": 0.13095238455571234, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4672619104385376, "step": 100 }, { "completion_length": 2804.0357666015625, "epoch": 2.8111888111888113, "grad_norm": 0.11217369139194489, "kl": 0.002048492431640625, "learning_rate": 5.41273900702679e-07, "loss": 0.0001, "reward": 0.7849702537059784, "reward_std": 0.3465358465909958, "rewards/accuracy_reward": 0.2946428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.490327388048172, "step": 101 }, { "completion_length": 2823.2500610351562, "epoch": 2.839160839160839, "grad_norm": 0.10132594406604767, "kl": 0.002750396728515625, "learning_rate": 5.325510829344324e-07, "loss": 0.0001, "reward": 0.6532738134264946, "reward_std": 0.31015192717313766, "rewards/accuracy_reward": 0.15773809794336557, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4955357238650322, "step": 102 }, { "completion_length": 2617.6280517578125, "epoch": 2.867132867132867, "grad_norm": 0.10968906432390213, "kl": 0.002490997314453125, "learning_rate": 5.238348269902859e-07, "loss": 0.0001, "reward": 0.7165178805589676, "reward_std": 0.2918235771358013, "rewards/accuracy_reward": 0.2261904776096344, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4903273805975914, "step": 103 }, { "completion_length": 2468.5327758789062, "epoch": 2.895104895104895, "grad_norm": 0.10988292843103409, "kl": 0.00304412841796875, "learning_rate": 5.151284106976311e-07, "loss": 0.0001, "reward": 0.7790178805589676, "reward_std": 0.29311515390872955, "rewards/accuracy_reward": 0.2678571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607164144516, "step": 104 }, { "completion_length": 2649.8631591796875, "epoch": 2.9230769230769234, "grad_norm": 0.11911734938621521, "kl": 0.00260162353515625, "learning_rate": 5.064351081835694e-07, "loss": 0.0001, "reward": 0.7857142984867096, "reward_std": 0.3036133013665676, "rewards/accuracy_reward": 0.2946428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4910714477300644, "step": 105 }, { "completion_length": 2294.0923767089844, "epoch": 2.951048951048951, "grad_norm": 0.10718347877264023, "kl": 0.00302886962890625, "learning_rate": 4.977581886436462e-07, "loss": 0.0001, "reward": 0.8206845372915268, "reward_std": 0.37444857507944107, "rewards/accuracy_reward": 0.255952388048172, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.564732164144516, "step": 106 }, { "completion_length": 3227.6697387695312, "epoch": 2.979020979020979, "grad_norm": 0.13584300875663757, "kl": 0.002513885498046875, "learning_rate": 4.891009151124411e-07, "loss": 0.0001, "reward": 0.5438988208770752, "reward_std": 0.2875536195933819, "rewards/accuracy_reward": 0.1160714328289032, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.427827388048172, "step": 107 }, { "completion_length": 3472.122802734375, "epoch": 3.0, "grad_norm": 0.14357814192771912, "kl": 0.0024566650390625, "learning_rate": 4.804665432364719e-07, "loss": 0.0001, "reward": 0.4494047661622365, "reward_std": 0.2378531942764918, "rewards/accuracy_reward": 0.06349206529557705, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3859127064545949, "step": 108 }, { "completion_length": 2802.714324951172, "epoch": 3.027972027972028, "grad_norm": 0.10655632615089417, "kl": 0.002201080322265625, "learning_rate": 4.7185832004988133e-07, "loss": 0.0001, "reward": 0.594494067132473, "reward_std": 0.2724935933947563, "rewards/accuracy_reward": 0.10714285774156451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4873512014746666, "step": 109 }, { "completion_length": 3067.919677734375, "epoch": 3.055944055944056, "grad_norm": 0.17373289167881012, "kl": 0.0023937225341796875, "learning_rate": 4.632794827533596e-07, "loss": 0.0001, "reward": 0.5587797686457634, "reward_std": 0.26902300491929054, "rewards/accuracy_reward": 0.0922619067132473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4665178656578064, "step": 110 }, { "completion_length": 2598.9345703125, "epoch": 3.0839160839160837, "grad_norm": 0.13991814851760864, "kl": 0.002651214599609375, "learning_rate": 4.547332574967653e-07, "loss": 0.0001, "reward": 0.6733631193637848, "reward_std": 0.2784017063677311, "rewards/accuracy_reward": 0.13095238571986556, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5424107164144516, "step": 111 }, { "completion_length": 2481.2262573242188, "epoch": 3.111888111888112, "grad_norm": 0.12796764075756073, "kl": 0.00284576416015625, "learning_rate": 4.4622285816590186e-07, "loss": 0.0001, "reward": 0.6428571492433548, "reward_std": 0.25486208125948906, "rewards/accuracy_reward": 0.10416666907258332, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5386904925107956, "step": 112 }, { "completion_length": 2844.6786499023438, "epoch": 3.13986013986014, "grad_norm": 0.11658414453268051, "kl": 0.0024261474609375, "learning_rate": 4.3775148517390846e-07, "loss": 0.0001, "reward": 0.6517857313156128, "reward_std": 0.2692863494157791, "rewards/accuracy_reward": 0.16964286006987095, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.482142873108387, "step": 113 }, { "completion_length": 2809.416748046875, "epoch": 3.167832167832168, "grad_norm": 0.12356197834014893, "kl": 0.002544403076171875, "learning_rate": 4.293223242577131e-07, "loss": 0.0001, "reward": 0.6004464477300644, "reward_std": 0.26328422501683235, "rewards/accuracy_reward": 0.09821428754366934, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5022321492433548, "step": 114 }, { "completion_length": 3025.8661499023438, "epoch": 3.195804195804196, "grad_norm": 0.11549896001815796, "kl": 0.00232696533203125, "learning_rate": 4.209385452800095e-07, "loss": 0.0001, "reward": 0.6071428656578064, "reward_std": 0.3151152990758419, "rewards/accuracy_reward": 0.1428571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4642857313156128, "step": 115 }, { "completion_length": 2975.202392578125, "epoch": 3.2237762237762237, "grad_norm": 0.10660137236118317, "kl": 0.00252532958984375, "learning_rate": 4.126033010371991e-07, "loss": 0.0001, "reward": 0.5825892984867096, "reward_std": 0.2717975974082947, "rewards/accuracy_reward": 0.12202380993403494, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.460565485060215, "step": 116 }, { "completion_length": 2774.3155517578125, "epoch": 3.2517482517482517, "grad_norm": 0.1331847459077835, "kl": 0.002536773681640625, "learning_rate": 4.043197260737555e-07, "loss": 0.0001, "reward": 0.6711309552192688, "reward_std": 0.2978888005018234, "rewards/accuracy_reward": 0.18750000093132257, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4836309626698494, "step": 117 }, { "completion_length": 2632.1339721679688, "epoch": 3.2797202797202796, "grad_norm": 0.11804653704166412, "kl": 0.002796173095703125, "learning_rate": 3.9609093550344907e-07, "loss": 0.0001, "reward": 0.6644345372915268, "reward_std": 0.33424488455057144, "rewards/accuracy_reward": 0.1547619067132473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.509672611951828, "step": 118 }, { "completion_length": 2838.5297241210938, "epoch": 3.3076923076923075, "grad_norm": 0.13138364255428314, "kl": 0.002597808837890625, "learning_rate": 3.8792002383788036e-07, "loss": 0.0001, "reward": 0.5818452388048172, "reward_std": 0.259741447865963, "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4925595372915268, "step": 119 }, { "completion_length": 3014.5476684570312, "epoch": 3.335664335664336, "grad_norm": 0.10719572007656097, "kl": 0.00262451171875, "learning_rate": 3.7981006382276093e-07, "loss": 0.0001, "reward": 0.6049107164144516, "reward_std": 0.27329112216830254, "rewards/accuracy_reward": 0.11904762033373117, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4858631044626236, "step": 120 }, { "completion_length": 2792.014892578125, "epoch": 3.3636363636363638, "grad_norm": 0.09954708069562912, "kl": 0.002727508544921875, "learning_rate": 3.7176410528237945e-07, "loss": 0.0001, "reward": 0.7752976417541504, "reward_std": 0.3618427440524101, "rewards/accuracy_reward": 0.2797619104385376, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4955357238650322, "step": 121 }, { "completion_length": 2741.3720703125, "epoch": 3.3916083916083917, "grad_norm": 0.11019442230463028, "kl": 0.003749847412109375, "learning_rate": 3.637851739726874e-07, "loss": 0.0001, "reward": 0.8489583432674408, "reward_std": 0.4548647478222847, "rewards/accuracy_reward": 0.3511904813349247, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4977678582072258, "step": 122 }, { "completion_length": 3301.1488647460938, "epoch": 3.4195804195804196, "grad_norm": 0.10979685932397842, "kl": 0.003269195556640625, "learning_rate": 3.5587627044343604e-07, "loss": 0.0001, "reward": 0.6569940596818924, "reward_std": 0.29442668706178665, "rewards/accuracy_reward": 0.23511905409395695, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4218750149011612, "step": 123 }, { "completion_length": 3237.297607421875, "epoch": 3.4475524475524475, "grad_norm": 0.09944824874401093, "kl": 0.003009796142578125, "learning_rate": 3.4804036890979205e-07, "loss": 0.0001, "reward": 0.762648805975914, "reward_std": 0.35693658888339996, "rewards/accuracy_reward": 0.3184523917734623, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4441964402794838, "step": 124 }, { "completion_length": 3247.4404907226562, "epoch": 3.4755244755244754, "grad_norm": 0.1071368008852005, "kl": 0.003139495849609375, "learning_rate": 3.402804161338577e-07, "loss": 0.0001, "reward": 0.7090774029493332, "reward_std": 0.39585674554109573, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4382440522313118, "step": 125 }, { "completion_length": 3228.3363647460938, "epoch": 3.5034965034965033, "grad_norm": 0.10697885602712631, "kl": 0.003299713134765625, "learning_rate": 3.3259933031651266e-07, "loss": 0.0001, "reward": 0.6659226417541504, "reward_std": 0.379042886197567, "rewards/accuracy_reward": 0.23511904664337635, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.430803582072258, "step": 126 }, { "completion_length": 2906.5477294921875, "epoch": 3.5314685314685317, "grad_norm": 0.12582722306251526, "kl": 0.003780364990234375, "learning_rate": 3.250000000000001e-07, "loss": 0.0002, "reward": 0.793898805975914, "reward_std": 0.3561762161552906, "rewards/accuracy_reward": 0.3244047611951828, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.469494067132473, "step": 127 }, { "completion_length": 3254.8929443359375, "epoch": 3.5594405594405596, "grad_norm": 0.09814245998859406, "kl": 0.003406524658203125, "learning_rate": 3.1748528298166164e-07, "loss": 0.0001, "reward": 0.7239583432674408, "reward_std": 0.37790394574403763, "rewards/accuracy_reward": 0.2797619067132473, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4441964253783226, "step": 128 }, { "completion_length": 3201.9077758789062, "epoch": 3.5874125874125875, "grad_norm": 0.10079911351203918, "kl": 0.0031280517578125, "learning_rate": 3.1005800523923903e-07, "loss": 0.0001, "reward": 0.7693452537059784, "reward_std": 0.406753808259964, "rewards/accuracy_reward": 0.3244047611951828, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.444940485060215, "step": 129 }, { "completion_length": 3339.5059814453125, "epoch": 3.6153846153846154, "grad_norm": 0.09590104967355728, "kl": 0.003513336181640625, "learning_rate": 3.027209598681373e-07, "loss": 0.0001, "reward": 0.725446455180645, "reward_std": 0.38120192289352417, "rewards/accuracy_reward": 0.29464286752045155, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.430803582072258, "step": 130 }, { "completion_length": 2748.104248046875, "epoch": 3.6433566433566433, "grad_norm": 0.13206864893436432, "kl": 0.004322052001953125, "learning_rate": 2.954769060310577e-07, "loss": 0.0002, "reward": 0.879464328289032, "reward_std": 0.4047969654202461, "rewards/accuracy_reward": 0.3928571492433548, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4866071566939354, "step": 131 }, { "completion_length": 3002.3394165039062, "epoch": 3.6713286713286712, "grad_norm": 0.10522563755512238, "kl": 0.00333404541015625, "learning_rate": 2.8832856792038794e-07, "loss": 0.0001, "reward": 0.8526785969734192, "reward_std": 0.361838236451149, "rewards/accuracy_reward": 0.3750000149011612, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.477678582072258, "step": 132 }, { "completion_length": 2796.1934814453125, "epoch": 3.699300699300699, "grad_norm": 0.11168068647384644, "kl": 0.003910064697265625, "learning_rate": 2.812786337337463e-07, "loss": 0.0002, "reward": 0.903273805975914, "reward_std": 0.395388700067997, "rewards/accuracy_reward": 0.410714291036129, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4925595298409462, "step": 133 }, { "completion_length": 2953.2053833007812, "epoch": 3.7272727272727275, "grad_norm": 0.09826097637414932, "kl": 0.00336456298828125, "learning_rate": 2.743297546630587e-07, "loss": 0.0001, "reward": 0.7938988208770752, "reward_std": 0.3516330271959305, "rewards/accuracy_reward": 0.3214285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4724702537059784, "step": 134 }, { "completion_length": 2285.59521484375, "epoch": 3.755244755244755, "grad_norm": 0.11822542548179626, "kl": 0.0056304931640625, "learning_rate": 2.674845438975557e-07, "loss": 0.0002, "reward": 0.8407738357782364, "reward_std": 0.3399865999817848, "rewards/accuracy_reward": 0.2976190522313118, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.543154776096344, "step": 135 }, { "completion_length": 2728.6280517578125, "epoch": 3.7832167832167833, "grad_norm": 0.17046403884887695, "kl": 0.003498077392578125, "learning_rate": 2.6074557564105724e-07, "loss": 0.0001, "reward": 0.649553582072258, "reward_std": 0.288466639816761, "rewards/accuracy_reward": 0.14285714365541935, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964402794838, "step": 136 }, { "completion_length": 2683.8005981445312, "epoch": 3.8111888111888113, "grad_norm": 0.11535181850194931, "kl": 0.00335693359375, "learning_rate": 2.541153841439214e-07, "loss": 0.0001, "reward": 0.8236607313156128, "reward_std": 0.337150476872921, "rewards/accuracy_reward": 0.30059524066746235, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.523065485060215, "step": 137 }, { "completion_length": 2738.6964721679688, "epoch": 3.839160839160839, "grad_norm": 0.11976780742406845, "kl": 0.00394439697265625, "learning_rate": 2.475964627500149e-07, "loss": 0.0002, "reward": 0.703125, "reward_std": 0.33021562546491623, "rewards/accuracy_reward": 0.19047619588673115, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5126488208770752, "step": 138 }, { "completion_length": 2560.4107666015625, "epoch": 3.867132867132867, "grad_norm": 0.11340122669935226, "kl": 0.003749847412109375, "learning_rate": 2.411912629590699e-07, "loss": 0.0002, "reward": 0.7388392984867096, "reward_std": 0.3105376362800598, "rewards/accuracy_reward": 0.2321428656578064, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5066964402794838, "step": 139 }, { "completion_length": 2278.681640625, "epoch": 3.895104895104895, "grad_norm": 0.11167583614587784, "kl": 0.005157470703125, "learning_rate": 2.349021935047742e-07, "loss": 0.0002, "reward": 0.8244047909975052, "reward_std": 0.2984638176858425, "rewards/accuracy_reward": 0.2857142873108387, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5386904925107956, "step": 140 }, { "completion_length": 2529.0982666015625, "epoch": 3.9230769230769234, "grad_norm": 0.15898652374744415, "kl": 0.0040130615234375, "learning_rate": 2.287316194489455e-07, "loss": 0.0002, "reward": 0.7641369104385376, "reward_std": 0.32191672176122665, "rewards/accuracy_reward": 0.2589285783469677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5052083507180214, "step": 141 }, { "completion_length": 2281.9910888671875, "epoch": 3.951048951048951, "grad_norm": 0.11731097847223282, "kl": 0.0043792724609375, "learning_rate": 2.2268186129212807e-07, "loss": 0.0002, "reward": 0.9226190745830536, "reward_std": 0.3625694811344147, "rewards/accuracy_reward": 0.318452388048172, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6041666716337204, "step": 142 }, { "completion_length": 3223.9078979492188, "epoch": 3.979020979020979, "grad_norm": 0.1330711990594864, "kl": 0.0041351318359375, "learning_rate": 2.16755194100948e-07, "loss": 0.0002, "reward": 0.5401785746216774, "reward_std": 0.319940485060215, "rewards/accuracy_reward": 0.11607143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4241071492433548, "step": 143 }, { "completion_length": 3505.614013671875, "epoch": 4.0, "grad_norm": 0.15241242945194244, "kl": 0.0035298665364583335, "learning_rate": 2.1095384665255267e-07, "loss": 0.0001, "reward": 0.4821428656578064, "reward_std": 0.2562485933303833, "rewards/accuracy_reward": 0.08333333457509677, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3988095323244731, "step": 144 }, { "completion_length": 2669.4762573242188, "epoch": 4.027972027972028, "grad_norm": 0.132435142993927, "kl": 0.003795623779296875, "learning_rate": 2.0528000059645995e-07, "loss": 0.0002, "reward": 0.6242559552192688, "reward_std": 0.2846153862774372, "rewards/accuracy_reward": 0.11904762126505375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5052083432674408, "step": 145 }, { "completion_length": 2907.2738647460938, "epoch": 4.055944055944056, "grad_norm": 0.14874334633350372, "kl": 0.003658294677734375, "learning_rate": 1.99735789634128e-07, "loss": 0.0001, "reward": 0.5632440596818924, "reward_std": 0.27348505705595016, "rewards/accuracy_reward": 0.07440476352348924, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.488839291036129, "step": 146 }, { "completion_length": 2507.997100830078, "epoch": 4.083916083916084, "grad_norm": 0.1229638084769249, "kl": 0.004901885986328125, "learning_rate": 1.9432329871655836e-07, "loss": 0.0002, "reward": 0.6912202537059784, "reward_std": 0.2840711995959282, "rewards/accuracy_reward": 0.15773809887468815, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5334821566939354, "step": 147 }, { "completion_length": 2388.607177734375, "epoch": 4.111888111888112, "grad_norm": 0.13819031417369843, "kl": 0.0043792724609375, "learning_rate": 1.8904456326023027e-07, "loss": 0.0002, "reward": 0.6636904925107956, "reward_std": 0.2932458780705929, "rewards/accuracy_reward": 0.09821428940631449, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.565476194024086, "step": 148 }, { "completion_length": 2596.8035888671875, "epoch": 4.13986013986014, "grad_norm": 0.12725792825222015, "kl": 0.00445556640625, "learning_rate": 1.8390156838166462e-07, "loss": 0.0002, "reward": 0.6793154925107956, "reward_std": 0.27951210737228394, "rewards/accuracy_reward": 0.17857143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5007440522313118, "step": 149 }, { "completion_length": 2657.2232666015625, "epoch": 4.1678321678321675, "grad_norm": 0.1211203783750534, "kl": 0.00418853759765625, "learning_rate": 1.7889624815090195e-07, "loss": 0.0002, "reward": 0.621279776096344, "reward_std": 0.293570376932621, "rewards/accuracy_reward": 0.11011905060149729, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607313156128, "step": 150 }, { "completion_length": 2903.3482666015625, "epoch": 4.195804195804196, "grad_norm": 0.14065128564834595, "kl": 0.003757476806640625, "learning_rate": 1.7403048486417868e-07, "loss": 0.0002, "reward": 0.6443452388048172, "reward_std": 0.30524395406246185, "rewards/accuracy_reward": 0.1428571455180645, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.501488097012043, "step": 151 }, { "completion_length": 2863.4404907226562, "epoch": 4.223776223776224, "grad_norm": 0.12928026914596558, "kl": 0.003902435302734375, "learning_rate": 1.693061083360715e-07, "loss": 0.0002, "reward": 0.6190476194024086, "reward_std": 0.306563138961792, "rewards/accuracy_reward": 0.1339285746216774, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4851190522313118, "step": 152 }, { "completion_length": 2692.7857666015625, "epoch": 4.251748251748252, "grad_norm": 0.12153156846761703, "kl": 0.00408935546875, "learning_rate": 1.6472489521138015e-07, "loss": 0.0002, "reward": 0.6763392984867096, "reward_std": 0.31852778047323227, "rewards/accuracy_reward": 0.15773809980601072, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5186012014746666, "step": 153 }, { "completion_length": 2442.0684204101562, "epoch": 4.27972027972028, "grad_norm": 0.12675337493419647, "kl": 0.00487518310546875, "learning_rate": 1.6028856829700258e-07, "loss": 0.0002, "reward": 0.7380952537059784, "reward_std": 0.3266712352633476, "rewards/accuracy_reward": 0.1815476231276989, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5565476417541504, "step": 154 }, { "completion_length": 2662.907745361328, "epoch": 4.3076923076923075, "grad_norm": 0.15777888894081116, "kl": 0.004787445068359375, "learning_rate": 1.5599879591405916e-07, "loss": 0.0002, "reward": 0.6049107313156128, "reward_std": 0.23991192504763603, "rewards/accuracy_reward": 0.08333333604969084, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5215773954987526, "step": 155 }, { "completion_length": 2794.9673461914062, "epoch": 4.335664335664336, "grad_norm": 0.1465885192155838, "kl": 0.00409698486328125, "learning_rate": 1.5185719127050398e-07, "loss": 0.0002, "reward": 0.5833333432674408, "reward_std": 0.2825283370912075, "rewards/accuracy_reward": 0.08928571594879031, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4940476343035698, "step": 156 }, { "completion_length": 2754.6488647460938, "epoch": 4.363636363636363, "grad_norm": 0.09911419451236725, "kl": 0.00395965576171875, "learning_rate": 1.4786531185446452e-07, "loss": 0.0002, "reward": 0.7857142984867096, "reward_std": 0.3356376476585865, "rewards/accuracy_reward": 0.2976190559566021, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4880952537059784, "step": 157 }, { "completion_length": 2780.6726684570312, "epoch": 4.391608391608392, "grad_norm": 0.11424998939037323, "kl": 0.0051422119140625, "learning_rate": 1.4402465884853301e-07, "loss": 0.0002, "reward": 0.8787202537059784, "reward_std": 0.41560350358486176, "rewards/accuracy_reward": 0.3779762014746666, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5007440522313118, "step": 158 }, { "completion_length": 3287.794677734375, "epoch": 4.41958041958042, "grad_norm": 0.1019367203116417, "kl": 0.00496673583984375, "learning_rate": 1.4033667656523404e-07, "loss": 0.0002, "reward": 0.6815476268529892, "reward_std": 0.3411427028477192, "rewards/accuracy_reward": 0.24107143096625805, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.440476194024086, "step": 159 }, { "completion_length": 3285.3780517578125, "epoch": 4.4475524475524475, "grad_norm": 0.09824172407388687, "kl": 0.00437164306640625, "learning_rate": 1.3680275190387675e-07, "loss": 0.0002, "reward": 0.7886904925107956, "reward_std": 0.3618383854627609, "rewards/accuracy_reward": 0.3452381044626236, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4434523954987526, "step": 160 }, { "completion_length": 3261.59228515625, "epoch": 4.475524475524476, "grad_norm": 0.12426801025867462, "kl": 0.004497528076171875, "learning_rate": 1.3342421382899935e-07, "loss": 0.0002, "reward": 0.699404776096344, "reward_std": 0.3835337683558464, "rewards/accuracy_reward": 0.2619047649204731, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375000149011612, "step": 161 }, { "completion_length": 3172.2529907226562, "epoch": 4.503496503496503, "grad_norm": 0.12890060245990753, "kl": 0.004352569580078125, "learning_rate": 1.3020233287059976e-07, "loss": 0.0002, "reward": 0.7105654776096344, "reward_std": 0.35477447509765625, "rewards/accuracy_reward": 0.25595238991081715, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4546131044626236, "step": 162 }, { "completion_length": 2853.4256591796875, "epoch": 4.531468531468532, "grad_norm": 0.13204464316368103, "kl": 0.00543975830078125, "learning_rate": 1.2713832064634125e-07, "loss": 0.0002, "reward": 0.814732164144516, "reward_std": 0.4248877093195915, "rewards/accuracy_reward": 0.3154761977493763, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49925597012043, "step": 163 }, { "completion_length": 3321.5536499023438, "epoch": 4.559440559440559, "grad_norm": 0.09118322283029556, "kl": 0.00458526611328125, "learning_rate": 1.2423332940591238e-07, "loss": 0.0002, "reward": 0.7172619253396988, "reward_std": 0.342393409460783, "rewards/accuracy_reward": 0.2708333395421505, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4464285746216774, "step": 164 }, { "completion_length": 3086.2708740234375, "epoch": 4.5874125874125875, "grad_norm": 0.09752269089221954, "kl": 0.00424957275390625, "learning_rate": 1.2148845159771312e-07, "loss": 0.0002, "reward": 0.7566964328289032, "reward_std": 0.3746805787086487, "rewards/accuracy_reward": 0.3095238134264946, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4471726268529892, "step": 165 }, { "completion_length": 3220.9583740234375, "epoch": 4.615384615384615, "grad_norm": 0.09589620679616928, "kl": 0.0044403076171875, "learning_rate": 1.1890471945802999e-07, "loss": 0.0002, "reward": 0.7388393133878708, "reward_std": 0.3828039579093456, "rewards/accuracy_reward": 0.28571428917348385, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 166 }, { "completion_length": 2777.7857666015625, "epoch": 4.643356643356643, "grad_norm": 0.11369551718235016, "kl": 0.0052642822265625, "learning_rate": 1.1648310462285385e-07, "loss": 0.0002, "reward": 0.8772321492433548, "reward_std": 0.41676195710897446, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4813988283276558, "step": 167 }, { "completion_length": 2835.4881591796875, "epoch": 4.671328671328672, "grad_norm": 0.11145463585853577, "kl": 0.004909515380859375, "learning_rate": 1.142245177624874e-07, "loss": 0.0002, "reward": 0.8630952537059784, "reward_std": 0.3876107409596443, "rewards/accuracy_reward": 0.3779762014746666, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4851190596818924, "step": 168 }, { "completion_length": 2691.9882202148438, "epoch": 4.699300699300699, "grad_norm": 0.10534343868494034, "kl": 0.0055389404296875, "learning_rate": 1.1212980823907929e-07, "loss": 0.0002, "reward": 0.9337797909975052, "reward_std": 0.3816476985812187, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4962797686457634, "step": 169 }, { "completion_length": 2880.0149536132812, "epoch": 4.7272727272727275, "grad_norm": 0.11654610186815262, "kl": 0.00537872314453125, "learning_rate": 1.1019976378721399e-07, "loss": 0.0002, "reward": 0.815476194024086, "reward_std": 0.3700762465596199, "rewards/accuracy_reward": 0.3303571529686451, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4851190522313118, "step": 170 }, { "completion_length": 2267.869140625, "epoch": 4.755244755244755, "grad_norm": 0.19363805651664734, "kl": 0.00749969482421875, "learning_rate": 1.0843511021767689e-07, "loss": 0.0003, "reward": 0.8407738357782364, "reward_std": 0.34815119206905365, "rewards/accuracy_reward": 0.2797619178891182, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5610119253396988, "step": 171 }, { "completion_length": 2730.889892578125, "epoch": 4.783216783216783, "grad_norm": 0.11546841263771057, "kl": 0.004669189453125, "learning_rate": 1.068365111445064e-07, "loss": 0.0002, "reward": 0.6510416716337204, "reward_std": 0.29459198564291, "rewards/accuracy_reward": 0.13988095801323652, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5111607164144516, "step": 172 }, { "completion_length": 2624.4077758789062, "epoch": 4.811188811188811, "grad_norm": 0.09517718106508255, "kl": 0.00469207763671875, "learning_rate": 1.0540456773543595e-07, "loss": 0.0002, "reward": 0.8214285969734192, "reward_std": 0.3938767686486244, "rewards/accuracy_reward": 0.27678571827709675, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5446428582072258, "step": 173 }, { "completion_length": 2643.919677734375, "epoch": 4.839160839160839, "grad_norm": 0.11820892244577408, "kl": 0.0055084228515625, "learning_rate": 1.041398184858196e-07, "loss": 0.0002, "reward": 0.6852678656578064, "reward_std": 0.30904605984687805, "rewards/accuracy_reward": 0.1666666662786156, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.518601194024086, "step": 174 }, { "completion_length": 2524.047607421875, "epoch": 4.867132867132867, "grad_norm": 0.1165422648191452, "kl": 0.00485992431640625, "learning_rate": 1.0304273901612565e-07, "loss": 0.0002, "reward": 0.7284226268529892, "reward_std": 0.3345734477043152, "rewards/accuracy_reward": 0.2142857164144516, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5141369178891182, "step": 175 }, { "completion_length": 2217.1279907226562, "epoch": 4.895104895104895, "grad_norm": 0.13623353838920593, "kl": 0.00641632080078125, "learning_rate": 1.0211374189307538e-07, "loss": 0.0003, "reward": 0.8489583432674408, "reward_std": 0.33298908174037933, "rewards/accuracy_reward": 0.2797619104385376, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5691964328289032, "step": 176 }, { "completion_length": 2418.5833129882812, "epoch": 4.923076923076923, "grad_norm": 0.12970149517059326, "kl": 0.005584716796875, "learning_rate": 1.013531764744936e-07, "loss": 0.0002, "reward": 0.8043155074119568, "reward_std": 0.3505142778158188, "rewards/accuracy_reward": 0.2738095261156559, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5305059626698494, "step": 177 }, { "completion_length": 2166.7738342285156, "epoch": 4.951048951048951, "grad_norm": 0.12993088364601135, "kl": 0.00594329833984375, "learning_rate": 1.0076132877792932e-07, "loss": 0.0002, "reward": 0.8958333432674408, "reward_std": 0.3779018819332123, "rewards/accuracy_reward": 0.2886904813349247, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6071428805589676, "step": 178 }, { "completion_length": 3136.8482360839844, "epoch": 4.979020979020979, "grad_norm": 0.139509916305542, "kl": 0.0050506591796875, "learning_rate": 1.0033842137309648e-07, "loss": 0.0002, "reward": 0.5885416865348816, "reward_std": 0.3464737571775913, "rewards/accuracy_reward": 0.13095238152891397, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.457589291036129, "step": 179 }, { "completion_length": 3431.8421223958335, "epoch": 5.0, "grad_norm": 0.139509916305542, "kl": 0.004852294921875, "learning_rate": 1.000846132981744e-07, "loss": 0.0001, "reward": 0.5496031840642294, "reward_std": 0.2647865464289983, "rewards/accuracy_reward": 0.11904762188593547, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4305555621782939, "step": 180 }, { "epoch": 5.0, "step": 180, "total_flos": 0.0, "train_loss": 8.540082282778706e-05, "train_runtime": 37307.6007, "train_samples_per_second": 0.134, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 180, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }