blacksunfm's picture
Model save
e1c8ae9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9986789960369881,
"eval_steps": 500,
"global_step": 378,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1179.875,
"epoch": 0.002642007926023778,
"grad_norm": 0.4997229278087616,
"kl": 0.0,
"learning_rate": 2.6315789473684208e-08,
"loss": 0.2467,
"reward": 1.19921875,
"reward_std": 0.13141997903585434,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.44921875,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 1057.625,
"epoch": 0.005284015852047556,
"grad_norm": 0.5586327910423279,
"kl": 0.0,
"learning_rate": 5.2631578947368416e-08,
"loss": 0.3641,
"reward": 0.95703125,
"reward_std": 0.12062124721705914,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 1275.578125,
"epoch": 0.007926023778071334,
"grad_norm": 0.5091319680213928,
"kl": 0.0001016855239868164,
"learning_rate": 7.894736842105262e-08,
"loss": 0.3625,
"reward": 0.4140625,
"reward_std": 0.13219169899821281,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4140625,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 891.71875,
"epoch": 0.010568031704095112,
"grad_norm": 0.5754386782646179,
"kl": 0.00015115737915039062,
"learning_rate": 1.0526315789473683e-07,
"loss": 0.3083,
"reward": 0.99609375,
"reward_std": 0.116029754281044,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.49609375,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 1166.125,
"epoch": 0.013210039630118891,
"grad_norm": 0.5114976763725281,
"kl": 0.00011730194091796875,
"learning_rate": 1.3157894736842104e-07,
"loss": 0.33,
"reward": 0.9296875,
"reward_std": 0.11507641524076462,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4296875,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 1035.703125,
"epoch": 0.015852047556142668,
"grad_norm": 0.7084254026412964,
"kl": 0.00015091896057128906,
"learning_rate": 1.5789473684210525e-07,
"loss": 0.3363,
"reward": 0.7265625,
"reward_std": 0.12440211698412895,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4765625,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 977.3125,
"epoch": 0.018494055482166448,
"grad_norm": 0.3633577525615692,
"kl": 9.310245513916016e-05,
"learning_rate": 1.8421052631578946e-07,
"loss": 0.2085,
"reward": 1.0,
"reward_std": 0.13400040566921234,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 889.03125,
"epoch": 0.021136063408190225,
"grad_norm": 0.6363146901130676,
"kl": 0.0001055002212524414,
"learning_rate": 2.1052631578947366e-07,
"loss": 0.3436,
"reward": 0.984375,
"reward_std": 0.11146603152155876,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.484375,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 1114.28125,
"epoch": 0.023778071334214,
"grad_norm": 0.6163086295127869,
"kl": 0.00010448694229125977,
"learning_rate": 2.3684210526315787e-07,
"loss": 0.387,
"reward": 0.45703125,
"reward_std": 0.11941792443394661,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 1325.578125,
"epoch": 0.026420079260237782,
"grad_norm": 0.45183688402175903,
"kl": 0.00015163421630859375,
"learning_rate": 2.631578947368421e-07,
"loss": 0.304,
"reward": 0.91015625,
"reward_std": 0.12797221168875694,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.41015625,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 1116.671875,
"epoch": 0.02906208718626156,
"grad_norm": 0.5506221055984497,
"kl": 0.0001614093780517578,
"learning_rate": 2.894736842105263e-07,
"loss": 0.2958,
"reward": 0.7109375,
"reward_std": 0.1341523937880993,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4609375,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 1107.546875,
"epoch": 0.031704095112285335,
"grad_norm": 0.423910528421402,
"kl": 0.000125885009765625,
"learning_rate": 3.157894736842105e-07,
"loss": 0.2614,
"reward": 0.9609375,
"reward_std": 0.11495335027575493,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4609375,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1075.453125,
"epoch": 0.034346103038309116,
"grad_norm": 0.6421769857406616,
"kl": 0.0001609325408935547,
"learning_rate": 3.4210526315789473e-07,
"loss": 0.3804,
"reward": 0.70703125,
"reward_std": 0.11874673143029213,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 1325.046875,
"epoch": 0.036988110964332896,
"grad_norm": 0.5751165151596069,
"kl": 0.00011897087097167969,
"learning_rate": 3.684210526315789e-07,
"loss": 0.3482,
"reward": 0.9296875,
"reward_std": 0.15341992676258087,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4296875,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 1087.34375,
"epoch": 0.03963011889035667,
"grad_norm": 0.6110666394233704,
"kl": 0.00010585784912109375,
"learning_rate": 3.9473684210526315e-07,
"loss": 0.3665,
"reward": 0.95703125,
"reward_std": 0.1287429742515087,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1314.15625,
"epoch": 0.04227212681638045,
"grad_norm": 0.5642758011817932,
"kl": 0.00013065338134765625,
"learning_rate": 4.2105263157894733e-07,
"loss": 0.4046,
"reward": 0.90625,
"reward_std": 0.13578036427497864,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.40625,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 1289.359375,
"epoch": 0.04491413474240423,
"grad_norm": 0.4779168963432312,
"kl": 8.845329284667969e-05,
"learning_rate": 4.4736842105263156e-07,
"loss": 0.2965,
"reward": 1.421875,
"reward_std": 0.12279411032795906,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.421875,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 1349.9375,
"epoch": 0.047556142668428,
"grad_norm": 0.4716605842113495,
"kl": 0.00012004375457763672,
"learning_rate": 4.7368421052631574e-07,
"loss": 0.3496,
"reward": 0.66796875,
"reward_std": 0.14581536501646042,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.41796875,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 775.328125,
"epoch": 0.05019815059445178,
"grad_norm": 0.5275957584381104,
"kl": 9.936094284057617e-05,
"learning_rate": 5e-07,
"loss": 0.3465,
"reward": 0.734375,
"reward_std": 0.08240052312612534,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.484375,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1096.671875,
"epoch": 0.052840158520475564,
"grad_norm": 0.622590959072113,
"kl": 0.00011599063873291016,
"learning_rate": 5.263157894736842e-07,
"loss": 0.3991,
"reward": 0.95703125,
"reward_std": 0.09287451207637787,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1145.40625,
"epoch": 0.05548216644649934,
"grad_norm": 0.5628076195716858,
"kl": 7.984042167663574e-05,
"learning_rate": 5.526315789473684e-07,
"loss": 0.3009,
"reward": 0.7109375,
"reward_std": 0.111817117780447,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4609375,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 829.109375,
"epoch": 0.05812417437252312,
"grad_norm": 0.5253135561943054,
"kl": 0.0001367330551147461,
"learning_rate": 5.789473684210526e-07,
"loss": 0.243,
"reward": 1.4921875,
"reward_std": 0.1498083807528019,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4921875,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 1060.53125,
"epoch": 0.0607661822985469,
"grad_norm": 0.624118983745575,
"kl": 7.021427154541016e-05,
"learning_rate": 6.052631578947368e-07,
"loss": 0.4002,
"reward": 1.21484375,
"reward_std": 0.1456764042377472,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46484375,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 976.578125,
"epoch": 0.06340819022457067,
"grad_norm": 0.46764305233955383,
"kl": 0.0001266002655029297,
"learning_rate": 6.31578947368421e-07,
"loss": 0.2928,
"reward": 1.20703125,
"reward_std": 0.096083864569664,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1207.015625,
"epoch": 0.06605019815059446,
"grad_norm": 0.39954128861427307,
"kl": 0.00010007619857788086,
"learning_rate": 6.578947368421053e-07,
"loss": 0.1622,
"reward": 0.953125,
"reward_std": 0.15208648890256882,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.453125,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 783.921875,
"epoch": 0.06869220607661823,
"grad_norm": 0.4758118689060211,
"kl": 8.118152618408203e-05,
"learning_rate": 6.842105263157895e-07,
"loss": 0.2011,
"reward": 0.96875,
"reward_std": 0.07889671996235847,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46875,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 920.515625,
"epoch": 0.071334214002642,
"grad_norm": 0.7195703387260437,
"kl": 9.21487808227539e-05,
"learning_rate": 7.105263157894736e-07,
"loss": 0.2896,
"reward": 0.984375,
"reward_std": 0.10958803817629814,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.484375,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 1289.40625,
"epoch": 0.07397622192866579,
"grad_norm": 0.4253327548503876,
"kl": 9.363889694213867e-05,
"learning_rate": 7.368421052631578e-07,
"loss": 0.0989,
"reward": 0.9375,
"reward_std": 0.1678653284907341,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1002.25,
"epoch": 0.07661822985468957,
"grad_norm": 0.7329438924789429,
"kl": 0.0001462697982788086,
"learning_rate": 7.631578947368421e-07,
"loss": 0.4594,
"reward": 0.95703125,
"reward_std": 0.11983717978000641,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 1205.53125,
"epoch": 0.07926023778071334,
"grad_norm": 0.7603439092636108,
"kl": 0.00011014938354492188,
"learning_rate": 7.894736842105263e-07,
"loss": 0.4604,
"reward": 0.9375,
"reward_std": 0.1396191380918026,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 1225.0625,
"epoch": 0.08190224570673713,
"grad_norm": 0.586107075214386,
"kl": 0.0001385211944580078,
"learning_rate": 8.157894736842105e-07,
"loss": 0.2906,
"reward": 0.7109375,
"reward_std": 0.15029004588723183,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4453125,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 1195.609375,
"epoch": 0.0845442536327609,
"grad_norm": 0.5367782711982727,
"kl": 0.00018310546875,
"learning_rate": 8.421052631578947e-07,
"loss": 0.3054,
"reward": 1.18359375,
"reward_std": 0.1250832974910736,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.43359375,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 1193.53125,
"epoch": 0.08718626155878467,
"grad_norm": 0.6531537771224976,
"kl": 0.0001990795135498047,
"learning_rate": 8.684210526315789e-07,
"loss": 0.382,
"reward": 0.93359375,
"reward_std": 0.10596734657883644,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.43359375,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 975.796875,
"epoch": 0.08982826948480846,
"grad_norm": 0.7079041004180908,
"kl": 0.0002675056457519531,
"learning_rate": 8.947368421052631e-07,
"loss": 0.3162,
"reward": 0.9921875,
"reward_std": 0.11211910098791122,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4921875,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 1131.34375,
"epoch": 0.09247027741083223,
"grad_norm": 0.5116021037101746,
"kl": 0.0003204345703125,
"learning_rate": 9.210526315789473e-07,
"loss": 0.3366,
"reward": 1.19140625,
"reward_std": 0.14293401315808296,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.44140625,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 1189.828125,
"epoch": 0.095112285336856,
"grad_norm": 0.5107906460762024,
"kl": 0.0003094673156738281,
"learning_rate": 9.473684210526315e-07,
"loss": 0.328,
"reward": 1.20703125,
"reward_std": 0.15370117127895355,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 1482.453125,
"epoch": 0.0977542932628798,
"grad_norm": 0.46826329827308655,
"kl": 0.0004634857177734375,
"learning_rate": 9.736842105263158e-07,
"loss": 0.2712,
"reward": 1.16015625,
"reward_std": 0.1653159111738205,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.41015625,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 1108.578125,
"epoch": 0.10039630118890357,
"grad_norm": 0.5141110420227051,
"kl": 0.0006732940673828125,
"learning_rate": 1e-06,
"loss": 0.1843,
"reward": 0.97265625,
"reward_std": 0.11588806286454201,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1196.65625,
"epoch": 0.10303830911492734,
"grad_norm": 0.5530170202255249,
"kl": 0.000946044921875,
"learning_rate": 9.999807902665155e-07,
"loss": 0.2593,
"reward": 0.9609375,
"reward_std": 0.1273726001381874,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4609375,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 822.328125,
"epoch": 0.10568031704095113,
"grad_norm": 0.6078239679336548,
"kl": 0.001224517822265625,
"learning_rate": 9.999231627061236e-07,
"loss": 0.2837,
"reward": 0.9921875,
"reward_std": 0.10058118030428886,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4921875,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 995.421875,
"epoch": 0.1083223249669749,
"grad_norm": 0.6204021573066711,
"kl": 0.001720428466796875,
"learning_rate": 9.998271222388693e-07,
"loss": 0.4368,
"reward": 1.2265625,
"reward_std": 0.13393215090036392,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4765625,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 1254.515625,
"epoch": 0.11096433289299867,
"grad_norm": 0.6290051937103271,
"kl": 0.0020294189453125,
"learning_rate": 9.996926770643603e-07,
"loss": 0.3358,
"reward": 0.94921875,
"reward_std": 0.13193362578749657,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.44921875,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 885.078125,
"epoch": 0.11360634081902246,
"grad_norm": 0.38145869970321655,
"kl": 0.0020084381103515625,
"learning_rate": 9.995198386610676e-07,
"loss": 0.1421,
"reward": 1.2421875,
"reward_std": 0.09872931987047195,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4921875,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 1162.296875,
"epoch": 0.11624834874504623,
"grad_norm": 0.5801534056663513,
"kl": 0.00255584716796875,
"learning_rate": 9.993086217853452e-07,
"loss": 0.3938,
"reward": 0.9375,
"reward_std": 0.12491972371935844,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 946.984375,
"epoch": 0.11889035667107001,
"grad_norm": 0.7080899477005005,
"kl": 0.00287628173828125,
"learning_rate": 9.990590444701706e-07,
"loss": 0.3176,
"reward": 0.71484375,
"reward_std": 0.07072163559496403,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46484375,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 1258.78125,
"epoch": 0.1215323645970938,
"grad_norm": 0.6584539413452148,
"kl": 0.00337982177734375,
"learning_rate": 9.987711280236046e-07,
"loss": 0.3364,
"reward": 0.9296875,
"reward_std": 0.10684756934642792,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4296875,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 1006.984375,
"epoch": 0.12417437252311757,
"grad_norm": 0.5412375926971436,
"kl": 0.003643035888671875,
"learning_rate": 9.984448970269725e-07,
"loss": 0.2438,
"reward": 1.25390625,
"reward_std": 0.16918476670980453,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.50390625,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1165.140625,
"epoch": 0.12681638044914134,
"grad_norm": 0.5502119064331055,
"kl": 0.00435638427734375,
"learning_rate": 9.980803793327655e-07,
"loss": 0.329,
"reward": 0.73046875,
"reward_std": 0.17235729470849037,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46484375,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 1094.59375,
"epoch": 0.12945838837516513,
"grad_norm": 0.6746593713760376,
"kl": 0.0046234130859375,
"learning_rate": 9.976776060622625e-07,
"loss": 0.2585,
"reward": 0.68359375,
"reward_std": 0.11046826094388962,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.43359375,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 894.875,
"epoch": 0.13210039630118892,
"grad_norm": 0.6030331254005432,
"kl": 0.0045623779296875,
"learning_rate": 9.972366116028733e-07,
"loss": 0.1373,
"reward": 1.2265625,
"reward_std": 0.11612267419695854,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4765625,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 660.078125,
"epoch": 0.13474240422721268,
"grad_norm": 0.7342778444290161,
"kl": 0.00536346435546875,
"learning_rate": 9.96757433605202e-07,
"loss": 0.2687,
"reward": 1.26171875,
"reward_std": 0.11859130859375,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51171875,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 1063.71875,
"epoch": 0.13738441215323646,
"grad_norm": 0.7268034219741821,
"kl": 0.00653076171875,
"learning_rate": 9.962401129798343e-07,
"loss": 0.3436,
"reward": 0.98046875,
"reward_std": 0.15140536800026894,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.48046875,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 1265.96875,
"epoch": 0.14002642007926025,
"grad_norm": 0.7652710676193237,
"kl": 0.00766754150390625,
"learning_rate": 9.956846938938422e-07,
"loss": 0.4375,
"reward": 0.91015625,
"reward_std": 0.1307620257139206,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.41015625,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 1342.03125,
"epoch": 0.142668428005284,
"grad_norm": 0.6607176065444946,
"kl": 0.0090179443359375,
"learning_rate": 9.950912237670157e-07,
"loss": 0.3436,
"reward": 0.90234375,
"reward_std": 0.1162625178694725,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.40234375,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1300.421875,
"epoch": 0.1453104359313078,
"grad_norm": 0.6878055930137634,
"kl": 0.01092529296875,
"learning_rate": 9.944597532678119e-07,
"loss": 0.3859,
"reward": 1.1640625,
"reward_std": 0.1533336602151394,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4140625,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 1351.71875,
"epoch": 0.14795244385733158,
"grad_norm": 0.6881883144378662,
"kl": 0.01397705078125,
"learning_rate": 9.93790336309031e-07,
"loss": 0.3671,
"reward": 0.92578125,
"reward_std": 0.15761961415410042,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.42578125,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 710.25,
"epoch": 0.15059445178335534,
"grad_norm": 0.5193164348602295,
"kl": 0.0154571533203125,
"learning_rate": 9.930830300432126e-07,
"loss": 0.1832,
"reward": 1.01953125,
"reward_std": 0.11765347048640251,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 1319.109375,
"epoch": 0.15323645970937913,
"grad_norm": 0.6145569086074829,
"kl": 0.0148468017578125,
"learning_rate": 9.923378948577558e-07,
"loss": 0.3036,
"reward": 0.9375,
"reward_std": 0.1474018730223179,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 1229.484375,
"epoch": 0.15587846763540292,
"grad_norm": 0.6062135100364685,
"kl": 0.0187530517578125,
"learning_rate": 9.915549943697644e-07,
"loss": 0.3039,
"reward": 0.92578125,
"reward_std": 0.12412451207637787,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.42578125,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 1122.96875,
"epoch": 0.15852047556142668,
"grad_norm": 0.7750731110572815,
"kl": 0.019989013671875,
"learning_rate": 9.907343954206146e-07,
"loss": 0.4269,
"reward": 0.4609375,
"reward_std": 0.15149712190032005,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4609375,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1289.796875,
"epoch": 0.16116248348745046,
"grad_norm": 0.4260408282279968,
"kl": 0.023284912109375,
"learning_rate": 9.898761680702495e-07,
"loss": 0.2105,
"reward": 0.66015625,
"reward_std": 0.10409127548336983,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.41015625,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 1395.359375,
"epoch": 0.16380449141347425,
"grad_norm": 0.4302825629711151,
"kl": 0.022216796875,
"learning_rate": 9.889803855911965e-07,
"loss": 0.2882,
"reward": 0.69140625,
"reward_std": 0.17329547554254532,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.44140625,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 1214.203125,
"epoch": 0.166446499339498,
"grad_norm": 0.5709892511367798,
"kl": 0.025421142578125,
"learning_rate": 9.880471244623118e-07,
"loss": 0.2752,
"reward": 0.96484375,
"reward_std": 0.16381771862506866,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46484375,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 1456.640625,
"epoch": 0.1690885072655218,
"grad_norm": 0.4366983473300934,
"kl": 0.03094482421875,
"learning_rate": 9.87076464362251e-07,
"loss": 0.1409,
"reward": 1.21484375,
"reward_std": 0.1545065976679325,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46484375,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 907.890625,
"epoch": 0.17173051519154559,
"grad_norm": 0.5789319276809692,
"kl": 0.0296630859375,
"learning_rate": 9.860684881626674e-07,
"loss": 0.223,
"reward": 1.0234375,
"reward_std": 0.18188364803791046,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1124.03125,
"epoch": 0.17437252311756934,
"grad_norm": 0.8789018988609314,
"kl": 0.03033447265625,
"learning_rate": 9.850232819211343e-07,
"loss": -0.0662,
"reward": 0.9609375,
"reward_std": 0.16317331418395042,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4609375,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1024.5,
"epoch": 0.17701453104359313,
"grad_norm": 0.7724674344062805,
"kl": 0.03656005859375,
"learning_rate": 9.839409348738e-07,
"loss": 0.2921,
"reward": 1.21875,
"reward_std": 0.12279859185218811,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46875,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 1002.6875,
"epoch": 0.17965653896961692,
"grad_norm": 1.0319114923477173,
"kl": 0.043212890625,
"learning_rate": 9.828215394277686e-07,
"loss": 0.3121,
"reward": 0.97265625,
"reward_std": 0.13220234587788582,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.47265625,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 1255.0625,
"epoch": 0.18229854689564068,
"grad_norm": 0.7915776371955872,
"kl": 0.041290283203125,
"learning_rate": 9.816651911532093e-07,
"loss": 0.3672,
"reward": 0.93359375,
"reward_std": 0.16574888676404953,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.43359375,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 1128.546875,
"epoch": 0.18494055482166447,
"grad_norm": 0.577376127243042,
"kl": 0.040679931640625,
"learning_rate": 9.804719887751984e-07,
"loss": 0.1898,
"reward": 1.0078125,
"reward_std": 0.17545727640390396,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1146.234375,
"epoch": 0.18758256274768825,
"grad_norm": 0.5707401633262634,
"kl": 0.034698486328125,
"learning_rate": 9.792420341652901e-07,
"loss": 0.269,
"reward": 1.1796875,
"reward_std": 0.11014671996235847,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4296875,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 1104.234375,
"epoch": 0.190224570673712,
"grad_norm": 0.5689163208007812,
"kl": 0.0369873046875,
"learning_rate": 9.779754323328192e-07,
"loss": 0.3013,
"reward": 0.73046875,
"reward_std": 0.1631980687379837,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.48046875,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1476.96875,
"epoch": 0.1928665785997358,
"grad_norm": 0.5846036672592163,
"kl": 0.033660888671875,
"learning_rate": 9.766722914159345e-07,
"loss": 0.2798,
"reward": 0.8984375,
"reward_std": 0.1427699662744999,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3984375,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 1340.578125,
"epoch": 0.1955085865257596,
"grad_norm": 0.4723777174949646,
"kl": 0.035400390625,
"learning_rate": 9.753327226723687e-07,
"loss": 0.2281,
"reward": 0.64453125,
"reward_std": 0.09241959825158119,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.39453125,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 1429.40625,
"epoch": 0.19815059445178335,
"grad_norm": 0.6316815614700317,
"kl": 0.03790283203125,
"learning_rate": 9.73956840469937e-07,
"loss": 0.2594,
"reward": 1.1640625,
"reward_std": 0.14494511112570763,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4140625,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1395.609375,
"epoch": 0.20079260237780713,
"grad_norm": 0.4536829888820648,
"kl": 0.036865234375,
"learning_rate": 9.725447622767754e-07,
"loss": 0.257,
"reward": 1.24609375,
"reward_std": 0.24476346373558044,
"rewards/accuracy_reward": 0.765625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.48046875,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1369.703125,
"epoch": 0.20343461030383092,
"grad_norm": 0.519792914390564,
"kl": 0.04010009765625,
"learning_rate": 9.710966086513085e-07,
"loss": 0.2693,
"reward": 0.93359375,
"reward_std": 0.15936565026640892,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.43359375,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 1090.53125,
"epoch": 0.20607661822985468,
"grad_norm": 0.7418442368507385,
"kl": 0.04974365234375,
"learning_rate": 9.6961250323196e-07,
"loss": 0.3581,
"reward": 1.203125,
"reward_std": 0.14408493414521217,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.453125,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1075.609375,
"epoch": 0.20871862615587847,
"grad_norm": 0.4650673270225525,
"kl": 0.046630859375,
"learning_rate": 9.680925727265944e-07,
"loss": 0.1385,
"reward": 0.984375,
"reward_std": 0.13037987425923347,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.484375,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 1463.03125,
"epoch": 0.21136063408190225,
"grad_norm": 0.44249987602233887,
"kl": 0.047119140625,
"learning_rate": 9.665369469017002e-07,
"loss": 0.1594,
"reward": 0.8984375,
"reward_std": 0.16113372519612312,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3984375,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1128.3125,
"epoch": 0.21400264200792601,
"grad_norm": 0.543846070766449,
"kl": 0.05157470703125,
"learning_rate": 9.649457585713108e-07,
"loss": 0.2237,
"reward": 1.234375,
"reward_std": 0.1662597917020321,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.484375,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 960.765625,
"epoch": 0.2166446499339498,
"grad_norm": 0.7787006497383118,
"kl": 0.0552978515625,
"learning_rate": 9.633191435856653e-07,
"loss": 0.3572,
"reward": 1.2109375,
"reward_std": 0.12929406948387623,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4609375,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1094.390625,
"epoch": 0.2192866578599736,
"grad_norm": 0.9358471632003784,
"kl": 0.060302734375,
"learning_rate": 9.616572408196093e-07,
"loss": 0.3621,
"reward": 0.73046875,
"reward_std": 0.18469755724072456,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.48046875,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 868.265625,
"epoch": 0.22192866578599735,
"grad_norm": 1.0493205785751343,
"kl": 0.06304931640625,
"learning_rate": 9.599601921607397e-07,
"loss": 0.3486,
"reward": 0.5078125,
"reward_std": 0.16107311472296715,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1090.9375,
"epoch": 0.22457067371202113,
"grad_norm": 0.9199777245521545,
"kl": 0.06231689453125,
"learning_rate": 9.582281424972892e-07,
"loss": 0.3608,
"reward": 0.96484375,
"reward_std": 0.129608154296875,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46484375,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1144.140625,
"epoch": 0.22721268163804492,
"grad_norm": 0.7876753807067871,
"kl": 0.067138671875,
"learning_rate": 9.56461239705758e-07,
"loss": 0.2158,
"reward": 0.44921875,
"reward_std": 0.11367761343717575,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.44921875,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 1346.328125,
"epoch": 0.22985468956406868,
"grad_norm": 0.8156364560127258,
"kl": 0.06951904296875,
"learning_rate": 9.546596346382864e-07,
"loss": 0.2484,
"reward": 0.92578125,
"reward_std": 0.14216843992471695,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.42578125,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 906.40625,
"epoch": 0.23249669749009247,
"grad_norm": 0.6532436013221741,
"kl": 0.083984375,
"learning_rate": 9.528234811097781e-07,
"loss": 0.1984,
"reward": 1.24609375,
"reward_std": 0.10012037679553032,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.49609375,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1103.65625,
"epoch": 0.23513870541611626,
"grad_norm": 0.6433841586112976,
"kl": 0.0770263671875,
"learning_rate": 9.509529358847654e-07,
"loss": 0.1822,
"reward": 0.70703125,
"reward_std": 0.12630900368094444,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 1273.546875,
"epoch": 0.23778071334214002,
"grad_norm": 1.185502529144287,
"kl": 0.106201171875,
"learning_rate": 9.490481586640278e-07,
"loss": 0.3498,
"reward": 0.91796875,
"reward_std": 0.14778802916407585,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.41796875,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1223.328125,
"epoch": 0.2404227212681638,
"grad_norm": 0.6358450055122375,
"kl": 0.1009521484375,
"learning_rate": 9.47109312070955e-07,
"loss": 0.1773,
"reward": 0.74609375,
"reward_std": 0.18448476120829582,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.48046875,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 700.5625,
"epoch": 0.2430647291941876,
"grad_norm": 0.750359058380127,
"kl": 0.1322021484375,
"learning_rate": 9.45136561637664e-07,
"loss": 0.1891,
"reward": 1.046875,
"reward_std": 0.14496402069926262,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 863.90625,
"epoch": 0.24570673712021135,
"grad_norm": 0.557322084903717,
"kl": 0.1099853515625,
"learning_rate": 9.431300757908663e-07,
"loss": 0.1089,
"reward": 1.30078125,
"reward_std": 0.15019455552101135,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 970.015625,
"epoch": 0.24834874504623514,
"grad_norm": 0.731271505355835,
"kl": 0.12158203125,
"learning_rate": 9.410900258374876e-07,
"loss": 0.1692,
"reward": 0.76953125,
"reward_std": 0.17832617834210396,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 924.6875,
"epoch": 0.2509907529722589,
"grad_norm": 1.327541708946228,
"kl": 0.14990234375,
"learning_rate": 9.390165859500435e-07,
"loss": 0.2367,
"reward": 0.5234375,
"reward_std": 0.1663740910589695,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 1177.421875,
"epoch": 0.2536327608982827,
"grad_norm": 1.7957454919815063,
"kl": 0.165771484375,
"learning_rate": 9.369099331517676e-07,
"loss": 0.3655,
"reward": 0.9453125,
"reward_std": 0.17608627676963806,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4453125,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 1124.71875,
"epoch": 0.2562747688243065,
"grad_norm": 1.353155493736267,
"kl": 0.1519775390625,
"learning_rate": 9.34770247301499e-07,
"loss": 0.2683,
"reward": 1.2109375,
"reward_std": 0.11838950589299202,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4609375,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 710.703125,
"epoch": 0.25891677675033026,
"grad_norm": 1.821932077407837,
"kl": 0.19970703125,
"learning_rate": 9.325977110783263e-07,
"loss": 0.1213,
"reward": 1.52734375,
"reward_std": 0.14770140498876572,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.52734375,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 910.125,
"epoch": 0.261558784676354,
"grad_norm": 0.8406642079353333,
"kl": 0.185546875,
"learning_rate": 9.30392509965991e-07,
"loss": 0.1623,
"reward": 1.015625,
"reward_std": 0.1544700786471367,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.515625,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 1078.859375,
"epoch": 0.26420079260237783,
"grad_norm": 1.6371651887893677,
"kl": 0.225341796875,
"learning_rate": 9.281548322370517e-07,
"loss": 0.2703,
"reward": 0.72265625,
"reward_std": 0.14984130859375,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.47265625,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 532.15625,
"epoch": 0.2668428005284016,
"grad_norm": 2.1254074573516846,
"kl": 0.246826171875,
"learning_rate": 9.258848689368094e-07,
"loss": 0.2214,
"reward": 1.2578125,
"reward_std": 0.10374833643436432,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 848.5625,
"epoch": 0.26948480845442535,
"grad_norm": 2.5907938480377197,
"kl": 0.2958984375,
"learning_rate": 9.235828138669978e-07,
"loss": 0.3198,
"reward": 1.01171875,
"reward_std": 0.137377567589283,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51171875,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1073.3125,
"epoch": 0.27212681638044917,
"grad_norm": 2.286487102508545,
"kl": 0.2607421875,
"learning_rate": 9.21248863569236e-07,
"loss": 0.3082,
"reward": 0.97265625,
"reward_std": 0.15867146104574203,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.47265625,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 858.796875,
"epoch": 0.2747688243064729,
"grad_norm": 1.7667056322097778,
"kl": 0.33837890625,
"learning_rate": 9.188832173082495e-07,
"loss": 0.2436,
"reward": 0.71875,
"reward_std": 0.10251419246196747,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46875,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 1045.796875,
"epoch": 0.2774108322324967,
"grad_norm": 2.42461895942688,
"kl": 0.40380859375,
"learning_rate": 9.164860770548567e-07,
"loss": 0.2974,
"reward": 0.9921875,
"reward_std": 0.16395077854394913,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4921875,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 893.453125,
"epoch": 0.2800528401585205,
"grad_norm": 10.34216594696045,
"kl": 0.474609375,
"learning_rate": 9.140576474687263e-07,
"loss": 0.294,
"reward": 0.97265625,
"reward_std": 0.1429976001381874,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.47265625,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1402.546875,
"epoch": 0.28269484808454426,
"grad_norm": 5.165650367736816,
"kl": 0.5849609375,
"learning_rate": 9.11598135880903e-07,
"loss": 0.3739,
"reward": 0.6484375,
"reward_std": 0.16659503430128098,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3984375,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 959.703125,
"epoch": 0.285336856010568,
"grad_norm": 5.434719562530518,
"kl": 0.6767578125,
"learning_rate": 9.091077522761078e-07,
"loss": 0.421,
"reward": 0.9765625,
"reward_std": 0.13730589486658573,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4765625,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1104.828125,
"epoch": 0.28797886393659183,
"grad_norm": 1.7607016563415527,
"kl": 0.40234375,
"learning_rate": 9.065867092748082e-07,
"loss": 0.205,
"reward": 0.71875,
"reward_std": 0.16618655994534492,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46875,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 1261.84375,
"epoch": 0.2906208718626156,
"grad_norm": 3.3362314701080322,
"kl": 0.57373046875,
"learning_rate": 9.040352221150674e-07,
"loss": 0.3039,
"reward": 0.71875,
"reward_std": 0.2016766332089901,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46875,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 850.640625,
"epoch": 0.29326287978863935,
"grad_norm": 3.9499456882476807,
"kl": 0.54296875,
"learning_rate": 9.014535086341669e-07,
"loss": 0.3804,
"reward": 1.234375,
"reward_std": 0.14762691780924797,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.484375,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 871.765625,
"epoch": 0.29590488771466317,
"grad_norm": 4.223949432373047,
"kl": 0.5234375,
"learning_rate": 8.988417892500083e-07,
"loss": 0.3621,
"reward": 1.2734375,
"reward_std": 0.18184370175004005,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 711.078125,
"epoch": 0.2985468956406869,
"grad_norm": 10.757521629333496,
"kl": 0.53955078125,
"learning_rate": 8.962002869422955e-07,
"loss": 0.6943,
"reward": 0.484375,
"reward_std": 0.17551938444375992,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.484375,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 485.96875,
"epoch": 0.3011889035667107,
"grad_norm": 6.041623592376709,
"kl": 0.59326171875,
"learning_rate": 8.935292272334963e-07,
"loss": 0.4734,
"reward": 0.76953125,
"reward_std": 0.13621540740132332,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 615.203125,
"epoch": 0.3038309114927345,
"grad_norm": 2.360245943069458,
"kl": 0.60302734375,
"learning_rate": 8.908288381695892e-07,
"loss": 0.2661,
"reward": 1.2578125,
"reward_std": 0.1489735022187233,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 851.28125,
"epoch": 0.30647291941875826,
"grad_norm": 6.270340442657471,
"kl": 0.8740234375,
"learning_rate": 8.88099350300593e-07,
"loss": 0.5072,
"reward": 0.73046875,
"reward_std": 0.15848717093467712,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.48046875,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1406.65625,
"epoch": 0.309114927344782,
"grad_norm": 4.970353126525879,
"kl": 1.427734375,
"learning_rate": 8.853409966608831e-07,
"loss": 0.3739,
"reward": 0.65234375,
"reward_std": 0.15436260029673576,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.40234375,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 682.796875,
"epoch": 0.31175693527080584,
"grad_norm": 11.649397850036621,
"kl": 1.416015625,
"learning_rate": 8.825540127492965e-07,
"loss": 0.582,
"reward": 1.2734375,
"reward_std": 0.16201764903962612,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 346.234375,
"epoch": 0.3143989431968296,
"grad_norm": 6.038275241851807,
"kl": 1.6640625,
"learning_rate": 8.797386365090252e-07,
"loss": 0.4335,
"reward": 1.3046875,
"reward_std": 0.16278167814016342,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 878.03125,
"epoch": 0.31704095112285335,
"grad_norm": 12.164133071899414,
"kl": 2.13671875,
"learning_rate": 8.768951083073009e-07,
"loss": 0.8115,
"reward": 0.9921875,
"reward_std": 0.1910713165998459,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4921875,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 404.671875,
"epoch": 0.31968295904887717,
"grad_norm": 9.305420875549316,
"kl": 2.453125,
"learning_rate": 8.740236709148745e-07,
"loss": 0.6232,
"reward": 1.29296875,
"reward_std": 0.1861564740538597,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.54296875,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 390.578125,
"epoch": 0.32232496697490093,
"grad_norm": 11.043706893920898,
"kl": 2.4150390625,
"learning_rate": 8.711245694852886e-07,
"loss": 0.4605,
"reward": 1.296875,
"reward_std": 0.20820768922567368,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 501.625,
"epoch": 0.3249669749009247,
"grad_norm": 10.729813575744629,
"kl": 2.490234375,
"learning_rate": 8.681980515339463e-07,
"loss": 0.6364,
"reward": 0.8359375,
"reward_std": 0.23206235468387604,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 466.9375,
"epoch": 0.3276089828269485,
"grad_norm": 7.306431770324707,
"kl": 2.515625,
"learning_rate": 8.652443669169809e-07,
"loss": 0.5031,
"reward": 0.5625,
"reward_std": 0.18624207936227322,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 435.640625,
"epoch": 0.33025099075297226,
"grad_norm": 9.731188774108887,
"kl": 3.28515625,
"learning_rate": 8.622637678099224e-07,
"loss": 0.7344,
"reward": 1.01171875,
"reward_std": 0.16986817121505737,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51171875,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 291.671875,
"epoch": 0.332892998678996,
"grad_norm": 11.137627601623535,
"kl": 3.392578125,
"learning_rate": 8.592565086861681e-07,
"loss": 0.3762,
"reward": 1.01953125,
"reward_std": 0.1285141110420227,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 543.8125,
"epoch": 0.33553500660501984,
"grad_norm": 16.820133209228516,
"kl": 3.1875,
"learning_rate": 8.562228462952576e-07,
"loss": 0.2899,
"reward": 1.28125,
"reward_std": 0.1833672672510147,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53125,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 232.859375,
"epoch": 0.3381770145310436,
"grad_norm": 10.55738353729248,
"kl": 2.62939453125,
"learning_rate": 8.531630396409507e-07,
"loss": 0.2709,
"reward": 1.06640625,
"reward_std": 0.12935607135295868,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 718.671875,
"epoch": 0.34081902245706736,
"grad_norm": 10.954379081726074,
"kl": 3.91015625,
"learning_rate": 8.500773499591156e-07,
"loss": 0.3251,
"reward": 0.5078125,
"reward_std": 0.10781864821910858,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 489.109375,
"epoch": 0.34346103038309117,
"grad_norm": 10.081979751586914,
"kl": 2.50390625,
"learning_rate": 8.469660406954252e-07,
"loss": 0.4498,
"reward": 0.796875,
"reward_std": 0.20939984917640686,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 377.484375,
"epoch": 0.34610303830911493,
"grad_norm": 4.734899520874023,
"kl": 1.208984375,
"learning_rate": 8.438293774828649e-07,
"loss": 0.2461,
"reward": 1.3046875,
"reward_std": 0.16797470301389694,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 353.15625,
"epoch": 0.3487450462351387,
"grad_norm": 3.945875883102417,
"kl": 1.7080078125,
"learning_rate": 8.406676281190542e-07,
"loss": 0.2267,
"reward": 0.83984375,
"reward_std": 0.172641359269619,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 632.921875,
"epoch": 0.3513870541611625,
"grad_norm": 26.704730987548828,
"kl": 1.767578125,
"learning_rate": 8.374810625433825e-07,
"loss": 0.7894,
"reward": 1.02734375,
"reward_std": 0.21192153729498386,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.52734375,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 478.828125,
"epoch": 0.35402906208718626,
"grad_norm": 23.016502380371094,
"kl": 1.65234375,
"learning_rate": 8.342699528139628e-07,
"loss": 0.5162,
"reward": 1.015625,
"reward_std": 0.1322025004774332,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.515625,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 613.46875,
"epoch": 0.35667107001321,
"grad_norm": 5.931519985198975,
"kl": 2.02734375,
"learning_rate": 8.310345730844047e-07,
"loss": 0.4553,
"reward": 1.3125,
"reward_std": 0.21167393401265144,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 500.46875,
"epoch": 0.35931307793923384,
"grad_norm": 7.461983680725098,
"kl": 1.9765625,
"learning_rate": 8.277751995804067e-07,
"loss": 0.3654,
"reward": 1.0234375,
"reward_std": 0.1544732078909874,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 439.890625,
"epoch": 0.3619550858652576,
"grad_norm": 3.8175482749938965,
"kl": 2.041015625,
"learning_rate": 8.244921105761755e-07,
"loss": 0.3475,
"reward": 1.07421875,
"reward_std": 0.23262840881943703,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 442.796875,
"epoch": 0.36459709379128136,
"grad_norm": 11.061271667480469,
"kl": 1.546875,
"learning_rate": 8.211855863706654e-07,
"loss": 0.5592,
"reward": 1.2890625,
"reward_std": 0.17124063521623611,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5390625,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 334.875,
"epoch": 0.36723910171730517,
"grad_norm": 12.917343139648438,
"kl": 2.42578125,
"learning_rate": 8.178559092636484e-07,
"loss": 0.1005,
"reward": 0.6015625,
"reward_std": 0.1888568513095379,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6015625,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 815.125,
"epoch": 0.36988110964332893,
"grad_norm": 4.946498394012451,
"kl": 2.6484375,
"learning_rate": 8.145033635316128e-07,
"loss": 0.4205,
"reward": 0.51171875,
"reward_std": 0.19404659420251846,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51171875,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 358.640625,
"epoch": 0.3725231175693527,
"grad_norm": 4.1423869132995605,
"kl": 2.376953125,
"learning_rate": 8.111282354034921e-07,
"loss": 0.362,
"reward": 1.0546875,
"reward_std": 0.1854284517467022,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 220.828125,
"epoch": 0.3751651254953765,
"grad_norm": 14.8277006149292,
"kl": 3.98828125,
"learning_rate": 8.077308130362273e-07,
"loss": 0.1853,
"reward": 1.0390625,
"reward_std": 0.12213464453816414,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5390625,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 880.515625,
"epoch": 0.37780713342140027,
"grad_norm": 4.2313103675842285,
"kl": 2.3984375,
"learning_rate": 8.043113864901663e-07,
"loss": 0.4005,
"reward": 1.20703125,
"reward_std": 0.1507197804749012,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.45703125,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 561.265625,
"epoch": 0.380449141347424,
"grad_norm": 7.7739458084106445,
"kl": 2.126953125,
"learning_rate": 8.008702477042985e-07,
"loss": 0.4939,
"reward": 1.3203125,
"reward_std": 0.20398560166358948,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 698.015625,
"epoch": 0.38309114927344784,
"grad_norm": 6.112682342529297,
"kl": 2.115234375,
"learning_rate": 7.974076904713301e-07,
"loss": 0.4279,
"reward": 0.73828125,
"reward_std": 0.09649410098791122,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.48828125,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 359.640625,
"epoch": 0.3857331571994716,
"grad_norm": 11.499645233154297,
"kl": 1.916015625,
"learning_rate": 7.939240104126022e-07,
"loss": 0.4661,
"reward": 1.04296875,
"reward_std": 0.1618601270020008,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.54296875,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 364.5,
"epoch": 0.38837516512549536,
"grad_norm": 5.250813961029053,
"kl": 2.73046875,
"learning_rate": 7.904195049528497e-07,
"loss": 0.4228,
"reward": 1.09765625,
"reward_std": 0.2164350003004074,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59765625,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 624.75,
"epoch": 0.3910171730515192,
"grad_norm": 12.445371627807617,
"kl": 1.9091796875,
"learning_rate": 7.8689447329481e-07,
"loss": 0.5554,
"reward": 1.015625,
"reward_std": 0.20019326359033585,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.515625,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 388.015625,
"epoch": 0.39365918097754293,
"grad_norm": 14.60313606262207,
"kl": 3.12890625,
"learning_rate": 7.833492163936773e-07,
"loss": 0.2208,
"reward": 1.01953125,
"reward_std": 0.15205424278974533,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 278.0625,
"epoch": 0.3963011889035667,
"grad_norm": 9.847626686096191,
"kl": 2.1591796875,
"learning_rate": 7.797840369314081e-07,
"loss": 0.5313,
"reward": 0.5546875,
"reward_std": 0.17377189174294472,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 413.171875,
"epoch": 0.3989431968295905,
"grad_norm": 10.47969913482666,
"kl": 3.029296875,
"learning_rate": 7.761992392908791e-07,
"loss": 0.391,
"reward": 0.78515625,
"reward_std": 0.1711183786392212,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53515625,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 256.0,
"epoch": 0.40158520475561427,
"grad_norm": 27.210330963134766,
"kl": 3.328125,
"learning_rate": 7.725951295299005e-07,
"loss": 0.8581,
"reward": 1.56640625,
"reward_std": 0.18129342049360275,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 608.375,
"epoch": 0.404227212681638,
"grad_norm": 7.893120765686035,
"kl": 3.9453125,
"learning_rate": 7.689720153550853e-07,
"loss": 0.5819,
"reward": 0.73828125,
"reward_std": 0.13392486423254013,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.48828125,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 431.484375,
"epoch": 0.40686922060766184,
"grad_norm": 6.096236705780029,
"kl": 3.18359375,
"learning_rate": 7.653302060955789e-07,
"loss": 0.4258,
"reward": 1.078125,
"reward_std": 0.20535630360245705,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 403.328125,
"epoch": 0.4095112285336856,
"grad_norm": 9.526097297668457,
"kl": 3.87890625,
"learning_rate": 7.616700126766492e-07,
"loss": 0.6043,
"reward": 1.05078125,
"reward_std": 0.15629850327968597,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 392.953125,
"epoch": 0.41215323645970936,
"grad_norm": 13.829514503479004,
"kl": 4.109375,
"learning_rate": 7.579917475931409e-07,
"loss": 0.3873,
"reward": 0.52734375,
"reward_std": 0.18767033517360687,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.52734375,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 331.265625,
"epoch": 0.4147952443857332,
"grad_norm": 12.386381149291992,
"kl": 3.4296875,
"learning_rate": 7.54295724882796e-07,
"loss": 0.7169,
"reward": 1.328125,
"reward_std": 0.2166232354938984,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 457.0,
"epoch": 0.41743725231175693,
"grad_norm": 7.208274841308594,
"kl": 4.09375,
"learning_rate": 7.505822600994423e-07,
"loss": 0.6254,
"reward": 1.28515625,
"reward_std": 0.17519249208271503,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53515625,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 385.28125,
"epoch": 0.4200792602377807,
"grad_norm": 10.335708618164062,
"kl": 4.54296875,
"learning_rate": 7.468516702860519e-07,
"loss": 0.5237,
"reward": 0.51953125,
"reward_std": 0.18916139006614685,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 517.96875,
"epoch": 0.4227212681638045,
"grad_norm": 15.89622688293457,
"kl": 3.72265625,
"learning_rate": 7.43104273947674e-07,
"loss": 0.3898,
"reward": 1.01953125,
"reward_std": 0.17299087904393673,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 307.828125,
"epoch": 0.42536327608982827,
"grad_norm": 8.838927268981934,
"kl": 2.689453125,
"learning_rate": 7.393403910242418e-07,
"loss": 0.4323,
"reward": 1.02734375,
"reward_std": 0.13064508698880672,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.52734375,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 431.3125,
"epoch": 0.42800528401585203,
"grad_norm": 15.761492729187012,
"kl": 2.98828125,
"learning_rate": 7.355603428632565e-07,
"loss": 0.23,
"reward": 1.3671875,
"reward_std": 0.22000113874673843,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6171875,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 582.75,
"epoch": 0.43064729194187584,
"grad_norm": 14.52424144744873,
"kl": 2.109375,
"learning_rate": 7.317644521923526e-07,
"loss": 0.5996,
"reward": 0.7578125,
"reward_std": 0.1417398639023304,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 364.53125,
"epoch": 0.4332892998678996,
"grad_norm": 12.958600044250488,
"kl": 1.623046875,
"learning_rate": 7.279530430917441e-07,
"loss": 0.0741,
"reward": 0.796875,
"reward_std": 0.1477682925760746,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 416.53125,
"epoch": 0.43593130779392336,
"grad_norm": 9.96493911743164,
"kl": 1.7197265625,
"learning_rate": 7.241264409665554e-07,
"loss": 0.441,
"reward": 0.82421875,
"reward_std": 0.21464627608656883,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 582.5625,
"epoch": 0.4385733157199472,
"grad_norm": 7.888613224029541,
"kl": 1.72265625,
"learning_rate": 7.202849725190397e-07,
"loss": 0.3068,
"reward": 1.0078125,
"reward_std": 0.17024145647883415,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 534.1875,
"epoch": 0.44121532364597094,
"grad_norm": 8.195699691772461,
"kl": 1.58642578125,
"learning_rate": 7.16428965720686e-07,
"loss": 0.3543,
"reward": 0.8046875,
"reward_std": 0.2195490226149559,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 262.46875,
"epoch": 0.4438573315719947,
"grad_norm": 11.303885459899902,
"kl": 0.970703125,
"learning_rate": 7.125587497842189e-07,
"loss": 0.4021,
"reward": 0.80078125,
"reward_std": 0.1908670738339424,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 507.84375,
"epoch": 0.4464993394980185,
"grad_norm": 18.99937629699707,
"kl": 1.3828125,
"learning_rate": 7.086746551354895e-07,
"loss": 0.5214,
"reward": 0.76953125,
"reward_std": 0.1896660476922989,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 481.09375,
"epoch": 0.44914134742404227,
"grad_norm": 16.57875633239746,
"kl": 1.5625,
"learning_rate": 7.047770133852676e-07,
"loss": 0.4899,
"reward": 1.0546875,
"reward_std": 0.19582437723875046,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 434.171875,
"epoch": 0.45178335535006603,
"grad_norm": 6.7548298835754395,
"kl": 1.451171875,
"learning_rate": 7.008661573009273e-07,
"loss": 0.3438,
"reward": 1.30078125,
"reward_std": 0.1738675981760025,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 398.4375,
"epoch": 0.45442536327608984,
"grad_norm": 11.90649127960205,
"kl": 1.791015625,
"learning_rate": 6.969424207780374e-07,
"loss": 0.1403,
"reward": 1.3515625,
"reward_std": 0.2295953370630741,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6015625,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 567.515625,
"epoch": 0.4570673712021136,
"grad_norm": 4.553245544433594,
"kl": 2.3828125,
"learning_rate": 6.930061388118557e-07,
"loss": 0.4131,
"reward": 1.05859375,
"reward_std": 0.21736154332756996,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55859375,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 522.15625,
"epoch": 0.45970937912813736,
"grad_norm": 10.5054931640625,
"kl": 2.76171875,
"learning_rate": 6.890576474687263e-07,
"loss": 0.2456,
"reward": 0.76171875,
"reward_std": 0.17176654934883118,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51171875,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 760.015625,
"epoch": 0.4623513870541612,
"grad_norm": 12.109650611877441,
"kl": 4.10546875,
"learning_rate": 6.850972838573888e-07,
"loss": 0.4345,
"reward": 0.7578125,
"reward_std": 0.17381427809596062,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 670.84375,
"epoch": 0.46499339498018494,
"grad_norm": 9.500724792480469,
"kl": 3.11328125,
"learning_rate": 6.811253861001961e-07,
"loss": 0.448,
"reward": 0.8125,
"reward_std": 0.2038702666759491,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 668.578125,
"epoch": 0.4676354029062087,
"grad_norm": 3.1513185501098633,
"kl": 2.83984375,
"learning_rate": 6.771422933042477e-07,
"loss": 0.4486,
"reward": 0.7734375,
"reward_std": 0.19701149314641953,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 460.640625,
"epoch": 0.4702774108322325,
"grad_norm": 3.928485631942749,
"kl": 2.52734375,
"learning_rate": 6.731483455324374e-07,
"loss": 0.4601,
"reward": 0.55078125,
"reward_std": 0.1819697804749012,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 500.734375,
"epoch": 0.47291941875825627,
"grad_norm": 5.9308905601501465,
"kl": 2.89453125,
"learning_rate": 6.691438837744191e-07,
"loss": 0.5959,
"reward": 1.0859375,
"reward_std": 0.24082761257886887,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 318.859375,
"epoch": 0.47556142668428003,
"grad_norm": 8.880630493164062,
"kl": 2.07373046875,
"learning_rate": 6.651292499174959e-07,
"loss": 0.2224,
"reward": 1.0703125,
"reward_std": 0.18467539176344872,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 504.203125,
"epoch": 0.47820343461030385,
"grad_norm": 7.29809045791626,
"kl": 2.671875,
"learning_rate": 6.611047867174298e-07,
"loss": 0.5424,
"reward": 0.796875,
"reward_std": 0.19480633921921253,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 693.046875,
"epoch": 0.4808454425363276,
"grad_norm": 7.5113844871521,
"kl": 3.078125,
"learning_rate": 6.570708377691783e-07,
"loss": 0.6193,
"reward": 1.5859375,
"reward_std": 0.2526575177907944,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 504.484375,
"epoch": 0.48348745046235136,
"grad_norm": 8.909899711608887,
"kl": 2.7265625,
"learning_rate": 6.530277474775602e-07,
"loss": 0.572,
"reward": 1.31640625,
"reward_std": 0.20270539075136185,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 451.421875,
"epoch": 0.4861294583883752,
"grad_norm": 20.32670021057129,
"kl": 2.5546875,
"learning_rate": 6.489758610278509e-07,
"loss": 0.4425,
"reward": 1.08203125,
"reward_std": 0.21750707924365997,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 586.625,
"epoch": 0.48877146631439894,
"grad_norm": 6.589134693145752,
"kl": 2.375,
"learning_rate": 6.449155243563114e-07,
"loss": 0.4211,
"reward": 0.546875,
"reward_std": 0.2208508811891079,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53125,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 534.640625,
"epoch": 0.4914134742404227,
"grad_norm": 9.064754486083984,
"kl": 2.705078125,
"learning_rate": 6.408470841206545e-07,
"loss": 0.2999,
"reward": 1.015625,
"reward_std": 0.10510582849383354,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.515625,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 758.078125,
"epoch": 0.4940554821664465,
"grad_norm": 14.509212493896484,
"kl": 3.865234375,
"learning_rate": 6.367708876704476e-07,
"loss": 0.494,
"reward": 1.02734375,
"reward_std": 0.20098446309566498,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.52734375,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 387.09375,
"epoch": 0.4966974900924703,
"grad_norm": 14.154923439025879,
"kl": 2.201171875,
"learning_rate": 6.326872830174566e-07,
"loss": 0.1712,
"reward": 1.0859375,
"reward_std": 0.19368236511945724,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 423.234375,
"epoch": 0.49933949801849403,
"grad_norm": 17.86855125427246,
"kl": 2.376953125,
"learning_rate": 6.285966188059355e-07,
"loss": 0.6533,
"reward": 1.09375,
"reward_std": 0.2263101488351822,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59375,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 512.125,
"epoch": 0.5019815059445178,
"grad_norm": 8.82755184173584,
"kl": 3.04296875,
"learning_rate": 6.244992442828585e-07,
"loss": 0.3686,
"reward": 0.7734375,
"reward_std": 0.1519293300807476,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 560.03125,
"epoch": 0.5046235138705416,
"grad_norm": 15.707466125488281,
"kl": 3.029296875,
"learning_rate": 6.203955092681039e-07,
"loss": 0.3194,
"reward": 1.0703125,
"reward_std": 0.1986095793545246,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 768.546875,
"epoch": 0.5072655217965654,
"grad_norm": 11.438809394836426,
"kl": 2.88671875,
"learning_rate": 6.162857641245869e-07,
"loss": 0.6017,
"reward": 1.28125,
"reward_std": 0.21250617876648903,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53125,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 408.859375,
"epoch": 0.5099075297225891,
"grad_norm": 5.250596523284912,
"kl": 1.41015625,
"learning_rate": 6.12170359728347e-07,
"loss": 0.2562,
"reward": 1.33203125,
"reward_std": 0.20339645817875862,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 460.921875,
"epoch": 0.512549537648613,
"grad_norm": 8.758655548095703,
"kl": 1.962890625,
"learning_rate": 6.080496474385916e-07,
"loss": 0.34,
"reward": 0.79296875,
"reward_std": 0.19175675138831139,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.54296875,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 426.734375,
"epoch": 0.5151915455746368,
"grad_norm": 13.022716522216797,
"kl": 1.361328125,
"learning_rate": 6.039239790676974e-07,
"loss": 0.49,
"reward": 1.1484375,
"reward_std": 0.2307521291077137,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6484375,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 519.953125,
"epoch": 0.5178335535006605,
"grad_norm": 14.834174156188965,
"kl": 2.318359375,
"learning_rate": 5.997937068511754e-07,
"loss": 0.1528,
"reward": 1.06640625,
"reward_std": 0.14010578021407127,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 568.75,
"epoch": 0.5204755614266843,
"grad_norm": 10.123536109924316,
"kl": 2.12109375,
"learning_rate": 5.956591834175964e-07,
"loss": 0.5013,
"reward": 1.31640625,
"reward_std": 0.21957488358020782,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 658.484375,
"epoch": 0.523117569352708,
"grad_norm": 6.424520015716553,
"kl": 3.1796875,
"learning_rate": 5.915207617584858e-07,
"loss": 0.4787,
"reward": 1.3125,
"reward_std": 0.22040452808141708,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 594.515625,
"epoch": 0.5257595772787318,
"grad_norm": 5.053133010864258,
"kl": 2.666015625,
"learning_rate": 5.873787951981868e-07,
"loss": 0.4661,
"reward": 0.75390625,
"reward_std": 0.17793777957558632,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.50390625,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 339.03125,
"epoch": 0.5284015852047557,
"grad_norm": 4.2198944091796875,
"kl": 1.740234375,
"learning_rate": 5.832336373636933e-07,
"loss": 0.3366,
"reward": 1.28515625,
"reward_std": 0.17389780096709728,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53515625,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 468.40625,
"epoch": 0.5310435931307794,
"grad_norm": 2.89648175239563,
"kl": 1.6396484375,
"learning_rate": 5.790856421544598e-07,
"loss": 0.3048,
"reward": 1.5859375,
"reward_std": 0.19600137695670128,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 475.765625,
"epoch": 0.5336856010568032,
"grad_norm": 6.781806468963623,
"kl": 2.189453125,
"learning_rate": 5.749351637121865e-07,
"loss": 0.3492,
"reward": 0.828125,
"reward_std": 0.20571819692850113,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 503.40625,
"epoch": 0.5363276089828269,
"grad_norm": 3.5012331008911133,
"kl": 2.72265625,
"learning_rate": 5.707825563905828e-07,
"loss": 0.4152,
"reward": 1.30078125,
"reward_std": 0.17533257603645325,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 374.390625,
"epoch": 0.5389696169088507,
"grad_norm": 16.517194747924805,
"kl": 1.6416015625,
"learning_rate": 5.666281747251153e-07,
"loss": 0.4345,
"reward": 1.2890625,
"reward_std": 0.18729007616639137,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5390625,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 344.546875,
"epoch": 0.5416116248348745,
"grad_norm": 4.214947700500488,
"kl": 1.6279296875,
"learning_rate": 5.624723734027373e-07,
"loss": 0.3469,
"reward": 1.01171875,
"reward_std": 0.1350011769682169,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51171875,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 640.53125,
"epoch": 0.5442536327608983,
"grad_norm": 4.432642936706543,
"kl": 2.634765625,
"learning_rate": 5.583155072316085e-07,
"loss": 0.3449,
"reward": 1.01953125,
"reward_std": 0.14237725362181664,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 346.4375,
"epoch": 0.5468956406869221,
"grad_norm": 6.426868915557861,
"kl": 2.21875,
"learning_rate": 5.541579311108009e-07,
"loss": 0.4081,
"reward": 1.33203125,
"reward_std": 0.20600395277142525,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 599.890625,
"epoch": 0.5495376486129459,
"grad_norm": 9.497568130493164,
"kl": 2.8671875,
"learning_rate": 5.5e-07,
"loss": 0.594,
"reward": 1.0390625,
"reward_std": 0.2189657799899578,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5390625,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 587.703125,
"epoch": 0.5521796565389696,
"grad_norm": 2.5981221199035645,
"kl": 2.576171875,
"learning_rate": 5.458420688891992e-07,
"loss": 0.3634,
"reward": 1.34765625,
"reward_std": 0.2173020839691162,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59765625,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 251.59375,
"epoch": 0.5548216644649934,
"grad_norm": 12.541109085083008,
"kl": 1.94140625,
"learning_rate": 5.416844927683916e-07,
"loss": 0.482,
"reward": 1.33984375,
"reward_std": 0.22426774725317955,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 353.359375,
"epoch": 0.5574636723910171,
"grad_norm": 21.176788330078125,
"kl": 2.33203125,
"learning_rate": 5.375276265972627e-07,
"loss": 0.2879,
"reward": 1.05078125,
"reward_std": 0.18691154941916466,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 407.1875,
"epoch": 0.560105680317041,
"grad_norm": 4.283320903778076,
"kl": 2.701171875,
"learning_rate": 5.333718252748849e-07,
"loss": 0.3272,
"reward": 1.5546875,
"reward_std": 0.1786573100835085,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 391.40625,
"epoch": 0.5627476882430648,
"grad_norm": 7.3552470207214355,
"kl": 2.76953125,
"learning_rate": 5.292174436094172e-07,
"loss": 0.4091,
"reward": 1.05859375,
"reward_std": 0.19953873381018639,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55859375,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 599.421875,
"epoch": 0.5653896961690885,
"grad_norm": 7.531975746154785,
"kl": 4.07421875,
"learning_rate": 5.250648362878135e-07,
"loss": 0.6474,
"reward": 1.3359375,
"reward_std": 0.22002986446022987,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 442.046875,
"epoch": 0.5680317040951123,
"grad_norm": 9.658491134643555,
"kl": 2.96875,
"learning_rate": 5.209143578455401e-07,
"loss": 0.3931,
"reward": 1.31640625,
"reward_std": 0.21046040952205658,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 323.140625,
"epoch": 0.570673712021136,
"grad_norm": 16.756044387817383,
"kl": 2.46484375,
"learning_rate": 5.167663626363066e-07,
"loss": 0.1497,
"reward": 1.328125,
"reward_std": 0.19799000024795532,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 485.28125,
"epoch": 0.5733157199471598,
"grad_norm": 14.802947998046875,
"kl": 2.94921875,
"learning_rate": 5.126212048018133e-07,
"loss": 0.3226,
"reward": 0.5546875,
"reward_std": 0.17373281717300415,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 553.296875,
"epoch": 0.5759577278731837,
"grad_norm": 6.547313213348389,
"kl": 3.35546875,
"learning_rate": 5.084792382415141e-07,
"loss": 0.7209,
"reward": 0.5703125,
"reward_std": 0.20446551591157913,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 580.078125,
"epoch": 0.5785997357992074,
"grad_norm": 7.502042293548584,
"kl": 2.875,
"learning_rate": 5.043408165824037e-07,
"loss": 0.522,
"reward": 1.07421875,
"reward_std": 0.2559613697230816,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 291.125,
"epoch": 0.5812417437252312,
"grad_norm": 9.088134765625,
"kl": 1.806640625,
"learning_rate": 5.002062931488247e-07,
"loss": 0.5338,
"reward": 0.8046875,
"reward_std": 0.18990932404994965,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 533.640625,
"epoch": 0.583883751651255,
"grad_norm": 11.220687866210938,
"kl": 2.3984375,
"learning_rate": 4.960760209323026e-07,
"loss": 0.6041,
"reward": 0.5234375,
"reward_std": 0.19436774030327797,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 612.890625,
"epoch": 0.5865257595772787,
"grad_norm": 6.296652317047119,
"kl": 3.07421875,
"learning_rate": 4.919503525614086e-07,
"loss": 0.5521,
"reward": 0.76953125,
"reward_std": 0.18084516376256943,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51953125,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 265.609375,
"epoch": 0.5891677675033025,
"grad_norm": 3.475614309310913,
"kl": 1.50390625,
"learning_rate": 4.878296402716531e-07,
"loss": 0.2643,
"reward": 1.38671875,
"reward_std": 0.20747815072536469,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.63671875,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 714.1875,
"epoch": 0.5918097754293263,
"grad_norm": 6.395312786102295,
"kl": 3.357421875,
"learning_rate": 4.837142358754131e-07,
"loss": 0.6176,
"reward": 1.2734375,
"reward_std": 0.21194355189800262,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 414.859375,
"epoch": 0.5944517833553501,
"grad_norm": 6.891757488250732,
"kl": 2.8984375,
"learning_rate": 4.79604490731896e-07,
"loss": 0.42,
"reward": 1.06640625,
"reward_std": 0.2256414033472538,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 581.984375,
"epoch": 0.5970937912813739,
"grad_norm": 7.385695934295654,
"kl": 3.4140625,
"learning_rate": 4.755007557171414e-07,
"loss": 0.6208,
"reward": 1.05078125,
"reward_std": 0.19489648565649986,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 346.296875,
"epoch": 0.5997357992073976,
"grad_norm": 5.268566608428955,
"kl": 2.427734375,
"learning_rate": 4.7140338119406455e-07,
"loss": 0.3306,
"reward": 1.109375,
"reward_std": 0.22719038277864456,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 454.625,
"epoch": 0.6023778071334214,
"grad_norm": 11.538866996765137,
"kl": 2.423828125,
"learning_rate": 4.6731271698254326e-07,
"loss": 0.664,
"reward": 1.109375,
"reward_std": 0.21347813308238983,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 398.4375,
"epoch": 0.6050198150594451,
"grad_norm": 10.027405738830566,
"kl": 2.166015625,
"learning_rate": 4.632291123295524e-07,
"loss": 0.3504,
"reward": 1.3125,
"reward_std": 0.2073436863720417,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 778.328125,
"epoch": 0.607661822985469,
"grad_norm": 8.903005599975586,
"kl": 4.5234375,
"learning_rate": 4.5915291587934547e-07,
"loss": 0.6184,
"reward": 1.0234375,
"reward_std": 0.21458512544631958,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 496.296875,
"epoch": 0.6103038309114928,
"grad_norm": 19.55433464050293,
"kl": 4.23046875,
"learning_rate": 4.5508447564368856e-07,
"loss": 0.6321,
"reward": 1.33984375,
"reward_std": 0.22301983460783958,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 686.265625,
"epoch": 0.6129458388375165,
"grad_norm": 6.192388534545898,
"kl": 3.7890625,
"learning_rate": 4.510241389721493e-07,
"loss": 0.5918,
"reward": 1.5859375,
"reward_std": 0.2616988569498062,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 540.578125,
"epoch": 0.6155878467635403,
"grad_norm": 7.43271017074585,
"kl": 3.13671875,
"learning_rate": 4.4697225252243976e-07,
"loss": 0.6237,
"reward": 1.3515625,
"reward_std": 0.24065708369016647,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6015625,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 521.265625,
"epoch": 0.618229854689564,
"grad_norm": 7.898358345031738,
"kl": 2.81640625,
"learning_rate": 4.4292916223082165e-07,
"loss": 0.5285,
"reward": 1.3046875,
"reward_std": 0.2356991246342659,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 493.28125,
"epoch": 0.6208718626155878,
"grad_norm": 10.038056373596191,
"kl": 2.90234375,
"learning_rate": 4.388952132825701e-07,
"loss": 0.2489,
"reward": 1.140625,
"reward_std": 0.2295135334134102,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.640625,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 431.984375,
"epoch": 0.6235138705416117,
"grad_norm": 4.178317546844482,
"kl": 2.68359375,
"learning_rate": 4.3487075008250397e-07,
"loss": 0.4859,
"reward": 0.79296875,
"reward_std": 0.2021397091448307,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.54296875,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 698.25,
"epoch": 0.6261558784676354,
"grad_norm": 7.887820243835449,
"kl": 4.2421875,
"learning_rate": 4.3085611622558084e-07,
"loss": 0.6169,
"reward": 1.28125,
"reward_std": 0.21125948429107666,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53125,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 548.328125,
"epoch": 0.6287978863936592,
"grad_norm": 5.685881614685059,
"kl": 2.59375,
"learning_rate": 4.268516544675628e-07,
"loss": 0.3334,
"reward": 1.0625,
"reward_std": 0.20200148969888687,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 414.75,
"epoch": 0.631439894319683,
"grad_norm": 11.868870735168457,
"kl": 2.0859375,
"learning_rate": 4.228577066957522e-07,
"loss": 0.1258,
"reward": 1.3671875,
"reward_std": 0.22833861783146858,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6171875,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 599.859375,
"epoch": 0.6340819022457067,
"grad_norm": 5.297094345092773,
"kl": 2.8125,
"learning_rate": 4.1887461389980394e-07,
"loss": 0.3444,
"reward": 1.046875,
"reward_std": 0.22738776728510857,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 460.609375,
"epoch": 0.6367239101717305,
"grad_norm": 9.069931983947754,
"kl": 2.166015625,
"learning_rate": 4.149027161426113e-07,
"loss": 0.5227,
"reward": 1.34375,
"reward_std": 0.21560321748256683,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59375,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 753.9375,
"epoch": 0.6393659180977543,
"grad_norm": 3.11356258392334,
"kl": 2.849609375,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.4795,
"reward": 1.046875,
"reward_std": 0.20162740349769592,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 626.859375,
"epoch": 0.6420079260237781,
"grad_norm": 4.849280834197998,
"kl": 2.39453125,
"learning_rate": 4.069938611881443e-07,
"loss": 0.5037,
"reward": 0.796875,
"reward_std": 0.18199804052710533,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 552.296875,
"epoch": 0.6446499339498019,
"grad_norm": 5.1860456466674805,
"kl": 2.4404296875,
"learning_rate": 4.030575792219626e-07,
"loss": 0.3665,
"reward": 1.296875,
"reward_std": 0.1943066604435444,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 638.5625,
"epoch": 0.6472919418758256,
"grad_norm": 9.586490631103516,
"kl": 2.599609375,
"learning_rate": 3.9913384269907293e-07,
"loss": 0.2958,
"reward": 1.33203125,
"reward_std": 0.22680200263857841,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 608.640625,
"epoch": 0.6499339498018494,
"grad_norm": 7.131601810455322,
"kl": 2.166015625,
"learning_rate": 3.952229866147323e-07,
"loss": 0.2385,
"reward": 1.375,
"reward_std": 0.2418774701654911,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.625,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 661.015625,
"epoch": 0.6525759577278731,
"grad_norm": 5.848790645599365,
"kl": 2.306640625,
"learning_rate": 3.913253448645103e-07,
"loss": 0.4711,
"reward": 1.08203125,
"reward_std": 0.22584940120577812,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 531.96875,
"epoch": 0.655217965653897,
"grad_norm": 5.778437614440918,
"kl": 1.859375,
"learning_rate": 3.8744125021578123e-07,
"loss": 0.3466,
"reward": 1.2734375,
"reward_std": 0.1622530035674572,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 525.40625,
"epoch": 0.6578599735799208,
"grad_norm": 3.1933047771453857,
"kl": 1.833984375,
"learning_rate": 3.835710342793139e-07,
"loss": 0.2862,
"reward": 1.30078125,
"reward_std": 0.15551739931106567,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 480.75,
"epoch": 0.6605019815059445,
"grad_norm": 8.949792861938477,
"kl": 1.197265625,
"learning_rate": 3.797150274809604e-07,
"loss": 0.326,
"reward": 1.3359375,
"reward_std": 0.2217497080564499,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 428.203125,
"epoch": 0.6631439894319683,
"grad_norm": 3.1499345302581787,
"kl": 1.2763671875,
"learning_rate": 3.7587355903344466e-07,
"loss": 0.1597,
"reward": 0.875,
"reward_std": 0.21982388943433762,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.625,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 369.6875,
"epoch": 0.665785997357992,
"grad_norm": 4.168592929840088,
"kl": 1.3583984375,
"learning_rate": 3.7204695690825593e-07,
"loss": 0.1939,
"reward": 1.28125,
"reward_std": 0.1477414984256029,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53125,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 544.015625,
"epoch": 0.6684280052840158,
"grad_norm": 7.520803451538086,
"kl": 1.921875,
"learning_rate": 3.682355478076473e-07,
"loss": 0.2638,
"reward": 0.82421875,
"reward_std": 0.2656807042658329,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55859375,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 561.34375,
"epoch": 0.6710700132100397,
"grad_norm": 6.172038555145264,
"kl": 2.318359375,
"learning_rate": 3.6443965713674354e-07,
"loss": 0.3545,
"reward": 1.02734375,
"reward_std": 0.19002593867480755,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.52734375,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 521.8125,
"epoch": 0.6737120211360634,
"grad_norm": 6.321176528930664,
"kl": 1.609375,
"learning_rate": 3.606596089757583e-07,
"loss": 0.3466,
"reward": 1.58984375,
"reward_std": 0.2514568492770195,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 307.46875,
"epoch": 0.6763540290620872,
"grad_norm": 4.846172332763672,
"kl": 1.09765625,
"learning_rate": 3.5689572605232597e-07,
"loss": 0.2335,
"reward": 1.3359375,
"reward_std": 0.20273161679506302,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 468.5,
"epoch": 0.678996036988111,
"grad_norm": 12.14126968383789,
"kl": 1.138671875,
"learning_rate": 3.531483297139481e-07,
"loss": 0.1721,
"reward": 0.80078125,
"reward_std": 0.1630447916686535,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 565.203125,
"epoch": 0.6816380449141347,
"grad_norm": 3.9592182636260986,
"kl": 1.837890625,
"learning_rate": 3.4941773990055777e-07,
"loss": 0.2977,
"reward": 1.10546875,
"reward_std": 0.25015248730778694,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.60546875,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 480.484375,
"epoch": 0.6842800528401585,
"grad_norm": 9.579623222351074,
"kl": 1.62109375,
"learning_rate": 3.45704275117204e-07,
"loss": 0.4312,
"reward": 1.08203125,
"reward_std": 0.24054544791579247,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 496.796875,
"epoch": 0.6869220607661823,
"grad_norm": 4.918056964874268,
"kl": 1.14306640625,
"learning_rate": 3.4200825240685914e-07,
"loss": 0.1878,
"reward": 1.1015625,
"reward_std": 0.22064152732491493,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6015625,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 515.96875,
"epoch": 0.6895640686922061,
"grad_norm": 11.338505744934082,
"kl": 1.765625,
"learning_rate": 3.3832998732335085e-07,
"loss": 0.4868,
"reward": 1.0859375,
"reward_std": 0.21507646515965462,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 579.875,
"epoch": 0.6922060766182299,
"grad_norm": 10.862038612365723,
"kl": 2.357421875,
"learning_rate": 3.346697939044211e-07,
"loss": 0.6303,
"reward": 0.77734375,
"reward_std": 0.20420579984784126,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.52734375,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 641.796875,
"epoch": 0.6948480845442536,
"grad_norm": 7.440125465393066,
"kl": 2.716796875,
"learning_rate": 3.310279846449147e-07,
"loss": 0.5692,
"reward": 0.83203125,
"reward_std": 0.2302125133574009,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 490.625,
"epoch": 0.6974900924702774,
"grad_norm": 11.042434692382812,
"kl": 1.890625,
"learning_rate": 3.2740487047009954e-07,
"loss": 0.575,
"reward": 0.8203125,
"reward_std": 0.21583595871925354,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 535.375,
"epoch": 0.7001321003963011,
"grad_norm": 9.307427406311035,
"kl": 1.8515625,
"learning_rate": 3.23800760709121e-07,
"loss": 0.2549,
"reward": 1.0625,
"reward_std": 0.19687864929437637,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 573.34375,
"epoch": 0.702774108322325,
"grad_norm": 4.253864765167236,
"kl": 2.693359375,
"learning_rate": 3.2021596306859195e-07,
"loss": 0.4737,
"reward": 0.8125,
"reward_std": 0.1992315910756588,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 648.3125,
"epoch": 0.7054161162483488,
"grad_norm": 7.490243911743164,
"kl": 3.2275390625,
"learning_rate": 3.1665078360632254e-07,
"loss": 0.377,
"reward": 1.078125,
"reward_std": 0.22863000631332397,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 489.296875,
"epoch": 0.7080581241743725,
"grad_norm": 4.917722702026367,
"kl": 2.056640625,
"learning_rate": 3.1310552670518987e-07,
"loss": 0.3075,
"reward": 1.12109375,
"reward_std": 0.23855430632829666,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.62109375,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 450.921875,
"epoch": 0.7107001321003963,
"grad_norm": 3.3728554248809814,
"kl": 2.087890625,
"learning_rate": 3.0958049504715024e-07,
"loss": 0.3534,
"reward": 1.07421875,
"reward_std": 0.20587731152772903,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 550.140625,
"epoch": 0.71334214002642,
"grad_norm": 6.581082344055176,
"kl": 2.974609375,
"learning_rate": 3.0607598958739777e-07,
"loss": 0.3513,
"reward": 1.08203125,
"reward_std": 0.21218016743659973,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 666.0625,
"epoch": 0.7159841479524438,
"grad_norm": 3.782729387283325,
"kl": 3.47265625,
"learning_rate": 3.0259230952866976e-07,
"loss": 0.5161,
"reward": 0.8515625,
"reward_std": 0.266521442681551,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6015625,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 589.734375,
"epoch": 0.7186261558784677,
"grad_norm": 12.191798210144043,
"kl": 2.857421875,
"learning_rate": 2.991297522957015e-07,
"loss": 0.257,
"reward": 1.05859375,
"reward_std": 0.1889869049191475,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55859375,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 476.515625,
"epoch": 0.7212681638044914,
"grad_norm": 5.739687442779541,
"kl": 2.828125,
"learning_rate": 2.9568861350983365e-07,
"loss": 0.3424,
"reward": 0.578125,
"reward_std": 0.20889347046613693,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 606.921875,
"epoch": 0.7239101717305152,
"grad_norm": 8.41596794128418,
"kl": 2.6015625,
"learning_rate": 2.922691869637727e-07,
"loss": 0.2616,
"reward": 1.1171875,
"reward_std": 0.24007226526737213,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6171875,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 491.59375,
"epoch": 0.726552179656539,
"grad_norm": 4.1023335456848145,
"kl": 1.966796875,
"learning_rate": 2.88871764596508e-07,
"loss": 0.2751,
"reward": 1.3515625,
"reward_std": 0.2043364755809307,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6015625,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 565.84375,
"epoch": 0.7291941875825627,
"grad_norm": 5.3786540031433105,
"kl": 2.720703125,
"learning_rate": 2.854966364683872e-07,
"loss": 0.3457,
"reward": 0.828125,
"reward_std": 0.20211807265877724,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 663.78125,
"epoch": 0.7318361955085865,
"grad_norm": 4.460934638977051,
"kl": 3.201171875,
"learning_rate": 2.821440907363516e-07,
"loss": 0.4525,
"reward": 0.8203125,
"reward_std": 0.23223434761166573,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 641.265625,
"epoch": 0.7344782034346103,
"grad_norm": 16.07205581665039,
"kl": 2.826171875,
"learning_rate": 2.7881441362933464e-07,
"loss": 0.334,
"reward": 1.0625,
"reward_std": 0.19014282897114754,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 676.0625,
"epoch": 0.7371202113606341,
"grad_norm": 11.935088157653809,
"kl": 2.81640625,
"learning_rate": 2.755078894238245e-07,
"loss": 0.23,
"reward": 0.78515625,
"reward_std": 0.20001451671123505,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53515625,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 603.546875,
"epoch": 0.7397622192866579,
"grad_norm": 9.738125801086426,
"kl": 2.033203125,
"learning_rate": 2.722248004195932e-07,
"loss": 0.2735,
"reward": 1.09375,
"reward_std": 0.20607677102088928,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59375,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 732.6875,
"epoch": 0.7424042272126816,
"grad_norm": 7.031618118286133,
"kl": 2.41015625,
"learning_rate": 2.689654269155955e-07,
"loss": 0.2994,
"reward": 0.82421875,
"reward_std": 0.20312216132879257,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 578.875,
"epoch": 0.7450462351387054,
"grad_norm": 5.801688194274902,
"kl": 1.40234375,
"learning_rate": 2.657300471860372e-07,
"loss": 0.2932,
"reward": 1.05078125,
"reward_std": 0.20492718927562237,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55078125,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 721.109375,
"epoch": 0.7476882430647291,
"grad_norm": 11.897012710571289,
"kl": 2.43359375,
"learning_rate": 2.625189374566175e-07,
"loss": 0.5936,
"reward": 0.7578125,
"reward_std": 0.15211578272283077,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 641.53125,
"epoch": 0.750330250990753,
"grad_norm": 5.453853130340576,
"kl": 1.376953125,
"learning_rate": 2.593323718809458e-07,
"loss": 0.3039,
"reward": 1.3671875,
"reward_std": 0.2303219847381115,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6171875,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 596.28125,
"epoch": 0.7529722589167768,
"grad_norm": 5.665752410888672,
"kl": 1.35546875,
"learning_rate": 2.561706225171352e-07,
"loss": 0.3616,
"reward": 1.04296875,
"reward_std": 0.17159553244709969,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.54296875,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 536.78125,
"epoch": 0.7556142668428005,
"grad_norm": 3.726806879043579,
"kl": 1.5693359375,
"learning_rate": 2.5303395930457494e-07,
"loss": 0.2881,
"reward": 1.3203125,
"reward_std": 0.2022528052330017,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 495.984375,
"epoch": 0.7582562747688243,
"grad_norm": 3.6658847332000732,
"kl": 1.1884765625,
"learning_rate": 2.499226500408845e-07,
"loss": 0.1181,
"reward": 1.1171875,
"reward_std": 0.1793758161365986,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6171875,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 859.984375,
"epoch": 0.760898282694848,
"grad_norm": 4.845893383026123,
"kl": 2.955078125,
"learning_rate": 2.4683696035904926e-07,
"loss": 0.4852,
"reward": 1.0078125,
"reward_std": 0.1604960411787033,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5078125,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 440.8125,
"epoch": 0.7635402906208718,
"grad_norm": 2.4910755157470703,
"kl": 0.85302734375,
"learning_rate": 2.437771537047423e-07,
"loss": 0.3161,
"reward": 1.07421875,
"reward_std": 0.2174788936972618,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 669.734375,
"epoch": 0.7661822985468957,
"grad_norm": 4.620151519775391,
"kl": 1.90234375,
"learning_rate": 2.407434913138318e-07,
"loss": 0.3675,
"reward": 0.5859375,
"reward_std": 0.22324015572667122,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 531.953125,
"epoch": 0.7688243064729194,
"grad_norm": 11.40556526184082,
"kl": 1.4501953125,
"learning_rate": 2.377362321900777e-07,
"loss": 0.0233,
"reward": 1.36328125,
"reward_std": 0.21594615280628204,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.61328125,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 696.34375,
"epoch": 0.7714663143989432,
"grad_norm": 3.5709707736968994,
"kl": 1.853515625,
"learning_rate": 2.3475563308301908e-07,
"loss": 0.2536,
"reward": 0.84765625,
"reward_std": 0.20635812729597092,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59765625,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 604.40625,
"epoch": 0.774108322324967,
"grad_norm": 6.535892486572266,
"kl": 1.3740234375,
"learning_rate": 2.3180194846605364e-07,
"loss": 0.1969,
"reward": 1.1171875,
"reward_std": 0.23528173938393593,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6171875,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 703.734375,
"epoch": 0.7767503302509907,
"grad_norm": 6.631422996520996,
"kl": 2.017578125,
"learning_rate": 2.288754305147115e-07,
"loss": 0.3918,
"reward": 1.296875,
"reward_std": 0.20271231979131699,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 550.125,
"epoch": 0.7793923381770145,
"grad_norm": 5.805858612060547,
"kl": 1.369140625,
"learning_rate": 2.259763290851255e-07,
"loss": 0.3276,
"reward": 1.0625,
"reward_std": 0.18768509849905968,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 776.859375,
"epoch": 0.7820343461030383,
"grad_norm": 6.1796135902404785,
"kl": 2.36328125,
"learning_rate": 2.231048916926992e-07,
"loss": 0.2911,
"reward": 1.3203125,
"reward_std": 0.2180866338312626,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 459.90625,
"epoch": 0.7846763540290621,
"grad_norm": 4.840709686279297,
"kl": 1.15234375,
"learning_rate": 2.2026136349097495e-07,
"loss": 0.2601,
"reward": 0.86328125,
"reward_std": 0.21641594916582108,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.61328125,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 626.34375,
"epoch": 0.7873183619550859,
"grad_norm": 4.876105308532715,
"kl": 2.0615234375,
"learning_rate": 2.1744598725070347e-07,
"loss": 0.403,
"reward": 1.28515625,
"reward_std": 0.17794826440513134,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53515625,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 560.78125,
"epoch": 0.7899603698811096,
"grad_norm": 5.7457451820373535,
"kl": 1.310546875,
"learning_rate": 2.146590033391168e-07,
"loss": 0.259,
"reward": 1.32421875,
"reward_std": 0.20343545079231262,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 666.5625,
"epoch": 0.7926023778071334,
"grad_norm": 4.766579627990723,
"kl": 1.6201171875,
"learning_rate": 2.11900649699407e-07,
"loss": 0.1752,
"reward": 1.109375,
"reward_std": 0.2358247935771942,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 631.734375,
"epoch": 0.7952443857331571,
"grad_norm": 3.2293262481689453,
"kl": 1.62890625,
"learning_rate": 2.0917116183041074e-07,
"loss": 0.2575,
"reward": 1.33984375,
"reward_std": 0.22996815666556358,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 740.484375,
"epoch": 0.797886393659181,
"grad_norm": 3.1481125354766846,
"kl": 2.294921875,
"learning_rate": 2.0647077276650366e-07,
"loss": 0.3915,
"reward": 0.828125,
"reward_std": 0.22289753332734108,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 472.625,
"epoch": 0.8005284015852048,
"grad_norm": 14.101240158081055,
"kl": 1.4130859375,
"learning_rate": 2.037997130577045e-07,
"loss": 0.5247,
"reward": 0.86328125,
"reward_std": 0.24362235516309738,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.61328125,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 579.9375,
"epoch": 0.8031704095112285,
"grad_norm": 2.720280885696411,
"kl": 1.720703125,
"learning_rate": 2.0115821074999156e-07,
"loss": 0.2849,
"reward": 1.3359375,
"reward_std": 0.21295345574617386,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 592.1875,
"epoch": 0.8058124174372523,
"grad_norm": 4.275804042816162,
"kl": 1.8828125,
"learning_rate": 1.9854649136583307e-07,
"loss": 0.3054,
"reward": 1.09765625,
"reward_std": 0.222886573523283,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59765625,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 468.53125,
"epoch": 0.808454425363276,
"grad_norm": 5.911637306213379,
"kl": 1.4951171875,
"learning_rate": 1.9596477788493254e-07,
"loss": 0.2116,
"reward": 1.109375,
"reward_std": 0.2025398500263691,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 548.859375,
"epoch": 0.8110964332892999,
"grad_norm": 5.387912273406982,
"kl": 1.599609375,
"learning_rate": 1.9341329072519176e-07,
"loss": 0.351,
"reward": 0.6171875,
"reward_std": 0.22198385372757912,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6171875,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 573.5625,
"epoch": 0.8137384412153237,
"grad_norm": 5.202173709869385,
"kl": 1.78125,
"learning_rate": 1.9089224772389223e-07,
"loss": 0.3517,
"reward": 1.09375,
"reward_std": 0.23804370686411858,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59375,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 652.28125,
"epoch": 0.8163804491413474,
"grad_norm": 4.832318305969238,
"kl": 1.6396484375,
"learning_rate": 1.884018641190968e-07,
"loss": 0.2776,
"reward": 1.69921875,
"reward_std": 0.27570171654224396,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.69921875,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 653.1875,
"epoch": 0.8190224570673712,
"grad_norm": 5.5447211265563965,
"kl": 2.037109375,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.247,
"reward": 1.046875,
"reward_std": 0.21413858234882355,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 555.765625,
"epoch": 0.821664464993395,
"grad_norm": 10.55873966217041,
"kl": 2.12109375,
"learning_rate": 1.8351392294514326e-07,
"loss": 0.4554,
"reward": 1.2890625,
"reward_std": 0.15378709696233273,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5390625,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 454.5625,
"epoch": 0.8243064729194187,
"grad_norm": 2.300844669342041,
"kl": 1.0029296875,
"learning_rate": 1.8111678269175055e-07,
"loss": 0.1514,
"reward": 1.11328125,
"reward_std": 0.2071386780589819,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.61328125,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 672.796875,
"epoch": 0.8269484808454426,
"grad_norm": 5.112921237945557,
"kl": 2.4970703125,
"learning_rate": 1.78751136430764e-07,
"loss": 0.4767,
"reward": 1.078125,
"reward_std": 0.20955145359039307,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 444.484375,
"epoch": 0.8295904887714664,
"grad_norm": 4.7589569091796875,
"kl": 1.42578125,
"learning_rate": 1.7641718613300228e-07,
"loss": 0.2688,
"reward": 0.640625,
"reward_std": 0.238662201911211,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.640625,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 814.015625,
"epoch": 0.8322324966974901,
"grad_norm": 10.08535385131836,
"kl": 3.3828125,
"learning_rate": 1.7411513106319058e-07,
"loss": 0.3937,
"reward": 0.78125,
"reward_std": 0.20346562936902046,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53125,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 613.03125,
"epoch": 0.8348745046235139,
"grad_norm": 12.75075912475586,
"kl": 2.302734375,
"learning_rate": 1.7184516776294832e-07,
"loss": 0.2161,
"reward": 0.8828125,
"reward_std": 0.26399971544742584,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.015625,
"rewards/tag_count_reward": 0.6171875,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 529.59375,
"epoch": 0.8375165125495376,
"grad_norm": 9.653738975524902,
"kl": 1.8046875,
"learning_rate": 1.6960749003400892e-07,
"loss": 0.1588,
"reward": 0.84375,
"reward_std": 0.16583861783146858,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59375,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 583.625,
"epoch": 0.8401585204755614,
"grad_norm": 4.075193405151367,
"kl": 1.640625,
"learning_rate": 1.674022889216737e-07,
"loss": 0.1898,
"reward": 1.3125,
"reward_std": 0.1740352250635624,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 669.0625,
"epoch": 0.8428005284015853,
"grad_norm": 4.472336292266846,
"kl": 2.494140625,
"learning_rate": 1.6522975269850104e-07,
"loss": 0.3193,
"reward": 0.85546875,
"reward_std": 0.21766092255711555,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.60546875,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 721.015625,
"epoch": 0.845442536327609,
"grad_norm": 6.250655174255371,
"kl": 3.150390625,
"learning_rate": 1.6309006684823239e-07,
"loss": 0.5334,
"reward": 1.0234375,
"reward_std": 0.1688866000622511,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 675.921875,
"epoch": 0.8480845442536328,
"grad_norm": 1.8639191389083862,
"kl": 2.427734375,
"learning_rate": 1.6098341404995647e-07,
"loss": 0.3932,
"reward": 0.62890625,
"reward_std": 0.24960599094629288,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.62890625,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 600.421875,
"epoch": 0.8507265521796565,
"grad_norm": 4.137293338775635,
"kl": 2.146484375,
"learning_rate": 1.5890997416251224e-07,
"loss": 0.351,
"reward": 1.04296875,
"reward_std": 0.1972101591527462,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.54296875,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 565.4375,
"epoch": 0.8533685601056803,
"grad_norm": 10.063258171081543,
"kl": 1.478515625,
"learning_rate": 1.5686992420913372e-07,
"loss": 0.0225,
"reward": 0.86328125,
"reward_std": 0.2034553661942482,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.61328125,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 647.328125,
"epoch": 0.8560105680317041,
"grad_norm": 9.994471549987793,
"kl": 2.05859375,
"learning_rate": 1.5486343836233595e-07,
"loss": 0.2504,
"reward": 1.328125,
"reward_std": 0.21247531473636627,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 690.234375,
"epoch": 0.8586525759577279,
"grad_norm": 9.103864669799805,
"kl": 2.4921875,
"learning_rate": 1.5289068792904495e-07,
"loss": 0.483,
"reward": 0.82421875,
"reward_std": 0.2072843722999096,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 732.96875,
"epoch": 0.8612945838837517,
"grad_norm": 7.12535285949707,
"kl": 1.994140625,
"learning_rate": 1.5095184133597217e-07,
"loss": 0.4435,
"reward": 1.08984375,
"reward_std": 0.2667161263525486,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 402.9375,
"epoch": 0.8639365918097754,
"grad_norm": 12.984781265258789,
"kl": 1.0556640625,
"learning_rate": 1.4904706411523448e-07,
"loss": 0.3994,
"reward": 1.32421875,
"reward_std": 0.18335551768541336,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 828.71875,
"epoch": 0.8665785997357992,
"grad_norm": 12.132417678833008,
"kl": 1.8466796875,
"learning_rate": 1.47176518890222e-07,
"loss": 0.182,
"reward": 1.0390625,
"reward_std": 0.16892226040363312,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5390625,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 822.609375,
"epoch": 0.869220607661823,
"grad_norm": 4.648046493530273,
"kl": 2.0146484375,
"learning_rate": 1.453403653617135e-07,
"loss": 0.4329,
"reward": 0.796875,
"reward_std": 0.20767118781805038,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.546875,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 590.765625,
"epoch": 0.8718626155878467,
"grad_norm": 3.794019937515259,
"kl": 1.7001953125,
"learning_rate": 1.4353876029424202e-07,
"loss": 0.371,
"reward": 1.09375,
"reward_std": 0.216283418238163,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59375,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 712.96875,
"epoch": 0.8745046235138706,
"grad_norm": 5.229684352874756,
"kl": 2.5732421875,
"learning_rate": 1.4177185750271055e-07,
"loss": 0.3925,
"reward": 1.09375,
"reward_std": 0.23571135476231575,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59375,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 646.328125,
"epoch": 0.8771466314398944,
"grad_norm": 5.142683506011963,
"kl": 2.0380859375,
"learning_rate": 1.400398078392602e-07,
"loss": 0.4217,
"reward": 0.828125,
"reward_std": 0.2310670204460621,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 580.5,
"epoch": 0.8797886393659181,
"grad_norm": 9.393284797668457,
"kl": 1.46875,
"learning_rate": 1.3834275918039055e-07,
"loss": 0.3297,
"reward": 1.33984375,
"reward_std": 0.18817520886659622,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 584.625,
"epoch": 0.8824306472919419,
"grad_norm": 6.900231838226318,
"kl": 1.951171875,
"learning_rate": 1.3668085641433462e-07,
"loss": 0.2931,
"reward": 0.86328125,
"reward_std": 0.2518454007804394,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.61328125,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 430.796875,
"epoch": 0.8850726552179656,
"grad_norm": 9.600037574768066,
"kl": 1.091796875,
"learning_rate": 1.3505424142868897e-07,
"loss": 0.3829,
"reward": 1.41796875,
"reward_std": 0.23616278544068336,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.66796875,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 631.875,
"epoch": 0.8877146631439894,
"grad_norm": 5.003634929656982,
"kl": 1.6171875,
"learning_rate": 1.334630530982997e-07,
"loss": 0.2516,
"reward": 1.3046875,
"reward_std": 0.21555107831954956,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5546875,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 614.03125,
"epoch": 0.8903566710700133,
"grad_norm": 16.881690979003906,
"kl": 1.8984375,
"learning_rate": 1.319074272734056e-07,
"loss": 0.0975,
"reward": 1.08984375,
"reward_std": 0.19282393157482147,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 666.0,
"epoch": 0.892998678996037,
"grad_norm": 5.620565414428711,
"kl": 2.3154296875,
"learning_rate": 1.303874967680399e-07,
"loss": 0.2757,
"reward": 1.62109375,
"reward_std": 0.2326289601624012,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.62109375,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 476.203125,
"epoch": 0.8956406869220608,
"grad_norm": 5.114979267120361,
"kl": 1.1298828125,
"learning_rate": 1.289033913486914e-07,
"loss": 0.1405,
"reward": 1.0703125,
"reward_std": 0.1810067780315876,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 713.8125,
"epoch": 0.8982826948480845,
"grad_norm": 3.9009175300598145,
"kl": 2.587890625,
"learning_rate": 1.2745523772322461e-07,
"loss": 0.4324,
"reward": 1.31640625,
"reward_std": 0.1788315549492836,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 642.796875,
"epoch": 0.9009247027741083,
"grad_norm": 5.570927619934082,
"kl": 1.9873046875,
"learning_rate": 1.2604315953006266e-07,
"loss": 0.34,
"reward": 0.86328125,
"reward_std": 0.24456297606229782,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.61328125,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 637.578125,
"epoch": 0.9035667107001321,
"grad_norm": 8.186066627502441,
"kl": 1.923828125,
"learning_rate": 1.2466727732763125e-07,
"loss": 0.4781,
"reward": 0.8671875,
"reward_std": 0.23449090123176575,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6171875,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 775.796875,
"epoch": 0.9062087186261559,
"grad_norm": 5.553122043609619,
"kl": 3.125,
"learning_rate": 1.2332770858406538e-07,
"loss": 0.5849,
"reward": 0.78515625,
"reward_std": 0.21501468122005463,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.53515625,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 445.453125,
"epoch": 0.9088507265521797,
"grad_norm": 4.708739757537842,
"kl": 1.2822265625,
"learning_rate": 1.220245676671809e-07,
"loss": 0.1695,
"reward": 1.078125,
"reward_std": 0.15526169911026955,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 752.78125,
"epoch": 0.9114927344782034,
"grad_norm": 3.9118199348449707,
"kl": 1.9716796875,
"learning_rate": 1.2075796583470984e-07,
"loss": 0.3416,
"reward": 1.06640625,
"reward_std": 0.21211567521095276,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 651.375,
"epoch": 0.9141347424042272,
"grad_norm": 5.419198513031006,
"kl": 2.326171875,
"learning_rate": 1.1952801122480167e-07,
"loss": 0.2937,
"reward": 0.59765625,
"reward_std": 0.2001628838479519,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59765625,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 662.203125,
"epoch": 0.916776750330251,
"grad_norm": 10.185606002807617,
"kl": 2.2119140625,
"learning_rate": 1.183348088467908e-07,
"loss": 0.2272,
"reward": 1.01171875,
"reward_std": 0.15968638472259045,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.51171875,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 476.359375,
"epoch": 0.9194187582562747,
"grad_norm": 5.287563323974609,
"kl": 1.537109375,
"learning_rate": 1.1717846057223143e-07,
"loss": 0.1921,
"reward": 0.60546875,
"reward_std": 0.22014086320996284,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.60546875,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 593.1875,
"epoch": 0.9220607661822986,
"grad_norm": 4.420534133911133,
"kl": 1.7568359375,
"learning_rate": 1.1605906512619983e-07,
"loss": 0.3432,
"reward": 1.3515625,
"reward_std": 0.23761418834328651,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6015625,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 709.6875,
"epoch": 0.9247027741083224,
"grad_norm": 4.137857437133789,
"kl": 2.36328125,
"learning_rate": 1.1497671807886567e-07,
"loss": 0.3999,
"reward": 1.0703125,
"reward_std": 0.19854220747947693,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 447.515625,
"epoch": 0.9273447820343461,
"grad_norm": 5.883572578430176,
"kl": 1.359375,
"learning_rate": 1.139315118373326e-07,
"loss": 0.3009,
"reward": 0.859375,
"reward_std": 0.21957654133439064,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 535.09375,
"epoch": 0.9299867899603699,
"grad_norm": 9.422240257263184,
"kl": 1.3564453125,
"learning_rate": 1.1292353563774873e-07,
"loss": 0.3162,
"reward": 1.08984375,
"reward_std": 0.22193554788827896,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 603.21875,
"epoch": 0.9326287978863936,
"grad_norm": 4.772337913513184,
"kl": 2.2646484375,
"learning_rate": 1.1195287553768821e-07,
"loss": 0.2438,
"reward": 0.62890625,
"reward_std": 0.28237032890319824,
"rewards/accuracy_reward": 0.015625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.61328125,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 714.09375,
"epoch": 0.9352708058124174,
"grad_norm": 9.603926658630371,
"kl": 2.470703125,
"learning_rate": 1.1101961440880352e-07,
"loss": 0.3789,
"reward": 1.05859375,
"reward_std": 0.19248899817466736,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.55859375,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 628.40625,
"epoch": 0.9379128137384413,
"grad_norm": 16.06355857849121,
"kl": 2.0009765625,
"learning_rate": 1.1012383192975041e-07,
"loss": 0.0823,
"reward": 1.33203125,
"reward_std": 0.18909762054681778,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 578.390625,
"epoch": 0.940554821664465,
"grad_norm": 3.9636921882629395,
"kl": 1.8291015625,
"learning_rate": 1.0926560457938536e-07,
"loss": 0.2746,
"reward": 1.3125,
"reward_std": 0.2061732206493616,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5625,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 520.203125,
"epoch": 0.9431968295904888,
"grad_norm": 6.897830486297607,
"kl": 1.431640625,
"learning_rate": 1.084450056302357e-07,
"loss": 0.1525,
"reward": 0.83203125,
"reward_std": 0.21859385818243027,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 524.21875,
"epoch": 0.9458388375165125,
"grad_norm": 11.090557098388672,
"kl": 1.40234375,
"learning_rate": 1.0766210514224419e-07,
"loss": 0.0591,
"reward": 1.1328125,
"reward_std": 0.23101669549942017,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6328125,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 578.65625,
"epoch": 0.9484808454425363,
"grad_norm": 13.82530689239502,
"kl": 2.111328125,
"learning_rate": 1.0691696995678738e-07,
"loss": 0.2682,
"reward": 1.109375,
"reward_std": 0.22573107481002808,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 720.625,
"epoch": 0.9511228533685601,
"grad_norm": 6.005599021911621,
"kl": 2.166015625,
"learning_rate": 1.0620966369096884e-07,
"loss": 0.3217,
"reward": 1.34375,
"reward_std": 0.2211884669959545,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59375,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 664.5625,
"epoch": 0.9537648612945839,
"grad_norm": 2.9504928588867188,
"kl": 1.896484375,
"learning_rate": 1.0554024673218806e-07,
"loss": 0.3339,
"reward": 1.31640625,
"reward_std": 0.21037080883979797,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 719.90625,
"epoch": 0.9564068692206077,
"grad_norm": 3.942823886871338,
"kl": 1.5712890625,
"learning_rate": 1.0490877623298431e-07,
"loss": 0.3399,
"reward": 0.8515625,
"reward_std": 0.23859936743974686,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6015625,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 497.296875,
"epoch": 0.9590488771466315,
"grad_norm": 11.69743824005127,
"kl": 1.6708984375,
"learning_rate": 1.0431530610615772e-07,
"loss": 0.1801,
"reward": 1.37109375,
"reward_std": 0.20750074833631516,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.62109375,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 662.953125,
"epoch": 0.9616908850726552,
"grad_norm": 5.648345470428467,
"kl": 2.005859375,
"learning_rate": 1.0375988702016576e-07,
"loss": 0.3905,
"reward": 0.8203125,
"reward_std": 0.21815017238259315,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 556.640625,
"epoch": 0.964332892998679,
"grad_norm": 3.6928138732910156,
"kl": 1.544921875,
"learning_rate": 1.0324256639479797e-07,
"loss": 0.1847,
"reward": 1.3359375,
"reward_std": 0.2146303877234459,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5859375,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 528.46875,
"epoch": 0.9669749009247027,
"grad_norm": 4.1989336013793945,
"kl": 1.3134765625,
"learning_rate": 1.0276338839712688e-07,
"loss": 0.2739,
"reward": 0.859375,
"reward_std": 0.212420754134655,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 807.125,
"epoch": 0.9696169088507266,
"grad_norm": 5.855282306671143,
"kl": 2.8173828125,
"learning_rate": 1.023223939377375e-07,
"loss": 0.3144,
"reward": 0.83203125,
"reward_std": 0.2185688391327858,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58203125,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 711.046875,
"epoch": 0.9722589167767504,
"grad_norm": 6.813151836395264,
"kl": 1.77734375,
"learning_rate": 1.0191962066723448e-07,
"loss": 0.1714,
"reward": 1.3203125,
"reward_std": 0.18526797741651535,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5703125,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 835.46875,
"epoch": 0.9749009247027741,
"grad_norm": 4.6733317375183105,
"kl": 2.62109375,
"learning_rate": 1.0155510297302745e-07,
"loss": 0.4741,
"reward": 0.7265625,
"reward_std": 0.1361106839030981,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4765625,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 501.609375,
"epoch": 0.9775429326287979,
"grad_norm": 7.580297946929932,
"kl": 1.306640625,
"learning_rate": 1.0122887197639539e-07,
"loss": 0.106,
"reward": 0.8828125,
"reward_std": 0.21267065405845642,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6328125,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 730.546875,
"epoch": 0.9801849405548216,
"grad_norm": 2.7990424633026123,
"kl": 1.625,
"learning_rate": 1.0094095552982936e-07,
"loss": 0.1954,
"reward": 1.06640625,
"reward_std": 0.15350224822759628,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 634.125,
"epoch": 0.9828269484808454,
"grad_norm": 5.10625696182251,
"kl": 1.578125,
"learning_rate": 1.0069137821465474e-07,
"loss": 0.3279,
"reward": 1.59765625,
"reward_std": 0.24609044939279556,
"rewards/accuracy_reward": 1.0,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59765625,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 521.46875,
"epoch": 0.9854689564068693,
"grad_norm": 2.8827366828918457,
"kl": 1.173828125,
"learning_rate": 1.0048016133893242e-07,
"loss": 0.2295,
"reward": 0.81640625,
"reward_std": 0.1789581961929798,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 528.515625,
"epoch": 0.988110964332893,
"grad_norm": 4.933093070983887,
"kl": 1.3515625,
"learning_rate": 1.0030732293563969e-07,
"loss": 0.1593,
"reward": 1.31640625,
"reward_std": 0.18777159228920937,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.56640625,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 486.390625,
"epoch": 0.9907529722589168,
"grad_norm": 5.345139980316162,
"kl": 1.306640625,
"learning_rate": 1.0017287776113066e-07,
"loss": 0.2942,
"reward": 1.34765625,
"reward_std": 0.23156387358903885,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.59765625,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 832.59375,
"epoch": 0.9933949801849405,
"grad_norm": 5.978093147277832,
"kl": 2.80859375,
"learning_rate": 1.0007683729387628e-07,
"loss": 0.562,
"reward": 0.7734375,
"reward_std": 0.20706837996840477,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5234375,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 601.359375,
"epoch": 0.9960369881109643,
"grad_norm": 4.996700763702393,
"kl": 1.537109375,
"learning_rate": 1.0001920973348446e-07,
"loss": 0.3616,
"reward": 1.33984375,
"reward_std": 0.2210528589785099,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.58984375,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 651.0499954223633,
"epoch": 0.9986789960369881,
"grad_norm": 10.63793659210205,
"kl": 1.486328125,
"learning_rate": 1e-07,
"loss": 0.16,
"reward": 1.32421875,
"reward_std": 0.1949087455868721,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.57421875,
"step": 378
},
{
"epoch": 0.9986789960369881,
"step": 378,
"total_flos": 0.0,
"train_loss": 0.3501640140083889,
"train_runtime": 20695.6892,
"train_samples_per_second": 0.073,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 378,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}