|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9986789960369881, |
|
"eval_steps": 500, |
|
"global_step": 378, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1179.875, |
|
"epoch": 0.002642007926023778, |
|
"grad_norm": 0.4997229278087616, |
|
"kl": 0.0, |
|
"learning_rate": 2.6315789473684208e-08, |
|
"loss": 0.2467, |
|
"reward": 1.19921875, |
|
"reward_std": 0.13141997903585434, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.44921875, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1057.625, |
|
"epoch": 0.005284015852047556, |
|
"grad_norm": 0.5586327910423279, |
|
"kl": 0.0, |
|
"learning_rate": 5.2631578947368416e-08, |
|
"loss": 0.3641, |
|
"reward": 0.95703125, |
|
"reward_std": 0.12062124721705914, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1275.578125, |
|
"epoch": 0.007926023778071334, |
|
"grad_norm": 0.5091319680213928, |
|
"kl": 0.0001016855239868164, |
|
"learning_rate": 7.894736842105262e-08, |
|
"loss": 0.3625, |
|
"reward": 0.4140625, |
|
"reward_std": 0.13219169899821281, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4140625, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 891.71875, |
|
"epoch": 0.010568031704095112, |
|
"grad_norm": 0.5754386782646179, |
|
"kl": 0.00015115737915039062, |
|
"learning_rate": 1.0526315789473683e-07, |
|
"loss": 0.3083, |
|
"reward": 0.99609375, |
|
"reward_std": 0.116029754281044, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.49609375, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1166.125, |
|
"epoch": 0.013210039630118891, |
|
"grad_norm": 0.5114976763725281, |
|
"kl": 0.00011730194091796875, |
|
"learning_rate": 1.3157894736842104e-07, |
|
"loss": 0.33, |
|
"reward": 0.9296875, |
|
"reward_std": 0.11507641524076462, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4296875, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1035.703125, |
|
"epoch": 0.015852047556142668, |
|
"grad_norm": 0.7084254026412964, |
|
"kl": 0.00015091896057128906, |
|
"learning_rate": 1.5789473684210525e-07, |
|
"loss": 0.3363, |
|
"reward": 0.7265625, |
|
"reward_std": 0.12440211698412895, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4765625, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 977.3125, |
|
"epoch": 0.018494055482166448, |
|
"grad_norm": 0.3633577525615692, |
|
"kl": 9.310245513916016e-05, |
|
"learning_rate": 1.8421052631578946e-07, |
|
"loss": 0.2085, |
|
"reward": 1.0, |
|
"reward_std": 0.13400040566921234, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 889.03125, |
|
"epoch": 0.021136063408190225, |
|
"grad_norm": 0.6363146901130676, |
|
"kl": 0.0001055002212524414, |
|
"learning_rate": 2.1052631578947366e-07, |
|
"loss": 0.3436, |
|
"reward": 0.984375, |
|
"reward_std": 0.11146603152155876, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.484375, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1114.28125, |
|
"epoch": 0.023778071334214, |
|
"grad_norm": 0.6163086295127869, |
|
"kl": 0.00010448694229125977, |
|
"learning_rate": 2.3684210526315787e-07, |
|
"loss": 0.387, |
|
"reward": 0.45703125, |
|
"reward_std": 0.11941792443394661, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1325.578125, |
|
"epoch": 0.026420079260237782, |
|
"grad_norm": 0.45183688402175903, |
|
"kl": 0.00015163421630859375, |
|
"learning_rate": 2.631578947368421e-07, |
|
"loss": 0.304, |
|
"reward": 0.91015625, |
|
"reward_std": 0.12797221168875694, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.41015625, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1116.671875, |
|
"epoch": 0.02906208718626156, |
|
"grad_norm": 0.5506221055984497, |
|
"kl": 0.0001614093780517578, |
|
"learning_rate": 2.894736842105263e-07, |
|
"loss": 0.2958, |
|
"reward": 0.7109375, |
|
"reward_std": 0.1341523937880993, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4609375, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1107.546875, |
|
"epoch": 0.031704095112285335, |
|
"grad_norm": 0.423910528421402, |
|
"kl": 0.000125885009765625, |
|
"learning_rate": 3.157894736842105e-07, |
|
"loss": 0.2614, |
|
"reward": 0.9609375, |
|
"reward_std": 0.11495335027575493, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4609375, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1075.453125, |
|
"epoch": 0.034346103038309116, |
|
"grad_norm": 0.6421769857406616, |
|
"kl": 0.0001609325408935547, |
|
"learning_rate": 3.4210526315789473e-07, |
|
"loss": 0.3804, |
|
"reward": 0.70703125, |
|
"reward_std": 0.11874673143029213, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1325.046875, |
|
"epoch": 0.036988110964332896, |
|
"grad_norm": 0.5751165151596069, |
|
"kl": 0.00011897087097167969, |
|
"learning_rate": 3.684210526315789e-07, |
|
"loss": 0.3482, |
|
"reward": 0.9296875, |
|
"reward_std": 0.15341992676258087, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4296875, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1087.34375, |
|
"epoch": 0.03963011889035667, |
|
"grad_norm": 0.6110666394233704, |
|
"kl": 0.00010585784912109375, |
|
"learning_rate": 3.9473684210526315e-07, |
|
"loss": 0.3665, |
|
"reward": 0.95703125, |
|
"reward_std": 0.1287429742515087, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1314.15625, |
|
"epoch": 0.04227212681638045, |
|
"grad_norm": 0.5642758011817932, |
|
"kl": 0.00013065338134765625, |
|
"learning_rate": 4.2105263157894733e-07, |
|
"loss": 0.4046, |
|
"reward": 0.90625, |
|
"reward_std": 0.13578036427497864, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.40625, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1289.359375, |
|
"epoch": 0.04491413474240423, |
|
"grad_norm": 0.4779168963432312, |
|
"kl": 8.845329284667969e-05, |
|
"learning_rate": 4.4736842105263156e-07, |
|
"loss": 0.2965, |
|
"reward": 1.421875, |
|
"reward_std": 0.12279411032795906, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.421875, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1349.9375, |
|
"epoch": 0.047556142668428, |
|
"grad_norm": 0.4716605842113495, |
|
"kl": 0.00012004375457763672, |
|
"learning_rate": 4.7368421052631574e-07, |
|
"loss": 0.3496, |
|
"reward": 0.66796875, |
|
"reward_std": 0.14581536501646042, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.41796875, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 775.328125, |
|
"epoch": 0.05019815059445178, |
|
"grad_norm": 0.5275957584381104, |
|
"kl": 9.936094284057617e-05, |
|
"learning_rate": 5e-07, |
|
"loss": 0.3465, |
|
"reward": 0.734375, |
|
"reward_std": 0.08240052312612534, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.484375, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1096.671875, |
|
"epoch": 0.052840158520475564, |
|
"grad_norm": 0.622590959072113, |
|
"kl": 0.00011599063873291016, |
|
"learning_rate": 5.263157894736842e-07, |
|
"loss": 0.3991, |
|
"reward": 0.95703125, |
|
"reward_std": 0.09287451207637787, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1145.40625, |
|
"epoch": 0.05548216644649934, |
|
"grad_norm": 0.5628076195716858, |
|
"kl": 7.984042167663574e-05, |
|
"learning_rate": 5.526315789473684e-07, |
|
"loss": 0.3009, |
|
"reward": 0.7109375, |
|
"reward_std": 0.111817117780447, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4609375, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 829.109375, |
|
"epoch": 0.05812417437252312, |
|
"grad_norm": 0.5253135561943054, |
|
"kl": 0.0001367330551147461, |
|
"learning_rate": 5.789473684210526e-07, |
|
"loss": 0.243, |
|
"reward": 1.4921875, |
|
"reward_std": 0.1498083807528019, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4921875, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1060.53125, |
|
"epoch": 0.0607661822985469, |
|
"grad_norm": 0.624118983745575, |
|
"kl": 7.021427154541016e-05, |
|
"learning_rate": 6.052631578947368e-07, |
|
"loss": 0.4002, |
|
"reward": 1.21484375, |
|
"reward_std": 0.1456764042377472, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46484375, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 976.578125, |
|
"epoch": 0.06340819022457067, |
|
"grad_norm": 0.46764305233955383, |
|
"kl": 0.0001266002655029297, |
|
"learning_rate": 6.31578947368421e-07, |
|
"loss": 0.2928, |
|
"reward": 1.20703125, |
|
"reward_std": 0.096083864569664, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1207.015625, |
|
"epoch": 0.06605019815059446, |
|
"grad_norm": 0.39954128861427307, |
|
"kl": 0.00010007619857788086, |
|
"learning_rate": 6.578947368421053e-07, |
|
"loss": 0.1622, |
|
"reward": 0.953125, |
|
"reward_std": 0.15208648890256882, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.453125, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 783.921875, |
|
"epoch": 0.06869220607661823, |
|
"grad_norm": 0.4758118689060211, |
|
"kl": 8.118152618408203e-05, |
|
"learning_rate": 6.842105263157895e-07, |
|
"loss": 0.2011, |
|
"reward": 0.96875, |
|
"reward_std": 0.07889671996235847, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46875, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 920.515625, |
|
"epoch": 0.071334214002642, |
|
"grad_norm": 0.7195703387260437, |
|
"kl": 9.21487808227539e-05, |
|
"learning_rate": 7.105263157894736e-07, |
|
"loss": 0.2896, |
|
"reward": 0.984375, |
|
"reward_std": 0.10958803817629814, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.484375, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1289.40625, |
|
"epoch": 0.07397622192866579, |
|
"grad_norm": 0.4253327548503876, |
|
"kl": 9.363889694213867e-05, |
|
"learning_rate": 7.368421052631578e-07, |
|
"loss": 0.0989, |
|
"reward": 0.9375, |
|
"reward_std": 0.1678653284907341, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4375, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1002.25, |
|
"epoch": 0.07661822985468957, |
|
"grad_norm": 0.7329438924789429, |
|
"kl": 0.0001462697982788086, |
|
"learning_rate": 7.631578947368421e-07, |
|
"loss": 0.4594, |
|
"reward": 0.95703125, |
|
"reward_std": 0.11983717978000641, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1205.53125, |
|
"epoch": 0.07926023778071334, |
|
"grad_norm": 0.7603439092636108, |
|
"kl": 0.00011014938354492188, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 0.4604, |
|
"reward": 0.9375, |
|
"reward_std": 0.1396191380918026, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4375, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1225.0625, |
|
"epoch": 0.08190224570673713, |
|
"grad_norm": 0.586107075214386, |
|
"kl": 0.0001385211944580078, |
|
"learning_rate": 8.157894736842105e-07, |
|
"loss": 0.2906, |
|
"reward": 0.7109375, |
|
"reward_std": 0.15029004588723183, |
|
"rewards/accuracy_reward": 0.265625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4453125, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1195.609375, |
|
"epoch": 0.0845442536327609, |
|
"grad_norm": 0.5367782711982727, |
|
"kl": 0.00018310546875, |
|
"learning_rate": 8.421052631578947e-07, |
|
"loss": 0.3054, |
|
"reward": 1.18359375, |
|
"reward_std": 0.1250832974910736, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.43359375, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1193.53125, |
|
"epoch": 0.08718626155878467, |
|
"grad_norm": 0.6531537771224976, |
|
"kl": 0.0001990795135498047, |
|
"learning_rate": 8.684210526315789e-07, |
|
"loss": 0.382, |
|
"reward": 0.93359375, |
|
"reward_std": 0.10596734657883644, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.43359375, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 975.796875, |
|
"epoch": 0.08982826948480846, |
|
"grad_norm": 0.7079041004180908, |
|
"kl": 0.0002675056457519531, |
|
"learning_rate": 8.947368421052631e-07, |
|
"loss": 0.3162, |
|
"reward": 0.9921875, |
|
"reward_std": 0.11211910098791122, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4921875, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1131.34375, |
|
"epoch": 0.09247027741083223, |
|
"grad_norm": 0.5116021037101746, |
|
"kl": 0.0003204345703125, |
|
"learning_rate": 9.210526315789473e-07, |
|
"loss": 0.3366, |
|
"reward": 1.19140625, |
|
"reward_std": 0.14293401315808296, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.44140625, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1189.828125, |
|
"epoch": 0.095112285336856, |
|
"grad_norm": 0.5107906460762024, |
|
"kl": 0.0003094673156738281, |
|
"learning_rate": 9.473684210526315e-07, |
|
"loss": 0.328, |
|
"reward": 1.20703125, |
|
"reward_std": 0.15370117127895355, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1482.453125, |
|
"epoch": 0.0977542932628798, |
|
"grad_norm": 0.46826329827308655, |
|
"kl": 0.0004634857177734375, |
|
"learning_rate": 9.736842105263158e-07, |
|
"loss": 0.2712, |
|
"reward": 1.16015625, |
|
"reward_std": 0.1653159111738205, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.41015625, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1108.578125, |
|
"epoch": 0.10039630118890357, |
|
"grad_norm": 0.5141110420227051, |
|
"kl": 0.0006732940673828125, |
|
"learning_rate": 1e-06, |
|
"loss": 0.1843, |
|
"reward": 0.97265625, |
|
"reward_std": 0.11588806286454201, |
|
"rewards/accuracy_reward": 0.515625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1196.65625, |
|
"epoch": 0.10303830911492734, |
|
"grad_norm": 0.5530170202255249, |
|
"kl": 0.000946044921875, |
|
"learning_rate": 9.999807902665155e-07, |
|
"loss": 0.2593, |
|
"reward": 0.9609375, |
|
"reward_std": 0.1273726001381874, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4609375, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 822.328125, |
|
"epoch": 0.10568031704095113, |
|
"grad_norm": 0.6078239679336548, |
|
"kl": 0.001224517822265625, |
|
"learning_rate": 9.999231627061236e-07, |
|
"loss": 0.2837, |
|
"reward": 0.9921875, |
|
"reward_std": 0.10058118030428886, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4921875, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 995.421875, |
|
"epoch": 0.1083223249669749, |
|
"grad_norm": 0.6204021573066711, |
|
"kl": 0.001720428466796875, |
|
"learning_rate": 9.998271222388693e-07, |
|
"loss": 0.4368, |
|
"reward": 1.2265625, |
|
"reward_std": 0.13393215090036392, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4765625, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1254.515625, |
|
"epoch": 0.11096433289299867, |
|
"grad_norm": 0.6290051937103271, |
|
"kl": 0.0020294189453125, |
|
"learning_rate": 9.996926770643603e-07, |
|
"loss": 0.3358, |
|
"reward": 0.94921875, |
|
"reward_std": 0.13193362578749657, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.44921875, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 885.078125, |
|
"epoch": 0.11360634081902246, |
|
"grad_norm": 0.38145869970321655, |
|
"kl": 0.0020084381103515625, |
|
"learning_rate": 9.995198386610676e-07, |
|
"loss": 0.1421, |
|
"reward": 1.2421875, |
|
"reward_std": 0.09872931987047195, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4921875, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1162.296875, |
|
"epoch": 0.11624834874504623, |
|
"grad_norm": 0.5801534056663513, |
|
"kl": 0.00255584716796875, |
|
"learning_rate": 9.993086217853452e-07, |
|
"loss": 0.3938, |
|
"reward": 0.9375, |
|
"reward_std": 0.12491972371935844, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4375, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 946.984375, |
|
"epoch": 0.11889035667107001, |
|
"grad_norm": 0.7080899477005005, |
|
"kl": 0.00287628173828125, |
|
"learning_rate": 9.990590444701706e-07, |
|
"loss": 0.3176, |
|
"reward": 0.71484375, |
|
"reward_std": 0.07072163559496403, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46484375, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1258.78125, |
|
"epoch": 0.1215323645970938, |
|
"grad_norm": 0.6584539413452148, |
|
"kl": 0.00337982177734375, |
|
"learning_rate": 9.987711280236046e-07, |
|
"loss": 0.3364, |
|
"reward": 0.9296875, |
|
"reward_std": 0.10684756934642792, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4296875, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1006.984375, |
|
"epoch": 0.12417437252311757, |
|
"grad_norm": 0.5412375926971436, |
|
"kl": 0.003643035888671875, |
|
"learning_rate": 9.984448970269725e-07, |
|
"loss": 0.2438, |
|
"reward": 1.25390625, |
|
"reward_std": 0.16918476670980453, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.50390625, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1165.140625, |
|
"epoch": 0.12681638044914134, |
|
"grad_norm": 0.5502119064331055, |
|
"kl": 0.00435638427734375, |
|
"learning_rate": 9.980803793327655e-07, |
|
"loss": 0.329, |
|
"reward": 0.73046875, |
|
"reward_std": 0.17235729470849037, |
|
"rewards/accuracy_reward": 0.265625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46484375, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1094.59375, |
|
"epoch": 0.12945838837516513, |
|
"grad_norm": 0.6746593713760376, |
|
"kl": 0.0046234130859375, |
|
"learning_rate": 9.976776060622625e-07, |
|
"loss": 0.2585, |
|
"reward": 0.68359375, |
|
"reward_std": 0.11046826094388962, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.43359375, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 894.875, |
|
"epoch": 0.13210039630118892, |
|
"grad_norm": 0.6030331254005432, |
|
"kl": 0.0045623779296875, |
|
"learning_rate": 9.972366116028733e-07, |
|
"loss": 0.1373, |
|
"reward": 1.2265625, |
|
"reward_std": 0.11612267419695854, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4765625, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 660.078125, |
|
"epoch": 0.13474240422721268, |
|
"grad_norm": 0.7342778444290161, |
|
"kl": 0.00536346435546875, |
|
"learning_rate": 9.96757433605202e-07, |
|
"loss": 0.2687, |
|
"reward": 1.26171875, |
|
"reward_std": 0.11859130859375, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51171875, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1063.71875, |
|
"epoch": 0.13738441215323646, |
|
"grad_norm": 0.7268034219741821, |
|
"kl": 0.00653076171875, |
|
"learning_rate": 9.962401129798343e-07, |
|
"loss": 0.3436, |
|
"reward": 0.98046875, |
|
"reward_std": 0.15140536800026894, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.48046875, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1265.96875, |
|
"epoch": 0.14002642007926025, |
|
"grad_norm": 0.7652710676193237, |
|
"kl": 0.00766754150390625, |
|
"learning_rate": 9.956846938938422e-07, |
|
"loss": 0.4375, |
|
"reward": 0.91015625, |
|
"reward_std": 0.1307620257139206, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.41015625, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1342.03125, |
|
"epoch": 0.142668428005284, |
|
"grad_norm": 0.6607176065444946, |
|
"kl": 0.0090179443359375, |
|
"learning_rate": 9.950912237670157e-07, |
|
"loss": 0.3436, |
|
"reward": 0.90234375, |
|
"reward_std": 0.1162625178694725, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.40234375, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1300.421875, |
|
"epoch": 0.1453104359313078, |
|
"grad_norm": 0.6878055930137634, |
|
"kl": 0.01092529296875, |
|
"learning_rate": 9.944597532678119e-07, |
|
"loss": 0.3859, |
|
"reward": 1.1640625, |
|
"reward_std": 0.1533336602151394, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4140625, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1351.71875, |
|
"epoch": 0.14795244385733158, |
|
"grad_norm": 0.6881883144378662, |
|
"kl": 0.01397705078125, |
|
"learning_rate": 9.93790336309031e-07, |
|
"loss": 0.3671, |
|
"reward": 0.92578125, |
|
"reward_std": 0.15761961415410042, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.42578125, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 710.25, |
|
"epoch": 0.15059445178335534, |
|
"grad_norm": 0.5193164348602295, |
|
"kl": 0.0154571533203125, |
|
"learning_rate": 9.930830300432126e-07, |
|
"loss": 0.1832, |
|
"reward": 1.01953125, |
|
"reward_std": 0.11765347048640251, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1319.109375, |
|
"epoch": 0.15323645970937913, |
|
"grad_norm": 0.6145569086074829, |
|
"kl": 0.0148468017578125, |
|
"learning_rate": 9.923378948577558e-07, |
|
"loss": 0.3036, |
|
"reward": 0.9375, |
|
"reward_std": 0.1474018730223179, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4375, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1229.484375, |
|
"epoch": 0.15587846763540292, |
|
"grad_norm": 0.6062135100364685, |
|
"kl": 0.0187530517578125, |
|
"learning_rate": 9.915549943697644e-07, |
|
"loss": 0.3039, |
|
"reward": 0.92578125, |
|
"reward_std": 0.12412451207637787, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.42578125, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1122.96875, |
|
"epoch": 0.15852047556142668, |
|
"grad_norm": 0.7750731110572815, |
|
"kl": 0.019989013671875, |
|
"learning_rate": 9.907343954206146e-07, |
|
"loss": 0.4269, |
|
"reward": 0.4609375, |
|
"reward_std": 0.15149712190032005, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4609375, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1289.796875, |
|
"epoch": 0.16116248348745046, |
|
"grad_norm": 0.4260408282279968, |
|
"kl": 0.023284912109375, |
|
"learning_rate": 9.898761680702495e-07, |
|
"loss": 0.2105, |
|
"reward": 0.66015625, |
|
"reward_std": 0.10409127548336983, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.41015625, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1395.359375, |
|
"epoch": 0.16380449141347425, |
|
"grad_norm": 0.4302825629711151, |
|
"kl": 0.022216796875, |
|
"learning_rate": 9.889803855911965e-07, |
|
"loss": 0.2882, |
|
"reward": 0.69140625, |
|
"reward_std": 0.17329547554254532, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.44140625, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1214.203125, |
|
"epoch": 0.166446499339498, |
|
"grad_norm": 0.5709892511367798, |
|
"kl": 0.025421142578125, |
|
"learning_rate": 9.880471244623118e-07, |
|
"loss": 0.2752, |
|
"reward": 0.96484375, |
|
"reward_std": 0.16381771862506866, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46484375, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1456.640625, |
|
"epoch": 0.1690885072655218, |
|
"grad_norm": 0.4366983473300934, |
|
"kl": 0.03094482421875, |
|
"learning_rate": 9.87076464362251e-07, |
|
"loss": 0.1409, |
|
"reward": 1.21484375, |
|
"reward_std": 0.1545065976679325, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46484375, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 907.890625, |
|
"epoch": 0.17173051519154559, |
|
"grad_norm": 0.5789319276809692, |
|
"kl": 0.0296630859375, |
|
"learning_rate": 9.860684881626674e-07, |
|
"loss": 0.223, |
|
"reward": 1.0234375, |
|
"reward_std": 0.18188364803791046, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1124.03125, |
|
"epoch": 0.17437252311756934, |
|
"grad_norm": 0.8789018988609314, |
|
"kl": 0.03033447265625, |
|
"learning_rate": 9.850232819211343e-07, |
|
"loss": -0.0662, |
|
"reward": 0.9609375, |
|
"reward_std": 0.16317331418395042, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4609375, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1024.5, |
|
"epoch": 0.17701453104359313, |
|
"grad_norm": 0.7724674344062805, |
|
"kl": 0.03656005859375, |
|
"learning_rate": 9.839409348738e-07, |
|
"loss": 0.2921, |
|
"reward": 1.21875, |
|
"reward_std": 0.12279859185218811, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46875, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1002.6875, |
|
"epoch": 0.17965653896961692, |
|
"grad_norm": 1.0319114923477173, |
|
"kl": 0.043212890625, |
|
"learning_rate": 9.828215394277686e-07, |
|
"loss": 0.3121, |
|
"reward": 0.97265625, |
|
"reward_std": 0.13220234587788582, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.47265625, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1255.0625, |
|
"epoch": 0.18229854689564068, |
|
"grad_norm": 0.7915776371955872, |
|
"kl": 0.041290283203125, |
|
"learning_rate": 9.816651911532093e-07, |
|
"loss": 0.3672, |
|
"reward": 0.93359375, |
|
"reward_std": 0.16574888676404953, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.43359375, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1128.546875, |
|
"epoch": 0.18494055482166447, |
|
"grad_norm": 0.577376127243042, |
|
"kl": 0.040679931640625, |
|
"learning_rate": 9.804719887751984e-07, |
|
"loss": 0.1898, |
|
"reward": 1.0078125, |
|
"reward_std": 0.17545727640390396, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1146.234375, |
|
"epoch": 0.18758256274768825, |
|
"grad_norm": 0.5707401633262634, |
|
"kl": 0.034698486328125, |
|
"learning_rate": 9.792420341652901e-07, |
|
"loss": 0.269, |
|
"reward": 1.1796875, |
|
"reward_std": 0.11014671996235847, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4296875, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1104.234375, |
|
"epoch": 0.190224570673712, |
|
"grad_norm": 0.5689163208007812, |
|
"kl": 0.0369873046875, |
|
"learning_rate": 9.779754323328192e-07, |
|
"loss": 0.3013, |
|
"reward": 0.73046875, |
|
"reward_std": 0.1631980687379837, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.48046875, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1476.96875, |
|
"epoch": 0.1928665785997358, |
|
"grad_norm": 0.5846036672592163, |
|
"kl": 0.033660888671875, |
|
"learning_rate": 9.766722914159345e-07, |
|
"loss": 0.2798, |
|
"reward": 0.8984375, |
|
"reward_std": 0.1427699662744999, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3984375, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1340.578125, |
|
"epoch": 0.1955085865257596, |
|
"grad_norm": 0.4723777174949646, |
|
"kl": 0.035400390625, |
|
"learning_rate": 9.753327226723687e-07, |
|
"loss": 0.2281, |
|
"reward": 0.64453125, |
|
"reward_std": 0.09241959825158119, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.39453125, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1429.40625, |
|
"epoch": 0.19815059445178335, |
|
"grad_norm": 0.6316815614700317, |
|
"kl": 0.03790283203125, |
|
"learning_rate": 9.73956840469937e-07, |
|
"loss": 0.2594, |
|
"reward": 1.1640625, |
|
"reward_std": 0.14494511112570763, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4140625, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1395.609375, |
|
"epoch": 0.20079260237780713, |
|
"grad_norm": 0.4536829888820648, |
|
"kl": 0.036865234375, |
|
"learning_rate": 9.725447622767754e-07, |
|
"loss": 0.257, |
|
"reward": 1.24609375, |
|
"reward_std": 0.24476346373558044, |
|
"rewards/accuracy_reward": 0.765625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.48046875, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1369.703125, |
|
"epoch": 0.20343461030383092, |
|
"grad_norm": 0.519792914390564, |
|
"kl": 0.04010009765625, |
|
"learning_rate": 9.710966086513085e-07, |
|
"loss": 0.2693, |
|
"reward": 0.93359375, |
|
"reward_std": 0.15936565026640892, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.43359375, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1090.53125, |
|
"epoch": 0.20607661822985468, |
|
"grad_norm": 0.7418442368507385, |
|
"kl": 0.04974365234375, |
|
"learning_rate": 9.6961250323196e-07, |
|
"loss": 0.3581, |
|
"reward": 1.203125, |
|
"reward_std": 0.14408493414521217, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.453125, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1075.609375, |
|
"epoch": 0.20871862615587847, |
|
"grad_norm": 0.4650673270225525, |
|
"kl": 0.046630859375, |
|
"learning_rate": 9.680925727265944e-07, |
|
"loss": 0.1385, |
|
"reward": 0.984375, |
|
"reward_std": 0.13037987425923347, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.484375, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1463.03125, |
|
"epoch": 0.21136063408190225, |
|
"grad_norm": 0.44249987602233887, |
|
"kl": 0.047119140625, |
|
"learning_rate": 9.665369469017002e-07, |
|
"loss": 0.1594, |
|
"reward": 0.8984375, |
|
"reward_std": 0.16113372519612312, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3984375, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1128.3125, |
|
"epoch": 0.21400264200792601, |
|
"grad_norm": 0.543846070766449, |
|
"kl": 0.05157470703125, |
|
"learning_rate": 9.649457585713108e-07, |
|
"loss": 0.2237, |
|
"reward": 1.234375, |
|
"reward_std": 0.1662597917020321, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.484375, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 960.765625, |
|
"epoch": 0.2166446499339498, |
|
"grad_norm": 0.7787006497383118, |
|
"kl": 0.0552978515625, |
|
"learning_rate": 9.633191435856653e-07, |
|
"loss": 0.3572, |
|
"reward": 1.2109375, |
|
"reward_std": 0.12929406948387623, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4609375, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1094.390625, |
|
"epoch": 0.2192866578599736, |
|
"grad_norm": 0.9358471632003784, |
|
"kl": 0.060302734375, |
|
"learning_rate": 9.616572408196093e-07, |
|
"loss": 0.3621, |
|
"reward": 0.73046875, |
|
"reward_std": 0.18469755724072456, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.48046875, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 868.265625, |
|
"epoch": 0.22192866578599735, |
|
"grad_norm": 1.0493205785751343, |
|
"kl": 0.06304931640625, |
|
"learning_rate": 9.599601921607397e-07, |
|
"loss": 0.3486, |
|
"reward": 0.5078125, |
|
"reward_std": 0.16107311472296715, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1090.9375, |
|
"epoch": 0.22457067371202113, |
|
"grad_norm": 0.9199777245521545, |
|
"kl": 0.06231689453125, |
|
"learning_rate": 9.582281424972892e-07, |
|
"loss": 0.3608, |
|
"reward": 0.96484375, |
|
"reward_std": 0.129608154296875, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46484375, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1144.140625, |
|
"epoch": 0.22721268163804492, |
|
"grad_norm": 0.7876753807067871, |
|
"kl": 0.067138671875, |
|
"learning_rate": 9.56461239705758e-07, |
|
"loss": 0.2158, |
|
"reward": 0.44921875, |
|
"reward_std": 0.11367761343717575, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.44921875, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1346.328125, |
|
"epoch": 0.22985468956406868, |
|
"grad_norm": 0.8156364560127258, |
|
"kl": 0.06951904296875, |
|
"learning_rate": 9.546596346382864e-07, |
|
"loss": 0.2484, |
|
"reward": 0.92578125, |
|
"reward_std": 0.14216843992471695, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.42578125, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 906.40625, |
|
"epoch": 0.23249669749009247, |
|
"grad_norm": 0.6532436013221741, |
|
"kl": 0.083984375, |
|
"learning_rate": 9.528234811097781e-07, |
|
"loss": 0.1984, |
|
"reward": 1.24609375, |
|
"reward_std": 0.10012037679553032, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.49609375, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1103.65625, |
|
"epoch": 0.23513870541611626, |
|
"grad_norm": 0.6433841586112976, |
|
"kl": 0.0770263671875, |
|
"learning_rate": 9.509529358847654e-07, |
|
"loss": 0.1822, |
|
"reward": 0.70703125, |
|
"reward_std": 0.12630900368094444, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1273.546875, |
|
"epoch": 0.23778071334214002, |
|
"grad_norm": 1.185502529144287, |
|
"kl": 0.106201171875, |
|
"learning_rate": 9.490481586640278e-07, |
|
"loss": 0.3498, |
|
"reward": 0.91796875, |
|
"reward_std": 0.14778802916407585, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.41796875, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1223.328125, |
|
"epoch": 0.2404227212681638, |
|
"grad_norm": 0.6358450055122375, |
|
"kl": 0.1009521484375, |
|
"learning_rate": 9.47109312070955e-07, |
|
"loss": 0.1773, |
|
"reward": 0.74609375, |
|
"reward_std": 0.18448476120829582, |
|
"rewards/accuracy_reward": 0.265625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.48046875, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 700.5625, |
|
"epoch": 0.2430647291941876, |
|
"grad_norm": 0.750359058380127, |
|
"kl": 0.1322021484375, |
|
"learning_rate": 9.45136561637664e-07, |
|
"loss": 0.1891, |
|
"reward": 1.046875, |
|
"reward_std": 0.14496402069926262, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 863.90625, |
|
"epoch": 0.24570673712021135, |
|
"grad_norm": 0.557322084903717, |
|
"kl": 0.1099853515625, |
|
"learning_rate": 9.431300757908663e-07, |
|
"loss": 0.1089, |
|
"reward": 1.30078125, |
|
"reward_std": 0.15019455552101135, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 970.015625, |
|
"epoch": 0.24834874504623514, |
|
"grad_norm": 0.731271505355835, |
|
"kl": 0.12158203125, |
|
"learning_rate": 9.410900258374876e-07, |
|
"loss": 0.1692, |
|
"reward": 0.76953125, |
|
"reward_std": 0.17832617834210396, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 924.6875, |
|
"epoch": 0.2509907529722589, |
|
"grad_norm": 1.327541708946228, |
|
"kl": 0.14990234375, |
|
"learning_rate": 9.390165859500435e-07, |
|
"loss": 0.2367, |
|
"reward": 0.5234375, |
|
"reward_std": 0.1663740910589695, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1177.421875, |
|
"epoch": 0.2536327608982827, |
|
"grad_norm": 1.7957454919815063, |
|
"kl": 0.165771484375, |
|
"learning_rate": 9.369099331517676e-07, |
|
"loss": 0.3655, |
|
"reward": 0.9453125, |
|
"reward_std": 0.17608627676963806, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4453125, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1124.71875, |
|
"epoch": 0.2562747688243065, |
|
"grad_norm": 1.353155493736267, |
|
"kl": 0.1519775390625, |
|
"learning_rate": 9.34770247301499e-07, |
|
"loss": 0.2683, |
|
"reward": 1.2109375, |
|
"reward_std": 0.11838950589299202, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4609375, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 710.703125, |
|
"epoch": 0.25891677675033026, |
|
"grad_norm": 1.821932077407837, |
|
"kl": 0.19970703125, |
|
"learning_rate": 9.325977110783263e-07, |
|
"loss": 0.1213, |
|
"reward": 1.52734375, |
|
"reward_std": 0.14770140498876572, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.52734375, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 910.125, |
|
"epoch": 0.261558784676354, |
|
"grad_norm": 0.8406642079353333, |
|
"kl": 0.185546875, |
|
"learning_rate": 9.30392509965991e-07, |
|
"loss": 0.1623, |
|
"reward": 1.015625, |
|
"reward_std": 0.1544700786471367, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.515625, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1078.859375, |
|
"epoch": 0.26420079260237783, |
|
"grad_norm": 1.6371651887893677, |
|
"kl": 0.225341796875, |
|
"learning_rate": 9.281548322370517e-07, |
|
"loss": 0.2703, |
|
"reward": 0.72265625, |
|
"reward_std": 0.14984130859375, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.47265625, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 532.15625, |
|
"epoch": 0.2668428005284016, |
|
"grad_norm": 2.1254074573516846, |
|
"kl": 0.246826171875, |
|
"learning_rate": 9.258848689368094e-07, |
|
"loss": 0.2214, |
|
"reward": 1.2578125, |
|
"reward_std": 0.10374833643436432, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 848.5625, |
|
"epoch": 0.26948480845442535, |
|
"grad_norm": 2.5907938480377197, |
|
"kl": 0.2958984375, |
|
"learning_rate": 9.235828138669978e-07, |
|
"loss": 0.3198, |
|
"reward": 1.01171875, |
|
"reward_std": 0.137377567589283, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51171875, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1073.3125, |
|
"epoch": 0.27212681638044917, |
|
"grad_norm": 2.286487102508545, |
|
"kl": 0.2607421875, |
|
"learning_rate": 9.21248863569236e-07, |
|
"loss": 0.3082, |
|
"reward": 0.97265625, |
|
"reward_std": 0.15867146104574203, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.47265625, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 858.796875, |
|
"epoch": 0.2747688243064729, |
|
"grad_norm": 1.7667056322097778, |
|
"kl": 0.33837890625, |
|
"learning_rate": 9.188832173082495e-07, |
|
"loss": 0.2436, |
|
"reward": 0.71875, |
|
"reward_std": 0.10251419246196747, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46875, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1045.796875, |
|
"epoch": 0.2774108322324967, |
|
"grad_norm": 2.42461895942688, |
|
"kl": 0.40380859375, |
|
"learning_rate": 9.164860770548567e-07, |
|
"loss": 0.2974, |
|
"reward": 0.9921875, |
|
"reward_std": 0.16395077854394913, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4921875, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 893.453125, |
|
"epoch": 0.2800528401585205, |
|
"grad_norm": 10.34216594696045, |
|
"kl": 0.474609375, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.294, |
|
"reward": 0.97265625, |
|
"reward_std": 0.1429976001381874, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.47265625, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1402.546875, |
|
"epoch": 0.28269484808454426, |
|
"grad_norm": 5.165650367736816, |
|
"kl": 0.5849609375, |
|
"learning_rate": 9.11598135880903e-07, |
|
"loss": 0.3739, |
|
"reward": 0.6484375, |
|
"reward_std": 0.16659503430128098, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.3984375, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 959.703125, |
|
"epoch": 0.285336856010568, |
|
"grad_norm": 5.434719562530518, |
|
"kl": 0.6767578125, |
|
"learning_rate": 9.091077522761078e-07, |
|
"loss": 0.421, |
|
"reward": 0.9765625, |
|
"reward_std": 0.13730589486658573, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4765625, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1104.828125, |
|
"epoch": 0.28797886393659183, |
|
"grad_norm": 1.7607016563415527, |
|
"kl": 0.40234375, |
|
"learning_rate": 9.065867092748082e-07, |
|
"loss": 0.205, |
|
"reward": 0.71875, |
|
"reward_std": 0.16618655994534492, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46875, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1261.84375, |
|
"epoch": 0.2906208718626156, |
|
"grad_norm": 3.3362314701080322, |
|
"kl": 0.57373046875, |
|
"learning_rate": 9.040352221150674e-07, |
|
"loss": 0.3039, |
|
"reward": 0.71875, |
|
"reward_std": 0.2016766332089901, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.46875, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 850.640625, |
|
"epoch": 0.29326287978863935, |
|
"grad_norm": 3.9499456882476807, |
|
"kl": 0.54296875, |
|
"learning_rate": 9.014535086341669e-07, |
|
"loss": 0.3804, |
|
"reward": 1.234375, |
|
"reward_std": 0.14762691780924797, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.484375, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 871.765625, |
|
"epoch": 0.29590488771466317, |
|
"grad_norm": 4.223949432373047, |
|
"kl": 0.5234375, |
|
"learning_rate": 8.988417892500083e-07, |
|
"loss": 0.3621, |
|
"reward": 1.2734375, |
|
"reward_std": 0.18184370175004005, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 112 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 711.078125, |
|
"epoch": 0.2985468956406869, |
|
"grad_norm": 10.757521629333496, |
|
"kl": 0.53955078125, |
|
"learning_rate": 8.962002869422955e-07, |
|
"loss": 0.6943, |
|
"reward": 0.484375, |
|
"reward_std": 0.17551938444375992, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.484375, |
|
"step": 113 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 485.96875, |
|
"epoch": 0.3011889035667107, |
|
"grad_norm": 6.041623592376709, |
|
"kl": 0.59326171875, |
|
"learning_rate": 8.935292272334963e-07, |
|
"loss": 0.4734, |
|
"reward": 0.76953125, |
|
"reward_std": 0.13621540740132332, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 114 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 615.203125, |
|
"epoch": 0.3038309114927345, |
|
"grad_norm": 2.360245943069458, |
|
"kl": 0.60302734375, |
|
"learning_rate": 8.908288381695892e-07, |
|
"loss": 0.2661, |
|
"reward": 1.2578125, |
|
"reward_std": 0.1489735022187233, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 851.28125, |
|
"epoch": 0.30647291941875826, |
|
"grad_norm": 6.270340442657471, |
|
"kl": 0.8740234375, |
|
"learning_rate": 8.88099350300593e-07, |
|
"loss": 0.5072, |
|
"reward": 0.73046875, |
|
"reward_std": 0.15848717093467712, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.48046875, |
|
"step": 116 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1406.65625, |
|
"epoch": 0.309114927344782, |
|
"grad_norm": 4.970353126525879, |
|
"kl": 1.427734375, |
|
"learning_rate": 8.853409966608831e-07, |
|
"loss": 0.3739, |
|
"reward": 0.65234375, |
|
"reward_std": 0.15436260029673576, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.40234375, |
|
"step": 117 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 682.796875, |
|
"epoch": 0.31175693527080584, |
|
"grad_norm": 11.649397850036621, |
|
"kl": 1.416015625, |
|
"learning_rate": 8.825540127492965e-07, |
|
"loss": 0.582, |
|
"reward": 1.2734375, |
|
"reward_std": 0.16201764903962612, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 118 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 346.234375, |
|
"epoch": 0.3143989431968296, |
|
"grad_norm": 6.038275241851807, |
|
"kl": 1.6640625, |
|
"learning_rate": 8.797386365090252e-07, |
|
"loss": 0.4335, |
|
"reward": 1.3046875, |
|
"reward_std": 0.16278167814016342, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 119 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 878.03125, |
|
"epoch": 0.31704095112285335, |
|
"grad_norm": 12.164133071899414, |
|
"kl": 2.13671875, |
|
"learning_rate": 8.768951083073009e-07, |
|
"loss": 0.8115, |
|
"reward": 0.9921875, |
|
"reward_std": 0.1910713165998459, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4921875, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 404.671875, |
|
"epoch": 0.31968295904887717, |
|
"grad_norm": 9.305420875549316, |
|
"kl": 2.453125, |
|
"learning_rate": 8.740236709148745e-07, |
|
"loss": 0.6232, |
|
"reward": 1.29296875, |
|
"reward_std": 0.1861564740538597, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.54296875, |
|
"step": 121 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 390.578125, |
|
"epoch": 0.32232496697490093, |
|
"grad_norm": 11.043706893920898, |
|
"kl": 2.4150390625, |
|
"learning_rate": 8.711245694852886e-07, |
|
"loss": 0.4605, |
|
"reward": 1.296875, |
|
"reward_std": 0.20820768922567368, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 122 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 501.625, |
|
"epoch": 0.3249669749009247, |
|
"grad_norm": 10.729813575744629, |
|
"kl": 2.490234375, |
|
"learning_rate": 8.681980515339463e-07, |
|
"loss": 0.6364, |
|
"reward": 0.8359375, |
|
"reward_std": 0.23206235468387604, |
|
"rewards/accuracy_reward": 0.265625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 123 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 466.9375, |
|
"epoch": 0.3276089828269485, |
|
"grad_norm": 7.306431770324707, |
|
"kl": 2.515625, |
|
"learning_rate": 8.652443669169809e-07, |
|
"loss": 0.5031, |
|
"reward": 0.5625, |
|
"reward_std": 0.18624207936227322, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 124 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 435.640625, |
|
"epoch": 0.33025099075297226, |
|
"grad_norm": 9.731188774108887, |
|
"kl": 3.28515625, |
|
"learning_rate": 8.622637678099224e-07, |
|
"loss": 0.7344, |
|
"reward": 1.01171875, |
|
"reward_std": 0.16986817121505737, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51171875, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 291.671875, |
|
"epoch": 0.332892998678996, |
|
"grad_norm": 11.137627601623535, |
|
"kl": 3.392578125, |
|
"learning_rate": 8.592565086861681e-07, |
|
"loss": 0.3762, |
|
"reward": 1.01953125, |
|
"reward_std": 0.1285141110420227, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 126 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 543.8125, |
|
"epoch": 0.33553500660501984, |
|
"grad_norm": 16.820133209228516, |
|
"kl": 3.1875, |
|
"learning_rate": 8.562228462952576e-07, |
|
"loss": 0.2899, |
|
"reward": 1.28125, |
|
"reward_std": 0.1833672672510147, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53125, |
|
"step": 127 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 232.859375, |
|
"epoch": 0.3381770145310436, |
|
"grad_norm": 10.55738353729248, |
|
"kl": 2.62939453125, |
|
"learning_rate": 8.531630396409507e-07, |
|
"loss": 0.2709, |
|
"reward": 1.06640625, |
|
"reward_std": 0.12935607135295868, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 128 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 718.671875, |
|
"epoch": 0.34081902245706736, |
|
"grad_norm": 10.954379081726074, |
|
"kl": 3.91015625, |
|
"learning_rate": 8.500773499591156e-07, |
|
"loss": 0.3251, |
|
"reward": 0.5078125, |
|
"reward_std": 0.10781864821910858, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 129 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 489.109375, |
|
"epoch": 0.34346103038309117, |
|
"grad_norm": 10.081979751586914, |
|
"kl": 2.50390625, |
|
"learning_rate": 8.469660406954252e-07, |
|
"loss": 0.4498, |
|
"reward": 0.796875, |
|
"reward_std": 0.20939984917640686, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 377.484375, |
|
"epoch": 0.34610303830911493, |
|
"grad_norm": 4.734899520874023, |
|
"kl": 1.208984375, |
|
"learning_rate": 8.438293774828649e-07, |
|
"loss": 0.2461, |
|
"reward": 1.3046875, |
|
"reward_std": 0.16797470301389694, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 131 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 353.15625, |
|
"epoch": 0.3487450462351387, |
|
"grad_norm": 3.945875883102417, |
|
"kl": 1.7080078125, |
|
"learning_rate": 8.406676281190542e-07, |
|
"loss": 0.2267, |
|
"reward": 0.83984375, |
|
"reward_std": 0.172641359269619, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 132 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 632.921875, |
|
"epoch": 0.3513870541611625, |
|
"grad_norm": 26.704730987548828, |
|
"kl": 1.767578125, |
|
"learning_rate": 8.374810625433825e-07, |
|
"loss": 0.7894, |
|
"reward": 1.02734375, |
|
"reward_std": 0.21192153729498386, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.52734375, |
|
"step": 133 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 478.828125, |
|
"epoch": 0.35402906208718626, |
|
"grad_norm": 23.016502380371094, |
|
"kl": 1.65234375, |
|
"learning_rate": 8.342699528139628e-07, |
|
"loss": 0.5162, |
|
"reward": 1.015625, |
|
"reward_std": 0.1322025004774332, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.515625, |
|
"step": 134 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 613.46875, |
|
"epoch": 0.35667107001321, |
|
"grad_norm": 5.931519985198975, |
|
"kl": 2.02734375, |
|
"learning_rate": 8.310345730844047e-07, |
|
"loss": 0.4553, |
|
"reward": 1.3125, |
|
"reward_std": 0.21167393401265144, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.46875, |
|
"epoch": 0.35931307793923384, |
|
"grad_norm": 7.461983680725098, |
|
"kl": 1.9765625, |
|
"learning_rate": 8.277751995804067e-07, |
|
"loss": 0.3654, |
|
"reward": 1.0234375, |
|
"reward_std": 0.1544732078909874, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 136 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 439.890625, |
|
"epoch": 0.3619550858652576, |
|
"grad_norm": 3.8175482749938965, |
|
"kl": 2.041015625, |
|
"learning_rate": 8.244921105761755e-07, |
|
"loss": 0.3475, |
|
"reward": 1.07421875, |
|
"reward_std": 0.23262840881943703, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 137 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 442.796875, |
|
"epoch": 0.36459709379128136, |
|
"grad_norm": 11.061271667480469, |
|
"kl": 1.546875, |
|
"learning_rate": 8.211855863706654e-07, |
|
"loss": 0.5592, |
|
"reward": 1.2890625, |
|
"reward_std": 0.17124063521623611, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5390625, |
|
"step": 138 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 334.875, |
|
"epoch": 0.36723910171730517, |
|
"grad_norm": 12.917343139648438, |
|
"kl": 2.42578125, |
|
"learning_rate": 8.178559092636484e-07, |
|
"loss": 0.1005, |
|
"reward": 0.6015625, |
|
"reward_std": 0.1888568513095379, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6015625, |
|
"step": 139 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 815.125, |
|
"epoch": 0.36988110964332893, |
|
"grad_norm": 4.946498394012451, |
|
"kl": 2.6484375, |
|
"learning_rate": 8.145033635316128e-07, |
|
"loss": 0.4205, |
|
"reward": 0.51171875, |
|
"reward_std": 0.19404659420251846, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51171875, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 358.640625, |
|
"epoch": 0.3725231175693527, |
|
"grad_norm": 4.1423869132995605, |
|
"kl": 2.376953125, |
|
"learning_rate": 8.111282354034921e-07, |
|
"loss": 0.362, |
|
"reward": 1.0546875, |
|
"reward_std": 0.1854284517467022, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 141 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 220.828125, |
|
"epoch": 0.3751651254953765, |
|
"grad_norm": 14.8277006149292, |
|
"kl": 3.98828125, |
|
"learning_rate": 8.077308130362273e-07, |
|
"loss": 0.1853, |
|
"reward": 1.0390625, |
|
"reward_std": 0.12213464453816414, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5390625, |
|
"step": 142 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 880.515625, |
|
"epoch": 0.37780713342140027, |
|
"grad_norm": 4.2313103675842285, |
|
"kl": 2.3984375, |
|
"learning_rate": 8.043113864901663e-07, |
|
"loss": 0.4005, |
|
"reward": 1.20703125, |
|
"reward_std": 0.1507197804749012, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.45703125, |
|
"step": 143 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 561.265625, |
|
"epoch": 0.380449141347424, |
|
"grad_norm": 7.7739458084106445, |
|
"kl": 2.126953125, |
|
"learning_rate": 8.008702477042985e-07, |
|
"loss": 0.4939, |
|
"reward": 1.3203125, |
|
"reward_std": 0.20398560166358948, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 144 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 698.015625, |
|
"epoch": 0.38309114927344784, |
|
"grad_norm": 6.112682342529297, |
|
"kl": 2.115234375, |
|
"learning_rate": 7.974076904713301e-07, |
|
"loss": 0.4279, |
|
"reward": 0.73828125, |
|
"reward_std": 0.09649410098791122, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.48828125, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 359.640625, |
|
"epoch": 0.3857331571994716, |
|
"grad_norm": 11.499645233154297, |
|
"kl": 1.916015625, |
|
"learning_rate": 7.939240104126022e-07, |
|
"loss": 0.4661, |
|
"reward": 1.04296875, |
|
"reward_std": 0.1618601270020008, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.54296875, |
|
"step": 146 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 364.5, |
|
"epoch": 0.38837516512549536, |
|
"grad_norm": 5.250813961029053, |
|
"kl": 2.73046875, |
|
"learning_rate": 7.904195049528497e-07, |
|
"loss": 0.4228, |
|
"reward": 1.09765625, |
|
"reward_std": 0.2164350003004074, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59765625, |
|
"step": 147 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 624.75, |
|
"epoch": 0.3910171730515192, |
|
"grad_norm": 12.445371627807617, |
|
"kl": 1.9091796875, |
|
"learning_rate": 7.8689447329481e-07, |
|
"loss": 0.5554, |
|
"reward": 1.015625, |
|
"reward_std": 0.20019326359033585, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.515625, |
|
"step": 148 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 388.015625, |
|
"epoch": 0.39365918097754293, |
|
"grad_norm": 14.60313606262207, |
|
"kl": 3.12890625, |
|
"learning_rate": 7.833492163936773e-07, |
|
"loss": 0.2208, |
|
"reward": 1.01953125, |
|
"reward_std": 0.15205424278974533, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 149 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 278.0625, |
|
"epoch": 0.3963011889035667, |
|
"grad_norm": 9.847626686096191, |
|
"kl": 2.1591796875, |
|
"learning_rate": 7.797840369314081e-07, |
|
"loss": 0.5313, |
|
"reward": 0.5546875, |
|
"reward_std": 0.17377189174294472, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 413.171875, |
|
"epoch": 0.3989431968295905, |
|
"grad_norm": 10.47969913482666, |
|
"kl": 3.029296875, |
|
"learning_rate": 7.761992392908791e-07, |
|
"loss": 0.391, |
|
"reward": 0.78515625, |
|
"reward_std": 0.1711183786392212, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53515625, |
|
"step": 151 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 256.0, |
|
"epoch": 0.40158520475561427, |
|
"grad_norm": 27.210330963134766, |
|
"kl": 3.328125, |
|
"learning_rate": 7.725951295299005e-07, |
|
"loss": 0.8581, |
|
"reward": 1.56640625, |
|
"reward_std": 0.18129342049360275, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 152 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 608.375, |
|
"epoch": 0.404227212681638, |
|
"grad_norm": 7.893120765686035, |
|
"kl": 3.9453125, |
|
"learning_rate": 7.689720153550853e-07, |
|
"loss": 0.5819, |
|
"reward": 0.73828125, |
|
"reward_std": 0.13392486423254013, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.48828125, |
|
"step": 153 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 431.484375, |
|
"epoch": 0.40686922060766184, |
|
"grad_norm": 6.096236705780029, |
|
"kl": 3.18359375, |
|
"learning_rate": 7.653302060955789e-07, |
|
"loss": 0.4258, |
|
"reward": 1.078125, |
|
"reward_std": 0.20535630360245705, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 154 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 403.328125, |
|
"epoch": 0.4095112285336856, |
|
"grad_norm": 9.526097297668457, |
|
"kl": 3.87890625, |
|
"learning_rate": 7.616700126766492e-07, |
|
"loss": 0.6043, |
|
"reward": 1.05078125, |
|
"reward_std": 0.15629850327968597, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 392.953125, |
|
"epoch": 0.41215323645970936, |
|
"grad_norm": 13.829514503479004, |
|
"kl": 4.109375, |
|
"learning_rate": 7.579917475931409e-07, |
|
"loss": 0.3873, |
|
"reward": 0.52734375, |
|
"reward_std": 0.18767033517360687, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.52734375, |
|
"step": 156 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 331.265625, |
|
"epoch": 0.4147952443857332, |
|
"grad_norm": 12.386381149291992, |
|
"kl": 3.4296875, |
|
"learning_rate": 7.54295724882796e-07, |
|
"loss": 0.7169, |
|
"reward": 1.328125, |
|
"reward_std": 0.2166232354938984, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 157 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 457.0, |
|
"epoch": 0.41743725231175693, |
|
"grad_norm": 7.208274841308594, |
|
"kl": 4.09375, |
|
"learning_rate": 7.505822600994423e-07, |
|
"loss": 0.6254, |
|
"reward": 1.28515625, |
|
"reward_std": 0.17519249208271503, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53515625, |
|
"step": 158 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 385.28125, |
|
"epoch": 0.4200792602377807, |
|
"grad_norm": 10.335708618164062, |
|
"kl": 4.54296875, |
|
"learning_rate": 7.468516702860519e-07, |
|
"loss": 0.5237, |
|
"reward": 0.51953125, |
|
"reward_std": 0.18916139006614685, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 159 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 517.96875, |
|
"epoch": 0.4227212681638045, |
|
"grad_norm": 15.89622688293457, |
|
"kl": 3.72265625, |
|
"learning_rate": 7.43104273947674e-07, |
|
"loss": 0.3898, |
|
"reward": 1.01953125, |
|
"reward_std": 0.17299087904393673, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 307.828125, |
|
"epoch": 0.42536327608982827, |
|
"grad_norm": 8.838927268981934, |
|
"kl": 2.689453125, |
|
"learning_rate": 7.393403910242418e-07, |
|
"loss": 0.4323, |
|
"reward": 1.02734375, |
|
"reward_std": 0.13064508698880672, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.52734375, |
|
"step": 161 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 431.3125, |
|
"epoch": 0.42800528401585203, |
|
"grad_norm": 15.761492729187012, |
|
"kl": 2.98828125, |
|
"learning_rate": 7.355603428632565e-07, |
|
"loss": 0.23, |
|
"reward": 1.3671875, |
|
"reward_std": 0.22000113874673843, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 162 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.75, |
|
"epoch": 0.43064729194187584, |
|
"grad_norm": 14.52424144744873, |
|
"kl": 2.109375, |
|
"learning_rate": 7.317644521923526e-07, |
|
"loss": 0.5996, |
|
"reward": 0.7578125, |
|
"reward_std": 0.1417398639023304, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 163 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 364.53125, |
|
"epoch": 0.4332892998678996, |
|
"grad_norm": 12.958600044250488, |
|
"kl": 1.623046875, |
|
"learning_rate": 7.279530430917441e-07, |
|
"loss": 0.0741, |
|
"reward": 0.796875, |
|
"reward_std": 0.1477682925760746, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 164 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 416.53125, |
|
"epoch": 0.43593130779392336, |
|
"grad_norm": 9.96493911743164, |
|
"kl": 1.7197265625, |
|
"learning_rate": 7.241264409665554e-07, |
|
"loss": 0.441, |
|
"reward": 0.82421875, |
|
"reward_std": 0.21464627608656883, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.5625, |
|
"epoch": 0.4385733157199472, |
|
"grad_norm": 7.888613224029541, |
|
"kl": 1.72265625, |
|
"learning_rate": 7.202849725190397e-07, |
|
"loss": 0.3068, |
|
"reward": 1.0078125, |
|
"reward_std": 0.17024145647883415, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 166 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.1875, |
|
"epoch": 0.44121532364597094, |
|
"grad_norm": 8.195699691772461, |
|
"kl": 1.58642578125, |
|
"learning_rate": 7.16428965720686e-07, |
|
"loss": 0.3543, |
|
"reward": 0.8046875, |
|
"reward_std": 0.2195490226149559, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 167 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 262.46875, |
|
"epoch": 0.4438573315719947, |
|
"grad_norm": 11.303885459899902, |
|
"kl": 0.970703125, |
|
"learning_rate": 7.125587497842189e-07, |
|
"loss": 0.4021, |
|
"reward": 0.80078125, |
|
"reward_std": 0.1908670738339424, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 168 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 507.84375, |
|
"epoch": 0.4464993394980185, |
|
"grad_norm": 18.99937629699707, |
|
"kl": 1.3828125, |
|
"learning_rate": 7.086746551354895e-07, |
|
"loss": 0.5214, |
|
"reward": 0.76953125, |
|
"reward_std": 0.1896660476922989, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 169 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 481.09375, |
|
"epoch": 0.44914134742404227, |
|
"grad_norm": 16.57875633239746, |
|
"kl": 1.5625, |
|
"learning_rate": 7.047770133852676e-07, |
|
"loss": 0.4899, |
|
"reward": 1.0546875, |
|
"reward_std": 0.19582437723875046, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 434.171875, |
|
"epoch": 0.45178335535006603, |
|
"grad_norm": 6.7548298835754395, |
|
"kl": 1.451171875, |
|
"learning_rate": 7.008661573009273e-07, |
|
"loss": 0.3438, |
|
"reward": 1.30078125, |
|
"reward_std": 0.1738675981760025, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 171 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 398.4375, |
|
"epoch": 0.45442536327608984, |
|
"grad_norm": 11.90649127960205, |
|
"kl": 1.791015625, |
|
"learning_rate": 6.969424207780374e-07, |
|
"loss": 0.1403, |
|
"reward": 1.3515625, |
|
"reward_std": 0.2295953370630741, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6015625, |
|
"step": 172 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 567.515625, |
|
"epoch": 0.4570673712021136, |
|
"grad_norm": 4.553245544433594, |
|
"kl": 2.3828125, |
|
"learning_rate": 6.930061388118557e-07, |
|
"loss": 0.4131, |
|
"reward": 1.05859375, |
|
"reward_std": 0.21736154332756996, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55859375, |
|
"step": 173 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 522.15625, |
|
"epoch": 0.45970937912813736, |
|
"grad_norm": 10.5054931640625, |
|
"kl": 2.76171875, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.2456, |
|
"reward": 0.76171875, |
|
"reward_std": 0.17176654934883118, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51171875, |
|
"step": 174 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 760.015625, |
|
"epoch": 0.4623513870541612, |
|
"grad_norm": 12.109650611877441, |
|
"kl": 4.10546875, |
|
"learning_rate": 6.850972838573888e-07, |
|
"loss": 0.4345, |
|
"reward": 0.7578125, |
|
"reward_std": 0.17381427809596062, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 670.84375, |
|
"epoch": 0.46499339498018494, |
|
"grad_norm": 9.500724792480469, |
|
"kl": 3.11328125, |
|
"learning_rate": 6.811253861001961e-07, |
|
"loss": 0.448, |
|
"reward": 0.8125, |
|
"reward_std": 0.2038702666759491, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 176 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 668.578125, |
|
"epoch": 0.4676354029062087, |
|
"grad_norm": 3.1513185501098633, |
|
"kl": 2.83984375, |
|
"learning_rate": 6.771422933042477e-07, |
|
"loss": 0.4486, |
|
"reward": 0.7734375, |
|
"reward_std": 0.19701149314641953, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 177 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.640625, |
|
"epoch": 0.4702774108322325, |
|
"grad_norm": 3.928485631942749, |
|
"kl": 2.52734375, |
|
"learning_rate": 6.731483455324374e-07, |
|
"loss": 0.4601, |
|
"reward": 0.55078125, |
|
"reward_std": 0.1819697804749012, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 178 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.734375, |
|
"epoch": 0.47291941875825627, |
|
"grad_norm": 5.9308905601501465, |
|
"kl": 2.89453125, |
|
"learning_rate": 6.691438837744191e-07, |
|
"loss": 0.5959, |
|
"reward": 1.0859375, |
|
"reward_std": 0.24082761257886887, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 179 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 318.859375, |
|
"epoch": 0.47556142668428003, |
|
"grad_norm": 8.880630493164062, |
|
"kl": 2.07373046875, |
|
"learning_rate": 6.651292499174959e-07, |
|
"loss": 0.2224, |
|
"reward": 1.0703125, |
|
"reward_std": 0.18467539176344872, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 504.203125, |
|
"epoch": 0.47820343461030385, |
|
"grad_norm": 7.29809045791626, |
|
"kl": 2.671875, |
|
"learning_rate": 6.611047867174298e-07, |
|
"loss": 0.5424, |
|
"reward": 0.796875, |
|
"reward_std": 0.19480633921921253, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 181 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 693.046875, |
|
"epoch": 0.4808454425363276, |
|
"grad_norm": 7.5113844871521, |
|
"kl": 3.078125, |
|
"learning_rate": 6.570708377691783e-07, |
|
"loss": 0.6193, |
|
"reward": 1.5859375, |
|
"reward_std": 0.2526575177907944, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 182 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 504.484375, |
|
"epoch": 0.48348745046235136, |
|
"grad_norm": 8.909899711608887, |
|
"kl": 2.7265625, |
|
"learning_rate": 6.530277474775602e-07, |
|
"loss": 0.572, |
|
"reward": 1.31640625, |
|
"reward_std": 0.20270539075136185, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 183 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 451.421875, |
|
"epoch": 0.4861294583883752, |
|
"grad_norm": 20.32670021057129, |
|
"kl": 2.5546875, |
|
"learning_rate": 6.489758610278509e-07, |
|
"loss": 0.4425, |
|
"reward": 1.08203125, |
|
"reward_std": 0.21750707924365997, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 184 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 586.625, |
|
"epoch": 0.48877146631439894, |
|
"grad_norm": 6.589134693145752, |
|
"kl": 2.375, |
|
"learning_rate": 6.449155243563114e-07, |
|
"loss": 0.4211, |
|
"reward": 0.546875, |
|
"reward_std": 0.2208508811891079, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53125, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.640625, |
|
"epoch": 0.4914134742404227, |
|
"grad_norm": 9.064754486083984, |
|
"kl": 2.705078125, |
|
"learning_rate": 6.408470841206545e-07, |
|
"loss": 0.2999, |
|
"reward": 1.015625, |
|
"reward_std": 0.10510582849383354, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.515625, |
|
"step": 186 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 758.078125, |
|
"epoch": 0.4940554821664465, |
|
"grad_norm": 14.509212493896484, |
|
"kl": 3.865234375, |
|
"learning_rate": 6.367708876704476e-07, |
|
"loss": 0.494, |
|
"reward": 1.02734375, |
|
"reward_std": 0.20098446309566498, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.52734375, |
|
"step": 187 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 387.09375, |
|
"epoch": 0.4966974900924703, |
|
"grad_norm": 14.154923439025879, |
|
"kl": 2.201171875, |
|
"learning_rate": 6.326872830174566e-07, |
|
"loss": 0.1712, |
|
"reward": 1.0859375, |
|
"reward_std": 0.19368236511945724, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 188 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 423.234375, |
|
"epoch": 0.49933949801849403, |
|
"grad_norm": 17.86855125427246, |
|
"kl": 2.376953125, |
|
"learning_rate": 6.285966188059355e-07, |
|
"loss": 0.6533, |
|
"reward": 1.09375, |
|
"reward_std": 0.2263101488351822, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59375, |
|
"step": 189 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 512.125, |
|
"epoch": 0.5019815059445178, |
|
"grad_norm": 8.82755184173584, |
|
"kl": 3.04296875, |
|
"learning_rate": 6.244992442828585e-07, |
|
"loss": 0.3686, |
|
"reward": 0.7734375, |
|
"reward_std": 0.1519293300807476, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.03125, |
|
"epoch": 0.5046235138705416, |
|
"grad_norm": 15.707466125488281, |
|
"kl": 3.029296875, |
|
"learning_rate": 6.203955092681039e-07, |
|
"loss": 0.3194, |
|
"reward": 1.0703125, |
|
"reward_std": 0.1986095793545246, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 191 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 768.546875, |
|
"epoch": 0.5072655217965654, |
|
"grad_norm": 11.438809394836426, |
|
"kl": 2.88671875, |
|
"learning_rate": 6.162857641245869e-07, |
|
"loss": 0.6017, |
|
"reward": 1.28125, |
|
"reward_std": 0.21250617876648903, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53125, |
|
"step": 192 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 408.859375, |
|
"epoch": 0.5099075297225891, |
|
"grad_norm": 5.250596523284912, |
|
"kl": 1.41015625, |
|
"learning_rate": 6.12170359728347e-07, |
|
"loss": 0.2562, |
|
"reward": 1.33203125, |
|
"reward_std": 0.20339645817875862, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 193 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.921875, |
|
"epoch": 0.512549537648613, |
|
"grad_norm": 8.758655548095703, |
|
"kl": 1.962890625, |
|
"learning_rate": 6.080496474385916e-07, |
|
"loss": 0.34, |
|
"reward": 0.79296875, |
|
"reward_std": 0.19175675138831139, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.54296875, |
|
"step": 194 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 426.734375, |
|
"epoch": 0.5151915455746368, |
|
"grad_norm": 13.022716522216797, |
|
"kl": 1.361328125, |
|
"learning_rate": 6.039239790676974e-07, |
|
"loss": 0.49, |
|
"reward": 1.1484375, |
|
"reward_std": 0.2307521291077137, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6484375, |
|
"step": 195 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.953125, |
|
"epoch": 0.5178335535006605, |
|
"grad_norm": 14.834174156188965, |
|
"kl": 2.318359375, |
|
"learning_rate": 5.997937068511754e-07, |
|
"loss": 0.1528, |
|
"reward": 1.06640625, |
|
"reward_std": 0.14010578021407127, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 196 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 568.75, |
|
"epoch": 0.5204755614266843, |
|
"grad_norm": 10.123536109924316, |
|
"kl": 2.12109375, |
|
"learning_rate": 5.956591834175964e-07, |
|
"loss": 0.5013, |
|
"reward": 1.31640625, |
|
"reward_std": 0.21957488358020782, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 197 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 658.484375, |
|
"epoch": 0.523117569352708, |
|
"grad_norm": 6.424520015716553, |
|
"kl": 3.1796875, |
|
"learning_rate": 5.915207617584858e-07, |
|
"loss": 0.4787, |
|
"reward": 1.3125, |
|
"reward_std": 0.22040452808141708, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 198 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.515625, |
|
"epoch": 0.5257595772787318, |
|
"grad_norm": 5.053133010864258, |
|
"kl": 2.666015625, |
|
"learning_rate": 5.873787951981868e-07, |
|
"loss": 0.4661, |
|
"reward": 0.75390625, |
|
"reward_std": 0.17793777957558632, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.50390625, |
|
"step": 199 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 339.03125, |
|
"epoch": 0.5284015852047557, |
|
"grad_norm": 4.2198944091796875, |
|
"kl": 1.740234375, |
|
"learning_rate": 5.832336373636933e-07, |
|
"loss": 0.3366, |
|
"reward": 1.28515625, |
|
"reward_std": 0.17389780096709728, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53515625, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 468.40625, |
|
"epoch": 0.5310435931307794, |
|
"grad_norm": 2.89648175239563, |
|
"kl": 1.6396484375, |
|
"learning_rate": 5.790856421544598e-07, |
|
"loss": 0.3048, |
|
"reward": 1.5859375, |
|
"reward_std": 0.19600137695670128, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 201 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 475.765625, |
|
"epoch": 0.5336856010568032, |
|
"grad_norm": 6.781806468963623, |
|
"kl": 2.189453125, |
|
"learning_rate": 5.749351637121865e-07, |
|
"loss": 0.3492, |
|
"reward": 0.828125, |
|
"reward_std": 0.20571819692850113, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 202 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 503.40625, |
|
"epoch": 0.5363276089828269, |
|
"grad_norm": 3.5012331008911133, |
|
"kl": 2.72265625, |
|
"learning_rate": 5.707825563905828e-07, |
|
"loss": 0.4152, |
|
"reward": 1.30078125, |
|
"reward_std": 0.17533257603645325, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 203 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 374.390625, |
|
"epoch": 0.5389696169088507, |
|
"grad_norm": 16.517194747924805, |
|
"kl": 1.6416015625, |
|
"learning_rate": 5.666281747251153e-07, |
|
"loss": 0.4345, |
|
"reward": 1.2890625, |
|
"reward_std": 0.18729007616639137, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5390625, |
|
"step": 204 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 344.546875, |
|
"epoch": 0.5416116248348745, |
|
"grad_norm": 4.214947700500488, |
|
"kl": 1.6279296875, |
|
"learning_rate": 5.624723734027373e-07, |
|
"loss": 0.3469, |
|
"reward": 1.01171875, |
|
"reward_std": 0.1350011769682169, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51171875, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 640.53125, |
|
"epoch": 0.5442536327608983, |
|
"grad_norm": 4.432642936706543, |
|
"kl": 2.634765625, |
|
"learning_rate": 5.583155072316085e-07, |
|
"loss": 0.3449, |
|
"reward": 1.01953125, |
|
"reward_std": 0.14237725362181664, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 206 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 346.4375, |
|
"epoch": 0.5468956406869221, |
|
"grad_norm": 6.426868915557861, |
|
"kl": 2.21875, |
|
"learning_rate": 5.541579311108009e-07, |
|
"loss": 0.4081, |
|
"reward": 1.33203125, |
|
"reward_std": 0.20600395277142525, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 207 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.890625, |
|
"epoch": 0.5495376486129459, |
|
"grad_norm": 9.497568130493164, |
|
"kl": 2.8671875, |
|
"learning_rate": 5.5e-07, |
|
"loss": 0.594, |
|
"reward": 1.0390625, |
|
"reward_std": 0.2189657799899578, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5390625, |
|
"step": 208 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 587.703125, |
|
"epoch": 0.5521796565389696, |
|
"grad_norm": 2.5981221199035645, |
|
"kl": 2.576171875, |
|
"learning_rate": 5.458420688891992e-07, |
|
"loss": 0.3634, |
|
"reward": 1.34765625, |
|
"reward_std": 0.2173020839691162, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59765625, |
|
"step": 209 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 251.59375, |
|
"epoch": 0.5548216644649934, |
|
"grad_norm": 12.541109085083008, |
|
"kl": 1.94140625, |
|
"learning_rate": 5.416844927683916e-07, |
|
"loss": 0.482, |
|
"reward": 1.33984375, |
|
"reward_std": 0.22426774725317955, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 353.359375, |
|
"epoch": 0.5574636723910171, |
|
"grad_norm": 21.176788330078125, |
|
"kl": 2.33203125, |
|
"learning_rate": 5.375276265972627e-07, |
|
"loss": 0.2879, |
|
"reward": 1.05078125, |
|
"reward_std": 0.18691154941916466, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 211 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 407.1875, |
|
"epoch": 0.560105680317041, |
|
"grad_norm": 4.283320903778076, |
|
"kl": 2.701171875, |
|
"learning_rate": 5.333718252748849e-07, |
|
"loss": 0.3272, |
|
"reward": 1.5546875, |
|
"reward_std": 0.1786573100835085, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 212 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 391.40625, |
|
"epoch": 0.5627476882430648, |
|
"grad_norm": 7.3552470207214355, |
|
"kl": 2.76953125, |
|
"learning_rate": 5.292174436094172e-07, |
|
"loss": 0.4091, |
|
"reward": 1.05859375, |
|
"reward_std": 0.19953873381018639, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55859375, |
|
"step": 213 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.421875, |
|
"epoch": 0.5653896961690885, |
|
"grad_norm": 7.531975746154785, |
|
"kl": 4.07421875, |
|
"learning_rate": 5.250648362878135e-07, |
|
"loss": 0.6474, |
|
"reward": 1.3359375, |
|
"reward_std": 0.22002986446022987, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 214 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 442.046875, |
|
"epoch": 0.5680317040951123, |
|
"grad_norm": 9.658491134643555, |
|
"kl": 2.96875, |
|
"learning_rate": 5.209143578455401e-07, |
|
"loss": 0.3931, |
|
"reward": 1.31640625, |
|
"reward_std": 0.21046040952205658, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 323.140625, |
|
"epoch": 0.570673712021136, |
|
"grad_norm": 16.756044387817383, |
|
"kl": 2.46484375, |
|
"learning_rate": 5.167663626363066e-07, |
|
"loss": 0.1497, |
|
"reward": 1.328125, |
|
"reward_std": 0.19799000024795532, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 216 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 485.28125, |
|
"epoch": 0.5733157199471598, |
|
"grad_norm": 14.802947998046875, |
|
"kl": 2.94921875, |
|
"learning_rate": 5.126212048018133e-07, |
|
"loss": 0.3226, |
|
"reward": 0.5546875, |
|
"reward_std": 0.17373281717300415, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 217 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 553.296875, |
|
"epoch": 0.5759577278731837, |
|
"grad_norm": 6.547313213348389, |
|
"kl": 3.35546875, |
|
"learning_rate": 5.084792382415141e-07, |
|
"loss": 0.7209, |
|
"reward": 0.5703125, |
|
"reward_std": 0.20446551591157913, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 218 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 580.078125, |
|
"epoch": 0.5785997357992074, |
|
"grad_norm": 7.502042293548584, |
|
"kl": 2.875, |
|
"learning_rate": 5.043408165824037e-07, |
|
"loss": 0.522, |
|
"reward": 1.07421875, |
|
"reward_std": 0.2559613697230816, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 219 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 291.125, |
|
"epoch": 0.5812417437252312, |
|
"grad_norm": 9.088134765625, |
|
"kl": 1.806640625, |
|
"learning_rate": 5.002062931488247e-07, |
|
"loss": 0.5338, |
|
"reward": 0.8046875, |
|
"reward_std": 0.18990932404994965, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 533.640625, |
|
"epoch": 0.583883751651255, |
|
"grad_norm": 11.220687866210938, |
|
"kl": 2.3984375, |
|
"learning_rate": 4.960760209323026e-07, |
|
"loss": 0.6041, |
|
"reward": 0.5234375, |
|
"reward_std": 0.19436774030327797, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 221 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 612.890625, |
|
"epoch": 0.5865257595772787, |
|
"grad_norm": 6.296652317047119, |
|
"kl": 3.07421875, |
|
"learning_rate": 4.919503525614086e-07, |
|
"loss": 0.5521, |
|
"reward": 0.76953125, |
|
"reward_std": 0.18084516376256943, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51953125, |
|
"step": 222 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 265.609375, |
|
"epoch": 0.5891677675033025, |
|
"grad_norm": 3.475614309310913, |
|
"kl": 1.50390625, |
|
"learning_rate": 4.878296402716531e-07, |
|
"loss": 0.2643, |
|
"reward": 1.38671875, |
|
"reward_std": 0.20747815072536469, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.63671875, |
|
"step": 223 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 714.1875, |
|
"epoch": 0.5918097754293263, |
|
"grad_norm": 6.395312786102295, |
|
"kl": 3.357421875, |
|
"learning_rate": 4.837142358754131e-07, |
|
"loss": 0.6176, |
|
"reward": 1.2734375, |
|
"reward_std": 0.21194355189800262, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 224 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 414.859375, |
|
"epoch": 0.5944517833553501, |
|
"grad_norm": 6.891757488250732, |
|
"kl": 2.8984375, |
|
"learning_rate": 4.79604490731896e-07, |
|
"loss": 0.42, |
|
"reward": 1.06640625, |
|
"reward_std": 0.2256414033472538, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.984375, |
|
"epoch": 0.5970937912813739, |
|
"grad_norm": 7.385695934295654, |
|
"kl": 3.4140625, |
|
"learning_rate": 4.755007557171414e-07, |
|
"loss": 0.6208, |
|
"reward": 1.05078125, |
|
"reward_std": 0.19489648565649986, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 226 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 346.296875, |
|
"epoch": 0.5997357992073976, |
|
"grad_norm": 5.268566608428955, |
|
"kl": 2.427734375, |
|
"learning_rate": 4.7140338119406455e-07, |
|
"loss": 0.3306, |
|
"reward": 1.109375, |
|
"reward_std": 0.22719038277864456, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.609375, |
|
"step": 227 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 454.625, |
|
"epoch": 0.6023778071334214, |
|
"grad_norm": 11.538866996765137, |
|
"kl": 2.423828125, |
|
"learning_rate": 4.6731271698254326e-07, |
|
"loss": 0.664, |
|
"reward": 1.109375, |
|
"reward_std": 0.21347813308238983, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.609375, |
|
"step": 228 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 398.4375, |
|
"epoch": 0.6050198150594451, |
|
"grad_norm": 10.027405738830566, |
|
"kl": 2.166015625, |
|
"learning_rate": 4.632291123295524e-07, |
|
"loss": 0.3504, |
|
"reward": 1.3125, |
|
"reward_std": 0.2073436863720417, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 229 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 778.328125, |
|
"epoch": 0.607661822985469, |
|
"grad_norm": 8.903005599975586, |
|
"kl": 4.5234375, |
|
"learning_rate": 4.5915291587934547e-07, |
|
"loss": 0.6184, |
|
"reward": 1.0234375, |
|
"reward_std": 0.21458512544631958, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.296875, |
|
"epoch": 0.6103038309114928, |
|
"grad_norm": 19.55433464050293, |
|
"kl": 4.23046875, |
|
"learning_rate": 4.5508447564368856e-07, |
|
"loss": 0.6321, |
|
"reward": 1.33984375, |
|
"reward_std": 0.22301983460783958, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 231 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 686.265625, |
|
"epoch": 0.6129458388375165, |
|
"grad_norm": 6.192388534545898, |
|
"kl": 3.7890625, |
|
"learning_rate": 4.510241389721493e-07, |
|
"loss": 0.5918, |
|
"reward": 1.5859375, |
|
"reward_std": 0.2616988569498062, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 232 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 540.578125, |
|
"epoch": 0.6155878467635403, |
|
"grad_norm": 7.43271017074585, |
|
"kl": 3.13671875, |
|
"learning_rate": 4.4697225252243976e-07, |
|
"loss": 0.6237, |
|
"reward": 1.3515625, |
|
"reward_std": 0.24065708369016647, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6015625, |
|
"step": 233 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.265625, |
|
"epoch": 0.618229854689564, |
|
"grad_norm": 7.898358345031738, |
|
"kl": 2.81640625, |
|
"learning_rate": 4.4292916223082165e-07, |
|
"loss": 0.5285, |
|
"reward": 1.3046875, |
|
"reward_std": 0.2356991246342659, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 234 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 493.28125, |
|
"epoch": 0.6208718626155878, |
|
"grad_norm": 10.038056373596191, |
|
"kl": 2.90234375, |
|
"learning_rate": 4.388952132825701e-07, |
|
"loss": 0.2489, |
|
"reward": 1.140625, |
|
"reward_std": 0.2295135334134102, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.640625, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 431.984375, |
|
"epoch": 0.6235138705416117, |
|
"grad_norm": 4.178317546844482, |
|
"kl": 2.68359375, |
|
"learning_rate": 4.3487075008250397e-07, |
|
"loss": 0.4859, |
|
"reward": 0.79296875, |
|
"reward_std": 0.2021397091448307, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.54296875, |
|
"step": 236 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 698.25, |
|
"epoch": 0.6261558784676354, |
|
"grad_norm": 7.887820243835449, |
|
"kl": 4.2421875, |
|
"learning_rate": 4.3085611622558084e-07, |
|
"loss": 0.6169, |
|
"reward": 1.28125, |
|
"reward_std": 0.21125948429107666, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53125, |
|
"step": 237 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 548.328125, |
|
"epoch": 0.6287978863936592, |
|
"grad_norm": 5.685881614685059, |
|
"kl": 2.59375, |
|
"learning_rate": 4.268516544675628e-07, |
|
"loss": 0.3334, |
|
"reward": 1.0625, |
|
"reward_std": 0.20200148969888687, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 238 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 414.75, |
|
"epoch": 0.631439894319683, |
|
"grad_norm": 11.868870735168457, |
|
"kl": 2.0859375, |
|
"learning_rate": 4.228577066957522e-07, |
|
"loss": 0.1258, |
|
"reward": 1.3671875, |
|
"reward_std": 0.22833861783146858, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 239 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 599.859375, |
|
"epoch": 0.6340819022457067, |
|
"grad_norm": 5.297094345092773, |
|
"kl": 2.8125, |
|
"learning_rate": 4.1887461389980394e-07, |
|
"loss": 0.3444, |
|
"reward": 1.046875, |
|
"reward_std": 0.22738776728510857, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 460.609375, |
|
"epoch": 0.6367239101717305, |
|
"grad_norm": 9.069931983947754, |
|
"kl": 2.166015625, |
|
"learning_rate": 4.149027161426113e-07, |
|
"loss": 0.5227, |
|
"reward": 1.34375, |
|
"reward_std": 0.21560321748256683, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59375, |
|
"step": 241 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 753.9375, |
|
"epoch": 0.6393659180977543, |
|
"grad_norm": 3.11356258392334, |
|
"kl": 2.849609375, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.4795, |
|
"reward": 1.046875, |
|
"reward_std": 0.20162740349769592, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 242 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 626.859375, |
|
"epoch": 0.6420079260237781, |
|
"grad_norm": 4.849280834197998, |
|
"kl": 2.39453125, |
|
"learning_rate": 4.069938611881443e-07, |
|
"loss": 0.5037, |
|
"reward": 0.796875, |
|
"reward_std": 0.18199804052710533, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 243 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 552.296875, |
|
"epoch": 0.6446499339498019, |
|
"grad_norm": 5.1860456466674805, |
|
"kl": 2.4404296875, |
|
"learning_rate": 4.030575792219626e-07, |
|
"loss": 0.3665, |
|
"reward": 1.296875, |
|
"reward_std": 0.1943066604435444, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 244 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 638.5625, |
|
"epoch": 0.6472919418758256, |
|
"grad_norm": 9.586490631103516, |
|
"kl": 2.599609375, |
|
"learning_rate": 3.9913384269907293e-07, |
|
"loss": 0.2958, |
|
"reward": 1.33203125, |
|
"reward_std": 0.22680200263857841, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 608.640625, |
|
"epoch": 0.6499339498018494, |
|
"grad_norm": 7.131601810455322, |
|
"kl": 2.166015625, |
|
"learning_rate": 3.952229866147323e-07, |
|
"loss": 0.2385, |
|
"reward": 1.375, |
|
"reward_std": 0.2418774701654911, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.625, |
|
"step": 246 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 661.015625, |
|
"epoch": 0.6525759577278731, |
|
"grad_norm": 5.848790645599365, |
|
"kl": 2.306640625, |
|
"learning_rate": 3.913253448645103e-07, |
|
"loss": 0.4711, |
|
"reward": 1.08203125, |
|
"reward_std": 0.22584940120577812, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 247 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 531.96875, |
|
"epoch": 0.655217965653897, |
|
"grad_norm": 5.778437614440918, |
|
"kl": 1.859375, |
|
"learning_rate": 3.8744125021578123e-07, |
|
"loss": 0.3466, |
|
"reward": 1.2734375, |
|
"reward_std": 0.1622530035674572, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 248 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 525.40625, |
|
"epoch": 0.6578599735799208, |
|
"grad_norm": 3.1933047771453857, |
|
"kl": 1.833984375, |
|
"learning_rate": 3.835710342793139e-07, |
|
"loss": 0.2862, |
|
"reward": 1.30078125, |
|
"reward_std": 0.15551739931106567, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 249 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 480.75, |
|
"epoch": 0.6605019815059445, |
|
"grad_norm": 8.949792861938477, |
|
"kl": 1.197265625, |
|
"learning_rate": 3.797150274809604e-07, |
|
"loss": 0.326, |
|
"reward": 1.3359375, |
|
"reward_std": 0.2217497080564499, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 428.203125, |
|
"epoch": 0.6631439894319683, |
|
"grad_norm": 3.1499345302581787, |
|
"kl": 1.2763671875, |
|
"learning_rate": 3.7587355903344466e-07, |
|
"loss": 0.1597, |
|
"reward": 0.875, |
|
"reward_std": 0.21982388943433762, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.625, |
|
"step": 251 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 369.6875, |
|
"epoch": 0.665785997357992, |
|
"grad_norm": 4.168592929840088, |
|
"kl": 1.3583984375, |
|
"learning_rate": 3.7204695690825593e-07, |
|
"loss": 0.1939, |
|
"reward": 1.28125, |
|
"reward_std": 0.1477414984256029, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53125, |
|
"step": 252 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 544.015625, |
|
"epoch": 0.6684280052840158, |
|
"grad_norm": 7.520803451538086, |
|
"kl": 1.921875, |
|
"learning_rate": 3.682355478076473e-07, |
|
"loss": 0.2638, |
|
"reward": 0.82421875, |
|
"reward_std": 0.2656807042658329, |
|
"rewards/accuracy_reward": 0.265625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55859375, |
|
"step": 253 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 561.34375, |
|
"epoch": 0.6710700132100397, |
|
"grad_norm": 6.172038555145264, |
|
"kl": 2.318359375, |
|
"learning_rate": 3.6443965713674354e-07, |
|
"loss": 0.3545, |
|
"reward": 1.02734375, |
|
"reward_std": 0.19002593867480755, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.52734375, |
|
"step": 254 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.8125, |
|
"epoch": 0.6737120211360634, |
|
"grad_norm": 6.321176528930664, |
|
"kl": 1.609375, |
|
"learning_rate": 3.606596089757583e-07, |
|
"loss": 0.3466, |
|
"reward": 1.58984375, |
|
"reward_std": 0.2514568492770195, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 307.46875, |
|
"epoch": 0.6763540290620872, |
|
"grad_norm": 4.846172332763672, |
|
"kl": 1.09765625, |
|
"learning_rate": 3.5689572605232597e-07, |
|
"loss": 0.2335, |
|
"reward": 1.3359375, |
|
"reward_std": 0.20273161679506302, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 256 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 468.5, |
|
"epoch": 0.678996036988111, |
|
"grad_norm": 12.14126968383789, |
|
"kl": 1.138671875, |
|
"learning_rate": 3.531483297139481e-07, |
|
"loss": 0.1721, |
|
"reward": 0.80078125, |
|
"reward_std": 0.1630447916686535, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 257 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 565.203125, |
|
"epoch": 0.6816380449141347, |
|
"grad_norm": 3.9592182636260986, |
|
"kl": 1.837890625, |
|
"learning_rate": 3.4941773990055777e-07, |
|
"loss": 0.2977, |
|
"reward": 1.10546875, |
|
"reward_std": 0.25015248730778694, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.60546875, |
|
"step": 258 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 480.484375, |
|
"epoch": 0.6842800528401585, |
|
"grad_norm": 9.579623222351074, |
|
"kl": 1.62109375, |
|
"learning_rate": 3.45704275117204e-07, |
|
"loss": 0.4312, |
|
"reward": 1.08203125, |
|
"reward_std": 0.24054544791579247, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 259 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.796875, |
|
"epoch": 0.6869220607661823, |
|
"grad_norm": 4.918056964874268, |
|
"kl": 1.14306640625, |
|
"learning_rate": 3.4200825240685914e-07, |
|
"loss": 0.1878, |
|
"reward": 1.1015625, |
|
"reward_std": 0.22064152732491493, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6015625, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 515.96875, |
|
"epoch": 0.6895640686922061, |
|
"grad_norm": 11.338505744934082, |
|
"kl": 1.765625, |
|
"learning_rate": 3.3832998732335085e-07, |
|
"loss": 0.4868, |
|
"reward": 1.0859375, |
|
"reward_std": 0.21507646515965462, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 261 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.875, |
|
"epoch": 0.6922060766182299, |
|
"grad_norm": 10.862038612365723, |
|
"kl": 2.357421875, |
|
"learning_rate": 3.346697939044211e-07, |
|
"loss": 0.6303, |
|
"reward": 0.77734375, |
|
"reward_std": 0.20420579984784126, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.52734375, |
|
"step": 262 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 641.796875, |
|
"epoch": 0.6948480845442536, |
|
"grad_norm": 7.440125465393066, |
|
"kl": 2.716796875, |
|
"learning_rate": 3.310279846449147e-07, |
|
"loss": 0.5692, |
|
"reward": 0.83203125, |
|
"reward_std": 0.2302125133574009, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 263 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 490.625, |
|
"epoch": 0.6974900924702774, |
|
"grad_norm": 11.042434692382812, |
|
"kl": 1.890625, |
|
"learning_rate": 3.2740487047009954e-07, |
|
"loss": 0.575, |
|
"reward": 0.8203125, |
|
"reward_std": 0.21583595871925354, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 264 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.375, |
|
"epoch": 0.7001321003963011, |
|
"grad_norm": 9.307427406311035, |
|
"kl": 1.8515625, |
|
"learning_rate": 3.23800760709121e-07, |
|
"loss": 0.2549, |
|
"reward": 1.0625, |
|
"reward_std": 0.19687864929437637, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.34375, |
|
"epoch": 0.702774108322325, |
|
"grad_norm": 4.253864765167236, |
|
"kl": 2.693359375, |
|
"learning_rate": 3.2021596306859195e-07, |
|
"loss": 0.4737, |
|
"reward": 0.8125, |
|
"reward_std": 0.1992315910756588, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 266 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 648.3125, |
|
"epoch": 0.7054161162483488, |
|
"grad_norm": 7.490243911743164, |
|
"kl": 3.2275390625, |
|
"learning_rate": 3.1665078360632254e-07, |
|
"loss": 0.377, |
|
"reward": 1.078125, |
|
"reward_std": 0.22863000631332397, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 267 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 489.296875, |
|
"epoch": 0.7080581241743725, |
|
"grad_norm": 4.917722702026367, |
|
"kl": 2.056640625, |
|
"learning_rate": 3.1310552670518987e-07, |
|
"loss": 0.3075, |
|
"reward": 1.12109375, |
|
"reward_std": 0.23855430632829666, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.62109375, |
|
"step": 268 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 450.921875, |
|
"epoch": 0.7107001321003963, |
|
"grad_norm": 3.3728554248809814, |
|
"kl": 2.087890625, |
|
"learning_rate": 3.0958049504715024e-07, |
|
"loss": 0.3534, |
|
"reward": 1.07421875, |
|
"reward_std": 0.20587731152772903, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 269 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 550.140625, |
|
"epoch": 0.71334214002642, |
|
"grad_norm": 6.581082344055176, |
|
"kl": 2.974609375, |
|
"learning_rate": 3.0607598958739777e-07, |
|
"loss": 0.3513, |
|
"reward": 1.08203125, |
|
"reward_std": 0.21218016743659973, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 666.0625, |
|
"epoch": 0.7159841479524438, |
|
"grad_norm": 3.782729387283325, |
|
"kl": 3.47265625, |
|
"learning_rate": 3.0259230952866976e-07, |
|
"loss": 0.5161, |
|
"reward": 0.8515625, |
|
"reward_std": 0.266521442681551, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6015625, |
|
"step": 271 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.734375, |
|
"epoch": 0.7186261558784677, |
|
"grad_norm": 12.191798210144043, |
|
"kl": 2.857421875, |
|
"learning_rate": 2.991297522957015e-07, |
|
"loss": 0.257, |
|
"reward": 1.05859375, |
|
"reward_std": 0.1889869049191475, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55859375, |
|
"step": 272 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.515625, |
|
"epoch": 0.7212681638044914, |
|
"grad_norm": 5.739687442779541, |
|
"kl": 2.828125, |
|
"learning_rate": 2.9568861350983365e-07, |
|
"loss": 0.3424, |
|
"reward": 0.578125, |
|
"reward_std": 0.20889347046613693, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 273 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 606.921875, |
|
"epoch": 0.7239101717305152, |
|
"grad_norm": 8.41596794128418, |
|
"kl": 2.6015625, |
|
"learning_rate": 2.922691869637727e-07, |
|
"loss": 0.2616, |
|
"reward": 1.1171875, |
|
"reward_std": 0.24007226526737213, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 274 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 491.59375, |
|
"epoch": 0.726552179656539, |
|
"grad_norm": 4.1023335456848145, |
|
"kl": 1.966796875, |
|
"learning_rate": 2.88871764596508e-07, |
|
"loss": 0.2751, |
|
"reward": 1.3515625, |
|
"reward_std": 0.2043364755809307, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6015625, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 565.84375, |
|
"epoch": 0.7291941875825627, |
|
"grad_norm": 5.3786540031433105, |
|
"kl": 2.720703125, |
|
"learning_rate": 2.854966364683872e-07, |
|
"loss": 0.3457, |
|
"reward": 0.828125, |
|
"reward_std": 0.20211807265877724, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 276 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 663.78125, |
|
"epoch": 0.7318361955085865, |
|
"grad_norm": 4.460934638977051, |
|
"kl": 3.201171875, |
|
"learning_rate": 2.821440907363516e-07, |
|
"loss": 0.4525, |
|
"reward": 0.8203125, |
|
"reward_std": 0.23223434761166573, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 277 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 641.265625, |
|
"epoch": 0.7344782034346103, |
|
"grad_norm": 16.07205581665039, |
|
"kl": 2.826171875, |
|
"learning_rate": 2.7881441362933464e-07, |
|
"loss": 0.334, |
|
"reward": 1.0625, |
|
"reward_std": 0.19014282897114754, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 278 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 676.0625, |
|
"epoch": 0.7371202113606341, |
|
"grad_norm": 11.935088157653809, |
|
"kl": 2.81640625, |
|
"learning_rate": 2.755078894238245e-07, |
|
"loss": 0.23, |
|
"reward": 0.78515625, |
|
"reward_std": 0.20001451671123505, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53515625, |
|
"step": 279 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 603.546875, |
|
"epoch": 0.7397622192866579, |
|
"grad_norm": 9.738125801086426, |
|
"kl": 2.033203125, |
|
"learning_rate": 2.722248004195932e-07, |
|
"loss": 0.2735, |
|
"reward": 1.09375, |
|
"reward_std": 0.20607677102088928, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59375, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 732.6875, |
|
"epoch": 0.7424042272126816, |
|
"grad_norm": 7.031618118286133, |
|
"kl": 2.41015625, |
|
"learning_rate": 2.689654269155955e-07, |
|
"loss": 0.2994, |
|
"reward": 0.82421875, |
|
"reward_std": 0.20312216132879257, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 281 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.875, |
|
"epoch": 0.7450462351387054, |
|
"grad_norm": 5.801688194274902, |
|
"kl": 1.40234375, |
|
"learning_rate": 2.657300471860372e-07, |
|
"loss": 0.2932, |
|
"reward": 1.05078125, |
|
"reward_std": 0.20492718927562237, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55078125, |
|
"step": 282 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 721.109375, |
|
"epoch": 0.7476882430647291, |
|
"grad_norm": 11.897012710571289, |
|
"kl": 2.43359375, |
|
"learning_rate": 2.625189374566175e-07, |
|
"loss": 0.5936, |
|
"reward": 0.7578125, |
|
"reward_std": 0.15211578272283077, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 283 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 641.53125, |
|
"epoch": 0.750330250990753, |
|
"grad_norm": 5.453853130340576, |
|
"kl": 1.376953125, |
|
"learning_rate": 2.593323718809458e-07, |
|
"loss": 0.3039, |
|
"reward": 1.3671875, |
|
"reward_std": 0.2303219847381115, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 284 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 596.28125, |
|
"epoch": 0.7529722589167768, |
|
"grad_norm": 5.665752410888672, |
|
"kl": 1.35546875, |
|
"learning_rate": 2.561706225171352e-07, |
|
"loss": 0.3616, |
|
"reward": 1.04296875, |
|
"reward_std": 0.17159553244709969, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.54296875, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 536.78125, |
|
"epoch": 0.7556142668428005, |
|
"grad_norm": 3.726806879043579, |
|
"kl": 1.5693359375, |
|
"learning_rate": 2.5303395930457494e-07, |
|
"loss": 0.2881, |
|
"reward": 1.3203125, |
|
"reward_std": 0.2022528052330017, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 286 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 495.984375, |
|
"epoch": 0.7582562747688243, |
|
"grad_norm": 3.6658847332000732, |
|
"kl": 1.1884765625, |
|
"learning_rate": 2.499226500408845e-07, |
|
"loss": 0.1181, |
|
"reward": 1.1171875, |
|
"reward_std": 0.1793758161365986, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 287 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 859.984375, |
|
"epoch": 0.760898282694848, |
|
"grad_norm": 4.845893383026123, |
|
"kl": 2.955078125, |
|
"learning_rate": 2.4683696035904926e-07, |
|
"loss": 0.4852, |
|
"reward": 1.0078125, |
|
"reward_std": 0.1604960411787033, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5078125, |
|
"step": 288 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 440.8125, |
|
"epoch": 0.7635402906208718, |
|
"grad_norm": 2.4910755157470703, |
|
"kl": 0.85302734375, |
|
"learning_rate": 2.437771537047423e-07, |
|
"loss": 0.3161, |
|
"reward": 1.07421875, |
|
"reward_std": 0.2174788936972618, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 289 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 669.734375, |
|
"epoch": 0.7661822985468957, |
|
"grad_norm": 4.620151519775391, |
|
"kl": 1.90234375, |
|
"learning_rate": 2.407434913138318e-07, |
|
"loss": 0.3675, |
|
"reward": 0.5859375, |
|
"reward_std": 0.22324015572667122, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 531.953125, |
|
"epoch": 0.7688243064729194, |
|
"grad_norm": 11.40556526184082, |
|
"kl": 1.4501953125, |
|
"learning_rate": 2.377362321900777e-07, |
|
"loss": 0.0233, |
|
"reward": 1.36328125, |
|
"reward_std": 0.21594615280628204, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.61328125, |
|
"step": 291 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 696.34375, |
|
"epoch": 0.7714663143989432, |
|
"grad_norm": 3.5709707736968994, |
|
"kl": 1.853515625, |
|
"learning_rate": 2.3475563308301908e-07, |
|
"loss": 0.2536, |
|
"reward": 0.84765625, |
|
"reward_std": 0.20635812729597092, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59765625, |
|
"step": 292 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 604.40625, |
|
"epoch": 0.774108322324967, |
|
"grad_norm": 6.535892486572266, |
|
"kl": 1.3740234375, |
|
"learning_rate": 2.3180194846605364e-07, |
|
"loss": 0.1969, |
|
"reward": 1.1171875, |
|
"reward_std": 0.23528173938393593, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 293 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 703.734375, |
|
"epoch": 0.7767503302509907, |
|
"grad_norm": 6.631422996520996, |
|
"kl": 2.017578125, |
|
"learning_rate": 2.288754305147115e-07, |
|
"loss": 0.3918, |
|
"reward": 1.296875, |
|
"reward_std": 0.20271231979131699, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 294 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 550.125, |
|
"epoch": 0.7793923381770145, |
|
"grad_norm": 5.805858612060547, |
|
"kl": 1.369140625, |
|
"learning_rate": 2.259763290851255e-07, |
|
"loss": 0.3276, |
|
"reward": 1.0625, |
|
"reward_std": 0.18768509849905968, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 295 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 776.859375, |
|
"epoch": 0.7820343461030383, |
|
"grad_norm": 6.1796135902404785, |
|
"kl": 2.36328125, |
|
"learning_rate": 2.231048916926992e-07, |
|
"loss": 0.2911, |
|
"reward": 1.3203125, |
|
"reward_std": 0.2180866338312626, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 296 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 459.90625, |
|
"epoch": 0.7846763540290621, |
|
"grad_norm": 4.840709686279297, |
|
"kl": 1.15234375, |
|
"learning_rate": 2.2026136349097495e-07, |
|
"loss": 0.2601, |
|
"reward": 0.86328125, |
|
"reward_std": 0.21641594916582108, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.61328125, |
|
"step": 297 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 626.34375, |
|
"epoch": 0.7873183619550859, |
|
"grad_norm": 4.876105308532715, |
|
"kl": 2.0615234375, |
|
"learning_rate": 2.1744598725070347e-07, |
|
"loss": 0.403, |
|
"reward": 1.28515625, |
|
"reward_std": 0.17794826440513134, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53515625, |
|
"step": 298 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.78125, |
|
"epoch": 0.7899603698811096, |
|
"grad_norm": 5.7457451820373535, |
|
"kl": 1.310546875, |
|
"learning_rate": 2.146590033391168e-07, |
|
"loss": 0.259, |
|
"reward": 1.32421875, |
|
"reward_std": 0.20343545079231262, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 299 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 666.5625, |
|
"epoch": 0.7926023778071334, |
|
"grad_norm": 4.766579627990723, |
|
"kl": 1.6201171875, |
|
"learning_rate": 2.11900649699407e-07, |
|
"loss": 0.1752, |
|
"reward": 1.109375, |
|
"reward_std": 0.2358247935771942, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.609375, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 631.734375, |
|
"epoch": 0.7952443857331571, |
|
"grad_norm": 3.2293262481689453, |
|
"kl": 1.62890625, |
|
"learning_rate": 2.0917116183041074e-07, |
|
"loss": 0.2575, |
|
"reward": 1.33984375, |
|
"reward_std": 0.22996815666556358, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 301 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 740.484375, |
|
"epoch": 0.797886393659181, |
|
"grad_norm": 3.1481125354766846, |
|
"kl": 2.294921875, |
|
"learning_rate": 2.0647077276650366e-07, |
|
"loss": 0.3915, |
|
"reward": 0.828125, |
|
"reward_std": 0.22289753332734108, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 302 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 472.625, |
|
"epoch": 0.8005284015852048, |
|
"grad_norm": 14.101240158081055, |
|
"kl": 1.4130859375, |
|
"learning_rate": 2.037997130577045e-07, |
|
"loss": 0.5247, |
|
"reward": 0.86328125, |
|
"reward_std": 0.24362235516309738, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.61328125, |
|
"step": 303 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.9375, |
|
"epoch": 0.8031704095112285, |
|
"grad_norm": 2.720280885696411, |
|
"kl": 1.720703125, |
|
"learning_rate": 2.0115821074999156e-07, |
|
"loss": 0.2849, |
|
"reward": 1.3359375, |
|
"reward_std": 0.21295345574617386, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 304 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 592.1875, |
|
"epoch": 0.8058124174372523, |
|
"grad_norm": 4.275804042816162, |
|
"kl": 1.8828125, |
|
"learning_rate": 1.9854649136583307e-07, |
|
"loss": 0.3054, |
|
"reward": 1.09765625, |
|
"reward_std": 0.222886573523283, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59765625, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 468.53125, |
|
"epoch": 0.808454425363276, |
|
"grad_norm": 5.911637306213379, |
|
"kl": 1.4951171875, |
|
"learning_rate": 1.9596477788493254e-07, |
|
"loss": 0.2116, |
|
"reward": 1.109375, |
|
"reward_std": 0.2025398500263691, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.609375, |
|
"step": 306 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 548.859375, |
|
"epoch": 0.8110964332892999, |
|
"grad_norm": 5.387912273406982, |
|
"kl": 1.599609375, |
|
"learning_rate": 1.9341329072519176e-07, |
|
"loss": 0.351, |
|
"reward": 0.6171875, |
|
"reward_std": 0.22198385372757912, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 307 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.5625, |
|
"epoch": 0.8137384412153237, |
|
"grad_norm": 5.202173709869385, |
|
"kl": 1.78125, |
|
"learning_rate": 1.9089224772389223e-07, |
|
"loss": 0.3517, |
|
"reward": 1.09375, |
|
"reward_std": 0.23804370686411858, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59375, |
|
"step": 308 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 652.28125, |
|
"epoch": 0.8163804491413474, |
|
"grad_norm": 4.832318305969238, |
|
"kl": 1.6396484375, |
|
"learning_rate": 1.884018641190968e-07, |
|
"loss": 0.2776, |
|
"reward": 1.69921875, |
|
"reward_std": 0.27570171654224396, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.69921875, |
|
"step": 309 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 653.1875, |
|
"epoch": 0.8190224570673712, |
|
"grad_norm": 5.5447211265563965, |
|
"kl": 2.037109375, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.247, |
|
"reward": 1.046875, |
|
"reward_std": 0.21413858234882355, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 555.765625, |
|
"epoch": 0.821664464993395, |
|
"grad_norm": 10.55873966217041, |
|
"kl": 2.12109375, |
|
"learning_rate": 1.8351392294514326e-07, |
|
"loss": 0.4554, |
|
"reward": 1.2890625, |
|
"reward_std": 0.15378709696233273, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5390625, |
|
"step": 311 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 454.5625, |
|
"epoch": 0.8243064729194187, |
|
"grad_norm": 2.300844669342041, |
|
"kl": 1.0029296875, |
|
"learning_rate": 1.8111678269175055e-07, |
|
"loss": 0.1514, |
|
"reward": 1.11328125, |
|
"reward_std": 0.2071386780589819, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.61328125, |
|
"step": 312 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 672.796875, |
|
"epoch": 0.8269484808454426, |
|
"grad_norm": 5.112921237945557, |
|
"kl": 2.4970703125, |
|
"learning_rate": 1.78751136430764e-07, |
|
"loss": 0.4767, |
|
"reward": 1.078125, |
|
"reward_std": 0.20955145359039307, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 313 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 444.484375, |
|
"epoch": 0.8295904887714664, |
|
"grad_norm": 4.7589569091796875, |
|
"kl": 1.42578125, |
|
"learning_rate": 1.7641718613300228e-07, |
|
"loss": 0.2688, |
|
"reward": 0.640625, |
|
"reward_std": 0.238662201911211, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.640625, |
|
"step": 314 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 814.015625, |
|
"epoch": 0.8322324966974901, |
|
"grad_norm": 10.08535385131836, |
|
"kl": 3.3828125, |
|
"learning_rate": 1.7411513106319058e-07, |
|
"loss": 0.3937, |
|
"reward": 0.78125, |
|
"reward_std": 0.20346562936902046, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53125, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 613.03125, |
|
"epoch": 0.8348745046235139, |
|
"grad_norm": 12.75075912475586, |
|
"kl": 2.302734375, |
|
"learning_rate": 1.7184516776294832e-07, |
|
"loss": 0.2161, |
|
"reward": 0.8828125, |
|
"reward_std": 0.26399971544742584, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.015625, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 316 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 529.59375, |
|
"epoch": 0.8375165125495376, |
|
"grad_norm": 9.653738975524902, |
|
"kl": 1.8046875, |
|
"learning_rate": 1.6960749003400892e-07, |
|
"loss": 0.1588, |
|
"reward": 0.84375, |
|
"reward_std": 0.16583861783146858, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59375, |
|
"step": 317 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 583.625, |
|
"epoch": 0.8401585204755614, |
|
"grad_norm": 4.075193405151367, |
|
"kl": 1.640625, |
|
"learning_rate": 1.674022889216737e-07, |
|
"loss": 0.1898, |
|
"reward": 1.3125, |
|
"reward_std": 0.1740352250635624, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 318 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 669.0625, |
|
"epoch": 0.8428005284015853, |
|
"grad_norm": 4.472336292266846, |
|
"kl": 2.494140625, |
|
"learning_rate": 1.6522975269850104e-07, |
|
"loss": 0.3193, |
|
"reward": 0.85546875, |
|
"reward_std": 0.21766092255711555, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.60546875, |
|
"step": 319 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 721.015625, |
|
"epoch": 0.845442536327609, |
|
"grad_norm": 6.250655174255371, |
|
"kl": 3.150390625, |
|
"learning_rate": 1.6309006684823239e-07, |
|
"loss": 0.5334, |
|
"reward": 1.0234375, |
|
"reward_std": 0.1688866000622511, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 675.921875, |
|
"epoch": 0.8480845442536328, |
|
"grad_norm": 1.8639191389083862, |
|
"kl": 2.427734375, |
|
"learning_rate": 1.6098341404995647e-07, |
|
"loss": 0.3932, |
|
"reward": 0.62890625, |
|
"reward_std": 0.24960599094629288, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.62890625, |
|
"step": 321 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 600.421875, |
|
"epoch": 0.8507265521796565, |
|
"grad_norm": 4.137293338775635, |
|
"kl": 2.146484375, |
|
"learning_rate": 1.5890997416251224e-07, |
|
"loss": 0.351, |
|
"reward": 1.04296875, |
|
"reward_std": 0.1972101591527462, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.54296875, |
|
"step": 322 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 565.4375, |
|
"epoch": 0.8533685601056803, |
|
"grad_norm": 10.063258171081543, |
|
"kl": 1.478515625, |
|
"learning_rate": 1.5686992420913372e-07, |
|
"loss": 0.0225, |
|
"reward": 0.86328125, |
|
"reward_std": 0.2034553661942482, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.61328125, |
|
"step": 323 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 647.328125, |
|
"epoch": 0.8560105680317041, |
|
"grad_norm": 9.994471549987793, |
|
"kl": 2.05859375, |
|
"learning_rate": 1.5486343836233595e-07, |
|
"loss": 0.2504, |
|
"reward": 1.328125, |
|
"reward_std": 0.21247531473636627, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 324 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 690.234375, |
|
"epoch": 0.8586525759577279, |
|
"grad_norm": 9.103864669799805, |
|
"kl": 2.4921875, |
|
"learning_rate": 1.5289068792904495e-07, |
|
"loss": 0.483, |
|
"reward": 0.82421875, |
|
"reward_std": 0.2072843722999096, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 732.96875, |
|
"epoch": 0.8612945838837517, |
|
"grad_norm": 7.12535285949707, |
|
"kl": 1.994140625, |
|
"learning_rate": 1.5095184133597217e-07, |
|
"loss": 0.4435, |
|
"reward": 1.08984375, |
|
"reward_std": 0.2667161263525486, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 326 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 402.9375, |
|
"epoch": 0.8639365918097754, |
|
"grad_norm": 12.984781265258789, |
|
"kl": 1.0556640625, |
|
"learning_rate": 1.4904706411523448e-07, |
|
"loss": 0.3994, |
|
"reward": 1.32421875, |
|
"reward_std": 0.18335551768541336, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 327 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 828.71875, |
|
"epoch": 0.8665785997357992, |
|
"grad_norm": 12.132417678833008, |
|
"kl": 1.8466796875, |
|
"learning_rate": 1.47176518890222e-07, |
|
"loss": 0.182, |
|
"reward": 1.0390625, |
|
"reward_std": 0.16892226040363312, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5390625, |
|
"step": 328 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 822.609375, |
|
"epoch": 0.869220607661823, |
|
"grad_norm": 4.648046493530273, |
|
"kl": 2.0146484375, |
|
"learning_rate": 1.453403653617135e-07, |
|
"loss": 0.4329, |
|
"reward": 0.796875, |
|
"reward_std": 0.20767118781805038, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.546875, |
|
"step": 329 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 590.765625, |
|
"epoch": 0.8718626155878467, |
|
"grad_norm": 3.794019937515259, |
|
"kl": 1.7001953125, |
|
"learning_rate": 1.4353876029424202e-07, |
|
"loss": 0.371, |
|
"reward": 1.09375, |
|
"reward_std": 0.216283418238163, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59375, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 712.96875, |
|
"epoch": 0.8745046235138706, |
|
"grad_norm": 5.229684352874756, |
|
"kl": 2.5732421875, |
|
"learning_rate": 1.4177185750271055e-07, |
|
"loss": 0.3925, |
|
"reward": 1.09375, |
|
"reward_std": 0.23571135476231575, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59375, |
|
"step": 331 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 646.328125, |
|
"epoch": 0.8771466314398944, |
|
"grad_norm": 5.142683506011963, |
|
"kl": 2.0380859375, |
|
"learning_rate": 1.400398078392602e-07, |
|
"loss": 0.4217, |
|
"reward": 0.828125, |
|
"reward_std": 0.2310670204460621, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 332 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 580.5, |
|
"epoch": 0.8797886393659181, |
|
"grad_norm": 9.393284797668457, |
|
"kl": 1.46875, |
|
"learning_rate": 1.3834275918039055e-07, |
|
"loss": 0.3297, |
|
"reward": 1.33984375, |
|
"reward_std": 0.18817520886659622, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 333 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 584.625, |
|
"epoch": 0.8824306472919419, |
|
"grad_norm": 6.900231838226318, |
|
"kl": 1.951171875, |
|
"learning_rate": 1.3668085641433462e-07, |
|
"loss": 0.2931, |
|
"reward": 0.86328125, |
|
"reward_std": 0.2518454007804394, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.61328125, |
|
"step": 334 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 430.796875, |
|
"epoch": 0.8850726552179656, |
|
"grad_norm": 9.600037574768066, |
|
"kl": 1.091796875, |
|
"learning_rate": 1.3505424142868897e-07, |
|
"loss": 0.3829, |
|
"reward": 1.41796875, |
|
"reward_std": 0.23616278544068336, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.66796875, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 631.875, |
|
"epoch": 0.8877146631439894, |
|
"grad_norm": 5.003634929656982, |
|
"kl": 1.6171875, |
|
"learning_rate": 1.334630530982997e-07, |
|
"loss": 0.2516, |
|
"reward": 1.3046875, |
|
"reward_std": 0.21555107831954956, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5546875, |
|
"step": 336 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 614.03125, |
|
"epoch": 0.8903566710700133, |
|
"grad_norm": 16.881690979003906, |
|
"kl": 1.8984375, |
|
"learning_rate": 1.319074272734056e-07, |
|
"loss": 0.0975, |
|
"reward": 1.08984375, |
|
"reward_std": 0.19282393157482147, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 337 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 666.0, |
|
"epoch": 0.892998678996037, |
|
"grad_norm": 5.620565414428711, |
|
"kl": 2.3154296875, |
|
"learning_rate": 1.303874967680399e-07, |
|
"loss": 0.2757, |
|
"reward": 1.62109375, |
|
"reward_std": 0.2326289601624012, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.62109375, |
|
"step": 338 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.203125, |
|
"epoch": 0.8956406869220608, |
|
"grad_norm": 5.114979267120361, |
|
"kl": 1.1298828125, |
|
"learning_rate": 1.289033913486914e-07, |
|
"loss": 0.1405, |
|
"reward": 1.0703125, |
|
"reward_std": 0.1810067780315876, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 339 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 713.8125, |
|
"epoch": 0.8982826948480845, |
|
"grad_norm": 3.9009175300598145, |
|
"kl": 2.587890625, |
|
"learning_rate": 1.2745523772322461e-07, |
|
"loss": 0.4324, |
|
"reward": 1.31640625, |
|
"reward_std": 0.1788315549492836, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 642.796875, |
|
"epoch": 0.9009247027741083, |
|
"grad_norm": 5.570927619934082, |
|
"kl": 1.9873046875, |
|
"learning_rate": 1.2604315953006266e-07, |
|
"loss": 0.34, |
|
"reward": 0.86328125, |
|
"reward_std": 0.24456297606229782, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.61328125, |
|
"step": 341 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 637.578125, |
|
"epoch": 0.9035667107001321, |
|
"grad_norm": 8.186066627502441, |
|
"kl": 1.923828125, |
|
"learning_rate": 1.2466727732763125e-07, |
|
"loss": 0.4781, |
|
"reward": 0.8671875, |
|
"reward_std": 0.23449090123176575, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6171875, |
|
"step": 342 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 775.796875, |
|
"epoch": 0.9062087186261559, |
|
"grad_norm": 5.553122043609619, |
|
"kl": 3.125, |
|
"learning_rate": 1.2332770858406538e-07, |
|
"loss": 0.5849, |
|
"reward": 0.78515625, |
|
"reward_std": 0.21501468122005463, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.53515625, |
|
"step": 343 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 445.453125, |
|
"epoch": 0.9088507265521797, |
|
"grad_norm": 4.708739757537842, |
|
"kl": 1.2822265625, |
|
"learning_rate": 1.220245676671809e-07, |
|
"loss": 0.1695, |
|
"reward": 1.078125, |
|
"reward_std": 0.15526169911026955, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.578125, |
|
"step": 344 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 752.78125, |
|
"epoch": 0.9114927344782034, |
|
"grad_norm": 3.9118199348449707, |
|
"kl": 1.9716796875, |
|
"learning_rate": 1.2075796583470984e-07, |
|
"loss": 0.3416, |
|
"reward": 1.06640625, |
|
"reward_std": 0.21211567521095276, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 651.375, |
|
"epoch": 0.9141347424042272, |
|
"grad_norm": 5.419198513031006, |
|
"kl": 2.326171875, |
|
"learning_rate": 1.1952801122480167e-07, |
|
"loss": 0.2937, |
|
"reward": 0.59765625, |
|
"reward_std": 0.2001628838479519, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59765625, |
|
"step": 346 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 662.203125, |
|
"epoch": 0.916776750330251, |
|
"grad_norm": 10.185606002807617, |
|
"kl": 2.2119140625, |
|
"learning_rate": 1.183348088467908e-07, |
|
"loss": 0.2272, |
|
"reward": 1.01171875, |
|
"reward_std": 0.15968638472259045, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.51171875, |
|
"step": 347 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 476.359375, |
|
"epoch": 0.9194187582562747, |
|
"grad_norm": 5.287563323974609, |
|
"kl": 1.537109375, |
|
"learning_rate": 1.1717846057223143e-07, |
|
"loss": 0.1921, |
|
"reward": 0.60546875, |
|
"reward_std": 0.22014086320996284, |
|
"rewards/accuracy_reward": 0.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.60546875, |
|
"step": 348 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 593.1875, |
|
"epoch": 0.9220607661822986, |
|
"grad_norm": 4.420534133911133, |
|
"kl": 1.7568359375, |
|
"learning_rate": 1.1605906512619983e-07, |
|
"loss": 0.3432, |
|
"reward": 1.3515625, |
|
"reward_std": 0.23761418834328651, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6015625, |
|
"step": 349 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 709.6875, |
|
"epoch": 0.9247027741083224, |
|
"grad_norm": 4.137857437133789, |
|
"kl": 2.36328125, |
|
"learning_rate": 1.1497671807886567e-07, |
|
"loss": 0.3999, |
|
"reward": 1.0703125, |
|
"reward_std": 0.19854220747947693, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 447.515625, |
|
"epoch": 0.9273447820343461, |
|
"grad_norm": 5.883572578430176, |
|
"kl": 1.359375, |
|
"learning_rate": 1.139315118373326e-07, |
|
"loss": 0.3009, |
|
"reward": 0.859375, |
|
"reward_std": 0.21957654133439064, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.609375, |
|
"step": 351 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.09375, |
|
"epoch": 0.9299867899603699, |
|
"grad_norm": 9.422240257263184, |
|
"kl": 1.3564453125, |
|
"learning_rate": 1.1292353563774873e-07, |
|
"loss": 0.3162, |
|
"reward": 1.08984375, |
|
"reward_std": 0.22193554788827896, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 352 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 603.21875, |
|
"epoch": 0.9326287978863936, |
|
"grad_norm": 4.772337913513184, |
|
"kl": 2.2646484375, |
|
"learning_rate": 1.1195287553768821e-07, |
|
"loss": 0.2438, |
|
"reward": 0.62890625, |
|
"reward_std": 0.28237032890319824, |
|
"rewards/accuracy_reward": 0.015625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.61328125, |
|
"step": 353 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 714.09375, |
|
"epoch": 0.9352708058124174, |
|
"grad_norm": 9.603926658630371, |
|
"kl": 2.470703125, |
|
"learning_rate": 1.1101961440880352e-07, |
|
"loss": 0.3789, |
|
"reward": 1.05859375, |
|
"reward_std": 0.19248899817466736, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.55859375, |
|
"step": 354 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 628.40625, |
|
"epoch": 0.9379128137384413, |
|
"grad_norm": 16.06355857849121, |
|
"kl": 2.0009765625, |
|
"learning_rate": 1.1012383192975041e-07, |
|
"loss": 0.0823, |
|
"reward": 1.33203125, |
|
"reward_std": 0.18909762054681778, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.390625, |
|
"epoch": 0.940554821664465, |
|
"grad_norm": 3.9636921882629395, |
|
"kl": 1.8291015625, |
|
"learning_rate": 1.0926560457938536e-07, |
|
"loss": 0.2746, |
|
"reward": 1.3125, |
|
"reward_std": 0.2061732206493616, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5625, |
|
"step": 356 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 520.203125, |
|
"epoch": 0.9431968295904888, |
|
"grad_norm": 6.897830486297607, |
|
"kl": 1.431640625, |
|
"learning_rate": 1.084450056302357e-07, |
|
"loss": 0.1525, |
|
"reward": 0.83203125, |
|
"reward_std": 0.21859385818243027, |
|
"rewards/accuracy_reward": 0.265625, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 357 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 524.21875, |
|
"epoch": 0.9458388375165125, |
|
"grad_norm": 11.090557098388672, |
|
"kl": 1.40234375, |
|
"learning_rate": 1.0766210514224419e-07, |
|
"loss": 0.0591, |
|
"reward": 1.1328125, |
|
"reward_std": 0.23101669549942017, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6328125, |
|
"step": 358 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.65625, |
|
"epoch": 0.9484808454425363, |
|
"grad_norm": 13.82530689239502, |
|
"kl": 2.111328125, |
|
"learning_rate": 1.0691696995678738e-07, |
|
"loss": 0.2682, |
|
"reward": 1.109375, |
|
"reward_std": 0.22573107481002808, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.609375, |
|
"step": 359 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 720.625, |
|
"epoch": 0.9511228533685601, |
|
"grad_norm": 6.005599021911621, |
|
"kl": 2.166015625, |
|
"learning_rate": 1.0620966369096884e-07, |
|
"loss": 0.3217, |
|
"reward": 1.34375, |
|
"reward_std": 0.2211884669959545, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59375, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 664.5625, |
|
"epoch": 0.9537648612945839, |
|
"grad_norm": 2.9504928588867188, |
|
"kl": 1.896484375, |
|
"learning_rate": 1.0554024673218806e-07, |
|
"loss": 0.3339, |
|
"reward": 1.31640625, |
|
"reward_std": 0.21037080883979797, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 361 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 719.90625, |
|
"epoch": 0.9564068692206077, |
|
"grad_norm": 3.942823886871338, |
|
"kl": 1.5712890625, |
|
"learning_rate": 1.0490877623298431e-07, |
|
"loss": 0.3399, |
|
"reward": 0.8515625, |
|
"reward_std": 0.23859936743974686, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6015625, |
|
"step": 362 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 497.296875, |
|
"epoch": 0.9590488771466315, |
|
"grad_norm": 11.69743824005127, |
|
"kl": 1.6708984375, |
|
"learning_rate": 1.0431530610615772e-07, |
|
"loss": 0.1801, |
|
"reward": 1.37109375, |
|
"reward_std": 0.20750074833631516, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.62109375, |
|
"step": 363 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 662.953125, |
|
"epoch": 0.9616908850726552, |
|
"grad_norm": 5.648345470428467, |
|
"kl": 2.005859375, |
|
"learning_rate": 1.0375988702016576e-07, |
|
"loss": 0.3905, |
|
"reward": 0.8203125, |
|
"reward_std": 0.21815017238259315, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 364 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 556.640625, |
|
"epoch": 0.964332892998679, |
|
"grad_norm": 3.6928138732910156, |
|
"kl": 1.544921875, |
|
"learning_rate": 1.0324256639479797e-07, |
|
"loss": 0.1847, |
|
"reward": 1.3359375, |
|
"reward_std": 0.2146303877234459, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5859375, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 528.46875, |
|
"epoch": 0.9669749009247027, |
|
"grad_norm": 4.1989336013793945, |
|
"kl": 1.3134765625, |
|
"learning_rate": 1.0276338839712688e-07, |
|
"loss": 0.2739, |
|
"reward": 0.859375, |
|
"reward_std": 0.212420754134655, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.609375, |
|
"step": 366 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 807.125, |
|
"epoch": 0.9696169088507266, |
|
"grad_norm": 5.855282306671143, |
|
"kl": 2.8173828125, |
|
"learning_rate": 1.023223939377375e-07, |
|
"loss": 0.3144, |
|
"reward": 0.83203125, |
|
"reward_std": 0.2185688391327858, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58203125, |
|
"step": 367 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 711.046875, |
|
"epoch": 0.9722589167767504, |
|
"grad_norm": 6.813151836395264, |
|
"kl": 1.77734375, |
|
"learning_rate": 1.0191962066723448e-07, |
|
"loss": 0.1714, |
|
"reward": 1.3203125, |
|
"reward_std": 0.18526797741651535, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5703125, |
|
"step": 368 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 835.46875, |
|
"epoch": 0.9749009247027741, |
|
"grad_norm": 4.6733317375183105, |
|
"kl": 2.62109375, |
|
"learning_rate": 1.0155510297302745e-07, |
|
"loss": 0.4741, |
|
"reward": 0.7265625, |
|
"reward_std": 0.1361106839030981, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.4765625, |
|
"step": 369 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 501.609375, |
|
"epoch": 0.9775429326287979, |
|
"grad_norm": 7.580297946929932, |
|
"kl": 1.306640625, |
|
"learning_rate": 1.0122887197639539e-07, |
|
"loss": 0.106, |
|
"reward": 0.8828125, |
|
"reward_std": 0.21267065405845642, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.6328125, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 730.546875, |
|
"epoch": 0.9801849405548216, |
|
"grad_norm": 2.7990424633026123, |
|
"kl": 1.625, |
|
"learning_rate": 1.0094095552982936e-07, |
|
"loss": 0.1954, |
|
"reward": 1.06640625, |
|
"reward_std": 0.15350224822759628, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 371 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 634.125, |
|
"epoch": 0.9828269484808454, |
|
"grad_norm": 5.10625696182251, |
|
"kl": 1.578125, |
|
"learning_rate": 1.0069137821465474e-07, |
|
"loss": 0.3279, |
|
"reward": 1.59765625, |
|
"reward_std": 0.24609044939279556, |
|
"rewards/accuracy_reward": 1.0, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59765625, |
|
"step": 372 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 521.46875, |
|
"epoch": 0.9854689564068693, |
|
"grad_norm": 2.8827366828918457, |
|
"kl": 1.173828125, |
|
"learning_rate": 1.0048016133893242e-07, |
|
"loss": 0.2295, |
|
"reward": 0.81640625, |
|
"reward_std": 0.1789581961929798, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 373 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 528.515625, |
|
"epoch": 0.988110964332893, |
|
"grad_norm": 4.933093070983887, |
|
"kl": 1.3515625, |
|
"learning_rate": 1.0030732293563969e-07, |
|
"loss": 0.1593, |
|
"reward": 1.31640625, |
|
"reward_std": 0.18777159228920937, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.56640625, |
|
"step": 374 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 486.390625, |
|
"epoch": 0.9907529722589168, |
|
"grad_norm": 5.345139980316162, |
|
"kl": 1.306640625, |
|
"learning_rate": 1.0017287776113066e-07, |
|
"loss": 0.2942, |
|
"reward": 1.34765625, |
|
"reward_std": 0.23156387358903885, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.59765625, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 832.59375, |
|
"epoch": 0.9933949801849405, |
|
"grad_norm": 5.978093147277832, |
|
"kl": 2.80859375, |
|
"learning_rate": 1.0007683729387628e-07, |
|
"loss": 0.562, |
|
"reward": 0.7734375, |
|
"reward_std": 0.20706837996840477, |
|
"rewards/accuracy_reward": 0.25, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.5234375, |
|
"step": 376 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 601.359375, |
|
"epoch": 0.9960369881109643, |
|
"grad_norm": 4.996700763702393, |
|
"kl": 1.537109375, |
|
"learning_rate": 1.0001920973348446e-07, |
|
"loss": 0.3616, |
|
"reward": 1.33984375, |
|
"reward_std": 0.2210528589785099, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.58984375, |
|
"step": 377 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 651.0499954223633, |
|
"epoch": 0.9986789960369881, |
|
"grad_norm": 10.63793659210205, |
|
"kl": 1.486328125, |
|
"learning_rate": 1e-07, |
|
"loss": 0.16, |
|
"reward": 1.32421875, |
|
"reward_std": 0.1949087455868721, |
|
"rewards/accuracy_reward": 0.75, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.57421875, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9986789960369881, |
|
"step": 378, |
|
"total_flos": 0.0, |
|
"train_loss": 0.3501640140083889, |
|
"train_runtime": 20695.6892, |
|
"train_samples_per_second": 0.073, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 378, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|