{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9986789960369881, "eval_steps": 500, "global_step": 378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1179.875, "epoch": 0.002642007926023778, "grad_norm": 0.4997229278087616, "kl": 0.0, "learning_rate": 2.6315789473684208e-08, "loss": 0.2467, "reward": 1.19921875, "reward_std": 0.13141997903585434, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1057.625, "epoch": 0.005284015852047556, "grad_norm": 0.5586327910423279, "kl": 0.0, "learning_rate": 5.2631578947368416e-08, "loss": 0.3641, "reward": 0.95703125, "reward_std": 0.12062124721705914, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 1275.578125, "epoch": 0.007926023778071334, "grad_norm": 0.5091319680213928, "kl": 0.0001016855239868164, "learning_rate": 7.894736842105262e-08, "loss": 0.3625, "reward": 0.4140625, "reward_std": 0.13219169899821281, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4140625, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 891.71875, "epoch": 0.010568031704095112, "grad_norm": 0.5754386782646179, "kl": 0.00015115737915039062, "learning_rate": 1.0526315789473683e-07, "loss": 0.3083, "reward": 0.99609375, "reward_std": 0.116029754281044, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 1166.125, "epoch": 0.013210039630118891, "grad_norm": 0.5114976763725281, "kl": 0.00011730194091796875, "learning_rate": 1.3157894736842104e-07, "loss": 0.33, "reward": 0.9296875, "reward_std": 0.11507641524076462, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1035.703125, "epoch": 0.015852047556142668, "grad_norm": 0.7084254026412964, "kl": 0.00015091896057128906, "learning_rate": 1.5789473684210525e-07, "loss": 0.3363, "reward": 0.7265625, "reward_std": 0.12440211698412895, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 977.3125, "epoch": 0.018494055482166448, "grad_norm": 0.3633577525615692, "kl": 9.310245513916016e-05, "learning_rate": 1.8421052631578946e-07, "loss": 0.2085, "reward": 1.0, "reward_std": 0.13400040566921234, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 889.03125, "epoch": 0.021136063408190225, "grad_norm": 0.6363146901130676, "kl": 0.0001055002212524414, "learning_rate": 2.1052631578947366e-07, "loss": 0.3436, "reward": 0.984375, "reward_std": 0.11146603152155876, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 1114.28125, "epoch": 0.023778071334214, "grad_norm": 0.6163086295127869, "kl": 0.00010448694229125977, "learning_rate": 2.3684210526315787e-07, "loss": 0.387, "reward": 0.45703125, "reward_std": 0.11941792443394661, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 1325.578125, "epoch": 0.026420079260237782, "grad_norm": 0.45183688402175903, "kl": 0.00015163421630859375, "learning_rate": 2.631578947368421e-07, "loss": 0.304, "reward": 0.91015625, "reward_std": 0.12797221168875694, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41015625, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1116.671875, "epoch": 0.02906208718626156, "grad_norm": 0.5506221055984497, "kl": 0.0001614093780517578, "learning_rate": 2.894736842105263e-07, "loss": 0.2958, "reward": 0.7109375, "reward_std": 0.1341523937880993, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 1107.546875, "epoch": 0.031704095112285335, "grad_norm": 0.423910528421402, "kl": 0.000125885009765625, "learning_rate": 3.157894736842105e-07, "loss": 0.2614, "reward": 0.9609375, "reward_std": 0.11495335027575493, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1075.453125, "epoch": 0.034346103038309116, "grad_norm": 0.6421769857406616, "kl": 0.0001609325408935547, "learning_rate": 3.4210526315789473e-07, "loss": 0.3804, "reward": 0.70703125, "reward_std": 0.11874673143029213, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 1325.046875, "epoch": 0.036988110964332896, "grad_norm": 0.5751165151596069, "kl": 0.00011897087097167969, "learning_rate": 3.684210526315789e-07, "loss": 0.3482, "reward": 0.9296875, "reward_std": 0.15341992676258087, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 1087.34375, "epoch": 0.03963011889035667, "grad_norm": 0.6110666394233704, "kl": 0.00010585784912109375, "learning_rate": 3.9473684210526315e-07, "loss": 0.3665, "reward": 0.95703125, "reward_std": 0.1287429742515087, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1314.15625, "epoch": 0.04227212681638045, "grad_norm": 0.5642758011817932, "kl": 0.00013065338134765625, "learning_rate": 4.2105263157894733e-07, "loss": 0.4046, "reward": 0.90625, "reward_std": 0.13578036427497864, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40625, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 1289.359375, "epoch": 0.04491413474240423, "grad_norm": 0.4779168963432312, "kl": 8.845329284667969e-05, "learning_rate": 4.4736842105263156e-07, "loss": 0.2965, "reward": 1.421875, "reward_std": 0.12279411032795906, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.421875, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1349.9375, "epoch": 0.047556142668428, "grad_norm": 0.4716605842113495, "kl": 0.00012004375457763672, "learning_rate": 4.7368421052631574e-07, "loss": 0.3496, "reward": 0.66796875, "reward_std": 0.14581536501646042, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41796875, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 775.328125, "epoch": 0.05019815059445178, "grad_norm": 0.5275957584381104, "kl": 9.936094284057617e-05, "learning_rate": 5e-07, "loss": 0.3465, "reward": 0.734375, "reward_std": 0.08240052312612534, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1096.671875, "epoch": 0.052840158520475564, "grad_norm": 0.622590959072113, "kl": 0.00011599063873291016, "learning_rate": 5.263157894736842e-07, "loss": 0.3991, "reward": 0.95703125, "reward_std": 0.09287451207637787, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1145.40625, "epoch": 0.05548216644649934, "grad_norm": 0.5628076195716858, "kl": 7.984042167663574e-05, "learning_rate": 5.526315789473684e-07, "loss": 0.3009, "reward": 0.7109375, "reward_std": 0.111817117780447, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 829.109375, "epoch": 0.05812417437252312, "grad_norm": 0.5253135561943054, "kl": 0.0001367330551147461, "learning_rate": 5.789473684210526e-07, "loss": 0.243, "reward": 1.4921875, "reward_std": 0.1498083807528019, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1060.53125, "epoch": 0.0607661822985469, "grad_norm": 0.624118983745575, "kl": 7.021427154541016e-05, "learning_rate": 6.052631578947368e-07, "loss": 0.4002, "reward": 1.21484375, "reward_std": 0.1456764042377472, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 976.578125, "epoch": 0.06340819022457067, "grad_norm": 0.46764305233955383, "kl": 0.0001266002655029297, "learning_rate": 6.31578947368421e-07, "loss": 0.2928, "reward": 1.20703125, "reward_std": 0.096083864569664, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 1207.015625, "epoch": 0.06605019815059446, "grad_norm": 0.39954128861427307, "kl": 0.00010007619857788086, "learning_rate": 6.578947368421053e-07, "loss": 0.1622, "reward": 0.953125, "reward_std": 0.15208648890256882, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 783.921875, "epoch": 0.06869220607661823, "grad_norm": 0.4758118689060211, "kl": 8.118152618408203e-05, "learning_rate": 6.842105263157895e-07, "loss": 0.2011, "reward": 0.96875, "reward_std": 0.07889671996235847, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 920.515625, "epoch": 0.071334214002642, "grad_norm": 0.7195703387260437, "kl": 9.21487808227539e-05, "learning_rate": 7.105263157894736e-07, "loss": 0.2896, "reward": 0.984375, "reward_std": 0.10958803817629814, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1289.40625, "epoch": 0.07397622192866579, "grad_norm": 0.4253327548503876, "kl": 9.363889694213867e-05, "learning_rate": 7.368421052631578e-07, "loss": 0.0989, "reward": 0.9375, "reward_std": 0.1678653284907341, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 1002.25, "epoch": 0.07661822985468957, "grad_norm": 0.7329438924789429, "kl": 0.0001462697982788086, "learning_rate": 7.631578947368421e-07, "loss": 0.4594, "reward": 0.95703125, "reward_std": 0.11983717978000641, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 1205.53125, "epoch": 0.07926023778071334, "grad_norm": 0.7603439092636108, "kl": 0.00011014938354492188, "learning_rate": 7.894736842105263e-07, "loss": 0.4604, "reward": 0.9375, "reward_std": 0.1396191380918026, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1225.0625, "epoch": 0.08190224570673713, "grad_norm": 0.586107075214386, "kl": 0.0001385211944580078, "learning_rate": 8.157894736842105e-07, "loss": 0.2906, "reward": 0.7109375, "reward_std": 0.15029004588723183, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 1195.609375, "epoch": 0.0845442536327609, "grad_norm": 0.5367782711982727, "kl": 0.00018310546875, "learning_rate": 8.421052631578947e-07, "loss": 0.3054, "reward": 1.18359375, "reward_std": 0.1250832974910736, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 1193.53125, "epoch": 0.08718626155878467, "grad_norm": 0.6531537771224976, "kl": 0.0001990795135498047, "learning_rate": 8.684210526315789e-07, "loss": 0.382, "reward": 0.93359375, "reward_std": 0.10596734657883644, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 975.796875, "epoch": 0.08982826948480846, "grad_norm": 0.7079041004180908, "kl": 0.0002675056457519531, "learning_rate": 8.947368421052631e-07, "loss": 0.3162, "reward": 0.9921875, "reward_std": 0.11211910098791122, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 1131.34375, "epoch": 0.09247027741083223, "grad_norm": 0.5116021037101746, "kl": 0.0003204345703125, "learning_rate": 9.210526315789473e-07, "loss": 0.3366, "reward": 1.19140625, "reward_std": 0.14293401315808296, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1189.828125, "epoch": 0.095112285336856, "grad_norm": 0.5107906460762024, "kl": 0.0003094673156738281, "learning_rate": 9.473684210526315e-07, "loss": 0.328, "reward": 1.20703125, "reward_std": 0.15370117127895355, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 1482.453125, "epoch": 0.0977542932628798, "grad_norm": 0.46826329827308655, "kl": 0.0004634857177734375, "learning_rate": 9.736842105263158e-07, "loss": 0.2712, "reward": 1.16015625, "reward_std": 0.1653159111738205, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41015625, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 1108.578125, "epoch": 0.10039630118890357, "grad_norm": 0.5141110420227051, "kl": 0.0006732940673828125, "learning_rate": 1e-06, "loss": 0.1843, "reward": 0.97265625, "reward_std": 0.11588806286454201, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 1196.65625, "epoch": 0.10303830911492734, "grad_norm": 0.5530170202255249, "kl": 0.000946044921875, "learning_rate": 9.999807902665155e-07, "loss": 0.2593, "reward": 0.9609375, "reward_std": 0.1273726001381874, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 822.328125, "epoch": 0.10568031704095113, "grad_norm": 0.6078239679336548, "kl": 0.001224517822265625, "learning_rate": 9.999231627061236e-07, "loss": 0.2837, "reward": 0.9921875, "reward_std": 0.10058118030428886, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 995.421875, "epoch": 0.1083223249669749, "grad_norm": 0.6204021573066711, "kl": 0.001720428466796875, "learning_rate": 9.998271222388693e-07, "loss": 0.4368, "reward": 1.2265625, "reward_std": 0.13393215090036392, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 1254.515625, "epoch": 0.11096433289299867, "grad_norm": 0.6290051937103271, "kl": 0.0020294189453125, "learning_rate": 9.996926770643603e-07, "loss": 0.3358, "reward": 0.94921875, "reward_std": 0.13193362578749657, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 885.078125, "epoch": 0.11360634081902246, "grad_norm": 0.38145869970321655, "kl": 0.0020084381103515625, "learning_rate": 9.995198386610676e-07, "loss": 0.1421, "reward": 1.2421875, "reward_std": 0.09872931987047195, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 1162.296875, "epoch": 0.11624834874504623, "grad_norm": 0.5801534056663513, "kl": 0.00255584716796875, "learning_rate": 9.993086217853452e-07, "loss": 0.3938, "reward": 0.9375, "reward_std": 0.12491972371935844, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 946.984375, "epoch": 0.11889035667107001, "grad_norm": 0.7080899477005005, "kl": 0.00287628173828125, "learning_rate": 9.990590444701706e-07, "loss": 0.3176, "reward": 0.71484375, "reward_std": 0.07072163559496403, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1258.78125, "epoch": 0.1215323645970938, "grad_norm": 0.6584539413452148, "kl": 0.00337982177734375, "learning_rate": 9.987711280236046e-07, "loss": 0.3364, "reward": 0.9296875, "reward_std": 0.10684756934642792, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 1006.984375, "epoch": 0.12417437252311757, "grad_norm": 0.5412375926971436, "kl": 0.003643035888671875, "learning_rate": 9.984448970269725e-07, "loss": 0.2438, "reward": 1.25390625, "reward_std": 0.16918476670980453, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.50390625, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1165.140625, "epoch": 0.12681638044914134, "grad_norm": 0.5502119064331055, "kl": 0.00435638427734375, "learning_rate": 9.980803793327655e-07, "loss": 0.329, "reward": 0.73046875, "reward_std": 0.17235729470849037, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 1094.59375, "epoch": 0.12945838837516513, "grad_norm": 0.6746593713760376, "kl": 0.0046234130859375, "learning_rate": 9.976776060622625e-07, "loss": 0.2585, "reward": 0.68359375, "reward_std": 0.11046826094388962, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 894.875, "epoch": 0.13210039630118892, "grad_norm": 0.6030331254005432, "kl": 0.0045623779296875, "learning_rate": 9.972366116028733e-07, "loss": 0.1373, "reward": 1.2265625, "reward_std": 0.11612267419695854, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 660.078125, "epoch": 0.13474240422721268, "grad_norm": 0.7342778444290161, "kl": 0.00536346435546875, "learning_rate": 9.96757433605202e-07, "loss": 0.2687, "reward": 1.26171875, "reward_std": 0.11859130859375, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51171875, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 1063.71875, "epoch": 0.13738441215323646, "grad_norm": 0.7268034219741821, "kl": 0.00653076171875, "learning_rate": 9.962401129798343e-07, "loss": 0.3436, "reward": 0.98046875, "reward_std": 0.15140536800026894, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 1265.96875, "epoch": 0.14002642007926025, "grad_norm": 0.7652710676193237, "kl": 0.00766754150390625, "learning_rate": 9.956846938938422e-07, "loss": 0.4375, "reward": 0.91015625, "reward_std": 0.1307620257139206, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41015625, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 1342.03125, "epoch": 0.142668428005284, "grad_norm": 0.6607176065444946, "kl": 0.0090179443359375, "learning_rate": 9.950912237670157e-07, "loss": 0.3436, "reward": 0.90234375, "reward_std": 0.1162625178694725, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40234375, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1300.421875, "epoch": 0.1453104359313078, "grad_norm": 0.6878055930137634, "kl": 0.01092529296875, "learning_rate": 9.944597532678119e-07, "loss": 0.3859, "reward": 1.1640625, "reward_std": 0.1533336602151394, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4140625, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1351.71875, "epoch": 0.14795244385733158, "grad_norm": 0.6881883144378662, "kl": 0.01397705078125, "learning_rate": 9.93790336309031e-07, "loss": 0.3671, "reward": 0.92578125, "reward_std": 0.15761961415410042, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 710.25, "epoch": 0.15059445178335534, "grad_norm": 0.5193164348602295, "kl": 0.0154571533203125, "learning_rate": 9.930830300432126e-07, "loss": 0.1832, "reward": 1.01953125, "reward_std": 0.11765347048640251, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 1319.109375, "epoch": 0.15323645970937913, "grad_norm": 0.6145569086074829, "kl": 0.0148468017578125, "learning_rate": 9.923378948577558e-07, "loss": 0.3036, "reward": 0.9375, "reward_std": 0.1474018730223179, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 1229.484375, "epoch": 0.15587846763540292, "grad_norm": 0.6062135100364685, "kl": 0.0187530517578125, "learning_rate": 9.915549943697644e-07, "loss": 0.3039, "reward": 0.92578125, "reward_std": 0.12412451207637787, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 1122.96875, "epoch": 0.15852047556142668, "grad_norm": 0.7750731110572815, "kl": 0.019989013671875, "learning_rate": 9.907343954206146e-07, "loss": 0.4269, "reward": 0.4609375, "reward_std": 0.15149712190032005, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 1289.796875, "epoch": 0.16116248348745046, "grad_norm": 0.4260408282279968, "kl": 0.023284912109375, "learning_rate": 9.898761680702495e-07, "loss": 0.2105, "reward": 0.66015625, "reward_std": 0.10409127548336983, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41015625, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1395.359375, "epoch": 0.16380449141347425, "grad_norm": 0.4302825629711151, "kl": 0.022216796875, "learning_rate": 9.889803855911965e-07, "loss": 0.2882, "reward": 0.69140625, "reward_std": 0.17329547554254532, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44140625, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 1214.203125, "epoch": 0.166446499339498, "grad_norm": 0.5709892511367798, "kl": 0.025421142578125, "learning_rate": 9.880471244623118e-07, "loss": 0.2752, "reward": 0.96484375, "reward_std": 0.16381771862506866, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 1456.640625, "epoch": 0.1690885072655218, "grad_norm": 0.4366983473300934, "kl": 0.03094482421875, "learning_rate": 9.87076464362251e-07, "loss": 0.1409, "reward": 1.21484375, "reward_std": 0.1545065976679325, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 907.890625, "epoch": 0.17173051519154559, "grad_norm": 0.5789319276809692, "kl": 0.0296630859375, "learning_rate": 9.860684881626674e-07, "loss": 0.223, "reward": 1.0234375, "reward_std": 0.18188364803791046, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 1124.03125, "epoch": 0.17437252311756934, "grad_norm": 0.8789018988609314, "kl": 0.03033447265625, "learning_rate": 9.850232819211343e-07, "loss": -0.0662, "reward": 0.9609375, "reward_std": 0.16317331418395042, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1024.5, "epoch": 0.17701453104359313, "grad_norm": 0.7724674344062805, "kl": 0.03656005859375, "learning_rate": 9.839409348738e-07, "loss": 0.2921, "reward": 1.21875, "reward_std": 0.12279859185218811, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 1002.6875, "epoch": 0.17965653896961692, "grad_norm": 1.0319114923477173, "kl": 0.043212890625, "learning_rate": 9.828215394277686e-07, "loss": 0.3121, "reward": 0.97265625, "reward_std": 0.13220234587788582, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1255.0625, "epoch": 0.18229854689564068, "grad_norm": 0.7915776371955872, "kl": 0.041290283203125, "learning_rate": 9.816651911532093e-07, "loss": 0.3672, "reward": 0.93359375, "reward_std": 0.16574888676404953, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 1128.546875, "epoch": 0.18494055482166447, "grad_norm": 0.577376127243042, "kl": 0.040679931640625, "learning_rate": 9.804719887751984e-07, "loss": 0.1898, "reward": 1.0078125, "reward_std": 0.17545727640390396, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 1146.234375, "epoch": 0.18758256274768825, "grad_norm": 0.5707401633262634, "kl": 0.034698486328125, "learning_rate": 9.792420341652901e-07, "loss": 0.269, "reward": 1.1796875, "reward_std": 0.11014671996235847, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4296875, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1104.234375, "epoch": 0.190224570673712, "grad_norm": 0.5689163208007812, "kl": 0.0369873046875, "learning_rate": 9.779754323328192e-07, "loss": 0.3013, "reward": 0.73046875, "reward_std": 0.1631980687379837, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 1476.96875, "epoch": 0.1928665785997358, "grad_norm": 0.5846036672592163, "kl": 0.033660888671875, "learning_rate": 9.766722914159345e-07, "loss": 0.2798, "reward": 0.8984375, "reward_std": 0.1427699662744999, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 1340.578125, "epoch": 0.1955085865257596, "grad_norm": 0.4723777174949646, "kl": 0.035400390625, "learning_rate": 9.753327226723687e-07, "loss": 0.2281, "reward": 0.64453125, "reward_std": 0.09241959825158119, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.39453125, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1429.40625, "epoch": 0.19815059445178335, "grad_norm": 0.6316815614700317, "kl": 0.03790283203125, "learning_rate": 9.73956840469937e-07, "loss": 0.2594, "reward": 1.1640625, "reward_std": 0.14494511112570763, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4140625, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 1395.609375, "epoch": 0.20079260237780713, "grad_norm": 0.4536829888820648, "kl": 0.036865234375, "learning_rate": 9.725447622767754e-07, "loss": 0.257, "reward": 1.24609375, "reward_std": 0.24476346373558044, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 1369.703125, "epoch": 0.20343461030383092, "grad_norm": 0.519792914390564, "kl": 0.04010009765625, "learning_rate": 9.710966086513085e-07, "loss": 0.2693, "reward": 0.93359375, "reward_std": 0.15936565026640892, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.43359375, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1090.53125, "epoch": 0.20607661822985468, "grad_norm": 0.7418442368507385, "kl": 0.04974365234375, "learning_rate": 9.6961250323196e-07, "loss": 0.3581, "reward": 1.203125, "reward_std": 0.14408493414521217, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.453125, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1075.609375, "epoch": 0.20871862615587847, "grad_norm": 0.4650673270225525, "kl": 0.046630859375, "learning_rate": 9.680925727265944e-07, "loss": 0.1385, "reward": 0.984375, "reward_std": 0.13037987425923347, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 1463.03125, "epoch": 0.21136063408190225, "grad_norm": 0.44249987602233887, "kl": 0.047119140625, "learning_rate": 9.665369469017002e-07, "loss": 0.1594, "reward": 0.8984375, "reward_std": 0.16113372519612312, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 1128.3125, "epoch": 0.21400264200792601, "grad_norm": 0.543846070766449, "kl": 0.05157470703125, "learning_rate": 9.649457585713108e-07, "loss": 0.2237, "reward": 1.234375, "reward_std": 0.1662597917020321, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 960.765625, "epoch": 0.2166446499339498, "grad_norm": 0.7787006497383118, "kl": 0.0552978515625, "learning_rate": 9.633191435856653e-07, "loss": 0.3572, "reward": 1.2109375, "reward_std": 0.12929406948387623, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1094.390625, "epoch": 0.2192866578599736, "grad_norm": 0.9358471632003784, "kl": 0.060302734375, "learning_rate": 9.616572408196093e-07, "loss": 0.3621, "reward": 0.73046875, "reward_std": 0.18469755724072456, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 868.265625, "epoch": 0.22192866578599735, "grad_norm": 1.0493205785751343, "kl": 0.06304931640625, "learning_rate": 9.599601921607397e-07, "loss": 0.3486, "reward": 0.5078125, "reward_std": 0.16107311472296715, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1090.9375, "epoch": 0.22457067371202113, "grad_norm": 0.9199777245521545, "kl": 0.06231689453125, "learning_rate": 9.582281424972892e-07, "loss": 0.3608, "reward": 0.96484375, "reward_std": 0.129608154296875, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46484375, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 1144.140625, "epoch": 0.22721268163804492, "grad_norm": 0.7876753807067871, "kl": 0.067138671875, "learning_rate": 9.56461239705758e-07, "loss": 0.2158, "reward": 0.44921875, "reward_std": 0.11367761343717575, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.44921875, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 1346.328125, "epoch": 0.22985468956406868, "grad_norm": 0.8156364560127258, "kl": 0.06951904296875, "learning_rate": 9.546596346382864e-07, "loss": 0.2484, "reward": 0.92578125, "reward_std": 0.14216843992471695, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.42578125, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 906.40625, "epoch": 0.23249669749009247, "grad_norm": 0.6532436013221741, "kl": 0.083984375, "learning_rate": 9.528234811097781e-07, "loss": 0.1984, "reward": 1.24609375, "reward_std": 0.10012037679553032, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.49609375, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1103.65625, "epoch": 0.23513870541611626, "grad_norm": 0.6433841586112976, "kl": 0.0770263671875, "learning_rate": 9.509529358847654e-07, "loss": 0.1822, "reward": 0.70703125, "reward_std": 0.12630900368094444, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 1273.546875, "epoch": 0.23778071334214002, "grad_norm": 1.185502529144287, "kl": 0.106201171875, "learning_rate": 9.490481586640278e-07, "loss": 0.3498, "reward": 0.91796875, "reward_std": 0.14778802916407585, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.41796875, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 1223.328125, "epoch": 0.2404227212681638, "grad_norm": 0.6358450055122375, "kl": 0.1009521484375, "learning_rate": 9.47109312070955e-07, "loss": 0.1773, "reward": 0.74609375, "reward_std": 0.18448476120829582, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 700.5625, "epoch": 0.2430647291941876, "grad_norm": 0.750359058380127, "kl": 0.1322021484375, "learning_rate": 9.45136561637664e-07, "loss": 0.1891, "reward": 1.046875, "reward_std": 0.14496402069926262, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 863.90625, "epoch": 0.24570673712021135, "grad_norm": 0.557322084903717, "kl": 0.1099853515625, "learning_rate": 9.431300757908663e-07, "loss": 0.1089, "reward": 1.30078125, "reward_std": 0.15019455552101135, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 970.015625, "epoch": 0.24834874504623514, "grad_norm": 0.731271505355835, "kl": 0.12158203125, "learning_rate": 9.410900258374876e-07, "loss": 0.1692, "reward": 0.76953125, "reward_std": 0.17832617834210396, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 924.6875, "epoch": 0.2509907529722589, "grad_norm": 1.327541708946228, "kl": 0.14990234375, "learning_rate": 9.390165859500435e-07, "loss": 0.2367, "reward": 0.5234375, "reward_std": 0.1663740910589695, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 1177.421875, "epoch": 0.2536327608982827, "grad_norm": 1.7957454919815063, "kl": 0.165771484375, "learning_rate": 9.369099331517676e-07, "loss": 0.3655, "reward": 0.9453125, "reward_std": 0.17608627676963806, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4453125, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 1124.71875, "epoch": 0.2562747688243065, "grad_norm": 1.353155493736267, "kl": 0.1519775390625, "learning_rate": 9.34770247301499e-07, "loss": 0.2683, "reward": 1.2109375, "reward_std": 0.11838950589299202, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4609375, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 710.703125, "epoch": 0.25891677675033026, "grad_norm": 1.821932077407837, "kl": 0.19970703125, "learning_rate": 9.325977110783263e-07, "loss": 0.1213, "reward": 1.52734375, "reward_std": 0.14770140498876572, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 910.125, "epoch": 0.261558784676354, "grad_norm": 0.8406642079353333, "kl": 0.185546875, "learning_rate": 9.30392509965991e-07, "loss": 0.1623, "reward": 1.015625, "reward_std": 0.1544700786471367, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.515625, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 1078.859375, "epoch": 0.26420079260237783, "grad_norm": 1.6371651887893677, "kl": 0.225341796875, "learning_rate": 9.281548322370517e-07, "loss": 0.2703, "reward": 0.72265625, "reward_std": 0.14984130859375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 532.15625, "epoch": 0.2668428005284016, "grad_norm": 2.1254074573516846, "kl": 0.246826171875, "learning_rate": 9.258848689368094e-07, "loss": 0.2214, "reward": 1.2578125, "reward_std": 0.10374833643436432, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 848.5625, "epoch": 0.26948480845442535, "grad_norm": 2.5907938480377197, "kl": 0.2958984375, "learning_rate": 9.235828138669978e-07, "loss": 0.3198, "reward": 1.01171875, "reward_std": 0.137377567589283, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51171875, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 1073.3125, "epoch": 0.27212681638044917, "grad_norm": 2.286487102508545, "kl": 0.2607421875, "learning_rate": 9.21248863569236e-07, "loss": 0.3082, "reward": 0.97265625, "reward_std": 0.15867146104574203, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 858.796875, "epoch": 0.2747688243064729, "grad_norm": 1.7667056322097778, "kl": 0.33837890625, "learning_rate": 9.188832173082495e-07, "loss": 0.2436, "reward": 0.71875, "reward_std": 0.10251419246196747, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 1045.796875, "epoch": 0.2774108322324967, "grad_norm": 2.42461895942688, "kl": 0.40380859375, "learning_rate": 9.164860770548567e-07, "loss": 0.2974, "reward": 0.9921875, "reward_std": 0.16395077854394913, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 893.453125, "epoch": 0.2800528401585205, "grad_norm": 10.34216594696045, "kl": 0.474609375, "learning_rate": 9.140576474687263e-07, "loss": 0.294, "reward": 0.97265625, "reward_std": 0.1429976001381874, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.47265625, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 1402.546875, "epoch": 0.28269484808454426, "grad_norm": 5.165650367736816, "kl": 0.5849609375, "learning_rate": 9.11598135880903e-07, "loss": 0.3739, "reward": 0.6484375, "reward_std": 0.16659503430128098, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3984375, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 959.703125, "epoch": 0.285336856010568, "grad_norm": 5.434719562530518, "kl": 0.6767578125, "learning_rate": 9.091077522761078e-07, "loss": 0.421, "reward": 0.9765625, "reward_std": 0.13730589486658573, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 1104.828125, "epoch": 0.28797886393659183, "grad_norm": 1.7607016563415527, "kl": 0.40234375, "learning_rate": 9.065867092748082e-07, "loss": 0.205, "reward": 0.71875, "reward_std": 0.16618655994534492, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 1261.84375, "epoch": 0.2906208718626156, "grad_norm": 3.3362314701080322, "kl": 0.57373046875, "learning_rate": 9.040352221150674e-07, "loss": 0.3039, "reward": 0.71875, "reward_std": 0.2016766332089901, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.46875, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 850.640625, "epoch": 0.29326287978863935, "grad_norm": 3.9499456882476807, "kl": 0.54296875, "learning_rate": 9.014535086341669e-07, "loss": 0.3804, "reward": 1.234375, "reward_std": 0.14762691780924797, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 871.765625, "epoch": 0.29590488771466317, "grad_norm": 4.223949432373047, "kl": 0.5234375, "learning_rate": 8.988417892500083e-07, "loss": 0.3621, "reward": 1.2734375, "reward_std": 0.18184370175004005, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 711.078125, "epoch": 0.2985468956406869, "grad_norm": 10.757521629333496, "kl": 0.53955078125, "learning_rate": 8.962002869422955e-07, "loss": 0.6943, "reward": 0.484375, "reward_std": 0.17551938444375992, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.484375, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 485.96875, "epoch": 0.3011889035667107, "grad_norm": 6.041623592376709, "kl": 0.59326171875, "learning_rate": 8.935292272334963e-07, "loss": 0.4734, "reward": 0.76953125, "reward_std": 0.13621540740132332, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 615.203125, "epoch": 0.3038309114927345, "grad_norm": 2.360245943069458, "kl": 0.60302734375, "learning_rate": 8.908288381695892e-07, "loss": 0.2661, "reward": 1.2578125, "reward_std": 0.1489735022187233, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 851.28125, "epoch": 0.30647291941875826, "grad_norm": 6.270340442657471, "kl": 0.8740234375, "learning_rate": 8.88099350300593e-07, "loss": 0.5072, "reward": 0.73046875, "reward_std": 0.15848717093467712, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48046875, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 1406.65625, "epoch": 0.309114927344782, "grad_norm": 4.970353126525879, "kl": 1.427734375, "learning_rate": 8.853409966608831e-07, "loss": 0.3739, "reward": 0.65234375, "reward_std": 0.15436260029673576, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.40234375, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 682.796875, "epoch": 0.31175693527080584, "grad_norm": 11.649397850036621, "kl": 1.416015625, "learning_rate": 8.825540127492965e-07, "loss": 0.582, "reward": 1.2734375, "reward_std": 0.16201764903962612, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 346.234375, "epoch": 0.3143989431968296, "grad_norm": 6.038275241851807, "kl": 1.6640625, "learning_rate": 8.797386365090252e-07, "loss": 0.4335, "reward": 1.3046875, "reward_std": 0.16278167814016342, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 878.03125, "epoch": 0.31704095112285335, "grad_norm": 12.164133071899414, "kl": 2.13671875, "learning_rate": 8.768951083073009e-07, "loss": 0.8115, "reward": 0.9921875, "reward_std": 0.1910713165998459, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4921875, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 404.671875, "epoch": 0.31968295904887717, "grad_norm": 9.305420875549316, "kl": 2.453125, "learning_rate": 8.740236709148745e-07, "loss": 0.6232, "reward": 1.29296875, "reward_std": 0.1861564740538597, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 390.578125, "epoch": 0.32232496697490093, "grad_norm": 11.043706893920898, "kl": 2.4150390625, "learning_rate": 8.711245694852886e-07, "loss": 0.4605, "reward": 1.296875, "reward_std": 0.20820768922567368, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 501.625, "epoch": 0.3249669749009247, "grad_norm": 10.729813575744629, "kl": 2.490234375, "learning_rate": 8.681980515339463e-07, "loss": 0.6364, "reward": 0.8359375, "reward_std": 0.23206235468387604, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 466.9375, "epoch": 0.3276089828269485, "grad_norm": 7.306431770324707, "kl": 2.515625, "learning_rate": 8.652443669169809e-07, "loss": 0.5031, "reward": 0.5625, "reward_std": 0.18624207936227322, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 435.640625, "epoch": 0.33025099075297226, "grad_norm": 9.731188774108887, "kl": 3.28515625, "learning_rate": 8.622637678099224e-07, "loss": 0.7344, "reward": 1.01171875, "reward_std": 0.16986817121505737, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51171875, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 291.671875, "epoch": 0.332892998678996, "grad_norm": 11.137627601623535, "kl": 3.392578125, "learning_rate": 8.592565086861681e-07, "loss": 0.3762, "reward": 1.01953125, "reward_std": 0.1285141110420227, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 543.8125, "epoch": 0.33553500660501984, "grad_norm": 16.820133209228516, "kl": 3.1875, "learning_rate": 8.562228462952576e-07, "loss": 0.2899, "reward": 1.28125, "reward_std": 0.1833672672510147, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53125, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 232.859375, "epoch": 0.3381770145310436, "grad_norm": 10.55738353729248, "kl": 2.62939453125, "learning_rate": 8.531630396409507e-07, "loss": 0.2709, "reward": 1.06640625, "reward_std": 0.12935607135295868, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 718.671875, "epoch": 0.34081902245706736, "grad_norm": 10.954379081726074, "kl": 3.91015625, "learning_rate": 8.500773499591156e-07, "loss": 0.3251, "reward": 0.5078125, "reward_std": 0.10781864821910858, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 489.109375, "epoch": 0.34346103038309117, "grad_norm": 10.081979751586914, "kl": 2.50390625, "learning_rate": 8.469660406954252e-07, "loss": 0.4498, "reward": 0.796875, "reward_std": 0.20939984917640686, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 377.484375, "epoch": 0.34610303830911493, "grad_norm": 4.734899520874023, "kl": 1.208984375, "learning_rate": 8.438293774828649e-07, "loss": 0.2461, "reward": 1.3046875, "reward_std": 0.16797470301389694, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 353.15625, "epoch": 0.3487450462351387, "grad_norm": 3.945875883102417, "kl": 1.7080078125, "learning_rate": 8.406676281190542e-07, "loss": 0.2267, "reward": 0.83984375, "reward_std": 0.172641359269619, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 632.921875, "epoch": 0.3513870541611625, "grad_norm": 26.704730987548828, "kl": 1.767578125, "learning_rate": 8.374810625433825e-07, "loss": 0.7894, "reward": 1.02734375, "reward_std": 0.21192153729498386, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 478.828125, "epoch": 0.35402906208718626, "grad_norm": 23.016502380371094, "kl": 1.65234375, "learning_rate": 8.342699528139628e-07, "loss": 0.5162, "reward": 1.015625, "reward_std": 0.1322025004774332, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.515625, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 613.46875, "epoch": 0.35667107001321, "grad_norm": 5.931519985198975, "kl": 2.02734375, "learning_rate": 8.310345730844047e-07, "loss": 0.4553, "reward": 1.3125, "reward_std": 0.21167393401265144, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 500.46875, "epoch": 0.35931307793923384, "grad_norm": 7.461983680725098, "kl": 1.9765625, "learning_rate": 8.277751995804067e-07, "loss": 0.3654, "reward": 1.0234375, "reward_std": 0.1544732078909874, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 439.890625, "epoch": 0.3619550858652576, "grad_norm": 3.8175482749938965, "kl": 2.041015625, "learning_rate": 8.244921105761755e-07, "loss": 0.3475, "reward": 1.07421875, "reward_std": 0.23262840881943703, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 442.796875, "epoch": 0.36459709379128136, "grad_norm": 11.061271667480469, "kl": 1.546875, "learning_rate": 8.211855863706654e-07, "loss": 0.5592, "reward": 1.2890625, "reward_std": 0.17124063521623611, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 334.875, "epoch": 0.36723910171730517, "grad_norm": 12.917343139648438, "kl": 2.42578125, "learning_rate": 8.178559092636484e-07, "loss": 0.1005, "reward": 0.6015625, "reward_std": 0.1888568513095379, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 815.125, "epoch": 0.36988110964332893, "grad_norm": 4.946498394012451, "kl": 2.6484375, "learning_rate": 8.145033635316128e-07, "loss": 0.4205, "reward": 0.51171875, "reward_std": 0.19404659420251846, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51171875, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 358.640625, "epoch": 0.3725231175693527, "grad_norm": 4.1423869132995605, "kl": 2.376953125, "learning_rate": 8.111282354034921e-07, "loss": 0.362, "reward": 1.0546875, "reward_std": 0.1854284517467022, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 220.828125, "epoch": 0.3751651254953765, "grad_norm": 14.8277006149292, "kl": 3.98828125, "learning_rate": 8.077308130362273e-07, "loss": 0.1853, "reward": 1.0390625, "reward_std": 0.12213464453816414, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 880.515625, "epoch": 0.37780713342140027, "grad_norm": 4.2313103675842285, "kl": 2.3984375, "learning_rate": 8.043113864901663e-07, "loss": 0.4005, "reward": 1.20703125, "reward_std": 0.1507197804749012, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.45703125, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 561.265625, "epoch": 0.380449141347424, "grad_norm": 7.7739458084106445, "kl": 2.126953125, "learning_rate": 8.008702477042985e-07, "loss": 0.4939, "reward": 1.3203125, "reward_std": 0.20398560166358948, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 698.015625, "epoch": 0.38309114927344784, "grad_norm": 6.112682342529297, "kl": 2.115234375, "learning_rate": 7.974076904713301e-07, "loss": 0.4279, "reward": 0.73828125, "reward_std": 0.09649410098791122, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 359.640625, "epoch": 0.3857331571994716, "grad_norm": 11.499645233154297, "kl": 1.916015625, "learning_rate": 7.939240104126022e-07, "loss": 0.4661, "reward": 1.04296875, "reward_std": 0.1618601270020008, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 364.5, "epoch": 0.38837516512549536, "grad_norm": 5.250813961029053, "kl": 2.73046875, "learning_rate": 7.904195049528497e-07, "loss": 0.4228, "reward": 1.09765625, "reward_std": 0.2164350003004074, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59765625, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 624.75, "epoch": 0.3910171730515192, "grad_norm": 12.445371627807617, "kl": 1.9091796875, "learning_rate": 7.8689447329481e-07, "loss": 0.5554, "reward": 1.015625, "reward_std": 0.20019326359033585, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.515625, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 388.015625, "epoch": 0.39365918097754293, "grad_norm": 14.60313606262207, "kl": 3.12890625, "learning_rate": 7.833492163936773e-07, "loss": 0.2208, "reward": 1.01953125, "reward_std": 0.15205424278974533, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 278.0625, "epoch": 0.3963011889035667, "grad_norm": 9.847626686096191, "kl": 2.1591796875, "learning_rate": 7.797840369314081e-07, "loss": 0.5313, "reward": 0.5546875, "reward_std": 0.17377189174294472, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 413.171875, "epoch": 0.3989431968295905, "grad_norm": 10.47969913482666, "kl": 3.029296875, "learning_rate": 7.761992392908791e-07, "loss": 0.391, "reward": 0.78515625, "reward_std": 0.1711183786392212, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 256.0, "epoch": 0.40158520475561427, "grad_norm": 27.210330963134766, "kl": 3.328125, "learning_rate": 7.725951295299005e-07, "loss": 0.8581, "reward": 1.56640625, "reward_std": 0.18129342049360275, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 608.375, "epoch": 0.404227212681638, "grad_norm": 7.893120765686035, "kl": 3.9453125, "learning_rate": 7.689720153550853e-07, "loss": 0.5819, "reward": 0.73828125, "reward_std": 0.13392486423254013, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.48828125, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 431.484375, "epoch": 0.40686922060766184, "grad_norm": 6.096236705780029, "kl": 3.18359375, "learning_rate": 7.653302060955789e-07, "loss": 0.4258, "reward": 1.078125, "reward_std": 0.20535630360245705, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 403.328125, "epoch": 0.4095112285336856, "grad_norm": 9.526097297668457, "kl": 3.87890625, "learning_rate": 7.616700126766492e-07, "loss": 0.6043, "reward": 1.05078125, "reward_std": 0.15629850327968597, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 392.953125, "epoch": 0.41215323645970936, "grad_norm": 13.829514503479004, "kl": 4.109375, "learning_rate": 7.579917475931409e-07, "loss": 0.3873, "reward": 0.52734375, "reward_std": 0.18767033517360687, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 331.265625, "epoch": 0.4147952443857332, "grad_norm": 12.386381149291992, "kl": 3.4296875, "learning_rate": 7.54295724882796e-07, "loss": 0.7169, "reward": 1.328125, "reward_std": 0.2166232354938984, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 457.0, "epoch": 0.41743725231175693, "grad_norm": 7.208274841308594, "kl": 4.09375, "learning_rate": 7.505822600994423e-07, "loss": 0.6254, "reward": 1.28515625, "reward_std": 0.17519249208271503, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 385.28125, "epoch": 0.4200792602377807, "grad_norm": 10.335708618164062, "kl": 4.54296875, "learning_rate": 7.468516702860519e-07, "loss": 0.5237, "reward": 0.51953125, "reward_std": 0.18916139006614685, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 517.96875, "epoch": 0.4227212681638045, "grad_norm": 15.89622688293457, "kl": 3.72265625, "learning_rate": 7.43104273947674e-07, "loss": 0.3898, "reward": 1.01953125, "reward_std": 0.17299087904393673, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 307.828125, "epoch": 0.42536327608982827, "grad_norm": 8.838927268981934, "kl": 2.689453125, "learning_rate": 7.393403910242418e-07, "loss": 0.4323, "reward": 1.02734375, "reward_std": 0.13064508698880672, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 431.3125, "epoch": 0.42800528401585203, "grad_norm": 15.761492729187012, "kl": 2.98828125, "learning_rate": 7.355603428632565e-07, "loss": 0.23, "reward": 1.3671875, "reward_std": 0.22000113874673843, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 582.75, "epoch": 0.43064729194187584, "grad_norm": 14.52424144744873, "kl": 2.109375, "learning_rate": 7.317644521923526e-07, "loss": 0.5996, "reward": 0.7578125, "reward_std": 0.1417398639023304, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 364.53125, "epoch": 0.4332892998678996, "grad_norm": 12.958600044250488, "kl": 1.623046875, "learning_rate": 7.279530430917441e-07, "loss": 0.0741, "reward": 0.796875, "reward_std": 0.1477682925760746, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 416.53125, "epoch": 0.43593130779392336, "grad_norm": 9.96493911743164, "kl": 1.7197265625, "learning_rate": 7.241264409665554e-07, "loss": 0.441, "reward": 0.82421875, "reward_std": 0.21464627608656883, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 582.5625, "epoch": 0.4385733157199472, "grad_norm": 7.888613224029541, "kl": 1.72265625, "learning_rate": 7.202849725190397e-07, "loss": 0.3068, "reward": 1.0078125, "reward_std": 0.17024145647883415, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 534.1875, "epoch": 0.44121532364597094, "grad_norm": 8.195699691772461, "kl": 1.58642578125, "learning_rate": 7.16428965720686e-07, "loss": 0.3543, "reward": 0.8046875, "reward_std": 0.2195490226149559, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 262.46875, "epoch": 0.4438573315719947, "grad_norm": 11.303885459899902, "kl": 0.970703125, "learning_rate": 7.125587497842189e-07, "loss": 0.4021, "reward": 0.80078125, "reward_std": 0.1908670738339424, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 507.84375, "epoch": 0.4464993394980185, "grad_norm": 18.99937629699707, "kl": 1.3828125, "learning_rate": 7.086746551354895e-07, "loss": 0.5214, "reward": 0.76953125, "reward_std": 0.1896660476922989, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 481.09375, "epoch": 0.44914134742404227, "grad_norm": 16.57875633239746, "kl": 1.5625, "learning_rate": 7.047770133852676e-07, "loss": 0.4899, "reward": 1.0546875, "reward_std": 0.19582437723875046, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 434.171875, "epoch": 0.45178335535006603, "grad_norm": 6.7548298835754395, "kl": 1.451171875, "learning_rate": 7.008661573009273e-07, "loss": 0.3438, "reward": 1.30078125, "reward_std": 0.1738675981760025, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 398.4375, "epoch": 0.45442536327608984, "grad_norm": 11.90649127960205, "kl": 1.791015625, "learning_rate": 6.969424207780374e-07, "loss": 0.1403, "reward": 1.3515625, "reward_std": 0.2295953370630741, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 567.515625, "epoch": 0.4570673712021136, "grad_norm": 4.553245544433594, "kl": 2.3828125, "learning_rate": 6.930061388118557e-07, "loss": 0.4131, "reward": 1.05859375, "reward_std": 0.21736154332756996, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55859375, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 522.15625, "epoch": 0.45970937912813736, "grad_norm": 10.5054931640625, "kl": 2.76171875, "learning_rate": 6.890576474687263e-07, "loss": 0.2456, "reward": 0.76171875, "reward_std": 0.17176654934883118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51171875, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 760.015625, "epoch": 0.4623513870541612, "grad_norm": 12.109650611877441, "kl": 4.10546875, "learning_rate": 6.850972838573888e-07, "loss": 0.4345, "reward": 0.7578125, "reward_std": 0.17381427809596062, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 670.84375, "epoch": 0.46499339498018494, "grad_norm": 9.500724792480469, "kl": 3.11328125, "learning_rate": 6.811253861001961e-07, "loss": 0.448, "reward": 0.8125, "reward_std": 0.2038702666759491, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 668.578125, "epoch": 0.4676354029062087, "grad_norm": 3.1513185501098633, "kl": 2.83984375, "learning_rate": 6.771422933042477e-07, "loss": 0.4486, "reward": 0.7734375, "reward_std": 0.19701149314641953, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 460.640625, "epoch": 0.4702774108322325, "grad_norm": 3.928485631942749, "kl": 2.52734375, "learning_rate": 6.731483455324374e-07, "loss": 0.4601, "reward": 0.55078125, "reward_std": 0.1819697804749012, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 500.734375, "epoch": 0.47291941875825627, "grad_norm": 5.9308905601501465, "kl": 2.89453125, "learning_rate": 6.691438837744191e-07, "loss": 0.5959, "reward": 1.0859375, "reward_std": 0.24082761257886887, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 318.859375, "epoch": 0.47556142668428003, "grad_norm": 8.880630493164062, "kl": 2.07373046875, "learning_rate": 6.651292499174959e-07, "loss": 0.2224, "reward": 1.0703125, "reward_std": 0.18467539176344872, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 504.203125, "epoch": 0.47820343461030385, "grad_norm": 7.29809045791626, "kl": 2.671875, "learning_rate": 6.611047867174298e-07, "loss": 0.5424, "reward": 0.796875, "reward_std": 0.19480633921921253, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 693.046875, "epoch": 0.4808454425363276, "grad_norm": 7.5113844871521, "kl": 3.078125, "learning_rate": 6.570708377691783e-07, "loss": 0.6193, "reward": 1.5859375, "reward_std": 0.2526575177907944, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 504.484375, "epoch": 0.48348745046235136, "grad_norm": 8.909899711608887, "kl": 2.7265625, "learning_rate": 6.530277474775602e-07, "loss": 0.572, "reward": 1.31640625, "reward_std": 0.20270539075136185, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 451.421875, "epoch": 0.4861294583883752, "grad_norm": 20.32670021057129, "kl": 2.5546875, "learning_rate": 6.489758610278509e-07, "loss": 0.4425, "reward": 1.08203125, "reward_std": 0.21750707924365997, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 586.625, "epoch": 0.48877146631439894, "grad_norm": 6.589134693145752, "kl": 2.375, "learning_rate": 6.449155243563114e-07, "loss": 0.4211, "reward": 0.546875, "reward_std": 0.2208508811891079, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53125, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 534.640625, "epoch": 0.4914134742404227, "grad_norm": 9.064754486083984, "kl": 2.705078125, "learning_rate": 6.408470841206545e-07, "loss": 0.2999, "reward": 1.015625, "reward_std": 0.10510582849383354, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.515625, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 758.078125, "epoch": 0.4940554821664465, "grad_norm": 14.509212493896484, "kl": 3.865234375, "learning_rate": 6.367708876704476e-07, "loss": 0.494, "reward": 1.02734375, "reward_std": 0.20098446309566498, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 387.09375, "epoch": 0.4966974900924703, "grad_norm": 14.154923439025879, "kl": 2.201171875, "learning_rate": 6.326872830174566e-07, "loss": 0.1712, "reward": 1.0859375, "reward_std": 0.19368236511945724, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 423.234375, "epoch": 0.49933949801849403, "grad_norm": 17.86855125427246, "kl": 2.376953125, "learning_rate": 6.285966188059355e-07, "loss": 0.6533, "reward": 1.09375, "reward_std": 0.2263101488351822, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 512.125, "epoch": 0.5019815059445178, "grad_norm": 8.82755184173584, "kl": 3.04296875, "learning_rate": 6.244992442828585e-07, "loss": 0.3686, "reward": 0.7734375, "reward_std": 0.1519293300807476, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 560.03125, "epoch": 0.5046235138705416, "grad_norm": 15.707466125488281, "kl": 3.029296875, "learning_rate": 6.203955092681039e-07, "loss": 0.3194, "reward": 1.0703125, "reward_std": 0.1986095793545246, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 768.546875, "epoch": 0.5072655217965654, "grad_norm": 11.438809394836426, "kl": 2.88671875, "learning_rate": 6.162857641245869e-07, "loss": 0.6017, "reward": 1.28125, "reward_std": 0.21250617876648903, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53125, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 408.859375, "epoch": 0.5099075297225891, "grad_norm": 5.250596523284912, "kl": 1.41015625, "learning_rate": 6.12170359728347e-07, "loss": 0.2562, "reward": 1.33203125, "reward_std": 0.20339645817875862, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 460.921875, "epoch": 0.512549537648613, "grad_norm": 8.758655548095703, "kl": 1.962890625, "learning_rate": 6.080496474385916e-07, "loss": 0.34, "reward": 0.79296875, "reward_std": 0.19175675138831139, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 426.734375, "epoch": 0.5151915455746368, "grad_norm": 13.022716522216797, "kl": 1.361328125, "learning_rate": 6.039239790676974e-07, "loss": 0.49, "reward": 1.1484375, "reward_std": 0.2307521291077137, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6484375, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 519.953125, "epoch": 0.5178335535006605, "grad_norm": 14.834174156188965, "kl": 2.318359375, "learning_rate": 5.997937068511754e-07, "loss": 0.1528, "reward": 1.06640625, "reward_std": 0.14010578021407127, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 568.75, "epoch": 0.5204755614266843, "grad_norm": 10.123536109924316, "kl": 2.12109375, "learning_rate": 5.956591834175964e-07, "loss": 0.5013, "reward": 1.31640625, "reward_std": 0.21957488358020782, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 658.484375, "epoch": 0.523117569352708, "grad_norm": 6.424520015716553, "kl": 3.1796875, "learning_rate": 5.915207617584858e-07, "loss": 0.4787, "reward": 1.3125, "reward_std": 0.22040452808141708, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 594.515625, "epoch": 0.5257595772787318, "grad_norm": 5.053133010864258, "kl": 2.666015625, "learning_rate": 5.873787951981868e-07, "loss": 0.4661, "reward": 0.75390625, "reward_std": 0.17793777957558632, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.50390625, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 339.03125, "epoch": 0.5284015852047557, "grad_norm": 4.2198944091796875, "kl": 1.740234375, "learning_rate": 5.832336373636933e-07, "loss": 0.3366, "reward": 1.28515625, "reward_std": 0.17389780096709728, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 468.40625, "epoch": 0.5310435931307794, "grad_norm": 2.89648175239563, "kl": 1.6396484375, "learning_rate": 5.790856421544598e-07, "loss": 0.3048, "reward": 1.5859375, "reward_std": 0.19600137695670128, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 475.765625, "epoch": 0.5336856010568032, "grad_norm": 6.781806468963623, "kl": 2.189453125, "learning_rate": 5.749351637121865e-07, "loss": 0.3492, "reward": 0.828125, "reward_std": 0.20571819692850113, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 503.40625, "epoch": 0.5363276089828269, "grad_norm": 3.5012331008911133, "kl": 2.72265625, "learning_rate": 5.707825563905828e-07, "loss": 0.4152, "reward": 1.30078125, "reward_std": 0.17533257603645325, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 374.390625, "epoch": 0.5389696169088507, "grad_norm": 16.517194747924805, "kl": 1.6416015625, "learning_rate": 5.666281747251153e-07, "loss": 0.4345, "reward": 1.2890625, "reward_std": 0.18729007616639137, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 344.546875, "epoch": 0.5416116248348745, "grad_norm": 4.214947700500488, "kl": 1.6279296875, "learning_rate": 5.624723734027373e-07, "loss": 0.3469, "reward": 1.01171875, "reward_std": 0.1350011769682169, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51171875, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 640.53125, "epoch": 0.5442536327608983, "grad_norm": 4.432642936706543, "kl": 2.634765625, "learning_rate": 5.583155072316085e-07, "loss": 0.3449, "reward": 1.01953125, "reward_std": 0.14237725362181664, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 346.4375, "epoch": 0.5468956406869221, "grad_norm": 6.426868915557861, "kl": 2.21875, "learning_rate": 5.541579311108009e-07, "loss": 0.4081, "reward": 1.33203125, "reward_std": 0.20600395277142525, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 599.890625, "epoch": 0.5495376486129459, "grad_norm": 9.497568130493164, "kl": 2.8671875, "learning_rate": 5.5e-07, "loss": 0.594, "reward": 1.0390625, "reward_std": 0.2189657799899578, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 587.703125, "epoch": 0.5521796565389696, "grad_norm": 2.5981221199035645, "kl": 2.576171875, "learning_rate": 5.458420688891992e-07, "loss": 0.3634, "reward": 1.34765625, "reward_std": 0.2173020839691162, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59765625, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 251.59375, "epoch": 0.5548216644649934, "grad_norm": 12.541109085083008, "kl": 1.94140625, "learning_rate": 5.416844927683916e-07, "loss": 0.482, "reward": 1.33984375, "reward_std": 0.22426774725317955, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 353.359375, "epoch": 0.5574636723910171, "grad_norm": 21.176788330078125, "kl": 2.33203125, "learning_rate": 5.375276265972627e-07, "loss": 0.2879, "reward": 1.05078125, "reward_std": 0.18691154941916466, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 407.1875, "epoch": 0.560105680317041, "grad_norm": 4.283320903778076, "kl": 2.701171875, "learning_rate": 5.333718252748849e-07, "loss": 0.3272, "reward": 1.5546875, "reward_std": 0.1786573100835085, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 391.40625, "epoch": 0.5627476882430648, "grad_norm": 7.3552470207214355, "kl": 2.76953125, "learning_rate": 5.292174436094172e-07, "loss": 0.4091, "reward": 1.05859375, "reward_std": 0.19953873381018639, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55859375, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 599.421875, "epoch": 0.5653896961690885, "grad_norm": 7.531975746154785, "kl": 4.07421875, "learning_rate": 5.250648362878135e-07, "loss": 0.6474, "reward": 1.3359375, "reward_std": 0.22002986446022987, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 442.046875, "epoch": 0.5680317040951123, "grad_norm": 9.658491134643555, "kl": 2.96875, "learning_rate": 5.209143578455401e-07, "loss": 0.3931, "reward": 1.31640625, "reward_std": 0.21046040952205658, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 323.140625, "epoch": 0.570673712021136, "grad_norm": 16.756044387817383, "kl": 2.46484375, "learning_rate": 5.167663626363066e-07, "loss": 0.1497, "reward": 1.328125, "reward_std": 0.19799000024795532, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 485.28125, "epoch": 0.5733157199471598, "grad_norm": 14.802947998046875, "kl": 2.94921875, "learning_rate": 5.126212048018133e-07, "loss": 0.3226, "reward": 0.5546875, "reward_std": 0.17373281717300415, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 553.296875, "epoch": 0.5759577278731837, "grad_norm": 6.547313213348389, "kl": 3.35546875, "learning_rate": 5.084792382415141e-07, "loss": 0.7209, "reward": 0.5703125, "reward_std": 0.20446551591157913, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 580.078125, "epoch": 0.5785997357992074, "grad_norm": 7.502042293548584, "kl": 2.875, "learning_rate": 5.043408165824037e-07, "loss": 0.522, "reward": 1.07421875, "reward_std": 0.2559613697230816, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 291.125, "epoch": 0.5812417437252312, "grad_norm": 9.088134765625, "kl": 1.806640625, "learning_rate": 5.002062931488247e-07, "loss": 0.5338, "reward": 0.8046875, "reward_std": 0.18990932404994965, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 533.640625, "epoch": 0.583883751651255, "grad_norm": 11.220687866210938, "kl": 2.3984375, "learning_rate": 4.960760209323026e-07, "loss": 0.6041, "reward": 0.5234375, "reward_std": 0.19436774030327797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 612.890625, "epoch": 0.5865257595772787, "grad_norm": 6.296652317047119, "kl": 3.07421875, "learning_rate": 4.919503525614086e-07, "loss": 0.5521, "reward": 0.76953125, "reward_std": 0.18084516376256943, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51953125, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 265.609375, "epoch": 0.5891677675033025, "grad_norm": 3.475614309310913, "kl": 1.50390625, "learning_rate": 4.878296402716531e-07, "loss": 0.2643, "reward": 1.38671875, "reward_std": 0.20747815072536469, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.63671875, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 714.1875, "epoch": 0.5918097754293263, "grad_norm": 6.395312786102295, "kl": 3.357421875, "learning_rate": 4.837142358754131e-07, "loss": 0.6176, "reward": 1.2734375, "reward_std": 0.21194355189800262, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 414.859375, "epoch": 0.5944517833553501, "grad_norm": 6.891757488250732, "kl": 2.8984375, "learning_rate": 4.79604490731896e-07, "loss": 0.42, "reward": 1.06640625, "reward_std": 0.2256414033472538, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 581.984375, "epoch": 0.5970937912813739, "grad_norm": 7.385695934295654, "kl": 3.4140625, "learning_rate": 4.755007557171414e-07, "loss": 0.6208, "reward": 1.05078125, "reward_std": 0.19489648565649986, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 346.296875, "epoch": 0.5997357992073976, "grad_norm": 5.268566608428955, "kl": 2.427734375, "learning_rate": 4.7140338119406455e-07, "loss": 0.3306, "reward": 1.109375, "reward_std": 0.22719038277864456, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 454.625, "epoch": 0.6023778071334214, "grad_norm": 11.538866996765137, "kl": 2.423828125, "learning_rate": 4.6731271698254326e-07, "loss": 0.664, "reward": 1.109375, "reward_std": 0.21347813308238983, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 398.4375, "epoch": 0.6050198150594451, "grad_norm": 10.027405738830566, "kl": 2.166015625, "learning_rate": 4.632291123295524e-07, "loss": 0.3504, "reward": 1.3125, "reward_std": 0.2073436863720417, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 778.328125, "epoch": 0.607661822985469, "grad_norm": 8.903005599975586, "kl": 4.5234375, "learning_rate": 4.5915291587934547e-07, "loss": 0.6184, "reward": 1.0234375, "reward_std": 0.21458512544631958, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 496.296875, "epoch": 0.6103038309114928, "grad_norm": 19.55433464050293, "kl": 4.23046875, "learning_rate": 4.5508447564368856e-07, "loss": 0.6321, "reward": 1.33984375, "reward_std": 0.22301983460783958, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 686.265625, "epoch": 0.6129458388375165, "grad_norm": 6.192388534545898, "kl": 3.7890625, "learning_rate": 4.510241389721493e-07, "loss": 0.5918, "reward": 1.5859375, "reward_std": 0.2616988569498062, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 540.578125, "epoch": 0.6155878467635403, "grad_norm": 7.43271017074585, "kl": 3.13671875, "learning_rate": 4.4697225252243976e-07, "loss": 0.6237, "reward": 1.3515625, "reward_std": 0.24065708369016647, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 521.265625, "epoch": 0.618229854689564, "grad_norm": 7.898358345031738, "kl": 2.81640625, "learning_rate": 4.4292916223082165e-07, "loss": 0.5285, "reward": 1.3046875, "reward_std": 0.2356991246342659, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 493.28125, "epoch": 0.6208718626155878, "grad_norm": 10.038056373596191, "kl": 2.90234375, "learning_rate": 4.388952132825701e-07, "loss": 0.2489, "reward": 1.140625, "reward_std": 0.2295135334134102, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.640625, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 431.984375, "epoch": 0.6235138705416117, "grad_norm": 4.178317546844482, "kl": 2.68359375, "learning_rate": 4.3487075008250397e-07, "loss": 0.4859, "reward": 0.79296875, "reward_std": 0.2021397091448307, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 698.25, "epoch": 0.6261558784676354, "grad_norm": 7.887820243835449, "kl": 4.2421875, "learning_rate": 4.3085611622558084e-07, "loss": 0.6169, "reward": 1.28125, "reward_std": 0.21125948429107666, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53125, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 548.328125, "epoch": 0.6287978863936592, "grad_norm": 5.685881614685059, "kl": 2.59375, "learning_rate": 4.268516544675628e-07, "loss": 0.3334, "reward": 1.0625, "reward_std": 0.20200148969888687, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 414.75, "epoch": 0.631439894319683, "grad_norm": 11.868870735168457, "kl": 2.0859375, "learning_rate": 4.228577066957522e-07, "loss": 0.1258, "reward": 1.3671875, "reward_std": 0.22833861783146858, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 599.859375, "epoch": 0.6340819022457067, "grad_norm": 5.297094345092773, "kl": 2.8125, "learning_rate": 4.1887461389980394e-07, "loss": 0.3444, "reward": 1.046875, "reward_std": 0.22738776728510857, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 460.609375, "epoch": 0.6367239101717305, "grad_norm": 9.069931983947754, "kl": 2.166015625, "learning_rate": 4.149027161426113e-07, "loss": 0.5227, "reward": 1.34375, "reward_std": 0.21560321748256683, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 753.9375, "epoch": 0.6393659180977543, "grad_norm": 3.11356258392334, "kl": 2.849609375, "learning_rate": 4.1094235253127374e-07, "loss": 0.4795, "reward": 1.046875, "reward_std": 0.20162740349769592, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 626.859375, "epoch": 0.6420079260237781, "grad_norm": 4.849280834197998, "kl": 2.39453125, "learning_rate": 4.069938611881443e-07, "loss": 0.5037, "reward": 0.796875, "reward_std": 0.18199804052710533, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 552.296875, "epoch": 0.6446499339498019, "grad_norm": 5.1860456466674805, "kl": 2.4404296875, "learning_rate": 4.030575792219626e-07, "loss": 0.3665, "reward": 1.296875, "reward_std": 0.1943066604435444, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 638.5625, "epoch": 0.6472919418758256, "grad_norm": 9.586490631103516, "kl": 2.599609375, "learning_rate": 3.9913384269907293e-07, "loss": 0.2958, "reward": 1.33203125, "reward_std": 0.22680200263857841, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 608.640625, "epoch": 0.6499339498018494, "grad_norm": 7.131601810455322, "kl": 2.166015625, "learning_rate": 3.952229866147323e-07, "loss": 0.2385, "reward": 1.375, "reward_std": 0.2418774701654911, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.625, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 661.015625, "epoch": 0.6525759577278731, "grad_norm": 5.848790645599365, "kl": 2.306640625, "learning_rate": 3.913253448645103e-07, "loss": 0.4711, "reward": 1.08203125, "reward_std": 0.22584940120577812, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 531.96875, "epoch": 0.655217965653897, "grad_norm": 5.778437614440918, "kl": 1.859375, "learning_rate": 3.8744125021578123e-07, "loss": 0.3466, "reward": 1.2734375, "reward_std": 0.1622530035674572, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 525.40625, "epoch": 0.6578599735799208, "grad_norm": 3.1933047771453857, "kl": 1.833984375, "learning_rate": 3.835710342793139e-07, "loss": 0.2862, "reward": 1.30078125, "reward_std": 0.15551739931106567, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 480.75, "epoch": 0.6605019815059445, "grad_norm": 8.949792861938477, "kl": 1.197265625, "learning_rate": 3.797150274809604e-07, "loss": 0.326, "reward": 1.3359375, "reward_std": 0.2217497080564499, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 428.203125, "epoch": 0.6631439894319683, "grad_norm": 3.1499345302581787, "kl": 1.2763671875, "learning_rate": 3.7587355903344466e-07, "loss": 0.1597, "reward": 0.875, "reward_std": 0.21982388943433762, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.625, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 369.6875, "epoch": 0.665785997357992, "grad_norm": 4.168592929840088, "kl": 1.3583984375, "learning_rate": 3.7204695690825593e-07, "loss": 0.1939, "reward": 1.28125, "reward_std": 0.1477414984256029, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53125, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 544.015625, "epoch": 0.6684280052840158, "grad_norm": 7.520803451538086, "kl": 1.921875, "learning_rate": 3.682355478076473e-07, "loss": 0.2638, "reward": 0.82421875, "reward_std": 0.2656807042658329, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55859375, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 561.34375, "epoch": 0.6710700132100397, "grad_norm": 6.172038555145264, "kl": 2.318359375, "learning_rate": 3.6443965713674354e-07, "loss": 0.3545, "reward": 1.02734375, "reward_std": 0.19002593867480755, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 521.8125, "epoch": 0.6737120211360634, "grad_norm": 6.321176528930664, "kl": 1.609375, "learning_rate": 3.606596089757583e-07, "loss": 0.3466, "reward": 1.58984375, "reward_std": 0.2514568492770195, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 307.46875, "epoch": 0.6763540290620872, "grad_norm": 4.846172332763672, "kl": 1.09765625, "learning_rate": 3.5689572605232597e-07, "loss": 0.2335, "reward": 1.3359375, "reward_std": 0.20273161679506302, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 468.5, "epoch": 0.678996036988111, "grad_norm": 12.14126968383789, "kl": 1.138671875, "learning_rate": 3.531483297139481e-07, "loss": 0.1721, "reward": 0.80078125, "reward_std": 0.1630447916686535, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 565.203125, "epoch": 0.6816380449141347, "grad_norm": 3.9592182636260986, "kl": 1.837890625, "learning_rate": 3.4941773990055777e-07, "loss": 0.2977, "reward": 1.10546875, "reward_std": 0.25015248730778694, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.60546875, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 480.484375, "epoch": 0.6842800528401585, "grad_norm": 9.579623222351074, "kl": 1.62109375, "learning_rate": 3.45704275117204e-07, "loss": 0.4312, "reward": 1.08203125, "reward_std": 0.24054544791579247, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 496.796875, "epoch": 0.6869220607661823, "grad_norm": 4.918056964874268, "kl": 1.14306640625, "learning_rate": 3.4200825240685914e-07, "loss": 0.1878, "reward": 1.1015625, "reward_std": 0.22064152732491493, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 515.96875, "epoch": 0.6895640686922061, "grad_norm": 11.338505744934082, "kl": 1.765625, "learning_rate": 3.3832998732335085e-07, "loss": 0.4868, "reward": 1.0859375, "reward_std": 0.21507646515965462, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 579.875, "epoch": 0.6922060766182299, "grad_norm": 10.862038612365723, "kl": 2.357421875, "learning_rate": 3.346697939044211e-07, "loss": 0.6303, "reward": 0.77734375, "reward_std": 0.20420579984784126, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.52734375, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 641.796875, "epoch": 0.6948480845442536, "grad_norm": 7.440125465393066, "kl": 2.716796875, "learning_rate": 3.310279846449147e-07, "loss": 0.5692, "reward": 0.83203125, "reward_std": 0.2302125133574009, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 490.625, "epoch": 0.6974900924702774, "grad_norm": 11.042434692382812, "kl": 1.890625, "learning_rate": 3.2740487047009954e-07, "loss": 0.575, "reward": 0.8203125, "reward_std": 0.21583595871925354, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 535.375, "epoch": 0.7001321003963011, "grad_norm": 9.307427406311035, "kl": 1.8515625, "learning_rate": 3.23800760709121e-07, "loss": 0.2549, "reward": 1.0625, "reward_std": 0.19687864929437637, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 573.34375, "epoch": 0.702774108322325, "grad_norm": 4.253864765167236, "kl": 2.693359375, "learning_rate": 3.2021596306859195e-07, "loss": 0.4737, "reward": 0.8125, "reward_std": 0.1992315910756588, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 648.3125, "epoch": 0.7054161162483488, "grad_norm": 7.490243911743164, "kl": 3.2275390625, "learning_rate": 3.1665078360632254e-07, "loss": 0.377, "reward": 1.078125, "reward_std": 0.22863000631332397, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 489.296875, "epoch": 0.7080581241743725, "grad_norm": 4.917722702026367, "kl": 2.056640625, "learning_rate": 3.1310552670518987e-07, "loss": 0.3075, "reward": 1.12109375, "reward_std": 0.23855430632829666, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.62109375, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 450.921875, "epoch": 0.7107001321003963, "grad_norm": 3.3728554248809814, "kl": 2.087890625, "learning_rate": 3.0958049504715024e-07, "loss": 0.3534, "reward": 1.07421875, "reward_std": 0.20587731152772903, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 550.140625, "epoch": 0.71334214002642, "grad_norm": 6.581082344055176, "kl": 2.974609375, "learning_rate": 3.0607598958739777e-07, "loss": 0.3513, "reward": 1.08203125, "reward_std": 0.21218016743659973, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 666.0625, "epoch": 0.7159841479524438, "grad_norm": 3.782729387283325, "kl": 3.47265625, "learning_rate": 3.0259230952866976e-07, "loss": 0.5161, "reward": 0.8515625, "reward_std": 0.266521442681551, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 589.734375, "epoch": 0.7186261558784677, "grad_norm": 12.191798210144043, "kl": 2.857421875, "learning_rate": 2.991297522957015e-07, "loss": 0.257, "reward": 1.05859375, "reward_std": 0.1889869049191475, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55859375, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 476.515625, "epoch": 0.7212681638044914, "grad_norm": 5.739687442779541, "kl": 2.828125, "learning_rate": 2.9568861350983365e-07, "loss": 0.3424, "reward": 0.578125, "reward_std": 0.20889347046613693, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 606.921875, "epoch": 0.7239101717305152, "grad_norm": 8.41596794128418, "kl": 2.6015625, "learning_rate": 2.922691869637727e-07, "loss": 0.2616, "reward": 1.1171875, "reward_std": 0.24007226526737213, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 491.59375, "epoch": 0.726552179656539, "grad_norm": 4.1023335456848145, "kl": 1.966796875, "learning_rate": 2.88871764596508e-07, "loss": 0.2751, "reward": 1.3515625, "reward_std": 0.2043364755809307, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 565.84375, "epoch": 0.7291941875825627, "grad_norm": 5.3786540031433105, "kl": 2.720703125, "learning_rate": 2.854966364683872e-07, "loss": 0.3457, "reward": 0.828125, "reward_std": 0.20211807265877724, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 663.78125, "epoch": 0.7318361955085865, "grad_norm": 4.460934638977051, "kl": 3.201171875, "learning_rate": 2.821440907363516e-07, "loss": 0.4525, "reward": 0.8203125, "reward_std": 0.23223434761166573, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 641.265625, "epoch": 0.7344782034346103, "grad_norm": 16.07205581665039, "kl": 2.826171875, "learning_rate": 2.7881441362933464e-07, "loss": 0.334, "reward": 1.0625, "reward_std": 0.19014282897114754, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 676.0625, "epoch": 0.7371202113606341, "grad_norm": 11.935088157653809, "kl": 2.81640625, "learning_rate": 2.755078894238245e-07, "loss": 0.23, "reward": 0.78515625, "reward_std": 0.20001451671123505, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 603.546875, "epoch": 0.7397622192866579, "grad_norm": 9.738125801086426, "kl": 2.033203125, "learning_rate": 2.722248004195932e-07, "loss": 0.2735, "reward": 1.09375, "reward_std": 0.20607677102088928, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 732.6875, "epoch": 0.7424042272126816, "grad_norm": 7.031618118286133, "kl": 2.41015625, "learning_rate": 2.689654269155955e-07, "loss": 0.2994, "reward": 0.82421875, "reward_std": 0.20312216132879257, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 578.875, "epoch": 0.7450462351387054, "grad_norm": 5.801688194274902, "kl": 1.40234375, "learning_rate": 2.657300471860372e-07, "loss": 0.2932, "reward": 1.05078125, "reward_std": 0.20492718927562237, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55078125, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 721.109375, "epoch": 0.7476882430647291, "grad_norm": 11.897012710571289, "kl": 2.43359375, "learning_rate": 2.625189374566175e-07, "loss": 0.5936, "reward": 0.7578125, "reward_std": 0.15211578272283077, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 641.53125, "epoch": 0.750330250990753, "grad_norm": 5.453853130340576, "kl": 1.376953125, "learning_rate": 2.593323718809458e-07, "loss": 0.3039, "reward": 1.3671875, "reward_std": 0.2303219847381115, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 596.28125, "epoch": 0.7529722589167768, "grad_norm": 5.665752410888672, "kl": 1.35546875, "learning_rate": 2.561706225171352e-07, "loss": 0.3616, "reward": 1.04296875, "reward_std": 0.17159553244709969, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 536.78125, "epoch": 0.7556142668428005, "grad_norm": 3.726806879043579, "kl": 1.5693359375, "learning_rate": 2.5303395930457494e-07, "loss": 0.2881, "reward": 1.3203125, "reward_std": 0.2022528052330017, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 495.984375, "epoch": 0.7582562747688243, "grad_norm": 3.6658847332000732, "kl": 1.1884765625, "learning_rate": 2.499226500408845e-07, "loss": 0.1181, "reward": 1.1171875, "reward_std": 0.1793758161365986, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 859.984375, "epoch": 0.760898282694848, "grad_norm": 4.845893383026123, "kl": 2.955078125, "learning_rate": 2.4683696035904926e-07, "loss": 0.4852, "reward": 1.0078125, "reward_std": 0.1604960411787033, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5078125, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 440.8125, "epoch": 0.7635402906208718, "grad_norm": 2.4910755157470703, "kl": 0.85302734375, "learning_rate": 2.437771537047423e-07, "loss": 0.3161, "reward": 1.07421875, "reward_std": 0.2174788936972618, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 669.734375, "epoch": 0.7661822985468957, "grad_norm": 4.620151519775391, "kl": 1.90234375, "learning_rate": 2.407434913138318e-07, "loss": 0.3675, "reward": 0.5859375, "reward_std": 0.22324015572667122, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 531.953125, "epoch": 0.7688243064729194, "grad_norm": 11.40556526184082, "kl": 1.4501953125, "learning_rate": 2.377362321900777e-07, "loss": 0.0233, "reward": 1.36328125, "reward_std": 0.21594615280628204, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 696.34375, "epoch": 0.7714663143989432, "grad_norm": 3.5709707736968994, "kl": 1.853515625, "learning_rate": 2.3475563308301908e-07, "loss": 0.2536, "reward": 0.84765625, "reward_std": 0.20635812729597092, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59765625, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 604.40625, "epoch": 0.774108322324967, "grad_norm": 6.535892486572266, "kl": 1.3740234375, "learning_rate": 2.3180194846605364e-07, "loss": 0.1969, "reward": 1.1171875, "reward_std": 0.23528173938393593, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 703.734375, "epoch": 0.7767503302509907, "grad_norm": 6.631422996520996, "kl": 2.017578125, "learning_rate": 2.288754305147115e-07, "loss": 0.3918, "reward": 1.296875, "reward_std": 0.20271231979131699, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 550.125, "epoch": 0.7793923381770145, "grad_norm": 5.805858612060547, "kl": 1.369140625, "learning_rate": 2.259763290851255e-07, "loss": 0.3276, "reward": 1.0625, "reward_std": 0.18768509849905968, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 776.859375, "epoch": 0.7820343461030383, "grad_norm": 6.1796135902404785, "kl": 2.36328125, "learning_rate": 2.231048916926992e-07, "loss": 0.2911, "reward": 1.3203125, "reward_std": 0.2180866338312626, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 459.90625, "epoch": 0.7846763540290621, "grad_norm": 4.840709686279297, "kl": 1.15234375, "learning_rate": 2.2026136349097495e-07, "loss": 0.2601, "reward": 0.86328125, "reward_std": 0.21641594916582108, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 626.34375, "epoch": 0.7873183619550859, "grad_norm": 4.876105308532715, "kl": 2.0615234375, "learning_rate": 2.1744598725070347e-07, "loss": 0.403, "reward": 1.28515625, "reward_std": 0.17794826440513134, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 560.78125, "epoch": 0.7899603698811096, "grad_norm": 5.7457451820373535, "kl": 1.310546875, "learning_rate": 2.146590033391168e-07, "loss": 0.259, "reward": 1.32421875, "reward_std": 0.20343545079231262, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 666.5625, "epoch": 0.7926023778071334, "grad_norm": 4.766579627990723, "kl": 1.6201171875, "learning_rate": 2.11900649699407e-07, "loss": 0.1752, "reward": 1.109375, "reward_std": 0.2358247935771942, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 631.734375, "epoch": 0.7952443857331571, "grad_norm": 3.2293262481689453, "kl": 1.62890625, "learning_rate": 2.0917116183041074e-07, "loss": 0.2575, "reward": 1.33984375, "reward_std": 0.22996815666556358, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 740.484375, "epoch": 0.797886393659181, "grad_norm": 3.1481125354766846, "kl": 2.294921875, "learning_rate": 2.0647077276650366e-07, "loss": 0.3915, "reward": 0.828125, "reward_std": 0.22289753332734108, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 472.625, "epoch": 0.8005284015852048, "grad_norm": 14.101240158081055, "kl": 1.4130859375, "learning_rate": 2.037997130577045e-07, "loss": 0.5247, "reward": 0.86328125, "reward_std": 0.24362235516309738, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 579.9375, "epoch": 0.8031704095112285, "grad_norm": 2.720280885696411, "kl": 1.720703125, "learning_rate": 2.0115821074999156e-07, "loss": 0.2849, "reward": 1.3359375, "reward_std": 0.21295345574617386, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 592.1875, "epoch": 0.8058124174372523, "grad_norm": 4.275804042816162, "kl": 1.8828125, "learning_rate": 1.9854649136583307e-07, "loss": 0.3054, "reward": 1.09765625, "reward_std": 0.222886573523283, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59765625, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 468.53125, "epoch": 0.808454425363276, "grad_norm": 5.911637306213379, "kl": 1.4951171875, "learning_rate": 1.9596477788493254e-07, "loss": 0.2116, "reward": 1.109375, "reward_std": 0.2025398500263691, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 548.859375, "epoch": 0.8110964332892999, "grad_norm": 5.387912273406982, "kl": 1.599609375, "learning_rate": 1.9341329072519176e-07, "loss": 0.351, "reward": 0.6171875, "reward_std": 0.22198385372757912, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 573.5625, "epoch": 0.8137384412153237, "grad_norm": 5.202173709869385, "kl": 1.78125, "learning_rate": 1.9089224772389223e-07, "loss": 0.3517, "reward": 1.09375, "reward_std": 0.23804370686411858, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 652.28125, "epoch": 0.8163804491413474, "grad_norm": 4.832318305969238, "kl": 1.6396484375, "learning_rate": 1.884018641190968e-07, "loss": 0.2776, "reward": 1.69921875, "reward_std": 0.27570171654224396, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.69921875, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 653.1875, "epoch": 0.8190224570673712, "grad_norm": 5.5447211265563965, "kl": 2.037109375, "learning_rate": 1.8594235253127372e-07, "loss": 0.247, "reward": 1.046875, "reward_std": 0.21413858234882355, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 555.765625, "epoch": 0.821664464993395, "grad_norm": 10.55873966217041, "kl": 2.12109375, "learning_rate": 1.8351392294514326e-07, "loss": 0.4554, "reward": 1.2890625, "reward_std": 0.15378709696233273, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 454.5625, "epoch": 0.8243064729194187, "grad_norm": 2.300844669342041, "kl": 1.0029296875, "learning_rate": 1.8111678269175055e-07, "loss": 0.1514, "reward": 1.11328125, "reward_std": 0.2071386780589819, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 672.796875, "epoch": 0.8269484808454426, "grad_norm": 5.112921237945557, "kl": 2.4970703125, "learning_rate": 1.78751136430764e-07, "loss": 0.4767, "reward": 1.078125, "reward_std": 0.20955145359039307, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 444.484375, "epoch": 0.8295904887714664, "grad_norm": 4.7589569091796875, "kl": 1.42578125, "learning_rate": 1.7641718613300228e-07, "loss": 0.2688, "reward": 0.640625, "reward_std": 0.238662201911211, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.640625, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 814.015625, "epoch": 0.8322324966974901, "grad_norm": 10.08535385131836, "kl": 3.3828125, "learning_rate": 1.7411513106319058e-07, "loss": 0.3937, "reward": 0.78125, "reward_std": 0.20346562936902046, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53125, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 613.03125, "epoch": 0.8348745046235139, "grad_norm": 12.75075912475586, "kl": 2.302734375, "learning_rate": 1.7184516776294832e-07, "loss": 0.2161, "reward": 0.8828125, "reward_std": 0.26399971544742584, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.015625, "rewards/tag_count_reward": 0.6171875, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 529.59375, "epoch": 0.8375165125495376, "grad_norm": 9.653738975524902, "kl": 1.8046875, "learning_rate": 1.6960749003400892e-07, "loss": 0.1588, "reward": 0.84375, "reward_std": 0.16583861783146858, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 583.625, "epoch": 0.8401585204755614, "grad_norm": 4.075193405151367, "kl": 1.640625, "learning_rate": 1.674022889216737e-07, "loss": 0.1898, "reward": 1.3125, "reward_std": 0.1740352250635624, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 669.0625, "epoch": 0.8428005284015853, "grad_norm": 4.472336292266846, "kl": 2.494140625, "learning_rate": 1.6522975269850104e-07, "loss": 0.3193, "reward": 0.85546875, "reward_std": 0.21766092255711555, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.60546875, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 721.015625, "epoch": 0.845442536327609, "grad_norm": 6.250655174255371, "kl": 3.150390625, "learning_rate": 1.6309006684823239e-07, "loss": 0.5334, "reward": 1.0234375, "reward_std": 0.1688866000622511, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 675.921875, "epoch": 0.8480845442536328, "grad_norm": 1.8639191389083862, "kl": 2.427734375, "learning_rate": 1.6098341404995647e-07, "loss": 0.3932, "reward": 0.62890625, "reward_std": 0.24960599094629288, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.62890625, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 600.421875, "epoch": 0.8507265521796565, "grad_norm": 4.137293338775635, "kl": 2.146484375, "learning_rate": 1.5890997416251224e-07, "loss": 0.351, "reward": 1.04296875, "reward_std": 0.1972101591527462, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.54296875, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 565.4375, "epoch": 0.8533685601056803, "grad_norm": 10.063258171081543, "kl": 1.478515625, "learning_rate": 1.5686992420913372e-07, "loss": 0.0225, "reward": 0.86328125, "reward_std": 0.2034553661942482, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 647.328125, "epoch": 0.8560105680317041, "grad_norm": 9.994471549987793, "kl": 2.05859375, "learning_rate": 1.5486343836233595e-07, "loss": 0.2504, "reward": 1.328125, "reward_std": 0.21247531473636627, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 690.234375, "epoch": 0.8586525759577279, "grad_norm": 9.103864669799805, "kl": 2.4921875, "learning_rate": 1.5289068792904495e-07, "loss": 0.483, "reward": 0.82421875, "reward_std": 0.2072843722999096, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 732.96875, "epoch": 0.8612945838837517, "grad_norm": 7.12535285949707, "kl": 1.994140625, "learning_rate": 1.5095184133597217e-07, "loss": 0.4435, "reward": 1.08984375, "reward_std": 0.2667161263525486, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 402.9375, "epoch": 0.8639365918097754, "grad_norm": 12.984781265258789, "kl": 1.0556640625, "learning_rate": 1.4904706411523448e-07, "loss": 0.3994, "reward": 1.32421875, "reward_std": 0.18335551768541336, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 828.71875, "epoch": 0.8665785997357992, "grad_norm": 12.132417678833008, "kl": 1.8466796875, "learning_rate": 1.47176518890222e-07, "loss": 0.182, "reward": 1.0390625, "reward_std": 0.16892226040363312, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5390625, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 822.609375, "epoch": 0.869220607661823, "grad_norm": 4.648046493530273, "kl": 2.0146484375, "learning_rate": 1.453403653617135e-07, "loss": 0.4329, "reward": 0.796875, "reward_std": 0.20767118781805038, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.546875, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 590.765625, "epoch": 0.8718626155878467, "grad_norm": 3.794019937515259, "kl": 1.7001953125, "learning_rate": 1.4353876029424202e-07, "loss": 0.371, "reward": 1.09375, "reward_std": 0.216283418238163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 712.96875, "epoch": 0.8745046235138706, "grad_norm": 5.229684352874756, "kl": 2.5732421875, "learning_rate": 1.4177185750271055e-07, "loss": 0.3925, "reward": 1.09375, "reward_std": 0.23571135476231575, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 646.328125, "epoch": 0.8771466314398944, "grad_norm": 5.142683506011963, "kl": 2.0380859375, "learning_rate": 1.400398078392602e-07, "loss": 0.4217, "reward": 0.828125, "reward_std": 0.2310670204460621, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 580.5, "epoch": 0.8797886393659181, "grad_norm": 9.393284797668457, "kl": 1.46875, "learning_rate": 1.3834275918039055e-07, "loss": 0.3297, "reward": 1.33984375, "reward_std": 0.18817520886659622, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 584.625, "epoch": 0.8824306472919419, "grad_norm": 6.900231838226318, "kl": 1.951171875, "learning_rate": 1.3668085641433462e-07, "loss": 0.2931, "reward": 0.86328125, "reward_std": 0.2518454007804394, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 430.796875, "epoch": 0.8850726552179656, "grad_norm": 9.600037574768066, "kl": 1.091796875, "learning_rate": 1.3505424142868897e-07, "loss": 0.3829, "reward": 1.41796875, "reward_std": 0.23616278544068336, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.66796875, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 631.875, "epoch": 0.8877146631439894, "grad_norm": 5.003634929656982, "kl": 1.6171875, "learning_rate": 1.334630530982997e-07, "loss": 0.2516, "reward": 1.3046875, "reward_std": 0.21555107831954956, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5546875, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 614.03125, "epoch": 0.8903566710700133, "grad_norm": 16.881690979003906, "kl": 1.8984375, "learning_rate": 1.319074272734056e-07, "loss": 0.0975, "reward": 1.08984375, "reward_std": 0.19282393157482147, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 666.0, "epoch": 0.892998678996037, "grad_norm": 5.620565414428711, "kl": 2.3154296875, "learning_rate": 1.303874967680399e-07, "loss": 0.2757, "reward": 1.62109375, "reward_std": 0.2326289601624012, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.62109375, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 476.203125, "epoch": 0.8956406869220608, "grad_norm": 5.114979267120361, "kl": 1.1298828125, "learning_rate": 1.289033913486914e-07, "loss": 0.1405, "reward": 1.0703125, "reward_std": 0.1810067780315876, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 713.8125, "epoch": 0.8982826948480845, "grad_norm": 3.9009175300598145, "kl": 2.587890625, "learning_rate": 1.2745523772322461e-07, "loss": 0.4324, "reward": 1.31640625, "reward_std": 0.1788315549492836, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 642.796875, "epoch": 0.9009247027741083, "grad_norm": 5.570927619934082, "kl": 1.9873046875, "learning_rate": 1.2604315953006266e-07, "loss": 0.34, "reward": 0.86328125, "reward_std": 0.24456297606229782, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 637.578125, "epoch": 0.9035667107001321, "grad_norm": 8.186066627502441, "kl": 1.923828125, "learning_rate": 1.2466727732763125e-07, "loss": 0.4781, "reward": 0.8671875, "reward_std": 0.23449090123176575, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6171875, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 775.796875, "epoch": 0.9062087186261559, "grad_norm": 5.553122043609619, "kl": 3.125, "learning_rate": 1.2332770858406538e-07, "loss": 0.5849, "reward": 0.78515625, "reward_std": 0.21501468122005463, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.53515625, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 445.453125, "epoch": 0.9088507265521797, "grad_norm": 4.708739757537842, "kl": 1.2822265625, "learning_rate": 1.220245676671809e-07, "loss": 0.1695, "reward": 1.078125, "reward_std": 0.15526169911026955, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.578125, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 752.78125, "epoch": 0.9114927344782034, "grad_norm": 3.9118199348449707, "kl": 1.9716796875, "learning_rate": 1.2075796583470984e-07, "loss": 0.3416, "reward": 1.06640625, "reward_std": 0.21211567521095276, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 651.375, "epoch": 0.9141347424042272, "grad_norm": 5.419198513031006, "kl": 2.326171875, "learning_rate": 1.1952801122480167e-07, "loss": 0.2937, "reward": 0.59765625, "reward_std": 0.2001628838479519, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59765625, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 662.203125, "epoch": 0.916776750330251, "grad_norm": 10.185606002807617, "kl": 2.2119140625, "learning_rate": 1.183348088467908e-07, "loss": 0.2272, "reward": 1.01171875, "reward_std": 0.15968638472259045, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.51171875, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 476.359375, "epoch": 0.9194187582562747, "grad_norm": 5.287563323974609, "kl": 1.537109375, "learning_rate": 1.1717846057223143e-07, "loss": 0.1921, "reward": 0.60546875, "reward_std": 0.22014086320996284, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.60546875, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 593.1875, "epoch": 0.9220607661822986, "grad_norm": 4.420534133911133, "kl": 1.7568359375, "learning_rate": 1.1605906512619983e-07, "loss": 0.3432, "reward": 1.3515625, "reward_std": 0.23761418834328651, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 709.6875, "epoch": 0.9247027741083224, "grad_norm": 4.137857437133789, "kl": 2.36328125, "learning_rate": 1.1497671807886567e-07, "loss": 0.3999, "reward": 1.0703125, "reward_std": 0.19854220747947693, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 447.515625, "epoch": 0.9273447820343461, "grad_norm": 5.883572578430176, "kl": 1.359375, "learning_rate": 1.139315118373326e-07, "loss": 0.3009, "reward": 0.859375, "reward_std": 0.21957654133439064, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 535.09375, "epoch": 0.9299867899603699, "grad_norm": 9.422240257263184, "kl": 1.3564453125, "learning_rate": 1.1292353563774873e-07, "loss": 0.3162, "reward": 1.08984375, "reward_std": 0.22193554788827896, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 603.21875, "epoch": 0.9326287978863936, "grad_norm": 4.772337913513184, "kl": 2.2646484375, "learning_rate": 1.1195287553768821e-07, "loss": 0.2438, "reward": 0.62890625, "reward_std": 0.28237032890319824, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.61328125, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 714.09375, "epoch": 0.9352708058124174, "grad_norm": 9.603926658630371, "kl": 2.470703125, "learning_rate": 1.1101961440880352e-07, "loss": 0.3789, "reward": 1.05859375, "reward_std": 0.19248899817466736, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.55859375, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 628.40625, "epoch": 0.9379128137384413, "grad_norm": 16.06355857849121, "kl": 2.0009765625, "learning_rate": 1.1012383192975041e-07, "loss": 0.0823, "reward": 1.33203125, "reward_std": 0.18909762054681778, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 578.390625, "epoch": 0.940554821664465, "grad_norm": 3.9636921882629395, "kl": 1.8291015625, "learning_rate": 1.0926560457938536e-07, "loss": 0.2746, "reward": 1.3125, "reward_std": 0.2061732206493616, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5625, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 520.203125, "epoch": 0.9431968295904888, "grad_norm": 6.897830486297607, "kl": 1.431640625, "learning_rate": 1.084450056302357e-07, "loss": 0.1525, "reward": 0.83203125, "reward_std": 0.21859385818243027, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 524.21875, "epoch": 0.9458388375165125, "grad_norm": 11.090557098388672, "kl": 1.40234375, "learning_rate": 1.0766210514224419e-07, "loss": 0.0591, "reward": 1.1328125, "reward_std": 0.23101669549942017, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6328125, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 578.65625, "epoch": 0.9484808454425363, "grad_norm": 13.82530689239502, "kl": 2.111328125, "learning_rate": 1.0691696995678738e-07, "loss": 0.2682, "reward": 1.109375, "reward_std": 0.22573107481002808, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 720.625, "epoch": 0.9511228533685601, "grad_norm": 6.005599021911621, "kl": 2.166015625, "learning_rate": 1.0620966369096884e-07, "loss": 0.3217, "reward": 1.34375, "reward_std": 0.2211884669959545, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59375, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 664.5625, "epoch": 0.9537648612945839, "grad_norm": 2.9504928588867188, "kl": 1.896484375, "learning_rate": 1.0554024673218806e-07, "loss": 0.3339, "reward": 1.31640625, "reward_std": 0.21037080883979797, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 719.90625, "epoch": 0.9564068692206077, "grad_norm": 3.942823886871338, "kl": 1.5712890625, "learning_rate": 1.0490877623298431e-07, "loss": 0.3399, "reward": 0.8515625, "reward_std": 0.23859936743974686, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6015625, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 497.296875, "epoch": 0.9590488771466315, "grad_norm": 11.69743824005127, "kl": 1.6708984375, "learning_rate": 1.0431530610615772e-07, "loss": 0.1801, "reward": 1.37109375, "reward_std": 0.20750074833631516, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.62109375, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 662.953125, "epoch": 0.9616908850726552, "grad_norm": 5.648345470428467, "kl": 2.005859375, "learning_rate": 1.0375988702016576e-07, "loss": 0.3905, "reward": 0.8203125, "reward_std": 0.21815017238259315, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 556.640625, "epoch": 0.964332892998679, "grad_norm": 3.6928138732910156, "kl": 1.544921875, "learning_rate": 1.0324256639479797e-07, "loss": 0.1847, "reward": 1.3359375, "reward_std": 0.2146303877234459, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5859375, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 528.46875, "epoch": 0.9669749009247027, "grad_norm": 4.1989336013793945, "kl": 1.3134765625, "learning_rate": 1.0276338839712688e-07, "loss": 0.2739, "reward": 0.859375, "reward_std": 0.212420754134655, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.609375, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 807.125, "epoch": 0.9696169088507266, "grad_norm": 5.855282306671143, "kl": 2.8173828125, "learning_rate": 1.023223939377375e-07, "loss": 0.3144, "reward": 0.83203125, "reward_std": 0.2185688391327858, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58203125, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 711.046875, "epoch": 0.9722589167767504, "grad_norm": 6.813151836395264, "kl": 1.77734375, "learning_rate": 1.0191962066723448e-07, "loss": 0.1714, "reward": 1.3203125, "reward_std": 0.18526797741651535, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5703125, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 835.46875, "epoch": 0.9749009247027741, "grad_norm": 4.6733317375183105, "kl": 2.62109375, "learning_rate": 1.0155510297302745e-07, "loss": 0.4741, "reward": 0.7265625, "reward_std": 0.1361106839030981, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4765625, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 501.609375, "epoch": 0.9775429326287979, "grad_norm": 7.580297946929932, "kl": 1.306640625, "learning_rate": 1.0122887197639539e-07, "loss": 0.106, "reward": 0.8828125, "reward_std": 0.21267065405845642, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.6328125, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 730.546875, "epoch": 0.9801849405548216, "grad_norm": 2.7990424633026123, "kl": 1.625, "learning_rate": 1.0094095552982936e-07, "loss": 0.1954, "reward": 1.06640625, "reward_std": 0.15350224822759628, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 634.125, "epoch": 0.9828269484808454, "grad_norm": 5.10625696182251, "kl": 1.578125, "learning_rate": 1.0069137821465474e-07, "loss": 0.3279, "reward": 1.59765625, "reward_std": 0.24609044939279556, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59765625, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 521.46875, "epoch": 0.9854689564068693, "grad_norm": 2.8827366828918457, "kl": 1.173828125, "learning_rate": 1.0048016133893242e-07, "loss": 0.2295, "reward": 0.81640625, "reward_std": 0.1789581961929798, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 528.515625, "epoch": 0.988110964332893, "grad_norm": 4.933093070983887, "kl": 1.3515625, "learning_rate": 1.0030732293563969e-07, "loss": 0.1593, "reward": 1.31640625, "reward_std": 0.18777159228920937, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.56640625, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 486.390625, "epoch": 0.9907529722589168, "grad_norm": 5.345139980316162, "kl": 1.306640625, "learning_rate": 1.0017287776113066e-07, "loss": 0.2942, "reward": 1.34765625, "reward_std": 0.23156387358903885, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.59765625, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 832.59375, "epoch": 0.9933949801849405, "grad_norm": 5.978093147277832, "kl": 2.80859375, "learning_rate": 1.0007683729387628e-07, "loss": 0.562, "reward": 0.7734375, "reward_std": 0.20706837996840477, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5234375, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 601.359375, "epoch": 0.9960369881109643, "grad_norm": 4.996700763702393, "kl": 1.537109375, "learning_rate": 1.0001920973348446e-07, "loss": 0.3616, "reward": 1.33984375, "reward_std": 0.2210528589785099, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.58984375, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 651.0499954223633, "epoch": 0.9986789960369881, "grad_norm": 10.63793659210205, "kl": 1.486328125, "learning_rate": 1e-07, "loss": 0.16, "reward": 1.32421875, "reward_std": 0.1949087455868721, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.57421875, "step": 378 }, { "epoch": 0.9986789960369881, "step": 378, "total_flos": 0.0, "train_loss": 0.3501640140083889, "train_runtime": 20695.6892, "train_samples_per_second": 0.073, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 378, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }