diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,34151 @@ +{ + "best_global_step": 16976, + "best_metric": 0.32866472005844116, + "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_boolq_1755555080/checkpoint-16976", + "epoch": 10.0, + "eval_steps": 1061, + "global_step": 21210, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0023573785950023575, + "grad_norm": 2.643651008605957, + "learning_rate": 9.42951438000943e-08, + "loss": 15.5105, + "num_input_tokens_seen": 5184, + "step": 5 + }, + { + "epoch": 0.004714757190004715, + "grad_norm": 2.630746603012085, + "learning_rate": 2.1216407355021216e-07, + "loss": 15.385, + "num_input_tokens_seen": 9600, + "step": 10 + }, + { + "epoch": 0.007072135785007072, + "grad_norm": 3.1535065174102783, + "learning_rate": 3.300330033003301e-07, + "loss": 15.3491, + "num_input_tokens_seen": 14336, + "step": 15 + }, + { + "epoch": 0.00942951438000943, + "grad_norm": 2.1575894355773926, + "learning_rate": 4.479019330504479e-07, + "loss": 15.1407, + "num_input_tokens_seen": 19680, + "step": 20 + }, + { + "epoch": 0.011786892975011787, + "grad_norm": 2.4391679763793945, + "learning_rate": 5.657708628005658e-07, + "loss": 15.1715, + "num_input_tokens_seen": 24576, + "step": 25 + }, + { + "epoch": 0.014144271570014143, + "grad_norm": 2.369990348815918, + "learning_rate": 6.836397925506837e-07, + "loss": 15.0071, + "num_input_tokens_seen": 29216, + "step": 30 + }, + { + "epoch": 0.0165016501650165, + "grad_norm": 2.353562593460083, + "learning_rate": 8.015087223008016e-07, + "loss": 15.0718, + "num_input_tokens_seen": 33440, + "step": 35 + }, + { + "epoch": 0.01885902876001886, + "grad_norm": 2.579127311706543, + "learning_rate": 9.193776520509195e-07, + "loss": 15.1379, + "num_input_tokens_seen": 38240, + "step": 40 + }, + { + "epoch": 0.021216407355021217, + "grad_norm": 2.370831251144409, + "learning_rate": 1.0372465818010372e-06, + "loss": 15.2496, + "num_input_tokens_seen": 43296, + "step": 45 + }, + { + "epoch": 0.023573785950023574, + "grad_norm": 2.210038423538208, + "learning_rate": 1.155115511551155e-06, + "loss": 15.329, + "num_input_tokens_seen": 47776, + "step": 50 + }, + { + "epoch": 0.02593116454502593, + "grad_norm": 2.240624189376831, + "learning_rate": 1.272984441301273e-06, + "loss": 15.0365, + "num_input_tokens_seen": 52544, + "step": 55 + }, + { + "epoch": 0.028288543140028287, + "grad_norm": 2.4277873039245605, + "learning_rate": 1.390853371051391e-06, + "loss": 15.141, + "num_input_tokens_seen": 58176, + "step": 60 + }, + { + "epoch": 0.030645921735030647, + "grad_norm": 2.630810022354126, + "learning_rate": 1.5087223008015088e-06, + "loss": 15.2853, + "num_input_tokens_seen": 62912, + "step": 65 + }, + { + "epoch": 0.033003300330033, + "grad_norm": 2.4146625995635986, + "learning_rate": 1.6265912305516266e-06, + "loss": 15.4079, + "num_input_tokens_seen": 68992, + "step": 70 + }, + { + "epoch": 0.03536067892503536, + "grad_norm": 2.4192922115325928, + "learning_rate": 1.7444601603017446e-06, + "loss": 15.0766, + "num_input_tokens_seen": 73312, + "step": 75 + }, + { + "epoch": 0.03771805752003772, + "grad_norm": 2.2752370834350586, + "learning_rate": 1.8623290900518624e-06, + "loss": 15.3757, + "num_input_tokens_seen": 78976, + "step": 80 + }, + { + "epoch": 0.040075436115040074, + "grad_norm": 2.2322397232055664, + "learning_rate": 1.9801980198019803e-06, + "loss": 15.2733, + "num_input_tokens_seen": 83712, + "step": 85 + }, + { + "epoch": 0.042432814710042434, + "grad_norm": 2.5928115844726562, + "learning_rate": 2.0980669495520983e-06, + "loss": 15.0274, + "num_input_tokens_seen": 87936, + "step": 90 + }, + { + "epoch": 0.04479019330504479, + "grad_norm": 2.6536717414855957, + "learning_rate": 2.215935879302216e-06, + "loss": 15.2136, + "num_input_tokens_seen": 92704, + "step": 95 + }, + { + "epoch": 0.04714757190004715, + "grad_norm": 2.3167200088500977, + "learning_rate": 2.333804809052334e-06, + "loss": 15.0459, + "num_input_tokens_seen": 97664, + "step": 100 + }, + { + "epoch": 0.04950495049504951, + "grad_norm": 2.429687023162842, + "learning_rate": 2.4516737388024515e-06, + "loss": 15.1184, + "num_input_tokens_seen": 102240, + "step": 105 + }, + { + "epoch": 0.05186232909005186, + "grad_norm": 2.775273561477661, + "learning_rate": 2.56954266855257e-06, + "loss": 14.7747, + "num_input_tokens_seen": 107296, + "step": 110 + }, + { + "epoch": 0.05421970768505422, + "grad_norm": 2.7603209018707275, + "learning_rate": 2.687411598302688e-06, + "loss": 15.2233, + "num_input_tokens_seen": 112992, + "step": 115 + }, + { + "epoch": 0.056577086280056574, + "grad_norm": 2.133662462234497, + "learning_rate": 2.8052805280528055e-06, + "loss": 14.8471, + "num_input_tokens_seen": 118464, + "step": 120 + }, + { + "epoch": 0.058934464875058934, + "grad_norm": 2.200395107269287, + "learning_rate": 2.9231494578029235e-06, + "loss": 15.1151, + "num_input_tokens_seen": 124608, + "step": 125 + }, + { + "epoch": 0.061291843470061294, + "grad_norm": 3.3914458751678467, + "learning_rate": 3.041018387553041e-06, + "loss": 14.984, + "num_input_tokens_seen": 131008, + "step": 130 + }, + { + "epoch": 0.06364922206506365, + "grad_norm": 2.3240926265716553, + "learning_rate": 3.1588873173031586e-06, + "loss": 14.9986, + "num_input_tokens_seen": 136480, + "step": 135 + }, + { + "epoch": 0.066006600660066, + "grad_norm": 2.5211684703826904, + "learning_rate": 3.2767562470532766e-06, + "loss": 15.0037, + "num_input_tokens_seen": 141280, + "step": 140 + }, + { + "epoch": 0.06836397925506836, + "grad_norm": 2.210487127304077, + "learning_rate": 3.3946251768033946e-06, + "loss": 15.049, + "num_input_tokens_seen": 146400, + "step": 145 + }, + { + "epoch": 0.07072135785007072, + "grad_norm": 2.115323543548584, + "learning_rate": 3.5124941065535126e-06, + "loss": 14.8276, + "num_input_tokens_seen": 151392, + "step": 150 + }, + { + "epoch": 0.07307873644507308, + "grad_norm": 2.35345458984375, + "learning_rate": 3.6303630363036306e-06, + "loss": 14.9377, + "num_input_tokens_seen": 156768, + "step": 155 + }, + { + "epoch": 0.07543611504007544, + "grad_norm": 2.377488613128662, + "learning_rate": 3.748231966053748e-06, + "loss": 14.8136, + "num_input_tokens_seen": 161216, + "step": 160 + }, + { + "epoch": 0.07779349363507779, + "grad_norm": 2.3532979488372803, + "learning_rate": 3.866100895803866e-06, + "loss": 14.8993, + "num_input_tokens_seen": 166528, + "step": 165 + }, + { + "epoch": 0.08015087223008015, + "grad_norm": 2.314832925796509, + "learning_rate": 3.983969825553984e-06, + "loss": 14.7946, + "num_input_tokens_seen": 172064, + "step": 170 + }, + { + "epoch": 0.08250825082508251, + "grad_norm": 2.190019130706787, + "learning_rate": 4.101838755304102e-06, + "loss": 14.8965, + "num_input_tokens_seen": 177280, + "step": 175 + }, + { + "epoch": 0.08486562942008487, + "grad_norm": 2.393362522125244, + "learning_rate": 4.2197076850542205e-06, + "loss": 14.8466, + "num_input_tokens_seen": 181440, + "step": 180 + }, + { + "epoch": 0.08722300801508723, + "grad_norm": 2.347012996673584, + "learning_rate": 4.337576614804337e-06, + "loss": 14.7427, + "num_input_tokens_seen": 186496, + "step": 185 + }, + { + "epoch": 0.08958038661008957, + "grad_norm": 2.7251482009887695, + "learning_rate": 4.455445544554456e-06, + "loss": 14.6577, + "num_input_tokens_seen": 190688, + "step": 190 + }, + { + "epoch": 0.09193776520509193, + "grad_norm": 2.2400524616241455, + "learning_rate": 4.573314474304573e-06, + "loss": 14.6831, + "num_input_tokens_seen": 195648, + "step": 195 + }, + { + "epoch": 0.0942951438000943, + "grad_norm": 2.300693988800049, + "learning_rate": 4.691183404054692e-06, + "loss": 14.624, + "num_input_tokens_seen": 200512, + "step": 200 + }, + { + "epoch": 0.09665252239509665, + "grad_norm": 2.371460437774658, + "learning_rate": 4.809052333804809e-06, + "loss": 14.7886, + "num_input_tokens_seen": 205856, + "step": 205 + }, + { + "epoch": 0.09900990099009901, + "grad_norm": 2.434375762939453, + "learning_rate": 4.926921263554928e-06, + "loss": 14.5351, + "num_input_tokens_seen": 210784, + "step": 210 + }, + { + "epoch": 0.10136727958510136, + "grad_norm": 2.3240387439727783, + "learning_rate": 5.044790193305045e-06, + "loss": 14.4426, + "num_input_tokens_seen": 215360, + "step": 215 + }, + { + "epoch": 0.10372465818010372, + "grad_norm": 2.283273696899414, + "learning_rate": 5.162659123055163e-06, + "loss": 14.5348, + "num_input_tokens_seen": 220064, + "step": 220 + }, + { + "epoch": 0.10608203677510608, + "grad_norm": 2.428091526031494, + "learning_rate": 5.280528052805281e-06, + "loss": 14.3829, + "num_input_tokens_seen": 224352, + "step": 225 + }, + { + "epoch": 0.10843941537010844, + "grad_norm": 2.400937080383301, + "learning_rate": 5.398396982555399e-06, + "loss": 14.3783, + "num_input_tokens_seen": 229088, + "step": 230 + }, + { + "epoch": 0.1107967939651108, + "grad_norm": 2.460219144821167, + "learning_rate": 5.516265912305517e-06, + "loss": 14.2022, + "num_input_tokens_seen": 233824, + "step": 235 + }, + { + "epoch": 0.11315417256011315, + "grad_norm": 2.399306058883667, + "learning_rate": 5.634134842055634e-06, + "loss": 14.5094, + "num_input_tokens_seen": 238816, + "step": 240 + }, + { + "epoch": 0.11551155115511551, + "grad_norm": 2.666745901107788, + "learning_rate": 5.752003771805752e-06, + "loss": 14.3346, + "num_input_tokens_seen": 243328, + "step": 245 + }, + { + "epoch": 0.11786892975011787, + "grad_norm": 2.136908769607544, + "learning_rate": 5.86987270155587e-06, + "loss": 14.3158, + "num_input_tokens_seen": 248896, + "step": 250 + }, + { + "epoch": 0.12022630834512023, + "grad_norm": 2.224700689315796, + "learning_rate": 5.987741631305988e-06, + "loss": 14.1005, + "num_input_tokens_seen": 253440, + "step": 255 + }, + { + "epoch": 0.12258368694012259, + "grad_norm": 2.267634868621826, + "learning_rate": 6.105610561056106e-06, + "loss": 14.1072, + "num_input_tokens_seen": 258208, + "step": 260 + }, + { + "epoch": 0.12494106553512493, + "grad_norm": 2.246155023574829, + "learning_rate": 6.2234794908062235e-06, + "loss": 14.2652, + "num_input_tokens_seen": 262592, + "step": 265 + }, + { + "epoch": 0.1272984441301273, + "grad_norm": 2.1847457885742188, + "learning_rate": 6.341348420556342e-06, + "loss": 13.9799, + "num_input_tokens_seen": 267648, + "step": 270 + }, + { + "epoch": 0.12965582272512965, + "grad_norm": 2.4558422565460205, + "learning_rate": 6.4592173503064595e-06, + "loss": 14.0429, + "num_input_tokens_seen": 272384, + "step": 275 + }, + { + "epoch": 0.132013201320132, + "grad_norm": 2.427628755569458, + "learning_rate": 6.577086280056577e-06, + "loss": 13.8459, + "num_input_tokens_seen": 278464, + "step": 280 + }, + { + "epoch": 0.13437057991513437, + "grad_norm": 2.4750702381134033, + "learning_rate": 6.6949552098066954e-06, + "loss": 13.7397, + "num_input_tokens_seen": 282432, + "step": 285 + }, + { + "epoch": 0.13672795851013672, + "grad_norm": 2.3747713565826416, + "learning_rate": 6.812824139556813e-06, + "loss": 13.8468, + "num_input_tokens_seen": 287744, + "step": 290 + }, + { + "epoch": 0.1390853371051391, + "grad_norm": 2.435499429702759, + "learning_rate": 6.9306930693069314e-06, + "loss": 13.8202, + "num_input_tokens_seen": 291840, + "step": 295 + }, + { + "epoch": 0.14144271570014144, + "grad_norm": 2.2570104598999023, + "learning_rate": 7.048561999057049e-06, + "loss": 14.2269, + "num_input_tokens_seen": 296928, + "step": 300 + }, + { + "epoch": 0.1438000942951438, + "grad_norm": 2.300691604614258, + "learning_rate": 7.1664309288071666e-06, + "loss": 13.7237, + "num_input_tokens_seen": 302272, + "step": 305 + }, + { + "epoch": 0.14615747289014616, + "grad_norm": 2.092979907989502, + "learning_rate": 7.284299858557285e-06, + "loss": 13.7469, + "num_input_tokens_seen": 306688, + "step": 310 + }, + { + "epoch": 0.1485148514851485, + "grad_norm": 2.1555418968200684, + "learning_rate": 7.4021687883074026e-06, + "loss": 13.8634, + "num_input_tokens_seen": 310528, + "step": 315 + }, + { + "epoch": 0.15087223008015088, + "grad_norm": 2.420609474182129, + "learning_rate": 7.520037718057521e-06, + "loss": 13.8487, + "num_input_tokens_seen": 314912, + "step": 320 + }, + { + "epoch": 0.15322960867515323, + "grad_norm": 2.253328323364258, + "learning_rate": 7.637906647807639e-06, + "loss": 13.8084, + "num_input_tokens_seen": 319712, + "step": 325 + }, + { + "epoch": 0.15558698727015557, + "grad_norm": 2.200246572494507, + "learning_rate": 7.755775577557756e-06, + "loss": 13.4273, + "num_input_tokens_seen": 324704, + "step": 330 + }, + { + "epoch": 0.15794436586515795, + "grad_norm": 2.197524070739746, + "learning_rate": 7.873644507307874e-06, + "loss": 13.525, + "num_input_tokens_seen": 330112, + "step": 335 + }, + { + "epoch": 0.1603017444601603, + "grad_norm": 2.130683422088623, + "learning_rate": 7.991513437057991e-06, + "loss": 13.3198, + "num_input_tokens_seen": 337984, + "step": 340 + }, + { + "epoch": 0.16265912305516267, + "grad_norm": 2.399686098098755, + "learning_rate": 8.10938236680811e-06, + "loss": 13.3456, + "num_input_tokens_seen": 343136, + "step": 345 + }, + { + "epoch": 0.16501650165016502, + "grad_norm": 2.412811756134033, + "learning_rate": 8.227251296558228e-06, + "loss": 13.4426, + "num_input_tokens_seen": 348992, + "step": 350 + }, + { + "epoch": 0.16737388024516736, + "grad_norm": 2.2952287197113037, + "learning_rate": 8.345120226308346e-06, + "loss": 13.2101, + "num_input_tokens_seen": 353504, + "step": 355 + }, + { + "epoch": 0.16973125884016974, + "grad_norm": 2.624142646789551, + "learning_rate": 8.462989156058463e-06, + "loss": 13.2058, + "num_input_tokens_seen": 359104, + "step": 360 + }, + { + "epoch": 0.17208863743517208, + "grad_norm": 2.168287992477417, + "learning_rate": 8.58085808580858e-06, + "loss": 13.0201, + "num_input_tokens_seen": 363744, + "step": 365 + }, + { + "epoch": 0.17444601603017446, + "grad_norm": 2.1613357067108154, + "learning_rate": 8.6987270155587e-06, + "loss": 12.9421, + "num_input_tokens_seen": 368864, + "step": 370 + }, + { + "epoch": 0.1768033946251768, + "grad_norm": 2.101402521133423, + "learning_rate": 8.816595945308818e-06, + "loss": 13.2214, + "num_input_tokens_seen": 373824, + "step": 375 + }, + { + "epoch": 0.17916077322017915, + "grad_norm": 2.175086259841919, + "learning_rate": 8.934464875058935e-06, + "loss": 12.9887, + "num_input_tokens_seen": 378432, + "step": 380 + }, + { + "epoch": 0.18151815181518152, + "grad_norm": 2.290900945663452, + "learning_rate": 9.052333804809053e-06, + "loss": 12.9238, + "num_input_tokens_seen": 383552, + "step": 385 + }, + { + "epoch": 0.18387553041018387, + "grad_norm": 2.4292938709259033, + "learning_rate": 9.17020273455917e-06, + "loss": 12.9689, + "num_input_tokens_seen": 388768, + "step": 390 + }, + { + "epoch": 0.18623290900518624, + "grad_norm": 2.113936424255371, + "learning_rate": 9.28807166430929e-06, + "loss": 12.7957, + "num_input_tokens_seen": 393120, + "step": 395 + }, + { + "epoch": 0.1885902876001886, + "grad_norm": 2.426271438598633, + "learning_rate": 9.405940594059407e-06, + "loss": 12.7672, + "num_input_tokens_seen": 397856, + "step": 400 + }, + { + "epoch": 0.19094766619519093, + "grad_norm": 2.3151450157165527, + "learning_rate": 9.523809523809523e-06, + "loss": 12.4349, + "num_input_tokens_seen": 402848, + "step": 405 + }, + { + "epoch": 0.1933050447901933, + "grad_norm": 2.3244853019714355, + "learning_rate": 9.641678453559642e-06, + "loss": 12.599, + "num_input_tokens_seen": 407936, + "step": 410 + }, + { + "epoch": 0.19566242338519566, + "grad_norm": 2.133065700531006, + "learning_rate": 9.75954738330976e-06, + "loss": 12.2661, + "num_input_tokens_seen": 413024, + "step": 415 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 2.289931297302246, + "learning_rate": 9.87741631305988e-06, + "loss": 12.2521, + "num_input_tokens_seen": 418720, + "step": 420 + }, + { + "epoch": 0.20037718057520038, + "grad_norm": 2.15366792678833, + "learning_rate": 9.995285242809995e-06, + "loss": 12.4414, + "num_input_tokens_seen": 425440, + "step": 425 + }, + { + "epoch": 0.20273455917020272, + "grad_norm": 2.0883822441101074, + "learning_rate": 1.0113154172560113e-05, + "loss": 12.2757, + "num_input_tokens_seen": 430464, + "step": 430 + }, + { + "epoch": 0.2050919377652051, + "grad_norm": 2.1710004806518555, + "learning_rate": 1.0231023102310232e-05, + "loss": 12.2219, + "num_input_tokens_seen": 435776, + "step": 435 + }, + { + "epoch": 0.20744931636020744, + "grad_norm": 2.3987183570861816, + "learning_rate": 1.034889203206035e-05, + "loss": 12.6392, + "num_input_tokens_seen": 440352, + "step": 440 + }, + { + "epoch": 0.20980669495520982, + "grad_norm": 2.1219420433044434, + "learning_rate": 1.0466760961810467e-05, + "loss": 11.8193, + "num_input_tokens_seen": 444768, + "step": 445 + }, + { + "epoch": 0.21216407355021216, + "grad_norm": 2.136152744293213, + "learning_rate": 1.0584629891560585e-05, + "loss": 12.0273, + "num_input_tokens_seen": 450528, + "step": 450 + }, + { + "epoch": 0.2145214521452145, + "grad_norm": 2.274381160736084, + "learning_rate": 1.0702498821310702e-05, + "loss": 12.0953, + "num_input_tokens_seen": 454976, + "step": 455 + }, + { + "epoch": 0.21687883074021688, + "grad_norm": 2.2138774394989014, + "learning_rate": 1.0820367751060821e-05, + "loss": 11.869, + "num_input_tokens_seen": 459072, + "step": 460 + }, + { + "epoch": 0.21923620933521923, + "grad_norm": 2.2592530250549316, + "learning_rate": 1.0938236680810939e-05, + "loss": 12.0721, + "num_input_tokens_seen": 464320, + "step": 465 + }, + { + "epoch": 0.2215935879302216, + "grad_norm": 2.3082640171051025, + "learning_rate": 1.1056105610561057e-05, + "loss": 11.8343, + "num_input_tokens_seen": 469120, + "step": 470 + }, + { + "epoch": 0.22395096652522395, + "grad_norm": 2.112302303314209, + "learning_rate": 1.1173974540311174e-05, + "loss": 11.489, + "num_input_tokens_seen": 473920, + "step": 475 + }, + { + "epoch": 0.2263083451202263, + "grad_norm": 2.2736873626708984, + "learning_rate": 1.1291843470061292e-05, + "loss": 11.2041, + "num_input_tokens_seen": 477920, + "step": 480 + }, + { + "epoch": 0.22866572371522867, + "grad_norm": 2.2342586517333984, + "learning_rate": 1.1409712399811411e-05, + "loss": 11.6223, + "num_input_tokens_seen": 483040, + "step": 485 + }, + { + "epoch": 0.23102310231023102, + "grad_norm": 2.033524990081787, + "learning_rate": 1.1527581329561529e-05, + "loss": 11.2804, + "num_input_tokens_seen": 487872, + "step": 490 + }, + { + "epoch": 0.2333804809052334, + "grad_norm": 2.240748167037964, + "learning_rate": 1.1645450259311646e-05, + "loss": 11.478, + "num_input_tokens_seen": 492960, + "step": 495 + }, + { + "epoch": 0.23573785950023574, + "grad_norm": 2.256547212600708, + "learning_rate": 1.1763319189061764e-05, + "loss": 11.0732, + "num_input_tokens_seen": 498400, + "step": 500 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 2.0904388427734375, + "learning_rate": 1.1881188118811881e-05, + "loss": 11.1136, + "num_input_tokens_seen": 502784, + "step": 505 + }, + { + "epoch": 0.24045261669024046, + "grad_norm": 2.2460556030273438, + "learning_rate": 1.1999057048562e-05, + "loss": 11.0662, + "num_input_tokens_seen": 507776, + "step": 510 + }, + { + "epoch": 0.2428099952852428, + "grad_norm": 2.2506520748138428, + "learning_rate": 1.2116925978312118e-05, + "loss": 10.9356, + "num_input_tokens_seen": 512480, + "step": 515 + }, + { + "epoch": 0.24516737388024518, + "grad_norm": 2.071056365966797, + "learning_rate": 1.2234794908062236e-05, + "loss": 10.8913, + "num_input_tokens_seen": 517600, + "step": 520 + }, + { + "epoch": 0.24752475247524752, + "grad_norm": 2.0566811561584473, + "learning_rate": 1.2352663837812353e-05, + "loss": 10.664, + "num_input_tokens_seen": 523520, + "step": 525 + }, + { + "epoch": 0.24988213107024987, + "grad_norm": 2.092362642288208, + "learning_rate": 1.247053276756247e-05, + "loss": 10.6564, + "num_input_tokens_seen": 527584, + "step": 530 + }, + { + "epoch": 0.2522395096652522, + "grad_norm": 2.5252673625946045, + "learning_rate": 1.2588401697312588e-05, + "loss": 10.735, + "num_input_tokens_seen": 532736, + "step": 535 + }, + { + "epoch": 0.2545968882602546, + "grad_norm": 2.1862945556640625, + "learning_rate": 1.2706270627062708e-05, + "loss": 10.5884, + "num_input_tokens_seen": 537440, + "step": 540 + }, + { + "epoch": 0.25695426685525696, + "grad_norm": 2.2459464073181152, + "learning_rate": 1.2824139556812825e-05, + "loss": 10.505, + "num_input_tokens_seen": 543040, + "step": 545 + }, + { + "epoch": 0.2593116454502593, + "grad_norm": 2.3139092922210693, + "learning_rate": 1.2942008486562943e-05, + "loss": 10.4784, + "num_input_tokens_seen": 547520, + "step": 550 + }, + { + "epoch": 0.26166902404526166, + "grad_norm": 2.040045738220215, + "learning_rate": 1.305987741631306e-05, + "loss": 10.3584, + "num_input_tokens_seen": 552800, + "step": 555 + }, + { + "epoch": 0.264026402640264, + "grad_norm": 2.126457452774048, + "learning_rate": 1.3177746346063178e-05, + "loss": 10.4735, + "num_input_tokens_seen": 557376, + "step": 560 + }, + { + "epoch": 0.2663837812352664, + "grad_norm": 1.8756071329116821, + "learning_rate": 1.3295615275813297e-05, + "loss": 10.1024, + "num_input_tokens_seen": 563776, + "step": 565 + }, + { + "epoch": 0.26874115983026875, + "grad_norm": 2.1983907222747803, + "learning_rate": 1.3413484205563415e-05, + "loss": 9.9534, + "num_input_tokens_seen": 567712, + "step": 570 + }, + { + "epoch": 0.2710985384252711, + "grad_norm": 2.0713250637054443, + "learning_rate": 1.3531353135313532e-05, + "loss": 9.9451, + "num_input_tokens_seen": 572992, + "step": 575 + }, + { + "epoch": 0.27345591702027344, + "grad_norm": 2.521862268447876, + "learning_rate": 1.364922206506365e-05, + "loss": 10.0467, + "num_input_tokens_seen": 578816, + "step": 580 + }, + { + "epoch": 0.2758132956152758, + "grad_norm": 2.276503562927246, + "learning_rate": 1.3767090994813767e-05, + "loss": 9.7745, + "num_input_tokens_seen": 583552, + "step": 585 + }, + { + "epoch": 0.2781706742102782, + "grad_norm": 2.3050103187561035, + "learning_rate": 1.3884959924563887e-05, + "loss": 9.793, + "num_input_tokens_seen": 588864, + "step": 590 + }, + { + "epoch": 0.28052805280528054, + "grad_norm": 2.281316041946411, + "learning_rate": 1.4002828854314004e-05, + "loss": 9.5284, + "num_input_tokens_seen": 593248, + "step": 595 + }, + { + "epoch": 0.2828854314002829, + "grad_norm": 2.174571990966797, + "learning_rate": 1.4120697784064122e-05, + "loss": 9.707, + "num_input_tokens_seen": 599360, + "step": 600 + }, + { + "epoch": 0.28524280999528523, + "grad_norm": 2.06434965133667, + "learning_rate": 1.423856671381424e-05, + "loss": 9.6692, + "num_input_tokens_seen": 604352, + "step": 605 + }, + { + "epoch": 0.2876001885902876, + "grad_norm": 2.26080060005188, + "learning_rate": 1.4356435643564355e-05, + "loss": 9.2734, + "num_input_tokens_seen": 609952, + "step": 610 + }, + { + "epoch": 0.28995756718529, + "grad_norm": 2.514310359954834, + "learning_rate": 1.4474304573314476e-05, + "loss": 9.4453, + "num_input_tokens_seen": 614400, + "step": 615 + }, + { + "epoch": 0.2923149457802923, + "grad_norm": 2.150261878967285, + "learning_rate": 1.4592173503064594e-05, + "loss": 9.1623, + "num_input_tokens_seen": 619552, + "step": 620 + }, + { + "epoch": 0.29467232437529467, + "grad_norm": 1.999029278755188, + "learning_rate": 1.4710042432814711e-05, + "loss": 9.2835, + "num_input_tokens_seen": 624960, + "step": 625 + }, + { + "epoch": 0.297029702970297, + "grad_norm": 2.210922956466675, + "learning_rate": 1.4827911362564827e-05, + "loss": 8.878, + "num_input_tokens_seen": 631520, + "step": 630 + }, + { + "epoch": 0.29938708156529936, + "grad_norm": 1.9600862264633179, + "learning_rate": 1.4945780292314945e-05, + "loss": 8.686, + "num_input_tokens_seen": 637568, + "step": 635 + }, + { + "epoch": 0.30174446016030176, + "grad_norm": 2.1410176753997803, + "learning_rate": 1.5063649222065066e-05, + "loss": 9.0203, + "num_input_tokens_seen": 642688, + "step": 640 + }, + { + "epoch": 0.3041018387553041, + "grad_norm": 2.1895792484283447, + "learning_rate": 1.5181518151815183e-05, + "loss": 8.3888, + "num_input_tokens_seen": 647392, + "step": 645 + }, + { + "epoch": 0.30645921735030646, + "grad_norm": 2.1009979248046875, + "learning_rate": 1.52993870815653e-05, + "loss": 8.3771, + "num_input_tokens_seen": 652800, + "step": 650 + }, + { + "epoch": 0.3088165959453088, + "grad_norm": 2.10996413230896, + "learning_rate": 1.5417256011315417e-05, + "loss": 8.6434, + "num_input_tokens_seen": 659328, + "step": 655 + }, + { + "epoch": 0.31117397454031115, + "grad_norm": 2.314079999923706, + "learning_rate": 1.5535124941065536e-05, + "loss": 8.2055, + "num_input_tokens_seen": 663488, + "step": 660 + }, + { + "epoch": 0.31353135313531355, + "grad_norm": 2.168898820877075, + "learning_rate": 1.5652993870815655e-05, + "loss": 8.3438, + "num_input_tokens_seen": 668192, + "step": 665 + }, + { + "epoch": 0.3158887317303159, + "grad_norm": 1.9399738311767578, + "learning_rate": 1.577086280056577e-05, + "loss": 8.4412, + "num_input_tokens_seen": 673568, + "step": 670 + }, + { + "epoch": 0.31824611032531824, + "grad_norm": 2.15460205078125, + "learning_rate": 1.588873173031589e-05, + "loss": 8.4649, + "num_input_tokens_seen": 681056, + "step": 675 + }, + { + "epoch": 0.3206034889203206, + "grad_norm": 2.0800178050994873, + "learning_rate": 1.6006600660066006e-05, + "loss": 7.976, + "num_input_tokens_seen": 685920, + "step": 680 + }, + { + "epoch": 0.32296086751532294, + "grad_norm": 1.919609785079956, + "learning_rate": 1.6124469589816126e-05, + "loss": 7.8989, + "num_input_tokens_seen": 690816, + "step": 685 + }, + { + "epoch": 0.32531824611032534, + "grad_norm": 1.9720211029052734, + "learning_rate": 1.6242338519566245e-05, + "loss": 7.7327, + "num_input_tokens_seen": 695648, + "step": 690 + }, + { + "epoch": 0.3276756247053277, + "grad_norm": 2.113879442214966, + "learning_rate": 1.636020744931636e-05, + "loss": 7.5978, + "num_input_tokens_seen": 700960, + "step": 695 + }, + { + "epoch": 0.33003300330033003, + "grad_norm": 1.831860899925232, + "learning_rate": 1.647807637906648e-05, + "loss": 7.6111, + "num_input_tokens_seen": 705856, + "step": 700 + }, + { + "epoch": 0.3323903818953324, + "grad_norm": 1.9531139135360718, + "learning_rate": 1.6595945308816596e-05, + "loss": 7.3174, + "num_input_tokens_seen": 710272, + "step": 705 + }, + { + "epoch": 0.3347477604903347, + "grad_norm": 1.6864482164382935, + "learning_rate": 1.6713814238566712e-05, + "loss": 7.1675, + "num_input_tokens_seen": 714816, + "step": 710 + }, + { + "epoch": 0.3371051390853371, + "grad_norm": 2.061335325241089, + "learning_rate": 1.6831683168316834e-05, + "loss": 7.448, + "num_input_tokens_seen": 720736, + "step": 715 + }, + { + "epoch": 0.33946251768033947, + "grad_norm": 1.8762348890304565, + "learning_rate": 1.694955209806695e-05, + "loss": 7.4959, + "num_input_tokens_seen": 726848, + "step": 720 + }, + { + "epoch": 0.3418198962753418, + "grad_norm": 2.4519174098968506, + "learning_rate": 1.706742102781707e-05, + "loss": 7.0165, + "num_input_tokens_seen": 732512, + "step": 725 + }, + { + "epoch": 0.34417727487034416, + "grad_norm": 2.054673194885254, + "learning_rate": 1.7185289957567185e-05, + "loss": 7.048, + "num_input_tokens_seen": 737952, + "step": 730 + }, + { + "epoch": 0.3465346534653465, + "grad_norm": 1.6277072429656982, + "learning_rate": 1.73031588873173e-05, + "loss": 6.8577, + "num_input_tokens_seen": 742752, + "step": 735 + }, + { + "epoch": 0.3488920320603489, + "grad_norm": 2.103745698928833, + "learning_rate": 1.7421027817067424e-05, + "loss": 6.7668, + "num_input_tokens_seen": 747968, + "step": 740 + }, + { + "epoch": 0.35124941065535126, + "grad_norm": 1.6316519975662231, + "learning_rate": 1.753889674681754e-05, + "loss": 6.5052, + "num_input_tokens_seen": 753632, + "step": 745 + }, + { + "epoch": 0.3536067892503536, + "grad_norm": 1.9552924633026123, + "learning_rate": 1.765676567656766e-05, + "loss": 6.575, + "num_input_tokens_seen": 759904, + "step": 750 + }, + { + "epoch": 0.35596416784535595, + "grad_norm": 1.7704380750656128, + "learning_rate": 1.7774634606317775e-05, + "loss": 6.8177, + "num_input_tokens_seen": 764960, + "step": 755 + }, + { + "epoch": 0.3583215464403583, + "grad_norm": 1.535231113433838, + "learning_rate": 1.789250353606789e-05, + "loss": 6.1887, + "num_input_tokens_seen": 769184, + "step": 760 + }, + { + "epoch": 0.3606789250353607, + "grad_norm": 1.5777666568756104, + "learning_rate": 1.8010372465818014e-05, + "loss": 6.2002, + "num_input_tokens_seen": 775104, + "step": 765 + }, + { + "epoch": 0.36303630363036304, + "grad_norm": 1.7237244844436646, + "learning_rate": 1.812824139556813e-05, + "loss": 5.9943, + "num_input_tokens_seen": 779744, + "step": 770 + }, + { + "epoch": 0.3653936822253654, + "grad_norm": 1.864912748336792, + "learning_rate": 1.8246110325318245e-05, + "loss": 6.3618, + "num_input_tokens_seen": 784832, + "step": 775 + }, + { + "epoch": 0.36775106082036774, + "grad_norm": 1.9157650470733643, + "learning_rate": 1.8363979255068365e-05, + "loss": 5.9592, + "num_input_tokens_seen": 789920, + "step": 780 + }, + { + "epoch": 0.3701084394153701, + "grad_norm": 1.4945064783096313, + "learning_rate": 1.848184818481848e-05, + "loss": 5.8319, + "num_input_tokens_seen": 795168, + "step": 785 + }, + { + "epoch": 0.3724658180103725, + "grad_norm": 1.660889744758606, + "learning_rate": 1.8599717114568603e-05, + "loss": 5.9526, + "num_input_tokens_seen": 800096, + "step": 790 + }, + { + "epoch": 0.37482319660537483, + "grad_norm": 1.6404544115066528, + "learning_rate": 1.871758604431872e-05, + "loss": 5.6223, + "num_input_tokens_seen": 805632, + "step": 795 + }, + { + "epoch": 0.3771805752003772, + "grad_norm": 1.7163115739822388, + "learning_rate": 1.8835454974068835e-05, + "loss": 5.4318, + "num_input_tokens_seen": 810688, + "step": 800 + }, + { + "epoch": 0.3795379537953795, + "grad_norm": 1.7385752201080322, + "learning_rate": 1.8953323903818954e-05, + "loss": 5.3429, + "num_input_tokens_seen": 815264, + "step": 805 + }, + { + "epoch": 0.38189533239038187, + "grad_norm": 1.393892765045166, + "learning_rate": 1.907119283356907e-05, + "loss": 5.3209, + "num_input_tokens_seen": 820320, + "step": 810 + }, + { + "epoch": 0.38425271098538427, + "grad_norm": 1.4846687316894531, + "learning_rate": 1.9189061763319193e-05, + "loss": 5.1477, + "num_input_tokens_seen": 824064, + "step": 815 + }, + { + "epoch": 0.3866100895803866, + "grad_norm": 1.7547835111618042, + "learning_rate": 1.930693069306931e-05, + "loss": 5.4069, + "num_input_tokens_seen": 829120, + "step": 820 + }, + { + "epoch": 0.38896746817538896, + "grad_norm": 1.4521467685699463, + "learning_rate": 1.9424799622819424e-05, + "loss": 5.109, + "num_input_tokens_seen": 834816, + "step": 825 + }, + { + "epoch": 0.3913248467703913, + "grad_norm": 1.5954492092132568, + "learning_rate": 1.9542668552569544e-05, + "loss": 5.1861, + "num_input_tokens_seen": 840256, + "step": 830 + }, + { + "epoch": 0.39368222536539366, + "grad_norm": 1.4067014455795288, + "learning_rate": 1.966053748231966e-05, + "loss": 4.8443, + "num_input_tokens_seen": 844864, + "step": 835 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 1.6726529598236084, + "learning_rate": 1.977840641206978e-05, + "loss": 4.8306, + "num_input_tokens_seen": 850304, + "step": 840 + }, + { + "epoch": 0.3983969825553984, + "grad_norm": 1.5784883499145508, + "learning_rate": 1.9896275341819898e-05, + "loss": 4.8807, + "num_input_tokens_seen": 855232, + "step": 845 + }, + { + "epoch": 0.40075436115040075, + "grad_norm": 1.5521334409713745, + "learning_rate": 2.0014144271570014e-05, + "loss": 4.6772, + "num_input_tokens_seen": 860128, + "step": 850 + }, + { + "epoch": 0.4031117397454031, + "grad_norm": 1.5741459131240845, + "learning_rate": 2.0132013201320133e-05, + "loss": 4.2879, + "num_input_tokens_seen": 864896, + "step": 855 + }, + { + "epoch": 0.40546911834040544, + "grad_norm": 1.461530327796936, + "learning_rate": 2.024988213107025e-05, + "loss": 4.3403, + "num_input_tokens_seen": 869408, + "step": 860 + }, + { + "epoch": 0.40782649693540785, + "grad_norm": 1.5467249155044556, + "learning_rate": 2.036775106082037e-05, + "loss": 4.441, + "num_input_tokens_seen": 874144, + "step": 865 + }, + { + "epoch": 0.4101838755304102, + "grad_norm": 1.651287317276001, + "learning_rate": 2.0485619990570488e-05, + "loss": 4.3287, + "num_input_tokens_seen": 879040, + "step": 870 + }, + { + "epoch": 0.41254125412541254, + "grad_norm": 1.5727077722549438, + "learning_rate": 2.0603488920320603e-05, + "loss": 4.204, + "num_input_tokens_seen": 883712, + "step": 875 + }, + { + "epoch": 0.4148986327204149, + "grad_norm": 1.654297113418579, + "learning_rate": 2.0721357850070723e-05, + "loss": 4.1457, + "num_input_tokens_seen": 888128, + "step": 880 + }, + { + "epoch": 0.41725601131541723, + "grad_norm": 1.4821985960006714, + "learning_rate": 2.083922677982084e-05, + "loss": 4.1677, + "num_input_tokens_seen": 894240, + "step": 885 + }, + { + "epoch": 0.41961338991041963, + "grad_norm": 1.6079695224761963, + "learning_rate": 2.0957095709570958e-05, + "loss": 4.0719, + "num_input_tokens_seen": 899456, + "step": 890 + }, + { + "epoch": 0.421970768505422, + "grad_norm": 1.6586661338806152, + "learning_rate": 2.1074964639321077e-05, + "loss": 3.8521, + "num_input_tokens_seen": 904672, + "step": 895 + }, + { + "epoch": 0.4243281471004243, + "grad_norm": 1.7778162956237793, + "learning_rate": 2.1192833569071193e-05, + "loss": 4.1942, + "num_input_tokens_seen": 913216, + "step": 900 + }, + { + "epoch": 0.42668552569542667, + "grad_norm": 1.610549807548523, + "learning_rate": 2.1310702498821312e-05, + "loss": 3.3761, + "num_input_tokens_seen": 917664, + "step": 905 + }, + { + "epoch": 0.429042904290429, + "grad_norm": 1.7250484228134155, + "learning_rate": 2.1428571428571428e-05, + "loss": 3.7723, + "num_input_tokens_seen": 923360, + "step": 910 + }, + { + "epoch": 0.4314002828854314, + "grad_norm": 1.3305034637451172, + "learning_rate": 2.1546440358321547e-05, + "loss": 3.5395, + "num_input_tokens_seen": 928608, + "step": 915 + }, + { + "epoch": 0.43375766148043376, + "grad_norm": 1.6605404615402222, + "learning_rate": 2.1664309288071667e-05, + "loss": 3.2632, + "num_input_tokens_seen": 933312, + "step": 920 + }, + { + "epoch": 0.4361150400754361, + "grad_norm": 1.3616340160369873, + "learning_rate": 2.1782178217821783e-05, + "loss": 3.1408, + "num_input_tokens_seen": 937824, + "step": 925 + }, + { + "epoch": 0.43847241867043846, + "grad_norm": 1.3496013879776, + "learning_rate": 2.1900047147571902e-05, + "loss": 3.0737, + "num_input_tokens_seen": 942016, + "step": 930 + }, + { + "epoch": 0.4408297972654408, + "grad_norm": 1.4894121885299683, + "learning_rate": 2.2017916077322018e-05, + "loss": 2.9443, + "num_input_tokens_seen": 946080, + "step": 935 + }, + { + "epoch": 0.4431871758604432, + "grad_norm": 1.4575815200805664, + "learning_rate": 2.2135785007072137e-05, + "loss": 3.1605, + "num_input_tokens_seen": 951168, + "step": 940 + }, + { + "epoch": 0.44554455445544555, + "grad_norm": 1.573561668395996, + "learning_rate": 2.2253653936822256e-05, + "loss": 2.9549, + "num_input_tokens_seen": 956992, + "step": 945 + }, + { + "epoch": 0.4479019330504479, + "grad_norm": 1.4182724952697754, + "learning_rate": 2.2371522866572372e-05, + "loss": 2.648, + "num_input_tokens_seen": 961248, + "step": 950 + }, + { + "epoch": 0.45025931164545024, + "grad_norm": 1.3640843629837036, + "learning_rate": 2.248939179632249e-05, + "loss": 2.3136, + "num_input_tokens_seen": 965376, + "step": 955 + }, + { + "epoch": 0.4526166902404526, + "grad_norm": 1.353670358657837, + "learning_rate": 2.2607260726072607e-05, + "loss": 2.3341, + "num_input_tokens_seen": 969664, + "step": 960 + }, + { + "epoch": 0.454974068835455, + "grad_norm": 1.2000300884246826, + "learning_rate": 2.2725129655822727e-05, + "loss": 2.4235, + "num_input_tokens_seen": 974016, + "step": 965 + }, + { + "epoch": 0.45733144743045734, + "grad_norm": 1.2228617668151855, + "learning_rate": 2.2842998585572846e-05, + "loss": 2.3301, + "num_input_tokens_seen": 979072, + "step": 970 + }, + { + "epoch": 0.4596888260254597, + "grad_norm": 1.7434426546096802, + "learning_rate": 2.296086751532296e-05, + "loss": 2.4078, + "num_input_tokens_seen": 984032, + "step": 975 + }, + { + "epoch": 0.46204620462046203, + "grad_norm": 1.6657531261444092, + "learning_rate": 2.3078736445073078e-05, + "loss": 2.1453, + "num_input_tokens_seen": 988544, + "step": 980 + }, + { + "epoch": 0.4644035832154644, + "grad_norm": 1.3883270025253296, + "learning_rate": 2.3196605374823197e-05, + "loss": 2.2037, + "num_input_tokens_seen": 993504, + "step": 985 + }, + { + "epoch": 0.4667609618104668, + "grad_norm": 1.7972182035446167, + "learning_rate": 2.3314474304573316e-05, + "loss": 2.3168, + "num_input_tokens_seen": 999104, + "step": 990 + }, + { + "epoch": 0.4691183404054691, + "grad_norm": 1.175042986869812, + "learning_rate": 2.3432343234323435e-05, + "loss": 2.294, + "num_input_tokens_seen": 1004864, + "step": 995 + }, + { + "epoch": 0.47147571900047147, + "grad_norm": 1.5251885652542114, + "learning_rate": 2.355021216407355e-05, + "loss": 1.9302, + "num_input_tokens_seen": 1009472, + "step": 1000 + }, + { + "epoch": 0.4738330975954738, + "grad_norm": 1.0408551692962646, + "learning_rate": 2.3668081093823667e-05, + "loss": 1.7054, + "num_input_tokens_seen": 1013536, + "step": 1005 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 1.1724919080734253, + "learning_rate": 2.3785950023573786e-05, + "loss": 1.641, + "num_input_tokens_seen": 1018144, + "step": 1010 + }, + { + "epoch": 0.47854785478547857, + "grad_norm": 1.1328731775283813, + "learning_rate": 2.3903818953323906e-05, + "loss": 1.9773, + "num_input_tokens_seen": 1022848, + "step": 1015 + }, + { + "epoch": 0.4809052333804809, + "grad_norm": 1.0744316577911377, + "learning_rate": 2.4021687883074025e-05, + "loss": 1.7947, + "num_input_tokens_seen": 1027424, + "step": 1020 + }, + { + "epoch": 0.48326261197548326, + "grad_norm": 1.124281883239746, + "learning_rate": 2.413955681282414e-05, + "loss": 1.57, + "num_input_tokens_seen": 1032768, + "step": 1025 + }, + { + "epoch": 0.4856199905704856, + "grad_norm": 1.1114492416381836, + "learning_rate": 2.4257425742574257e-05, + "loss": 1.4171, + "num_input_tokens_seen": 1037440, + "step": 1030 + }, + { + "epoch": 0.48797736916548795, + "grad_norm": 1.1220316886901855, + "learning_rate": 2.4375294672324376e-05, + "loss": 1.5649, + "num_input_tokens_seen": 1042400, + "step": 1035 + }, + { + "epoch": 0.49033474776049035, + "grad_norm": 1.3439549207687378, + "learning_rate": 2.4493163602074495e-05, + "loss": 1.6671, + "num_input_tokens_seen": 1048032, + "step": 1040 + }, + { + "epoch": 0.4926921263554927, + "grad_norm": 0.8767451047897339, + "learning_rate": 2.461103253182461e-05, + "loss": 1.4986, + "num_input_tokens_seen": 1053408, + "step": 1045 + }, + { + "epoch": 0.49504950495049505, + "grad_norm": 0.975526750087738, + "learning_rate": 2.472890146157473e-05, + "loss": 1.441, + "num_input_tokens_seen": 1059008, + "step": 1050 + }, + { + "epoch": 0.4974068835454974, + "grad_norm": 1.1596741676330566, + "learning_rate": 2.4846770391324846e-05, + "loss": 1.099, + "num_input_tokens_seen": 1063712, + "step": 1055 + }, + { + "epoch": 0.49976426214049974, + "grad_norm": 1.383148431777954, + "learning_rate": 2.4964639321074965e-05, + "loss": 1.2842, + "num_input_tokens_seen": 1068320, + "step": 1060 + }, + { + "epoch": 0.5002357378595003, + "eval_loss": 1.2705003023147583, + "eval_runtime": 25.571, + "eval_samples_per_second": 36.878, + "eval_steps_per_second": 9.229, + "num_input_tokens_seen": 1069568, + "step": 1061 + }, + { + "epoch": 0.5021216407355021, + "grad_norm": 1.1699161529541016, + "learning_rate": 2.508250825082508e-05, + "loss": 1.2618, + "num_input_tokens_seen": 1073280, + "step": 1065 + }, + { + "epoch": 0.5044790193305044, + "grad_norm": 1.1910182237625122, + "learning_rate": 2.52003771805752e-05, + "loss": 0.9999, + "num_input_tokens_seen": 1077184, + "step": 1070 + }, + { + "epoch": 0.5068363979255068, + "grad_norm": 1.0585192441940308, + "learning_rate": 2.5318246110325323e-05, + "loss": 1.1483, + "num_input_tokens_seen": 1081664, + "step": 1075 + }, + { + "epoch": 0.5091937765205092, + "grad_norm": 1.00127112865448, + "learning_rate": 2.5436115040075436e-05, + "loss": 1.2674, + "num_input_tokens_seen": 1086304, + "step": 1080 + }, + { + "epoch": 0.5115511551155115, + "grad_norm": 1.4407867193222046, + "learning_rate": 2.555398396982556e-05, + "loss": 1.0875, + "num_input_tokens_seen": 1091616, + "step": 1085 + }, + { + "epoch": 0.5139085337105139, + "grad_norm": 1.2129045724868774, + "learning_rate": 2.567185289957567e-05, + "loss": 0.9516, + "num_input_tokens_seen": 1095616, + "step": 1090 + }, + { + "epoch": 0.5162659123055162, + "grad_norm": 1.1375104188919067, + "learning_rate": 2.578972182932579e-05, + "loss": 1.341, + "num_input_tokens_seen": 1101632, + "step": 1095 + }, + { + "epoch": 0.5186232909005186, + "grad_norm": 0.9895179867744446, + "learning_rate": 2.5907590759075913e-05, + "loss": 0.8428, + "num_input_tokens_seen": 1106080, + "step": 1100 + }, + { + "epoch": 0.520980669495521, + "grad_norm": 1.0461146831512451, + "learning_rate": 2.6025459688826025e-05, + "loss": 1.0415, + "num_input_tokens_seen": 1111456, + "step": 1105 + }, + { + "epoch": 0.5233380480905233, + "grad_norm": 1.0543495416641235, + "learning_rate": 2.6143328618576145e-05, + "loss": 1.2755, + "num_input_tokens_seen": 1117632, + "step": 1110 + }, + { + "epoch": 0.5256954266855257, + "grad_norm": 0.7490912079811096, + "learning_rate": 2.626119754832626e-05, + "loss": 1.0616, + "num_input_tokens_seen": 1122496, + "step": 1115 + }, + { + "epoch": 0.528052805280528, + "grad_norm": 0.789945662021637, + "learning_rate": 2.637906647807638e-05, + "loss": 0.7002, + "num_input_tokens_seen": 1126176, + "step": 1120 + }, + { + "epoch": 0.5304101838755304, + "grad_norm": 0.6595125198364258, + "learning_rate": 2.6496935407826502e-05, + "loss": 1.1933, + "num_input_tokens_seen": 1133024, + "step": 1125 + }, + { + "epoch": 0.5327675624705328, + "grad_norm": 0.8349714875221252, + "learning_rate": 2.6614804337576615e-05, + "loss": 0.8047, + "num_input_tokens_seen": 1138112, + "step": 1130 + }, + { + "epoch": 0.5351249410655351, + "grad_norm": 0.8154754638671875, + "learning_rate": 2.6732673267326734e-05, + "loss": 1.1543, + "num_input_tokens_seen": 1144064, + "step": 1135 + }, + { + "epoch": 0.5374823196605375, + "grad_norm": 0.8555862307548523, + "learning_rate": 2.685054219707685e-05, + "loss": 1.0209, + "num_input_tokens_seen": 1148992, + "step": 1140 + }, + { + "epoch": 0.5398396982555398, + "grad_norm": 0.8837468028068542, + "learning_rate": 2.696841112682697e-05, + "loss": 1.2019, + "num_input_tokens_seen": 1154592, + "step": 1145 + }, + { + "epoch": 0.5421970768505422, + "grad_norm": 1.4224478006362915, + "learning_rate": 2.7086280056577092e-05, + "loss": 0.8929, + "num_input_tokens_seen": 1159936, + "step": 1150 + }, + { + "epoch": 0.5445544554455446, + "grad_norm": 0.934142529964447, + "learning_rate": 2.7204148986327204e-05, + "loss": 0.8432, + "num_input_tokens_seen": 1164448, + "step": 1155 + }, + { + "epoch": 0.5469118340405469, + "grad_norm": 0.7761560678482056, + "learning_rate": 2.7322017916077324e-05, + "loss": 0.7754, + "num_input_tokens_seen": 1168544, + "step": 1160 + }, + { + "epoch": 0.5492692126355493, + "grad_norm": 0.46116888523101807, + "learning_rate": 2.743988684582744e-05, + "loss": 0.7201, + "num_input_tokens_seen": 1172960, + "step": 1165 + }, + { + "epoch": 0.5516265912305516, + "grad_norm": 1.3311429023742676, + "learning_rate": 2.755775577557756e-05, + "loss": 0.9896, + "num_input_tokens_seen": 1178144, + "step": 1170 + }, + { + "epoch": 0.553983969825554, + "grad_norm": 0.7157586216926575, + "learning_rate": 2.7675624705327678e-05, + "loss": 0.8286, + "num_input_tokens_seen": 1184448, + "step": 1175 + }, + { + "epoch": 0.5563413484205564, + "grad_norm": 0.7437616586685181, + "learning_rate": 2.7793493635077794e-05, + "loss": 0.7075, + "num_input_tokens_seen": 1189472, + "step": 1180 + }, + { + "epoch": 0.5586987270155587, + "grad_norm": 0.8969188332557678, + "learning_rate": 2.7911362564827913e-05, + "loss": 0.7349, + "num_input_tokens_seen": 1195072, + "step": 1185 + }, + { + "epoch": 0.5610561056105611, + "grad_norm": 1.2597697973251343, + "learning_rate": 2.802923149457803e-05, + "loss": 0.8313, + "num_input_tokens_seen": 1198656, + "step": 1190 + }, + { + "epoch": 0.5634134842055634, + "grad_norm": 1.0998255014419556, + "learning_rate": 2.8147100424328148e-05, + "loss": 0.7063, + "num_input_tokens_seen": 1203616, + "step": 1195 + }, + { + "epoch": 0.5657708628005658, + "grad_norm": 0.6322990655899048, + "learning_rate": 2.8264969354078268e-05, + "loss": 0.6437, + "num_input_tokens_seen": 1208224, + "step": 1200 + }, + { + "epoch": 0.5681282413955682, + "grad_norm": 0.9333911538124084, + "learning_rate": 2.8382838283828383e-05, + "loss": 0.6274, + "num_input_tokens_seen": 1212608, + "step": 1205 + }, + { + "epoch": 0.5704856199905705, + "grad_norm": 1.0893452167510986, + "learning_rate": 2.8500707213578503e-05, + "loss": 0.8128, + "num_input_tokens_seen": 1218336, + "step": 1210 + }, + { + "epoch": 0.5728429985855729, + "grad_norm": 0.9871928691864014, + "learning_rate": 2.861857614332862e-05, + "loss": 0.5454, + "num_input_tokens_seen": 1223296, + "step": 1215 + }, + { + "epoch": 0.5752003771805752, + "grad_norm": 0.8280481696128845, + "learning_rate": 2.8736445073078738e-05, + "loss": 0.646, + "num_input_tokens_seen": 1228448, + "step": 1220 + }, + { + "epoch": 0.5775577557755776, + "grad_norm": 0.6413212418556213, + "learning_rate": 2.8854314002828857e-05, + "loss": 0.6466, + "num_input_tokens_seen": 1233024, + "step": 1225 + }, + { + "epoch": 0.57991513437058, + "grad_norm": 1.0654373168945312, + "learning_rate": 2.8972182932578973e-05, + "loss": 0.722, + "num_input_tokens_seen": 1238080, + "step": 1230 + }, + { + "epoch": 0.5822725129655822, + "grad_norm": 0.9163988828659058, + "learning_rate": 2.9090051862329092e-05, + "loss": 0.541, + "num_input_tokens_seen": 1243296, + "step": 1235 + }, + { + "epoch": 0.5846298915605846, + "grad_norm": 0.616413950920105, + "learning_rate": 2.9207920792079208e-05, + "loss": 0.5814, + "num_input_tokens_seen": 1248128, + "step": 1240 + }, + { + "epoch": 0.5869872701555869, + "grad_norm": 0.49895185232162476, + "learning_rate": 2.9325789721829327e-05, + "loss": 0.6448, + "num_input_tokens_seen": 1252864, + "step": 1245 + }, + { + "epoch": 0.5893446487505893, + "grad_norm": 1.412937045097351, + "learning_rate": 2.9443658651579447e-05, + "loss": 0.6968, + "num_input_tokens_seen": 1257856, + "step": 1250 + }, + { + "epoch": 0.5917020273455917, + "grad_norm": 0.9425813555717468, + "learning_rate": 2.9561527581329563e-05, + "loss": 0.5613, + "num_input_tokens_seen": 1262240, + "step": 1255 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.9182863831520081, + "learning_rate": 2.9679396511079682e-05, + "loss": 0.5669, + "num_input_tokens_seen": 1267392, + "step": 1260 + }, + { + "epoch": 0.5964167845355964, + "grad_norm": 0.9919418096542358, + "learning_rate": 2.9797265440829798e-05, + "loss": 0.536, + "num_input_tokens_seen": 1272064, + "step": 1265 + }, + { + "epoch": 0.5987741631305987, + "grad_norm": 0.5386390089988708, + "learning_rate": 2.9915134370579917e-05, + "loss": 0.4523, + "num_input_tokens_seen": 1275968, + "step": 1270 + }, + { + "epoch": 0.6011315417256011, + "grad_norm": 0.8925914764404297, + "learning_rate": 3.0033003300330036e-05, + "loss": 0.6364, + "num_input_tokens_seen": 1280096, + "step": 1275 + }, + { + "epoch": 0.6034889203206035, + "grad_norm": 0.6217484474182129, + "learning_rate": 3.0150872230080152e-05, + "loss": 0.5838, + "num_input_tokens_seen": 1284704, + "step": 1280 + }, + { + "epoch": 0.6058462989156058, + "grad_norm": 1.0053385496139526, + "learning_rate": 3.026874115983027e-05, + "loss": 0.5881, + "num_input_tokens_seen": 1289312, + "step": 1285 + }, + { + "epoch": 0.6082036775106082, + "grad_norm": 1.0940099954605103, + "learning_rate": 3.0386610089580387e-05, + "loss": 0.5811, + "num_input_tokens_seen": 1293440, + "step": 1290 + }, + { + "epoch": 0.6105610561056105, + "grad_norm": 1.0689424276351929, + "learning_rate": 3.0504479019330506e-05, + "loss": 0.5986, + "num_input_tokens_seen": 1297408, + "step": 1295 + }, + { + "epoch": 0.6129184347006129, + "grad_norm": 0.9674487113952637, + "learning_rate": 3.0622347949080626e-05, + "loss": 0.6229, + "num_input_tokens_seen": 1302304, + "step": 1300 + }, + { + "epoch": 0.6152758132956153, + "grad_norm": 0.8255386352539062, + "learning_rate": 3.074021687883074e-05, + "loss": 0.4849, + "num_input_tokens_seen": 1306368, + "step": 1305 + }, + { + "epoch": 0.6176331918906176, + "grad_norm": 0.9451804757118225, + "learning_rate": 3.0858085808580864e-05, + "loss": 0.6918, + "num_input_tokens_seen": 1311712, + "step": 1310 + }, + { + "epoch": 0.61999057048562, + "grad_norm": 0.9613333940505981, + "learning_rate": 3.097595473833098e-05, + "loss": 0.941, + "num_input_tokens_seen": 1320000, + "step": 1315 + }, + { + "epoch": 0.6223479490806223, + "grad_norm": 0.6606389284133911, + "learning_rate": 3.1093823668081096e-05, + "loss": 0.5183, + "num_input_tokens_seen": 1324448, + "step": 1320 + }, + { + "epoch": 0.6247053276756247, + "grad_norm": 0.601767361164093, + "learning_rate": 3.1211692597831215e-05, + "loss": 0.5746, + "num_input_tokens_seen": 1329088, + "step": 1325 + }, + { + "epoch": 0.6270627062706271, + "grad_norm": 0.9819926023483276, + "learning_rate": 3.132956152758133e-05, + "loss": 0.5466, + "num_input_tokens_seen": 1333920, + "step": 1330 + }, + { + "epoch": 0.6294200848656294, + "grad_norm": 0.46888047456741333, + "learning_rate": 3.1447430457331454e-05, + "loss": 0.5228, + "num_input_tokens_seen": 1338848, + "step": 1335 + }, + { + "epoch": 0.6317774634606318, + "grad_norm": 0.8047199249267578, + "learning_rate": 3.1565299387081566e-05, + "loss": 0.7855, + "num_input_tokens_seen": 1344864, + "step": 1340 + }, + { + "epoch": 0.6341348420556341, + "grad_norm": 0.6804221272468567, + "learning_rate": 3.1683168316831686e-05, + "loss": 0.5842, + "num_input_tokens_seen": 1350144, + "step": 1345 + }, + { + "epoch": 0.6364922206506365, + "grad_norm": 0.8770887851715088, + "learning_rate": 3.1801037246581805e-05, + "loss": 0.425, + "num_input_tokens_seen": 1354592, + "step": 1350 + }, + { + "epoch": 0.6388495992456389, + "grad_norm": 0.9010847806930542, + "learning_rate": 3.191890617633192e-05, + "loss": 0.4453, + "num_input_tokens_seen": 1358368, + "step": 1355 + }, + { + "epoch": 0.6412069778406412, + "grad_norm": 0.5401031970977783, + "learning_rate": 3.2036775106082037e-05, + "loss": 0.4601, + "num_input_tokens_seen": 1362848, + "step": 1360 + }, + { + "epoch": 0.6435643564356436, + "grad_norm": 0.761959969997406, + "learning_rate": 3.2154644035832156e-05, + "loss": 0.6022, + "num_input_tokens_seen": 1367456, + "step": 1365 + }, + { + "epoch": 0.6459217350306459, + "grad_norm": 0.6014624238014221, + "learning_rate": 3.2272512965582275e-05, + "loss": 0.6097, + "num_input_tokens_seen": 1373888, + "step": 1370 + }, + { + "epoch": 0.6482791136256483, + "grad_norm": 0.5386530756950378, + "learning_rate": 3.2390381895332394e-05, + "loss": 0.5167, + "num_input_tokens_seen": 1378944, + "step": 1375 + }, + { + "epoch": 0.6506364922206507, + "grad_norm": 0.6988528966903687, + "learning_rate": 3.250825082508251e-05, + "loss": 0.5247, + "num_input_tokens_seen": 1383328, + "step": 1380 + }, + { + "epoch": 0.652993870815653, + "grad_norm": 0.8911290764808655, + "learning_rate": 3.2626119754832626e-05, + "loss": 0.4614, + "num_input_tokens_seen": 1388928, + "step": 1385 + }, + { + "epoch": 0.6553512494106554, + "grad_norm": 1.4455665349960327, + "learning_rate": 3.2743988684582745e-05, + "loss": 0.8149, + "num_input_tokens_seen": 1395712, + "step": 1390 + }, + { + "epoch": 0.6577086280056577, + "grad_norm": 0.5311848521232605, + "learning_rate": 3.2861857614332865e-05, + "loss": 0.6481, + "num_input_tokens_seen": 1399936, + "step": 1395 + }, + { + "epoch": 0.6600660066006601, + "grad_norm": 0.8016121983528137, + "learning_rate": 3.2979726544082984e-05, + "loss": 0.4705, + "num_input_tokens_seen": 1404992, + "step": 1400 + }, + { + "epoch": 0.6624233851956625, + "grad_norm": 0.7688477039337158, + "learning_rate": 3.3097595473833096e-05, + "loss": 0.5246, + "num_input_tokens_seen": 1409568, + "step": 1405 + }, + { + "epoch": 0.6647807637906648, + "grad_norm": 0.6225636005401611, + "learning_rate": 3.3215464403583216e-05, + "loss": 0.5602, + "num_input_tokens_seen": 1415072, + "step": 1410 + }, + { + "epoch": 0.6671381423856672, + "grad_norm": 0.7140394449234009, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.4364, + "num_input_tokens_seen": 1419552, + "step": 1415 + }, + { + "epoch": 0.6694955209806694, + "grad_norm": 0.9447476863861084, + "learning_rate": 3.3451202263083454e-05, + "loss": 0.4511, + "num_input_tokens_seen": 1424544, + "step": 1420 + }, + { + "epoch": 0.6718528995756718, + "grad_norm": 0.8706852793693542, + "learning_rate": 3.3569071192833573e-05, + "loss": 0.4829, + "num_input_tokens_seen": 1429504, + "step": 1425 + }, + { + "epoch": 0.6742102781706742, + "grad_norm": 0.48921048641204834, + "learning_rate": 3.3686940122583686e-05, + "loss": 0.5222, + "num_input_tokens_seen": 1434336, + "step": 1430 + }, + { + "epoch": 0.6765676567656765, + "grad_norm": 0.6611127257347107, + "learning_rate": 3.3804809052333805e-05, + "loss": 0.436, + "num_input_tokens_seen": 1438496, + "step": 1435 + }, + { + "epoch": 0.6789250353606789, + "grad_norm": 0.9817191958427429, + "learning_rate": 3.3922677982083924e-05, + "loss": 0.543, + "num_input_tokens_seen": 1444032, + "step": 1440 + }, + { + "epoch": 0.6812824139556812, + "grad_norm": 0.9550262689590454, + "learning_rate": 3.4040546911834044e-05, + "loss": 0.4529, + "num_input_tokens_seen": 1448864, + "step": 1445 + }, + { + "epoch": 0.6836397925506836, + "grad_norm": 0.39085501432418823, + "learning_rate": 3.415841584158416e-05, + "loss": 0.3501, + "num_input_tokens_seen": 1453376, + "step": 1450 + }, + { + "epoch": 0.685997171145686, + "grad_norm": 1.0135102272033691, + "learning_rate": 3.4276284771334275e-05, + "loss": 0.3956, + "num_input_tokens_seen": 1457376, + "step": 1455 + }, + { + "epoch": 0.6883545497406883, + "grad_norm": 0.8863470554351807, + "learning_rate": 3.4394153701084395e-05, + "loss": 0.5146, + "num_input_tokens_seen": 1462848, + "step": 1460 + }, + { + "epoch": 0.6907119283356907, + "grad_norm": 0.9552745223045349, + "learning_rate": 3.4512022630834514e-05, + "loss": 0.4667, + "num_input_tokens_seen": 1467872, + "step": 1465 + }, + { + "epoch": 0.693069306930693, + "grad_norm": 0.6415888667106628, + "learning_rate": 3.462989156058463e-05, + "loss": 0.4659, + "num_input_tokens_seen": 1472096, + "step": 1470 + }, + { + "epoch": 0.6954266855256954, + "grad_norm": 0.6636183261871338, + "learning_rate": 3.474776049033475e-05, + "loss": 0.4166, + "num_input_tokens_seen": 1477632, + "step": 1475 + }, + { + "epoch": 0.6977840641206978, + "grad_norm": 0.7308278679847717, + "learning_rate": 3.4865629420084865e-05, + "loss": 0.495, + "num_input_tokens_seen": 1482944, + "step": 1480 + }, + { + "epoch": 0.7001414427157001, + "grad_norm": 1.1124413013458252, + "learning_rate": 3.4983498349834984e-05, + "loss": 0.4004, + "num_input_tokens_seen": 1486976, + "step": 1485 + }, + { + "epoch": 0.7024988213107025, + "grad_norm": 0.7150792479515076, + "learning_rate": 3.5101367279585104e-05, + "loss": 0.4447, + "num_input_tokens_seen": 1491168, + "step": 1490 + }, + { + "epoch": 0.7048561999057048, + "grad_norm": 1.1895619630813599, + "learning_rate": 3.521923620933522e-05, + "loss": 0.4553, + "num_input_tokens_seen": 1496000, + "step": 1495 + }, + { + "epoch": 0.7072135785007072, + "grad_norm": 0.4361288547515869, + "learning_rate": 3.533710513908534e-05, + "loss": 0.3796, + "num_input_tokens_seen": 1500064, + "step": 1500 + }, + { + "epoch": 0.7095709570957096, + "grad_norm": 0.45707017183303833, + "learning_rate": 3.5454974068835455e-05, + "loss": 0.5033, + "num_input_tokens_seen": 1505120, + "step": 1505 + }, + { + "epoch": 0.7119283356907119, + "grad_norm": 0.4852288067340851, + "learning_rate": 3.5572842998585574e-05, + "loss": 0.479, + "num_input_tokens_seen": 1510240, + "step": 1510 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.8621490001678467, + "learning_rate": 3.569071192833569e-05, + "loss": 0.4092, + "num_input_tokens_seen": 1515712, + "step": 1515 + }, + { + "epoch": 0.7166430928807166, + "grad_norm": 0.5832515954971313, + "learning_rate": 3.580858085808581e-05, + "loss": 0.3349, + "num_input_tokens_seen": 1520512, + "step": 1520 + }, + { + "epoch": 0.719000471475719, + "grad_norm": 0.710948646068573, + "learning_rate": 3.592644978783593e-05, + "loss": 0.349, + "num_input_tokens_seen": 1524288, + "step": 1525 + }, + { + "epoch": 0.7213578500707214, + "grad_norm": 0.4796910285949707, + "learning_rate": 3.6044318717586044e-05, + "loss": 0.3602, + "num_input_tokens_seen": 1528896, + "step": 1530 + }, + { + "epoch": 0.7237152286657237, + "grad_norm": 0.7159779071807861, + "learning_rate": 3.6162187647336163e-05, + "loss": 0.562, + "num_input_tokens_seen": 1534656, + "step": 1535 + }, + { + "epoch": 0.7260726072607261, + "grad_norm": 0.6666659116744995, + "learning_rate": 3.628005657708628e-05, + "loss": 0.4068, + "num_input_tokens_seen": 1538848, + "step": 1540 + }, + { + "epoch": 0.7284299858557284, + "grad_norm": 0.6590299010276794, + "learning_rate": 3.63979255068364e-05, + "loss": 0.4209, + "num_input_tokens_seen": 1543712, + "step": 1545 + }, + { + "epoch": 0.7307873644507308, + "grad_norm": 1.961840033531189, + "learning_rate": 3.651579443658652e-05, + "loss": 0.6791, + "num_input_tokens_seen": 1549152, + "step": 1550 + }, + { + "epoch": 0.7331447430457332, + "grad_norm": 0.9700015187263489, + "learning_rate": 3.6633663366336634e-05, + "loss": 0.5059, + "num_input_tokens_seen": 1554080, + "step": 1555 + }, + { + "epoch": 0.7355021216407355, + "grad_norm": 1.0457555055618286, + "learning_rate": 3.675153229608675e-05, + "loss": 0.4535, + "num_input_tokens_seen": 1559104, + "step": 1560 + }, + { + "epoch": 0.7378595002357379, + "grad_norm": 1.03368079662323, + "learning_rate": 3.686940122583687e-05, + "loss": 0.5257, + "num_input_tokens_seen": 1565152, + "step": 1565 + }, + { + "epoch": 0.7402168788307402, + "grad_norm": 1.3059360980987549, + "learning_rate": 3.698727015558699e-05, + "loss": 0.4772, + "num_input_tokens_seen": 1570688, + "step": 1570 + }, + { + "epoch": 0.7425742574257426, + "grad_norm": 1.4053372144699097, + "learning_rate": 3.710513908533711e-05, + "loss": 0.5422, + "num_input_tokens_seen": 1576256, + "step": 1575 + }, + { + "epoch": 0.744931636020745, + "grad_norm": 0.36198627948760986, + "learning_rate": 3.722300801508722e-05, + "loss": 0.3005, + "num_input_tokens_seen": 1581056, + "step": 1580 + }, + { + "epoch": 0.7472890146157473, + "grad_norm": 1.0840938091278076, + "learning_rate": 3.734087694483734e-05, + "loss": 0.4269, + "num_input_tokens_seen": 1586464, + "step": 1585 + }, + { + "epoch": 0.7496463932107497, + "grad_norm": 1.0667831897735596, + "learning_rate": 3.745874587458746e-05, + "loss": 0.5254, + "num_input_tokens_seen": 1591296, + "step": 1590 + }, + { + "epoch": 0.752003771805752, + "grad_norm": 0.8211997151374817, + "learning_rate": 3.757661480433758e-05, + "loss": 0.3166, + "num_input_tokens_seen": 1595488, + "step": 1595 + }, + { + "epoch": 0.7543611504007544, + "grad_norm": 0.8227137923240662, + "learning_rate": 3.76944837340877e-05, + "loss": 0.4338, + "num_input_tokens_seen": 1601312, + "step": 1600 + }, + { + "epoch": 0.7567185289957568, + "grad_norm": 0.6391598582267761, + "learning_rate": 3.781235266383781e-05, + "loss": 0.3677, + "num_input_tokens_seen": 1606336, + "step": 1605 + }, + { + "epoch": 0.759075907590759, + "grad_norm": 0.45895540714263916, + "learning_rate": 3.793022159358793e-05, + "loss": 0.3617, + "num_input_tokens_seen": 1611008, + "step": 1610 + }, + { + "epoch": 0.7614332861857614, + "grad_norm": 0.7864967584609985, + "learning_rate": 3.804809052333805e-05, + "loss": 0.3923, + "num_input_tokens_seen": 1615328, + "step": 1615 + }, + { + "epoch": 0.7637906647807637, + "grad_norm": 0.7314066290855408, + "learning_rate": 3.816595945308817e-05, + "loss": 0.3534, + "num_input_tokens_seen": 1620320, + "step": 1620 + }, + { + "epoch": 0.7661480433757661, + "grad_norm": 0.4684637188911438, + "learning_rate": 3.828382838283829e-05, + "loss": 0.3373, + "num_input_tokens_seen": 1625344, + "step": 1625 + }, + { + "epoch": 0.7685054219707685, + "grad_norm": 0.7766032814979553, + "learning_rate": 3.84016973125884e-05, + "loss": 0.4586, + "num_input_tokens_seen": 1630624, + "step": 1630 + }, + { + "epoch": 0.7708628005657708, + "grad_norm": 0.7184266448020935, + "learning_rate": 3.851956624233852e-05, + "loss": 0.3859, + "num_input_tokens_seen": 1635136, + "step": 1635 + }, + { + "epoch": 0.7732201791607732, + "grad_norm": 0.786317765712738, + "learning_rate": 3.8637435172088634e-05, + "loss": 0.5038, + "num_input_tokens_seen": 1640640, + "step": 1640 + }, + { + "epoch": 0.7755775577557755, + "grad_norm": 0.6445505619049072, + "learning_rate": 3.875530410183876e-05, + "loss": 0.6583, + "num_input_tokens_seen": 1647040, + "step": 1645 + }, + { + "epoch": 0.7779349363507779, + "grad_norm": 0.521615743637085, + "learning_rate": 3.887317303158888e-05, + "loss": 0.325, + "num_input_tokens_seen": 1651712, + "step": 1650 + }, + { + "epoch": 0.7802923149457803, + "grad_norm": 0.8840108513832092, + "learning_rate": 3.899104196133899e-05, + "loss": 0.4278, + "num_input_tokens_seen": 1657536, + "step": 1655 + }, + { + "epoch": 0.7826496935407826, + "grad_norm": 1.0164748430252075, + "learning_rate": 3.910891089108911e-05, + "loss": 0.4865, + "num_input_tokens_seen": 1663616, + "step": 1660 + }, + { + "epoch": 0.785007072135785, + "grad_norm": 1.1529144048690796, + "learning_rate": 3.9226779820839224e-05, + "loss": 0.4337, + "num_input_tokens_seen": 1669600, + "step": 1665 + }, + { + "epoch": 0.7873644507307873, + "grad_norm": 0.5490677356719971, + "learning_rate": 3.934464875058935e-05, + "loss": 0.4576, + "num_input_tokens_seen": 1674560, + "step": 1670 + }, + { + "epoch": 0.7897218293257897, + "grad_norm": 0.6520237922668457, + "learning_rate": 3.946251768033947e-05, + "loss": 0.3357, + "num_input_tokens_seen": 1679616, + "step": 1675 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.5118961334228516, + "learning_rate": 3.958038661008958e-05, + "loss": 0.3881, + "num_input_tokens_seen": 1684160, + "step": 1680 + }, + { + "epoch": 0.7944365865157944, + "grad_norm": 0.845289409160614, + "learning_rate": 3.96982555398397e-05, + "loss": 0.4556, + "num_input_tokens_seen": 1689280, + "step": 1685 + }, + { + "epoch": 0.7967939651107968, + "grad_norm": 0.828121542930603, + "learning_rate": 3.981612446958981e-05, + "loss": 0.4102, + "num_input_tokens_seen": 1694368, + "step": 1690 + }, + { + "epoch": 0.7991513437057991, + "grad_norm": 0.9508788585662842, + "learning_rate": 3.993399339933994e-05, + "loss": 0.4693, + "num_input_tokens_seen": 1699072, + "step": 1695 + }, + { + "epoch": 0.8015087223008015, + "grad_norm": 0.49903830885887146, + "learning_rate": 4.005186232909006e-05, + "loss": 0.35, + "num_input_tokens_seen": 1704352, + "step": 1700 + }, + { + "epoch": 0.8038661008958039, + "grad_norm": 0.5746772885322571, + "learning_rate": 4.016973125884017e-05, + "loss": 0.4036, + "num_input_tokens_seen": 1709088, + "step": 1705 + }, + { + "epoch": 0.8062234794908062, + "grad_norm": 0.5874239802360535, + "learning_rate": 4.028760018859029e-05, + "loss": 0.3623, + "num_input_tokens_seen": 1713600, + "step": 1710 + }, + { + "epoch": 0.8085808580858086, + "grad_norm": 1.579070806503296, + "learning_rate": 4.04054691183404e-05, + "loss": 0.5543, + "num_input_tokens_seen": 1720224, + "step": 1715 + }, + { + "epoch": 0.8109382366808109, + "grad_norm": 0.7166122198104858, + "learning_rate": 4.052333804809053e-05, + "loss": 0.476, + "num_input_tokens_seen": 1725280, + "step": 1720 + }, + { + "epoch": 0.8132956152758133, + "grad_norm": 0.6637470722198486, + "learning_rate": 4.064120697784065e-05, + "loss": 0.6709, + "num_input_tokens_seen": 1731904, + "step": 1725 + }, + { + "epoch": 0.8156529938708157, + "grad_norm": 0.8644849061965942, + "learning_rate": 4.075907590759076e-05, + "loss": 0.4011, + "num_input_tokens_seen": 1737056, + "step": 1730 + }, + { + "epoch": 0.818010372465818, + "grad_norm": 1.0403010845184326, + "learning_rate": 4.087694483734088e-05, + "loss": 0.3761, + "num_input_tokens_seen": 1741568, + "step": 1735 + }, + { + "epoch": 0.8203677510608204, + "grad_norm": 0.7636839151382446, + "learning_rate": 4.099481376709099e-05, + "loss": 0.405, + "num_input_tokens_seen": 1745984, + "step": 1740 + }, + { + "epoch": 0.8227251296558227, + "grad_norm": 0.8189226388931274, + "learning_rate": 4.111268269684112e-05, + "loss": 0.4275, + "num_input_tokens_seen": 1750688, + "step": 1745 + }, + { + "epoch": 0.8250825082508251, + "grad_norm": 0.5539513826370239, + "learning_rate": 4.123055162659124e-05, + "loss": 0.3917, + "num_input_tokens_seen": 1755648, + "step": 1750 + }, + { + "epoch": 0.8274398868458275, + "grad_norm": 0.971939206123352, + "learning_rate": 4.134842055634135e-05, + "loss": 0.467, + "num_input_tokens_seen": 1761408, + "step": 1755 + }, + { + "epoch": 0.8297972654408298, + "grad_norm": 0.7585169076919556, + "learning_rate": 4.146628948609147e-05, + "loss": 0.4178, + "num_input_tokens_seen": 1765984, + "step": 1760 + }, + { + "epoch": 0.8321546440358322, + "grad_norm": 0.7093639969825745, + "learning_rate": 4.158415841584158e-05, + "loss": 0.3716, + "num_input_tokens_seen": 1771040, + "step": 1765 + }, + { + "epoch": 0.8345120226308345, + "grad_norm": 0.5998044610023499, + "learning_rate": 4.17020273455917e-05, + "loss": 0.3785, + "num_input_tokens_seen": 1776288, + "step": 1770 + }, + { + "epoch": 0.8368694012258369, + "grad_norm": 0.8088096380233765, + "learning_rate": 4.181989627534183e-05, + "loss": 0.4213, + "num_input_tokens_seen": 1781632, + "step": 1775 + }, + { + "epoch": 0.8392267798208393, + "grad_norm": 0.6315543055534363, + "learning_rate": 4.193776520509194e-05, + "loss": 0.3943, + "num_input_tokens_seen": 1786048, + "step": 1780 + }, + { + "epoch": 0.8415841584158416, + "grad_norm": 0.534536600112915, + "learning_rate": 4.205563413484206e-05, + "loss": 0.4674, + "num_input_tokens_seen": 1790848, + "step": 1785 + }, + { + "epoch": 0.843941537010844, + "grad_norm": 0.6127022504806519, + "learning_rate": 4.217350306459217e-05, + "loss": 0.3699, + "num_input_tokens_seen": 1795680, + "step": 1790 + }, + { + "epoch": 0.8462989156058462, + "grad_norm": 1.0143624544143677, + "learning_rate": 4.229137199434229e-05, + "loss": 0.385, + "num_input_tokens_seen": 1800448, + "step": 1795 + }, + { + "epoch": 0.8486562942008486, + "grad_norm": 0.7667749524116516, + "learning_rate": 4.240924092409242e-05, + "loss": 0.4382, + "num_input_tokens_seen": 1805568, + "step": 1800 + }, + { + "epoch": 0.851013672795851, + "grad_norm": 0.7182868719100952, + "learning_rate": 4.252710985384253e-05, + "loss": 0.3418, + "num_input_tokens_seen": 1810080, + "step": 1805 + }, + { + "epoch": 0.8533710513908533, + "grad_norm": 0.6148192882537842, + "learning_rate": 4.264497878359265e-05, + "loss": 0.3367, + "num_input_tokens_seen": 1814400, + "step": 1810 + }, + { + "epoch": 0.8557284299858557, + "grad_norm": 0.6615179777145386, + "learning_rate": 4.276284771334276e-05, + "loss": 0.3963, + "num_input_tokens_seen": 1819296, + "step": 1815 + }, + { + "epoch": 0.858085808580858, + "grad_norm": 0.8268840909004211, + "learning_rate": 4.288071664309288e-05, + "loss": 0.3978, + "num_input_tokens_seen": 1824160, + "step": 1820 + }, + { + "epoch": 0.8604431871758604, + "grad_norm": 1.0768799781799316, + "learning_rate": 4.2998585572843006e-05, + "loss": 0.395, + "num_input_tokens_seen": 1830240, + "step": 1825 + }, + { + "epoch": 0.8628005657708628, + "grad_norm": 0.982930064201355, + "learning_rate": 4.311645450259312e-05, + "loss": 0.3795, + "num_input_tokens_seen": 1834752, + "step": 1830 + }, + { + "epoch": 0.8651579443658651, + "grad_norm": 0.7406169176101685, + "learning_rate": 4.323432343234324e-05, + "loss": 0.4403, + "num_input_tokens_seen": 1839936, + "step": 1835 + }, + { + "epoch": 0.8675153229608675, + "grad_norm": 0.38677406311035156, + "learning_rate": 4.335219236209335e-05, + "loss": 0.325, + "num_input_tokens_seen": 1844448, + "step": 1840 + }, + { + "epoch": 0.8698727015558698, + "grad_norm": 0.5065212845802307, + "learning_rate": 4.347006129184347e-05, + "loss": 0.3471, + "num_input_tokens_seen": 1848704, + "step": 1845 + }, + { + "epoch": 0.8722300801508722, + "grad_norm": 0.6621030569076538, + "learning_rate": 4.3587930221593596e-05, + "loss": 0.2985, + "num_input_tokens_seen": 1854144, + "step": 1850 + }, + { + "epoch": 0.8745874587458746, + "grad_norm": 0.7197255492210388, + "learning_rate": 4.370579915134371e-05, + "loss": 0.4728, + "num_input_tokens_seen": 1859232, + "step": 1855 + }, + { + "epoch": 0.8769448373408769, + "grad_norm": 0.630316972732544, + "learning_rate": 4.382366808109383e-05, + "loss": 0.3759, + "num_input_tokens_seen": 1864736, + "step": 1860 + }, + { + "epoch": 0.8793022159358793, + "grad_norm": 0.8806106448173523, + "learning_rate": 4.394153701084394e-05, + "loss": 0.39, + "num_input_tokens_seen": 1870336, + "step": 1865 + }, + { + "epoch": 0.8816595945308816, + "grad_norm": 0.46690720319747925, + "learning_rate": 4.405940594059406e-05, + "loss": 0.4695, + "num_input_tokens_seen": 1875872, + "step": 1870 + }, + { + "epoch": 0.884016973125884, + "grad_norm": 0.6827927231788635, + "learning_rate": 4.417727487034418e-05, + "loss": 0.4363, + "num_input_tokens_seen": 1880288, + "step": 1875 + }, + { + "epoch": 0.8863743517208864, + "grad_norm": 0.8727323412895203, + "learning_rate": 4.42951438000943e-05, + "loss": 0.4968, + "num_input_tokens_seen": 1885376, + "step": 1880 + }, + { + "epoch": 0.8887317303158887, + "grad_norm": 0.7675926089286804, + "learning_rate": 4.441301272984442e-05, + "loss": 0.3768, + "num_input_tokens_seen": 1890272, + "step": 1885 + }, + { + "epoch": 0.8910891089108911, + "grad_norm": 0.6321307420730591, + "learning_rate": 4.453088165959453e-05, + "loss": 0.3377, + "num_input_tokens_seen": 1894976, + "step": 1890 + }, + { + "epoch": 0.8934464875058934, + "grad_norm": 0.9417411088943481, + "learning_rate": 4.464875058934465e-05, + "loss": 0.4216, + "num_input_tokens_seen": 1900992, + "step": 1895 + }, + { + "epoch": 0.8958038661008958, + "grad_norm": 0.7948809862136841, + "learning_rate": 4.476661951909477e-05, + "loss": 0.4212, + "num_input_tokens_seen": 1905984, + "step": 1900 + }, + { + "epoch": 0.8981612446958982, + "grad_norm": 0.5846356749534607, + "learning_rate": 4.488448844884489e-05, + "loss": 0.3859, + "num_input_tokens_seen": 1910848, + "step": 1905 + }, + { + "epoch": 0.9005186232909005, + "grad_norm": 0.4934006929397583, + "learning_rate": 4.5002357378595007e-05, + "loss": 0.4846, + "num_input_tokens_seen": 1916448, + "step": 1910 + }, + { + "epoch": 0.9028760018859029, + "grad_norm": 0.548418402671814, + "learning_rate": 4.512022630834512e-05, + "loss": 0.3485, + "num_input_tokens_seen": 1921440, + "step": 1915 + }, + { + "epoch": 0.9052333804809052, + "grad_norm": 0.4255371689796448, + "learning_rate": 4.523809523809524e-05, + "loss": 0.3464, + "num_input_tokens_seen": 1925984, + "step": 1920 + }, + { + "epoch": 0.9075907590759076, + "grad_norm": 0.5660032629966736, + "learning_rate": 4.535596416784536e-05, + "loss": 0.3943, + "num_input_tokens_seen": 1932096, + "step": 1925 + }, + { + "epoch": 0.90994813767091, + "grad_norm": 0.3890734910964966, + "learning_rate": 4.547383309759548e-05, + "loss": 0.3481, + "num_input_tokens_seen": 1936640, + "step": 1930 + }, + { + "epoch": 0.9123055162659123, + "grad_norm": 0.44773417711257935, + "learning_rate": 4.5591702027345596e-05, + "loss": 0.3508, + "num_input_tokens_seen": 1941472, + "step": 1935 + }, + { + "epoch": 0.9146628948609147, + "grad_norm": 0.5673953890800476, + "learning_rate": 4.570957095709571e-05, + "loss": 0.3409, + "num_input_tokens_seen": 1946528, + "step": 1940 + }, + { + "epoch": 0.917020273455917, + "grad_norm": 0.6340048909187317, + "learning_rate": 4.582743988684583e-05, + "loss": 0.4899, + "num_input_tokens_seen": 1952352, + "step": 1945 + }, + { + "epoch": 0.9193776520509194, + "grad_norm": 0.511544406414032, + "learning_rate": 4.594530881659595e-05, + "loss": 0.3233, + "num_input_tokens_seen": 1957344, + "step": 1950 + }, + { + "epoch": 0.9217350306459218, + "grad_norm": 0.7124595046043396, + "learning_rate": 4.6063177746346066e-05, + "loss": 0.6171, + "num_input_tokens_seen": 1961824, + "step": 1955 + }, + { + "epoch": 0.9240924092409241, + "grad_norm": 0.5116147398948669, + "learning_rate": 4.6181046676096186e-05, + "loss": 0.3404, + "num_input_tokens_seen": 1966368, + "step": 1960 + }, + { + "epoch": 0.9264497878359265, + "grad_norm": 0.9519069194793701, + "learning_rate": 4.62989156058463e-05, + "loss": 0.4984, + "num_input_tokens_seen": 1973600, + "step": 1965 + }, + { + "epoch": 0.9288071664309288, + "grad_norm": 0.7634571194648743, + "learning_rate": 4.641678453559642e-05, + "loss": 0.3762, + "num_input_tokens_seen": 1978880, + "step": 1970 + }, + { + "epoch": 0.9311645450259312, + "grad_norm": 0.7683371305465698, + "learning_rate": 4.653465346534654e-05, + "loss": 0.4254, + "num_input_tokens_seen": 1984864, + "step": 1975 + }, + { + "epoch": 0.9335219236209336, + "grad_norm": 0.6550701856613159, + "learning_rate": 4.6652522395096656e-05, + "loss": 0.5014, + "num_input_tokens_seen": 1990592, + "step": 1980 + }, + { + "epoch": 0.9358793022159358, + "grad_norm": 0.4278097450733185, + "learning_rate": 4.6770391324846775e-05, + "loss": 0.3428, + "num_input_tokens_seen": 1996512, + "step": 1985 + }, + { + "epoch": 0.9382366808109383, + "grad_norm": 0.507910966873169, + "learning_rate": 4.688826025459689e-05, + "loss": 0.3909, + "num_input_tokens_seen": 2000960, + "step": 1990 + }, + { + "epoch": 0.9405940594059405, + "grad_norm": 0.5650092363357544, + "learning_rate": 4.700612918434701e-05, + "loss": 0.3099, + "num_input_tokens_seen": 2005056, + "step": 1995 + }, + { + "epoch": 0.9429514380009429, + "grad_norm": 0.9456562399864197, + "learning_rate": 4.7123998114097126e-05, + "loss": 0.4365, + "num_input_tokens_seen": 2009472, + "step": 2000 + }, + { + "epoch": 0.9453088165959453, + "grad_norm": 0.5485828518867493, + "learning_rate": 4.7241867043847245e-05, + "loss": 0.4227, + "num_input_tokens_seen": 2013888, + "step": 2005 + }, + { + "epoch": 0.9476661951909476, + "grad_norm": 0.6328248381614685, + "learning_rate": 4.7359735973597365e-05, + "loss": 0.4265, + "num_input_tokens_seen": 2019808, + "step": 2010 + }, + { + "epoch": 0.95002357378595, + "grad_norm": 0.7029588222503662, + "learning_rate": 4.747760490334748e-05, + "loss": 0.3925, + "num_input_tokens_seen": 2025440, + "step": 2015 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.8300467729568481, + "learning_rate": 4.7595473833097597e-05, + "loss": 0.3996, + "num_input_tokens_seen": 2030464, + "step": 2020 + }, + { + "epoch": 0.9547383309759547, + "grad_norm": 0.7641953825950623, + "learning_rate": 4.7713342762847716e-05, + "loss": 0.357, + "num_input_tokens_seen": 2035328, + "step": 2025 + }, + { + "epoch": 0.9570957095709571, + "grad_norm": 0.6328737735748291, + "learning_rate": 4.7831211692597835e-05, + "loss": 0.3375, + "num_input_tokens_seen": 2039616, + "step": 2030 + }, + { + "epoch": 0.9594530881659594, + "grad_norm": 0.6025834083557129, + "learning_rate": 4.7949080622347954e-05, + "loss": 0.3411, + "num_input_tokens_seen": 2045408, + "step": 2035 + }, + { + "epoch": 0.9618104667609618, + "grad_norm": 1.2114486694335938, + "learning_rate": 4.806694955209807e-05, + "loss": 0.3603, + "num_input_tokens_seen": 2050816, + "step": 2040 + }, + { + "epoch": 0.9641678453559641, + "grad_norm": 0.46354880928993225, + "learning_rate": 4.8184818481848186e-05, + "loss": 0.3533, + "num_input_tokens_seen": 2056128, + "step": 2045 + }, + { + "epoch": 0.9665252239509665, + "grad_norm": 0.4769867956638336, + "learning_rate": 4.8302687411598305e-05, + "loss": 0.4167, + "num_input_tokens_seen": 2061120, + "step": 2050 + }, + { + "epoch": 0.9688826025459689, + "grad_norm": 0.46283772587776184, + "learning_rate": 4.8420556341348425e-05, + "loss": 0.3588, + "num_input_tokens_seen": 2066400, + "step": 2055 + }, + { + "epoch": 0.9712399811409712, + "grad_norm": 0.9059154391288757, + "learning_rate": 4.8538425271098544e-05, + "loss": 0.3241, + "num_input_tokens_seen": 2071200, + "step": 2060 + }, + { + "epoch": 0.9735973597359736, + "grad_norm": 1.1071295738220215, + "learning_rate": 4.8656294200848656e-05, + "loss": 0.4347, + "num_input_tokens_seen": 2076320, + "step": 2065 + }, + { + "epoch": 0.9759547383309759, + "grad_norm": 0.6848787069320679, + "learning_rate": 4.8774163130598776e-05, + "loss": 0.4199, + "num_input_tokens_seen": 2082016, + "step": 2070 + }, + { + "epoch": 0.9783121169259783, + "grad_norm": 0.7899431586265564, + "learning_rate": 4.8892032060348895e-05, + "loss": 0.3971, + "num_input_tokens_seen": 2087136, + "step": 2075 + }, + { + "epoch": 0.9806694955209807, + "grad_norm": 0.7939406037330627, + "learning_rate": 4.9009900990099014e-05, + "loss": 0.4185, + "num_input_tokens_seen": 2092352, + "step": 2080 + }, + { + "epoch": 0.983026874115983, + "grad_norm": 0.7662653923034668, + "learning_rate": 4.9127769919849133e-05, + "loss": 0.3943, + "num_input_tokens_seen": 2096576, + "step": 2085 + }, + { + "epoch": 0.9853842527109854, + "grad_norm": 0.6294280290603638, + "learning_rate": 4.9245638849599246e-05, + "loss": 0.3597, + "num_input_tokens_seen": 2101472, + "step": 2090 + }, + { + "epoch": 0.9877416313059877, + "grad_norm": 0.5478662848472595, + "learning_rate": 4.9363507779349365e-05, + "loss": 0.3508, + "num_input_tokens_seen": 2106144, + "step": 2095 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 1.3307567834854126, + "learning_rate": 4.9481376709099484e-05, + "loss": 0.4336, + "num_input_tokens_seen": 2110880, + "step": 2100 + }, + { + "epoch": 0.9924563884959925, + "grad_norm": 0.7126138806343079, + "learning_rate": 4.9599245638849604e-05, + "loss": 0.3623, + "num_input_tokens_seen": 2115712, + "step": 2105 + }, + { + "epoch": 0.9948137670909948, + "grad_norm": 0.5800937414169312, + "learning_rate": 4.971711456859972e-05, + "loss": 0.3879, + "num_input_tokens_seen": 2120192, + "step": 2110 + }, + { + "epoch": 0.9971711456859972, + "grad_norm": 0.597841739654541, + "learning_rate": 4.9834983498349835e-05, + "loss": 0.385, + "num_input_tokens_seen": 2125728, + "step": 2115 + }, + { + "epoch": 0.9995285242809995, + "grad_norm": 0.6761749386787415, + "learning_rate": 4.9952852428099955e-05, + "loss": 0.3203, + "num_input_tokens_seen": 2130624, + "step": 2120 + }, + { + "epoch": 1.0004714757190005, + "eval_loss": 0.3713439702987671, + "eval_runtime": 25.5625, + "eval_samples_per_second": 36.89, + "eval_steps_per_second": 9.232, + "num_input_tokens_seen": 2133248, + "step": 2122 + }, + { + "epoch": 1.0018859028760019, + "grad_norm": 0.4754067063331604, + "learning_rate": 4.9999996952905647e-05, + "loss": 0.4306, + "num_input_tokens_seen": 2136160, + "step": 2125 + }, + { + "epoch": 1.0042432814710043, + "grad_norm": 0.8162623643875122, + "learning_rate": 4.999997833177614e-05, + "loss": 0.3788, + "num_input_tokens_seen": 2140320, + "step": 2130 + }, + { + "epoch": 1.0066006600660067, + "grad_norm": 0.653659999370575, + "learning_rate": 4.999994278235993e-05, + "loss": 0.3777, + "num_input_tokens_seen": 2145952, + "step": 2135 + }, + { + "epoch": 1.0089580386610089, + "grad_norm": 0.5008558034896851, + "learning_rate": 4.9999890304681087e-05, + "loss": 0.3609, + "num_input_tokens_seen": 2151936, + "step": 2140 + }, + { + "epoch": 1.0113154172560113, + "grad_norm": 0.4382736086845398, + "learning_rate": 4.999982089877514e-05, + "loss": 0.3768, + "num_input_tokens_seen": 2157824, + "step": 2145 + }, + { + "epoch": 1.0136727958510137, + "grad_norm": 0.5452284216880798, + "learning_rate": 4.999973456468909e-05, + "loss": 0.3195, + "num_input_tokens_seen": 2163744, + "step": 2150 + }, + { + "epoch": 1.016030174446016, + "grad_norm": 0.540254533290863, + "learning_rate": 4.9999631302481394e-05, + "loss": 0.307, + "num_input_tokens_seen": 2168608, + "step": 2155 + }, + { + "epoch": 1.0183875530410185, + "grad_norm": 0.38552770018577576, + "learning_rate": 4.999951111222198e-05, + "loss": 0.4314, + "num_input_tokens_seen": 2173312, + "step": 2160 + }, + { + "epoch": 1.0207449316360206, + "grad_norm": 0.8560072779655457, + "learning_rate": 4.999937399399221e-05, + "loss": 0.4226, + "num_input_tokens_seen": 2179008, + "step": 2165 + }, + { + "epoch": 1.023102310231023, + "grad_norm": 1.295783519744873, + "learning_rate": 4.999921994788496e-05, + "loss": 0.4396, + "num_input_tokens_seen": 2184320, + "step": 2170 + }, + { + "epoch": 1.0254596888260255, + "grad_norm": 0.3802526593208313, + "learning_rate": 4.9999048974004526e-05, + "loss": 0.3267, + "num_input_tokens_seen": 2188928, + "step": 2175 + }, + { + "epoch": 1.0278170674210279, + "grad_norm": 0.5793749094009399, + "learning_rate": 4.9998861072466684e-05, + "loss": 0.3437, + "num_input_tokens_seen": 2194016, + "step": 2180 + }, + { + "epoch": 1.0301744460160303, + "grad_norm": 1.5657917261123657, + "learning_rate": 4.9998656243398664e-05, + "loss": 0.3968, + "num_input_tokens_seen": 2197632, + "step": 2185 + }, + { + "epoch": 1.0325318246110324, + "grad_norm": 0.4864366948604584, + "learning_rate": 4.999843448693917e-05, + "loss": 0.3616, + "num_input_tokens_seen": 2203328, + "step": 2190 + }, + { + "epoch": 1.0348892032060348, + "grad_norm": 0.6371437907218933, + "learning_rate": 4.999819580323835e-05, + "loss": 0.4145, + "num_input_tokens_seen": 2207424, + "step": 2195 + }, + { + "epoch": 1.0372465818010372, + "grad_norm": 0.5469833612442017, + "learning_rate": 4.9997940192457826e-05, + "loss": 0.4028, + "num_input_tokens_seen": 2211616, + "step": 2200 + }, + { + "epoch": 1.0396039603960396, + "grad_norm": 0.5671599507331848, + "learning_rate": 4.9997667654770685e-05, + "loss": 0.416, + "num_input_tokens_seen": 2215776, + "step": 2205 + }, + { + "epoch": 1.041961338991042, + "grad_norm": 0.915004312992096, + "learning_rate": 4.9997378190361476e-05, + "loss": 0.3789, + "num_input_tokens_seen": 2220576, + "step": 2210 + }, + { + "epoch": 1.0443187175860442, + "grad_norm": 0.44941556453704834, + "learning_rate": 4.9997071799426196e-05, + "loss": 0.3149, + "num_input_tokens_seen": 2225024, + "step": 2215 + }, + { + "epoch": 1.0466760961810466, + "grad_norm": 0.5401982069015503, + "learning_rate": 4.9996748482172306e-05, + "loss": 0.358, + "num_input_tokens_seen": 2229888, + "step": 2220 + }, + { + "epoch": 1.049033474776049, + "grad_norm": 0.7395175099372864, + "learning_rate": 4.999640823881875e-05, + "loss": 0.3938, + "num_input_tokens_seen": 2234624, + "step": 2225 + }, + { + "epoch": 1.0513908533710514, + "grad_norm": 0.4630518853664398, + "learning_rate": 4.9996051069595906e-05, + "loss": 0.3864, + "num_input_tokens_seen": 2238656, + "step": 2230 + }, + { + "epoch": 1.0537482319660538, + "grad_norm": 0.6198683381080627, + "learning_rate": 4.999567697474563e-05, + "loss": 0.3598, + "num_input_tokens_seen": 2243552, + "step": 2235 + }, + { + "epoch": 1.056105610561056, + "grad_norm": 0.5433990955352783, + "learning_rate": 4.999528595452124e-05, + "loss": 0.3566, + "num_input_tokens_seen": 2249056, + "step": 2240 + }, + { + "epoch": 1.0584629891560584, + "grad_norm": 0.3676885962486267, + "learning_rate": 4.999487800918749e-05, + "loss": 0.2872, + "num_input_tokens_seen": 2254080, + "step": 2245 + }, + { + "epoch": 1.0608203677510608, + "grad_norm": 0.9657170176506042, + "learning_rate": 4.999445313902063e-05, + "loss": 0.3207, + "num_input_tokens_seen": 2259232, + "step": 2250 + }, + { + "epoch": 1.0631777463460632, + "grad_norm": 0.6619391441345215, + "learning_rate": 4.9994011344308355e-05, + "loss": 0.3365, + "num_input_tokens_seen": 2263680, + "step": 2255 + }, + { + "epoch": 1.0655351249410656, + "grad_norm": 0.6471905708312988, + "learning_rate": 4.99935526253498e-05, + "loss": 0.2966, + "num_input_tokens_seen": 2267904, + "step": 2260 + }, + { + "epoch": 1.0678925035360678, + "grad_norm": 0.33572089672088623, + "learning_rate": 4.999307698245559e-05, + "loss": 0.2862, + "num_input_tokens_seen": 2272992, + "step": 2265 + }, + { + "epoch": 1.0702498821310702, + "grad_norm": 0.36872416734695435, + "learning_rate": 4.9992584415947796e-05, + "loss": 0.2872, + "num_input_tokens_seen": 2276864, + "step": 2270 + }, + { + "epoch": 1.0726072607260726, + "grad_norm": 0.4661833345890045, + "learning_rate": 4.999207492615996e-05, + "loss": 0.4038, + "num_input_tokens_seen": 2282688, + "step": 2275 + }, + { + "epoch": 1.074964639321075, + "grad_norm": 0.721641480922699, + "learning_rate": 4.999154851343706e-05, + "loss": 0.4158, + "num_input_tokens_seen": 2286720, + "step": 2280 + }, + { + "epoch": 1.0773220179160774, + "grad_norm": 0.703184187412262, + "learning_rate": 4.9991005178135545e-05, + "loss": 0.3179, + "num_input_tokens_seen": 2291488, + "step": 2285 + }, + { + "epoch": 1.0796793965110796, + "grad_norm": 0.4073609709739685, + "learning_rate": 4.9990444920623334e-05, + "loss": 0.3539, + "num_input_tokens_seen": 2296736, + "step": 2290 + }, + { + "epoch": 1.082036775106082, + "grad_norm": 0.6142986416816711, + "learning_rate": 4.99898677412798e-05, + "loss": 0.3744, + "num_input_tokens_seen": 2301184, + "step": 2295 + }, + { + "epoch": 1.0843941537010844, + "grad_norm": 0.6926703453063965, + "learning_rate": 4.9989273640495766e-05, + "loss": 0.3732, + "num_input_tokens_seen": 2305536, + "step": 2300 + }, + { + "epoch": 1.0867515322960868, + "grad_norm": 0.4068440794944763, + "learning_rate": 4.998866261867351e-05, + "loss": 0.4447, + "num_input_tokens_seen": 2311584, + "step": 2305 + }, + { + "epoch": 1.0891089108910892, + "grad_norm": 0.41175734996795654, + "learning_rate": 4.998803467622677e-05, + "loss": 0.3628, + "num_input_tokens_seen": 2316192, + "step": 2310 + }, + { + "epoch": 1.0914662894860914, + "grad_norm": 0.7214474678039551, + "learning_rate": 4.998738981358076e-05, + "loss": 0.3396, + "num_input_tokens_seen": 2320256, + "step": 2315 + }, + { + "epoch": 1.0938236680810938, + "grad_norm": 0.6084800362586975, + "learning_rate": 4.998672803117214e-05, + "loss": 0.3712, + "num_input_tokens_seen": 2324384, + "step": 2320 + }, + { + "epoch": 1.0961810466760962, + "grad_norm": 0.5308446288108826, + "learning_rate": 4.9986049329449006e-05, + "loss": 0.2997, + "num_input_tokens_seen": 2328960, + "step": 2325 + }, + { + "epoch": 1.0985384252710986, + "grad_norm": 0.5622880458831787, + "learning_rate": 4.998535370887093e-05, + "loss": 0.3713, + "num_input_tokens_seen": 2334368, + "step": 2330 + }, + { + "epoch": 1.100895803866101, + "grad_norm": 0.6304563879966736, + "learning_rate": 4.9984641169908956e-05, + "loss": 0.3232, + "num_input_tokens_seen": 2339008, + "step": 2335 + }, + { + "epoch": 1.1032531824611032, + "grad_norm": 0.6377050280570984, + "learning_rate": 4.998391171304556e-05, + "loss": 0.3414, + "num_input_tokens_seen": 2343392, + "step": 2340 + }, + { + "epoch": 1.1056105610561056, + "grad_norm": 0.7724384665489197, + "learning_rate": 4.998316533877467e-05, + "loss": 0.4243, + "num_input_tokens_seen": 2349184, + "step": 2345 + }, + { + "epoch": 1.107967939651108, + "grad_norm": 0.463733434677124, + "learning_rate": 4.99824020476017e-05, + "loss": 0.3543, + "num_input_tokens_seen": 2353024, + "step": 2350 + }, + { + "epoch": 1.1103253182461104, + "grad_norm": 0.8349390029907227, + "learning_rate": 4.998162184004348e-05, + "loss": 0.3789, + "num_input_tokens_seen": 2357856, + "step": 2355 + }, + { + "epoch": 1.1126826968411128, + "grad_norm": 1.3780730962753296, + "learning_rate": 4.998082471662833e-05, + "loss": 0.6346, + "num_input_tokens_seen": 2366080, + "step": 2360 + }, + { + "epoch": 1.115040075436115, + "grad_norm": 0.7921567559242249, + "learning_rate": 4.998001067789599e-05, + "loss": 0.3667, + "num_input_tokens_seen": 2371424, + "step": 2365 + }, + { + "epoch": 1.1173974540311173, + "grad_norm": 0.7693462371826172, + "learning_rate": 4.997917972439769e-05, + "loss": 0.4452, + "num_input_tokens_seen": 2378112, + "step": 2370 + }, + { + "epoch": 1.1197548326261197, + "grad_norm": 1.160934567451477, + "learning_rate": 4.997833185669607e-05, + "loss": 0.4803, + "num_input_tokens_seen": 2383040, + "step": 2375 + }, + { + "epoch": 1.1221122112211221, + "grad_norm": 0.5763225555419922, + "learning_rate": 4.997746707536527e-05, + "loss": 0.3594, + "num_input_tokens_seen": 2387360, + "step": 2380 + }, + { + "epoch": 1.1244695898161245, + "grad_norm": 0.6073573231697083, + "learning_rate": 4.997658538099086e-05, + "loss": 0.4096, + "num_input_tokens_seen": 2392576, + "step": 2385 + }, + { + "epoch": 1.1268269684111267, + "grad_norm": 1.0481877326965332, + "learning_rate": 4.997568677416986e-05, + "loss": 0.3327, + "num_input_tokens_seen": 2397760, + "step": 2390 + }, + { + "epoch": 1.1291843470061291, + "grad_norm": 0.978972315788269, + "learning_rate": 4.997477125551073e-05, + "loss": 0.3375, + "num_input_tokens_seen": 2401792, + "step": 2395 + }, + { + "epoch": 1.1315417256011315, + "grad_norm": 0.3217482268810272, + "learning_rate": 4.997383882563343e-05, + "loss": 0.3226, + "num_input_tokens_seen": 2407680, + "step": 2400 + }, + { + "epoch": 1.133899104196134, + "grad_norm": 0.5573812127113342, + "learning_rate": 4.997288948516931e-05, + "loss": 0.3078, + "num_input_tokens_seen": 2412512, + "step": 2405 + }, + { + "epoch": 1.1362564827911363, + "grad_norm": 0.4608996510505676, + "learning_rate": 4.997192323476121e-05, + "loss": 0.3501, + "num_input_tokens_seen": 2417696, + "step": 2410 + }, + { + "epoch": 1.1386138613861387, + "grad_norm": 0.6298462152481079, + "learning_rate": 4.997094007506341e-05, + "loss": 0.351, + "num_input_tokens_seen": 2422304, + "step": 2415 + }, + { + "epoch": 1.140971239981141, + "grad_norm": 0.5126791596412659, + "learning_rate": 4.996994000674163e-05, + "loss": 0.373, + "num_input_tokens_seen": 2427552, + "step": 2420 + }, + { + "epoch": 1.1433286185761433, + "grad_norm": 0.3848946690559387, + "learning_rate": 4.996892303047306e-05, + "loss": 0.3926, + "num_input_tokens_seen": 2432544, + "step": 2425 + }, + { + "epoch": 1.1456859971711457, + "grad_norm": 0.45506325364112854, + "learning_rate": 4.996788914694632e-05, + "loss": 0.3776, + "num_input_tokens_seen": 2437248, + "step": 2430 + }, + { + "epoch": 1.1480433757661481, + "grad_norm": 0.5318833589553833, + "learning_rate": 4.996683835686149e-05, + "loss": 0.3055, + "num_input_tokens_seen": 2442912, + "step": 2435 + }, + { + "epoch": 1.1504007543611503, + "grad_norm": 0.3198949992656708, + "learning_rate": 4.996577066093009e-05, + "loss": 0.4149, + "num_input_tokens_seen": 2449184, + "step": 2440 + }, + { + "epoch": 1.1527581329561527, + "grad_norm": 0.6527677774429321, + "learning_rate": 4.996468605987509e-05, + "loss": 0.3659, + "num_input_tokens_seen": 2455008, + "step": 2445 + }, + { + "epoch": 1.155115511551155, + "grad_norm": 0.4217100739479065, + "learning_rate": 4.996358455443092e-05, + "loss": 0.3219, + "num_input_tokens_seen": 2459872, + "step": 2450 + }, + { + "epoch": 1.1574728901461575, + "grad_norm": 0.8086008429527283, + "learning_rate": 4.996246614534342e-05, + "loss": 0.3895, + "num_input_tokens_seen": 2466016, + "step": 2455 + }, + { + "epoch": 1.15983026874116, + "grad_norm": 0.571332573890686, + "learning_rate": 4.996133083336993e-05, + "loss": 0.3954, + "num_input_tokens_seen": 2471584, + "step": 2460 + }, + { + "epoch": 1.1621876473361623, + "grad_norm": 1.3517595529556274, + "learning_rate": 4.996017861927919e-05, + "loss": 0.4401, + "num_input_tokens_seen": 2478400, + "step": 2465 + }, + { + "epoch": 1.1645450259311645, + "grad_norm": 0.7774268984794617, + "learning_rate": 4.9959009503851394e-05, + "loss": 0.3176, + "num_input_tokens_seen": 2482752, + "step": 2470 + }, + { + "epoch": 1.166902404526167, + "grad_norm": 0.5184615254402161, + "learning_rate": 4.9957823487878205e-05, + "loss": 0.3417, + "num_input_tokens_seen": 2487584, + "step": 2475 + }, + { + "epoch": 1.1692597831211693, + "grad_norm": 0.6050660014152527, + "learning_rate": 4.9956620572162696e-05, + "loss": 0.3533, + "num_input_tokens_seen": 2491872, + "step": 2480 + }, + { + "epoch": 1.1716171617161717, + "grad_norm": 0.5211944580078125, + "learning_rate": 4.995540075751942e-05, + "loss": 0.256, + "num_input_tokens_seen": 2497056, + "step": 2485 + }, + { + "epoch": 1.1739745403111739, + "grad_norm": 0.7576819062232971, + "learning_rate": 4.995416404477433e-05, + "loss": 0.3542, + "num_input_tokens_seen": 2502880, + "step": 2490 + }, + { + "epoch": 1.1763319189061763, + "grad_norm": 0.6864145994186401, + "learning_rate": 4.995291043476486e-05, + "loss": 0.3249, + "num_input_tokens_seen": 2508480, + "step": 2495 + }, + { + "epoch": 1.1786892975011787, + "grad_norm": 0.9817536473274231, + "learning_rate": 4.995163992833986e-05, + "loss": 0.4497, + "num_input_tokens_seen": 2512960, + "step": 2500 + }, + { + "epoch": 1.181046676096181, + "grad_norm": 0.735061526298523, + "learning_rate": 4.9950352526359634e-05, + "loss": 0.448, + "num_input_tokens_seen": 2519456, + "step": 2505 + }, + { + "epoch": 1.1834040546911835, + "grad_norm": 0.912033200263977, + "learning_rate": 4.9949048229695925e-05, + "loss": 0.391, + "num_input_tokens_seen": 2525216, + "step": 2510 + }, + { + "epoch": 1.1857614332861859, + "grad_norm": 1.1792798042297363, + "learning_rate": 4.994772703923192e-05, + "loss": 0.3564, + "num_input_tokens_seen": 2530240, + "step": 2515 + }, + { + "epoch": 1.188118811881188, + "grad_norm": 0.7150898575782776, + "learning_rate": 4.994638895586222e-05, + "loss": 0.3366, + "num_input_tokens_seen": 2535552, + "step": 2520 + }, + { + "epoch": 1.1904761904761905, + "grad_norm": 0.4459812045097351, + "learning_rate": 4.99450339804929e-05, + "loss": 0.361, + "num_input_tokens_seen": 2540704, + "step": 2525 + }, + { + "epoch": 1.1928335690711929, + "grad_norm": 0.5798656344413757, + "learning_rate": 4.9943662114041464e-05, + "loss": 0.3682, + "num_input_tokens_seen": 2545376, + "step": 2530 + }, + { + "epoch": 1.1951909476661953, + "grad_norm": 0.7328293323516846, + "learning_rate": 4.994227335743682e-05, + "loss": 0.3661, + "num_input_tokens_seen": 2550592, + "step": 2535 + }, + { + "epoch": 1.1975483262611974, + "grad_norm": 0.544994056224823, + "learning_rate": 4.994086771161937e-05, + "loss": 0.3461, + "num_input_tokens_seen": 2554912, + "step": 2540 + }, + { + "epoch": 1.1999057048561999, + "grad_norm": 0.525545060634613, + "learning_rate": 4.99394451775409e-05, + "loss": 0.3823, + "num_input_tokens_seen": 2560064, + "step": 2545 + }, + { + "epoch": 1.2022630834512023, + "grad_norm": 0.4715658128261566, + "learning_rate": 4.9938005756164664e-05, + "loss": 0.3318, + "num_input_tokens_seen": 2565184, + "step": 2550 + }, + { + "epoch": 1.2046204620462047, + "grad_norm": 0.5904998779296875, + "learning_rate": 4.9936549448465334e-05, + "loss": 0.3534, + "num_input_tokens_seen": 2569696, + "step": 2555 + }, + { + "epoch": 1.206977840641207, + "grad_norm": 1.0814273357391357, + "learning_rate": 4.993507625542903e-05, + "loss": 0.3501, + "num_input_tokens_seen": 2574112, + "step": 2560 + }, + { + "epoch": 1.2093352192362095, + "grad_norm": 0.6115481853485107, + "learning_rate": 4.993358617805329e-05, + "loss": 0.3274, + "num_input_tokens_seen": 2578688, + "step": 2565 + }, + { + "epoch": 1.2116925978312116, + "grad_norm": 0.7028239965438843, + "learning_rate": 4.993207921734711e-05, + "loss": 0.3776, + "num_input_tokens_seen": 2583552, + "step": 2570 + }, + { + "epoch": 1.214049976426214, + "grad_norm": 0.6012017130851746, + "learning_rate": 4.993055537433087e-05, + "loss": 0.3727, + "num_input_tokens_seen": 2588384, + "step": 2575 + }, + { + "epoch": 1.2164073550212164, + "grad_norm": 0.5226420760154724, + "learning_rate": 4.992901465003644e-05, + "loss": 0.3177, + "num_input_tokens_seen": 2593504, + "step": 2580 + }, + { + "epoch": 1.2187647336162188, + "grad_norm": 0.8864946961402893, + "learning_rate": 4.9927457045507084e-05, + "loss": 0.3734, + "num_input_tokens_seen": 2598752, + "step": 2585 + }, + { + "epoch": 1.221122112211221, + "grad_norm": 0.5882484316825867, + "learning_rate": 4.9925882561797516e-05, + "loss": 0.3704, + "num_input_tokens_seen": 2604096, + "step": 2590 + }, + { + "epoch": 1.2234794908062234, + "grad_norm": 0.5059121251106262, + "learning_rate": 4.9924291199973855e-05, + "loss": 0.3294, + "num_input_tokens_seen": 2608960, + "step": 2595 + }, + { + "epoch": 1.2258368694012258, + "grad_norm": 0.5383749008178711, + "learning_rate": 4.992268296111367e-05, + "loss": 0.3125, + "num_input_tokens_seen": 2614528, + "step": 2600 + }, + { + "epoch": 1.2281942479962282, + "grad_norm": 0.6961892247200012, + "learning_rate": 4.992105784630595e-05, + "loss": 0.3033, + "num_input_tokens_seen": 2619712, + "step": 2605 + }, + { + "epoch": 1.2305516265912306, + "grad_norm": 0.4388941526412964, + "learning_rate": 4.991941585665111e-05, + "loss": 0.3188, + "num_input_tokens_seen": 2624096, + "step": 2610 + }, + { + "epoch": 1.232909005186233, + "grad_norm": 0.4958673417568207, + "learning_rate": 4.9917756993261e-05, + "loss": 0.3003, + "num_input_tokens_seen": 2628256, + "step": 2615 + }, + { + "epoch": 1.2352663837812352, + "grad_norm": 0.6410450339317322, + "learning_rate": 4.9916081257258884e-05, + "loss": 0.4523, + "num_input_tokens_seen": 2632448, + "step": 2620 + }, + { + "epoch": 1.2376237623762376, + "grad_norm": 0.32272690534591675, + "learning_rate": 4.991438864977946e-05, + "loss": 0.3064, + "num_input_tokens_seen": 2637696, + "step": 2625 + }, + { + "epoch": 1.23998114097124, + "grad_norm": 0.4612202048301697, + "learning_rate": 4.991267917196885e-05, + "loss": 0.3286, + "num_input_tokens_seen": 2642240, + "step": 2630 + }, + { + "epoch": 1.2423385195662424, + "grad_norm": 0.5317416787147522, + "learning_rate": 4.991095282498458e-05, + "loss": 0.3808, + "num_input_tokens_seen": 2647040, + "step": 2635 + }, + { + "epoch": 1.2446958981612446, + "grad_norm": 0.6379772424697876, + "learning_rate": 4.990920960999563e-05, + "loss": 0.2932, + "num_input_tokens_seen": 2652032, + "step": 2640 + }, + { + "epoch": 1.247053276756247, + "grad_norm": 0.5013633370399475, + "learning_rate": 4.990744952818239e-05, + "loss": 0.4331, + "num_input_tokens_seen": 2656768, + "step": 2645 + }, + { + "epoch": 1.2494106553512494, + "grad_norm": 1.0653992891311646, + "learning_rate": 4.990567258073665e-05, + "loss": 0.4415, + "num_input_tokens_seen": 2663008, + "step": 2650 + }, + { + "epoch": 1.2517680339462518, + "grad_norm": 0.4642782509326935, + "learning_rate": 4.9903878768861654e-05, + "loss": 0.3624, + "num_input_tokens_seen": 2668704, + "step": 2655 + }, + { + "epoch": 1.2541254125412542, + "grad_norm": 0.9342177510261536, + "learning_rate": 4.9902068093772046e-05, + "loss": 0.3766, + "num_input_tokens_seen": 2673056, + "step": 2660 + }, + { + "epoch": 1.2564827911362566, + "grad_norm": 0.443909227848053, + "learning_rate": 4.9900240556693895e-05, + "loss": 0.3701, + "num_input_tokens_seen": 2677760, + "step": 2665 + }, + { + "epoch": 1.2588401697312588, + "grad_norm": 0.7191041707992554, + "learning_rate": 4.989839615886468e-05, + "loss": 0.3928, + "num_input_tokens_seen": 2682560, + "step": 2670 + }, + { + "epoch": 1.2611975483262612, + "grad_norm": 1.0863206386566162, + "learning_rate": 4.9896534901533296e-05, + "loss": 0.3992, + "num_input_tokens_seen": 2688000, + "step": 2675 + }, + { + "epoch": 1.2635549269212636, + "grad_norm": 0.5972856879234314, + "learning_rate": 4.989465678596007e-05, + "loss": 0.3757, + "num_input_tokens_seen": 2692832, + "step": 2680 + }, + { + "epoch": 1.265912305516266, + "grad_norm": 0.48382487893104553, + "learning_rate": 4.9892761813416735e-05, + "loss": 0.3327, + "num_input_tokens_seen": 2698528, + "step": 2685 + }, + { + "epoch": 1.2682696841112682, + "grad_norm": 1.2289206981658936, + "learning_rate": 4.989084998518642e-05, + "loss": 0.429, + "num_input_tokens_seen": 2702336, + "step": 2690 + }, + { + "epoch": 1.2706270627062706, + "grad_norm": 0.7582857012748718, + "learning_rate": 4.9888921302563706e-05, + "loss": 0.3513, + "num_input_tokens_seen": 2707008, + "step": 2695 + }, + { + "epoch": 1.272984441301273, + "grad_norm": 0.7868213653564453, + "learning_rate": 4.988697576685456e-05, + "loss": 0.3746, + "num_input_tokens_seen": 2711392, + "step": 2700 + }, + { + "epoch": 1.2753418198962754, + "grad_norm": 0.5037279725074768, + "learning_rate": 4.9885013379376363e-05, + "loss": 0.3397, + "num_input_tokens_seen": 2716000, + "step": 2705 + }, + { + "epoch": 1.2776991984912778, + "grad_norm": 0.8365815877914429, + "learning_rate": 4.988303414145792e-05, + "loss": 0.3351, + "num_input_tokens_seen": 2720736, + "step": 2710 + }, + { + "epoch": 1.2800565770862802, + "grad_norm": 0.4260013699531555, + "learning_rate": 4.988103805443942e-05, + "loss": 0.3511, + "num_input_tokens_seen": 2725824, + "step": 2715 + }, + { + "epoch": 1.2824139556812824, + "grad_norm": 0.72172611951828, + "learning_rate": 4.98790251196725e-05, + "loss": 0.4168, + "num_input_tokens_seen": 2731168, + "step": 2720 + }, + { + "epoch": 1.2847713342762848, + "grad_norm": 0.6184137463569641, + "learning_rate": 4.987699533852015e-05, + "loss": 0.3312, + "num_input_tokens_seen": 2736128, + "step": 2725 + }, + { + "epoch": 1.2871287128712872, + "grad_norm": 0.4869082272052765, + "learning_rate": 4.987494871235683e-05, + "loss": 0.2857, + "num_input_tokens_seen": 2740256, + "step": 2730 + }, + { + "epoch": 1.2894860914662896, + "grad_norm": 0.5426555275917053, + "learning_rate": 4.987288524256836e-05, + "loss": 0.3352, + "num_input_tokens_seen": 2745728, + "step": 2735 + }, + { + "epoch": 1.2918434700612917, + "grad_norm": 0.6634846329689026, + "learning_rate": 4.9870804930551996e-05, + "loss": 0.4051, + "num_input_tokens_seen": 2750368, + "step": 2740 + }, + { + "epoch": 1.2942008486562941, + "grad_norm": 0.4338265657424927, + "learning_rate": 4.9868707777716363e-05, + "loss": 0.3967, + "num_input_tokens_seen": 2755520, + "step": 2745 + }, + { + "epoch": 1.2965582272512965, + "grad_norm": 0.4616822898387909, + "learning_rate": 4.986659378548153e-05, + "loss": 0.3134, + "num_input_tokens_seen": 2760192, + "step": 2750 + }, + { + "epoch": 1.298915605846299, + "grad_norm": 0.5989142656326294, + "learning_rate": 4.986446295527893e-05, + "loss": 0.4124, + "num_input_tokens_seen": 2765920, + "step": 2755 + }, + { + "epoch": 1.3012729844413014, + "grad_norm": 0.374980628490448, + "learning_rate": 4.986231528855144e-05, + "loss": 0.3716, + "num_input_tokens_seen": 2770624, + "step": 2760 + }, + { + "epoch": 1.3036303630363038, + "grad_norm": 0.6428484320640564, + "learning_rate": 4.98601507867533e-05, + "loss": 0.4521, + "num_input_tokens_seen": 2776960, + "step": 2765 + }, + { + "epoch": 1.305987741631306, + "grad_norm": 0.535014271736145, + "learning_rate": 4.9857969451350164e-05, + "loss": 0.3285, + "num_input_tokens_seen": 2782048, + "step": 2770 + }, + { + "epoch": 1.3083451202263083, + "grad_norm": 0.47030937671661377, + "learning_rate": 4.985577128381908e-05, + "loss": 0.3393, + "num_input_tokens_seen": 2787520, + "step": 2775 + }, + { + "epoch": 1.3107024988213107, + "grad_norm": 0.5649645924568176, + "learning_rate": 4.9853556285648505e-05, + "loss": 0.3552, + "num_input_tokens_seen": 2792128, + "step": 2780 + }, + { + "epoch": 1.3130598774163131, + "grad_norm": 0.7651804685592651, + "learning_rate": 4.985132445833829e-05, + "loss": 0.3827, + "num_input_tokens_seen": 2797920, + "step": 2785 + }, + { + "epoch": 1.3154172560113153, + "grad_norm": 0.8120128512382507, + "learning_rate": 4.984907580339966e-05, + "loss": 0.3552, + "num_input_tokens_seen": 2802368, + "step": 2790 + }, + { + "epoch": 1.3177746346063177, + "grad_norm": 0.480980783700943, + "learning_rate": 4.984681032235527e-05, + "loss": 0.2978, + "num_input_tokens_seen": 2806592, + "step": 2795 + }, + { + "epoch": 1.3201320132013201, + "grad_norm": 1.064394235610962, + "learning_rate": 4.9844528016739136e-05, + "loss": 0.41, + "num_input_tokens_seen": 2811648, + "step": 2800 + }, + { + "epoch": 1.3224893917963225, + "grad_norm": 0.6525077223777771, + "learning_rate": 4.984222888809668e-05, + "loss": 0.3625, + "num_input_tokens_seen": 2817088, + "step": 2805 + }, + { + "epoch": 1.324846770391325, + "grad_norm": 0.6033342480659485, + "learning_rate": 4.9839912937984726e-05, + "loss": 0.3209, + "num_input_tokens_seen": 2822240, + "step": 2810 + }, + { + "epoch": 1.3272041489863273, + "grad_norm": 0.3510502874851227, + "learning_rate": 4.9837580167971476e-05, + "loss": 0.3855, + "num_input_tokens_seen": 2826400, + "step": 2815 + }, + { + "epoch": 1.3295615275813295, + "grad_norm": 0.5077722668647766, + "learning_rate": 4.983523057963651e-05, + "loss": 0.38, + "num_input_tokens_seen": 2832032, + "step": 2820 + }, + { + "epoch": 1.331918906176332, + "grad_norm": 0.5386887192726135, + "learning_rate": 4.983286417457084e-05, + "loss": 0.4722, + "num_input_tokens_seen": 2840000, + "step": 2825 + }, + { + "epoch": 1.3342762847713343, + "grad_norm": 0.9026244282722473, + "learning_rate": 4.9830480954376804e-05, + "loss": 0.3445, + "num_input_tokens_seen": 2845312, + "step": 2830 + }, + { + "epoch": 1.3366336633663367, + "grad_norm": 0.6278881430625916, + "learning_rate": 4.982808092066816e-05, + "loss": 0.3883, + "num_input_tokens_seen": 2849984, + "step": 2835 + }, + { + "epoch": 1.338991041961339, + "grad_norm": 0.41701725125312805, + "learning_rate": 4.982566407507006e-05, + "loss": 0.3446, + "num_input_tokens_seen": 2854944, + "step": 2840 + }, + { + "epoch": 1.3413484205563413, + "grad_norm": 0.27371177077293396, + "learning_rate": 4.9823230419219025e-05, + "loss": 0.3073, + "num_input_tokens_seen": 2859776, + "step": 2845 + }, + { + "epoch": 1.3437057991513437, + "grad_norm": 0.45670416951179504, + "learning_rate": 4.982077995476296e-05, + "loss": 0.3575, + "num_input_tokens_seen": 2865472, + "step": 2850 + }, + { + "epoch": 1.346063177746346, + "grad_norm": 0.5852196216583252, + "learning_rate": 4.9818312683361154e-05, + "loss": 0.3041, + "num_input_tokens_seen": 2870560, + "step": 2855 + }, + { + "epoch": 1.3484205563413485, + "grad_norm": 0.9782708883285522, + "learning_rate": 4.981582860668427e-05, + "loss": 0.3581, + "num_input_tokens_seen": 2874592, + "step": 2860 + }, + { + "epoch": 1.350777934936351, + "grad_norm": 0.4004775285720825, + "learning_rate": 4.981332772641436e-05, + "loss": 0.3603, + "num_input_tokens_seen": 2879264, + "step": 2865 + }, + { + "epoch": 1.353135313531353, + "grad_norm": 0.4497714638710022, + "learning_rate": 4.981081004424485e-05, + "loss": 0.3828, + "num_input_tokens_seen": 2884416, + "step": 2870 + }, + { + "epoch": 1.3554926921263555, + "grad_norm": 0.6929923892021179, + "learning_rate": 4.9808275561880546e-05, + "loss": 0.4381, + "num_input_tokens_seen": 2889088, + "step": 2875 + }, + { + "epoch": 1.3578500707213579, + "grad_norm": 0.879804253578186, + "learning_rate": 4.9805724281037626e-05, + "loss": 0.31, + "num_input_tokens_seen": 2893088, + "step": 2880 + }, + { + "epoch": 1.3602074493163603, + "grad_norm": 0.5219035148620605, + "learning_rate": 4.980315620344364e-05, + "loss": 0.3865, + "num_input_tokens_seen": 2899776, + "step": 2885 + }, + { + "epoch": 1.3625648279113625, + "grad_norm": 0.9812784790992737, + "learning_rate": 4.980057133083751e-05, + "loss": 0.3712, + "num_input_tokens_seen": 2905536, + "step": 2890 + }, + { + "epoch": 1.3649222065063649, + "grad_norm": 0.6341279745101929, + "learning_rate": 4.979796966496956e-05, + "loss": 0.3522, + "num_input_tokens_seen": 2910240, + "step": 2895 + }, + { + "epoch": 1.3672795851013673, + "grad_norm": 0.35943740606307983, + "learning_rate": 4.979535120760143e-05, + "loss": 0.3773, + "num_input_tokens_seen": 2915040, + "step": 2900 + }, + { + "epoch": 1.3696369636963697, + "grad_norm": 0.8197608590126038, + "learning_rate": 4.9792715960506187e-05, + "loss": 0.3284, + "num_input_tokens_seen": 2919968, + "step": 2905 + }, + { + "epoch": 1.371994342291372, + "grad_norm": 1.1036365032196045, + "learning_rate": 4.979006392546823e-05, + "loss": 0.3743, + "num_input_tokens_seen": 2924992, + "step": 2910 + }, + { + "epoch": 1.3743517208863745, + "grad_norm": 0.7546456456184387, + "learning_rate": 4.978739510428334e-05, + "loss": 0.4058, + "num_input_tokens_seen": 2930912, + "step": 2915 + }, + { + "epoch": 1.3767090994813767, + "grad_norm": 0.7959119081497192, + "learning_rate": 4.978470949875865e-05, + "loss": 0.338, + "num_input_tokens_seen": 2935744, + "step": 2920 + }, + { + "epoch": 1.379066478076379, + "grad_norm": 0.7064768075942993, + "learning_rate": 4.978200711071269e-05, + "loss": 0.3412, + "num_input_tokens_seen": 2940256, + "step": 2925 + }, + { + "epoch": 1.3814238566713815, + "grad_norm": 0.4311988949775696, + "learning_rate": 4.977928794197532e-05, + "loss": 0.2939, + "num_input_tokens_seen": 2945696, + "step": 2930 + }, + { + "epoch": 1.3837812352663839, + "grad_norm": 0.6175945997238159, + "learning_rate": 4.977655199438778e-05, + "loss": 0.3532, + "num_input_tokens_seen": 2950304, + "step": 2935 + }, + { + "epoch": 1.386138613861386, + "grad_norm": 0.8201709985733032, + "learning_rate": 4.9773799269802665e-05, + "loss": 0.3766, + "num_input_tokens_seen": 2955200, + "step": 2940 + }, + { + "epoch": 1.3884959924563884, + "grad_norm": 0.5208750367164612, + "learning_rate": 4.9771029770083935e-05, + "loss": 0.3404, + "num_input_tokens_seen": 2960256, + "step": 2945 + }, + { + "epoch": 1.3908533710513908, + "grad_norm": 0.6798740029335022, + "learning_rate": 4.976824349710691e-05, + "loss": 0.3707, + "num_input_tokens_seen": 2964704, + "step": 2950 + }, + { + "epoch": 1.3932107496463932, + "grad_norm": 0.521921694278717, + "learning_rate": 4.976544045275826e-05, + "loss": 0.4205, + "num_input_tokens_seen": 2969312, + "step": 2955 + }, + { + "epoch": 1.3955681282413956, + "grad_norm": 0.48010194301605225, + "learning_rate": 4.976262063893602e-05, + "loss": 0.3698, + "num_input_tokens_seen": 2973728, + "step": 2960 + }, + { + "epoch": 1.397925506836398, + "grad_norm": 0.8291370868682861, + "learning_rate": 4.9759784057549574e-05, + "loss": 0.3768, + "num_input_tokens_seen": 2979424, + "step": 2965 + }, + { + "epoch": 1.4002828854314002, + "grad_norm": 0.8392614722251892, + "learning_rate": 4.9756930710519656e-05, + "loss": 0.3275, + "num_input_tokens_seen": 2984608, + "step": 2970 + }, + { + "epoch": 1.4026402640264026, + "grad_norm": 1.041524887084961, + "learning_rate": 4.9754060599778374e-05, + "loss": 0.3655, + "num_input_tokens_seen": 2989248, + "step": 2975 + }, + { + "epoch": 1.404997642621405, + "grad_norm": 0.39793065190315247, + "learning_rate": 4.975117372726916e-05, + "loss": 0.3553, + "num_input_tokens_seen": 2993248, + "step": 2980 + }, + { + "epoch": 1.4073550212164074, + "grad_norm": 0.44637101888656616, + "learning_rate": 4.9748270094946814e-05, + "loss": 0.2742, + "num_input_tokens_seen": 2997920, + "step": 2985 + }, + { + "epoch": 1.4097123998114096, + "grad_norm": 0.9214646220207214, + "learning_rate": 4.9745349704777464e-05, + "loss": 0.3755, + "num_input_tokens_seen": 3003776, + "step": 2990 + }, + { + "epoch": 1.412069778406412, + "grad_norm": 0.5480021238327026, + "learning_rate": 4.974241255873863e-05, + "loss": 0.2975, + "num_input_tokens_seen": 3008096, + "step": 2995 + }, + { + "epoch": 1.4144271570014144, + "grad_norm": 0.4284164011478424, + "learning_rate": 4.9739458658819115e-05, + "loss": 0.3127, + "num_input_tokens_seen": 3013152, + "step": 3000 + }, + { + "epoch": 1.4167845355964168, + "grad_norm": 0.554698646068573, + "learning_rate": 4.973648800701912e-05, + "loss": 0.289, + "num_input_tokens_seen": 3017088, + "step": 3005 + }, + { + "epoch": 1.4191419141914192, + "grad_norm": 0.5046913027763367, + "learning_rate": 4.9733500605350155e-05, + "loss": 0.3056, + "num_input_tokens_seen": 3021824, + "step": 3010 + }, + { + "epoch": 1.4214992927864216, + "grad_norm": 0.39606785774230957, + "learning_rate": 4.97304964558351e-05, + "loss": 0.3968, + "num_input_tokens_seen": 3026144, + "step": 3015 + }, + { + "epoch": 1.4238566713814238, + "grad_norm": 0.6000524163246155, + "learning_rate": 4.9727475560508155e-05, + "loss": 0.3876, + "num_input_tokens_seen": 3030784, + "step": 3020 + }, + { + "epoch": 1.4262140499764262, + "grad_norm": 0.6161635518074036, + "learning_rate": 4.9724437921414857e-05, + "loss": 0.2726, + "num_input_tokens_seen": 3035328, + "step": 3025 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.424553245306015, + "learning_rate": 4.97213835406121e-05, + "loss": 0.3485, + "num_input_tokens_seen": 3040224, + "step": 3030 + }, + { + "epoch": 1.430928807166431, + "grad_norm": 0.715301513671875, + "learning_rate": 4.97183124201681e-05, + "loss": 0.3104, + "num_input_tokens_seen": 3045344, + "step": 3035 + }, + { + "epoch": 1.4332861857614332, + "grad_norm": 0.5115458369255066, + "learning_rate": 4.97152245621624e-05, + "loss": 0.4012, + "num_input_tokens_seen": 3049376, + "step": 3040 + }, + { + "epoch": 1.4356435643564356, + "grad_norm": 0.5292837619781494, + "learning_rate": 4.971211996868591e-05, + "loss": 0.2339, + "num_input_tokens_seen": 3054944, + "step": 3045 + }, + { + "epoch": 1.438000942951438, + "grad_norm": 0.6727907657623291, + "learning_rate": 4.970899864184083e-05, + "loss": 0.3411, + "num_input_tokens_seen": 3060672, + "step": 3050 + }, + { + "epoch": 1.4403583215464404, + "grad_norm": 0.4637244641780853, + "learning_rate": 4.9705860583740716e-05, + "loss": 0.2999, + "num_input_tokens_seen": 3065056, + "step": 3055 + }, + { + "epoch": 1.4427157001414428, + "grad_norm": 1.1683905124664307, + "learning_rate": 4.970270579651045e-05, + "loss": 0.4146, + "num_input_tokens_seen": 3069536, + "step": 3060 + }, + { + "epoch": 1.4450730787364452, + "grad_norm": 0.41963785886764526, + "learning_rate": 4.969953428228624e-05, + "loss": 0.4546, + "num_input_tokens_seen": 3074560, + "step": 3065 + }, + { + "epoch": 1.4474304573314474, + "grad_norm": 0.7191119194030762, + "learning_rate": 4.969634604321563e-05, + "loss": 0.3744, + "num_input_tokens_seen": 3078688, + "step": 3070 + }, + { + "epoch": 1.4497878359264498, + "grad_norm": 0.5890703797340393, + "learning_rate": 4.969314108145745e-05, + "loss": 0.3331, + "num_input_tokens_seen": 3083552, + "step": 3075 + }, + { + "epoch": 1.4521452145214522, + "grad_norm": 0.6282918453216553, + "learning_rate": 4.968991939918191e-05, + "loss": 0.433, + "num_input_tokens_seen": 3088352, + "step": 3080 + }, + { + "epoch": 1.4545025931164546, + "grad_norm": 0.8296025395393372, + "learning_rate": 4.9686680998570494e-05, + "loss": 0.354, + "num_input_tokens_seen": 3092960, + "step": 3085 + }, + { + "epoch": 1.4568599717114568, + "grad_norm": 0.8617197871208191, + "learning_rate": 4.968342588181605e-05, + "loss": 0.355, + "num_input_tokens_seen": 3098592, + "step": 3090 + }, + { + "epoch": 1.4592173503064592, + "grad_norm": 0.4949839413166046, + "learning_rate": 4.9680154051122694e-05, + "loss": 0.3233, + "num_input_tokens_seen": 3103840, + "step": 3095 + }, + { + "epoch": 1.4615747289014616, + "grad_norm": 0.7089719772338867, + "learning_rate": 4.9676865508705914e-05, + "loss": 0.2983, + "num_input_tokens_seen": 3109312, + "step": 3100 + }, + { + "epoch": 1.463932107496464, + "grad_norm": 0.4879027009010315, + "learning_rate": 4.9673560256792474e-05, + "loss": 0.3959, + "num_input_tokens_seen": 3113984, + "step": 3105 + }, + { + "epoch": 1.4662894860914664, + "grad_norm": 0.5505763292312622, + "learning_rate": 4.967023829762047e-05, + "loss": 0.3535, + "num_input_tokens_seen": 3118400, + "step": 3110 + }, + { + "epoch": 1.4686468646864688, + "grad_norm": 0.60032057762146, + "learning_rate": 4.96668996334393e-05, + "loss": 0.3539, + "num_input_tokens_seen": 3123488, + "step": 3115 + }, + { + "epoch": 1.471004243281471, + "grad_norm": 0.3244975507259369, + "learning_rate": 4.9663544266509684e-05, + "loss": 0.3693, + "num_input_tokens_seen": 3128256, + "step": 3120 + }, + { + "epoch": 1.4733616218764733, + "grad_norm": 0.3905419409275055, + "learning_rate": 4.966017219910366e-05, + "loss": 0.3353, + "num_input_tokens_seen": 3133088, + "step": 3125 + }, + { + "epoch": 1.4757190004714758, + "grad_norm": 0.5464195013046265, + "learning_rate": 4.965678343350455e-05, + "loss": 0.4138, + "num_input_tokens_seen": 3138336, + "step": 3130 + }, + { + "epoch": 1.4780763790664782, + "grad_norm": 0.6046174764633179, + "learning_rate": 4.965337797200699e-05, + "loss": 0.3385, + "num_input_tokens_seen": 3142912, + "step": 3135 + }, + { + "epoch": 1.4804337576614803, + "grad_norm": 0.32217124104499817, + "learning_rate": 4.9649955816916946e-05, + "loss": 0.3631, + "num_input_tokens_seen": 3148256, + "step": 3140 + }, + { + "epoch": 1.4827911362564827, + "grad_norm": 1.1970844268798828, + "learning_rate": 4.964651697055165e-05, + "loss": 0.3749, + "num_input_tokens_seen": 3152800, + "step": 3145 + }, + { + "epoch": 1.4851485148514851, + "grad_norm": 0.7218926548957825, + "learning_rate": 4.9643061435239666e-05, + "loss": 0.3436, + "num_input_tokens_seen": 3158432, + "step": 3150 + }, + { + "epoch": 1.4875058934464875, + "grad_norm": 0.6722522974014282, + "learning_rate": 4.963958921332086e-05, + "loss": 0.2946, + "num_input_tokens_seen": 3162176, + "step": 3155 + }, + { + "epoch": 1.48986327204149, + "grad_norm": 0.5346218943595886, + "learning_rate": 4.9636100307146355e-05, + "loss": 0.3489, + "num_input_tokens_seen": 3167008, + "step": 3160 + }, + { + "epoch": 1.4922206506364923, + "grad_norm": 0.5754990577697754, + "learning_rate": 4.963259471907862e-05, + "loss": 0.4281, + "num_input_tokens_seen": 3173056, + "step": 3165 + }, + { + "epoch": 1.4945780292314945, + "grad_norm": 0.5484002828598022, + "learning_rate": 4.9629072451491396e-05, + "loss": 0.3414, + "num_input_tokens_seen": 3178624, + "step": 3170 + }, + { + "epoch": 1.496935407826497, + "grad_norm": 0.30876779556274414, + "learning_rate": 4.962553350676973e-05, + "loss": 0.4206, + "num_input_tokens_seen": 3183744, + "step": 3175 + }, + { + "epoch": 1.4992927864214993, + "grad_norm": 0.6147398352622986, + "learning_rate": 4.9621977887309944e-05, + "loss": 0.3396, + "num_input_tokens_seen": 3190240, + "step": 3180 + }, + { + "epoch": 1.5007072135785007, + "eval_loss": 0.3412669003009796, + "eval_runtime": 25.5608, + "eval_samples_per_second": 36.892, + "eval_steps_per_second": 9.233, + "num_input_tokens_seen": 3194016, + "step": 3183 + }, + { + "epoch": 1.5016501650165015, + "grad_norm": 0.6041547656059265, + "learning_rate": 4.961840559551967e-05, + "loss": 0.3075, + "num_input_tokens_seen": 3196864, + "step": 3185 + }, + { + "epoch": 1.504007543611504, + "grad_norm": 1.193408727645874, + "learning_rate": 4.961481663381782e-05, + "loss": 0.3473, + "num_input_tokens_seen": 3202144, + "step": 3190 + }, + { + "epoch": 1.5063649222065063, + "grad_norm": 1.3551757335662842, + "learning_rate": 4.961121100463459e-05, + "loss": 0.4373, + "num_input_tokens_seen": 3207680, + "step": 3195 + }, + { + "epoch": 1.5087223008015087, + "grad_norm": 0.4780251383781433, + "learning_rate": 4.960758871041148e-05, + "loss": 0.3469, + "num_input_tokens_seen": 3212864, + "step": 3200 + }, + { + "epoch": 1.511079679396511, + "grad_norm": 0.560552179813385, + "learning_rate": 4.960394975360125e-05, + "loss": 0.3402, + "num_input_tokens_seen": 3218528, + "step": 3205 + }, + { + "epoch": 1.5134370579915135, + "grad_norm": 0.5728883743286133, + "learning_rate": 4.960029413666796e-05, + "loss": 0.3866, + "num_input_tokens_seen": 3225152, + "step": 3210 + }, + { + "epoch": 1.515794436586516, + "grad_norm": 0.5331249237060547, + "learning_rate": 4.9596621862086935e-05, + "loss": 0.3509, + "num_input_tokens_seen": 3230272, + "step": 3215 + }, + { + "epoch": 1.5181518151815183, + "grad_norm": 0.4300689101219177, + "learning_rate": 4.95929329323448e-05, + "loss": 0.3919, + "num_input_tokens_seen": 3235584, + "step": 3220 + }, + { + "epoch": 1.5205091937765205, + "grad_norm": 0.5622859001159668, + "learning_rate": 4.958922734993945e-05, + "loss": 0.3582, + "num_input_tokens_seen": 3240064, + "step": 3225 + }, + { + "epoch": 1.522866572371523, + "grad_norm": 0.6185344457626343, + "learning_rate": 4.958550511738004e-05, + "loss": 0.2998, + "num_input_tokens_seen": 3244704, + "step": 3230 + }, + { + "epoch": 1.525223950966525, + "grad_norm": 0.8041464686393738, + "learning_rate": 4.958176623718703e-05, + "loss": 0.3376, + "num_input_tokens_seen": 3248992, + "step": 3235 + }, + { + "epoch": 1.5275813295615275, + "grad_norm": 0.5777222514152527, + "learning_rate": 4.957801071189212e-05, + "loss": 0.3157, + "num_input_tokens_seen": 3254368, + "step": 3240 + }, + { + "epoch": 1.5299387081565299, + "grad_norm": 0.45562466979026794, + "learning_rate": 4.9574238544038306e-05, + "loss": 0.3361, + "num_input_tokens_seen": 3259488, + "step": 3245 + }, + { + "epoch": 1.5322960867515323, + "grad_norm": 0.44011425971984863, + "learning_rate": 4.9570449736179844e-05, + "loss": 0.4104, + "num_input_tokens_seen": 3264224, + "step": 3250 + }, + { + "epoch": 1.5346534653465347, + "grad_norm": 0.735593855381012, + "learning_rate": 4.956664429088225e-05, + "loss": 0.3166, + "num_input_tokens_seen": 3269888, + "step": 3255 + }, + { + "epoch": 1.537010843941537, + "grad_norm": 0.5494565367698669, + "learning_rate": 4.956282221072232e-05, + "loss": 0.3755, + "num_input_tokens_seen": 3275040, + "step": 3260 + }, + { + "epoch": 1.5393682225365395, + "grad_norm": 0.6973879337310791, + "learning_rate": 4.95589834982881e-05, + "loss": 0.3519, + "num_input_tokens_seen": 3279712, + "step": 3265 + }, + { + "epoch": 1.541725601131542, + "grad_norm": 0.5479940176010132, + "learning_rate": 4.955512815617892e-05, + "loss": 0.3818, + "num_input_tokens_seen": 3283776, + "step": 3270 + }, + { + "epoch": 1.544082979726544, + "grad_norm": 0.43221572041511536, + "learning_rate": 4.955125618700534e-05, + "loss": 0.352, + "num_input_tokens_seen": 3288928, + "step": 3275 + }, + { + "epoch": 1.5464403583215465, + "grad_norm": 0.34842416644096375, + "learning_rate": 4.95473675933892e-05, + "loss": 0.3268, + "num_input_tokens_seen": 3293504, + "step": 3280 + }, + { + "epoch": 1.5487977369165487, + "grad_norm": 0.3987444341182709, + "learning_rate": 4.954346237796359e-05, + "loss": 0.3078, + "num_input_tokens_seen": 3297792, + "step": 3285 + }, + { + "epoch": 1.551155115511551, + "grad_norm": 0.47722724080085754, + "learning_rate": 4.953954054337287e-05, + "loss": 0.4145, + "num_input_tokens_seen": 3304736, + "step": 3290 + }, + { + "epoch": 1.5535124941065535, + "grad_norm": 0.4886819124221802, + "learning_rate": 4.953560209227261e-05, + "loss": 0.3458, + "num_input_tokens_seen": 3310048, + "step": 3295 + }, + { + "epoch": 1.5558698727015559, + "grad_norm": 0.44439026713371277, + "learning_rate": 4.953164702732969e-05, + "loss": 0.3898, + "num_input_tokens_seen": 3315328, + "step": 3300 + }, + { + "epoch": 1.5582272512965583, + "grad_norm": 0.597527027130127, + "learning_rate": 4.95276753512222e-05, + "loss": 0.3703, + "num_input_tokens_seen": 3320416, + "step": 3305 + }, + { + "epoch": 1.5605846298915607, + "grad_norm": 0.5193409323692322, + "learning_rate": 4.952368706663948e-05, + "loss": 0.3707, + "num_input_tokens_seen": 3325056, + "step": 3310 + }, + { + "epoch": 1.562942008486563, + "grad_norm": 0.6953625679016113, + "learning_rate": 4.951968217628214e-05, + "loss": 0.3412, + "num_input_tokens_seen": 3328672, + "step": 3315 + }, + { + "epoch": 1.5652993870815655, + "grad_norm": 0.48664993047714233, + "learning_rate": 4.9515660682862026e-05, + "loss": 0.2625, + "num_input_tokens_seen": 3333376, + "step": 3320 + }, + { + "epoch": 1.5676567656765676, + "grad_norm": 0.39638063311576843, + "learning_rate": 4.95116225891022e-05, + "loss": 0.3449, + "num_input_tokens_seen": 3339424, + "step": 3325 + }, + { + "epoch": 1.57001414427157, + "grad_norm": 0.4986254870891571, + "learning_rate": 4.950756789773699e-05, + "loss": 0.3474, + "num_input_tokens_seen": 3343968, + "step": 3330 + }, + { + "epoch": 1.5723715228665722, + "grad_norm": 0.7905935645103455, + "learning_rate": 4.9503496611511974e-05, + "loss": 0.346, + "num_input_tokens_seen": 3349856, + "step": 3335 + }, + { + "epoch": 1.5747289014615746, + "grad_norm": 0.6550210118293762, + "learning_rate": 4.9499408733183924e-05, + "loss": 0.2999, + "num_input_tokens_seen": 3355232, + "step": 3340 + }, + { + "epoch": 1.577086280056577, + "grad_norm": 0.6625834703445435, + "learning_rate": 4.94953042655209e-05, + "loss": 0.3254, + "num_input_tokens_seen": 3360288, + "step": 3345 + }, + { + "epoch": 1.5794436586515794, + "grad_norm": 0.9705275893211365, + "learning_rate": 4.949118321130215e-05, + "loss": 0.3278, + "num_input_tokens_seen": 3364768, + "step": 3350 + }, + { + "epoch": 1.5818010372465818, + "grad_norm": 0.37506476044654846, + "learning_rate": 4.948704557331817e-05, + "loss": 0.3174, + "num_input_tokens_seen": 3369632, + "step": 3355 + }, + { + "epoch": 1.5841584158415842, + "grad_norm": 0.39219480752944946, + "learning_rate": 4.94828913543707e-05, + "loss": 0.3815, + "num_input_tokens_seen": 3375264, + "step": 3360 + }, + { + "epoch": 1.5865157944365866, + "grad_norm": 0.5678951144218445, + "learning_rate": 4.947872055727269e-05, + "loss": 0.3204, + "num_input_tokens_seen": 3380064, + "step": 3365 + }, + { + "epoch": 1.588873173031589, + "grad_norm": 0.5203983187675476, + "learning_rate": 4.9474533184848324e-05, + "loss": 0.3409, + "num_input_tokens_seen": 3384768, + "step": 3370 + }, + { + "epoch": 1.5912305516265912, + "grad_norm": 0.8494237065315247, + "learning_rate": 4.9470329239933e-05, + "loss": 0.3885, + "num_input_tokens_seen": 3388832, + "step": 3375 + }, + { + "epoch": 1.5935879302215936, + "grad_norm": 0.49740317463874817, + "learning_rate": 4.9466108725373353e-05, + "loss": 0.3912, + "num_input_tokens_seen": 3392896, + "step": 3380 + }, + { + "epoch": 1.5959453088165958, + "grad_norm": 0.6710426211357117, + "learning_rate": 4.9461871644027206e-05, + "loss": 0.3437, + "num_input_tokens_seen": 3398144, + "step": 3385 + }, + { + "epoch": 1.5983026874115982, + "grad_norm": 0.8679342269897461, + "learning_rate": 4.945761799876365e-05, + "loss": 0.4273, + "num_input_tokens_seen": 3403232, + "step": 3390 + }, + { + "epoch": 1.6006600660066006, + "grad_norm": 0.5451124906539917, + "learning_rate": 4.945334779246295e-05, + "loss": 0.3309, + "num_input_tokens_seen": 3408128, + "step": 3395 + }, + { + "epoch": 1.603017444601603, + "grad_norm": 0.42672768235206604, + "learning_rate": 4.9449061028016605e-05, + "loss": 0.32, + "num_input_tokens_seen": 3412864, + "step": 3400 + }, + { + "epoch": 1.6053748231966054, + "grad_norm": 0.364302396774292, + "learning_rate": 4.944475770832732e-05, + "loss": 0.301, + "num_input_tokens_seen": 3418016, + "step": 3405 + }, + { + "epoch": 1.6077322017916078, + "grad_norm": 0.4660206437110901, + "learning_rate": 4.944043783630899e-05, + "loss": 0.318, + "num_input_tokens_seen": 3422688, + "step": 3410 + }, + { + "epoch": 1.6100895803866102, + "grad_norm": 0.47168800234794617, + "learning_rate": 4.943610141488678e-05, + "loss": 0.3183, + "num_input_tokens_seen": 3428192, + "step": 3415 + }, + { + "epoch": 1.6124469589816126, + "grad_norm": 0.3789424002170563, + "learning_rate": 4.943174844699699e-05, + "loss": 0.3695, + "num_input_tokens_seen": 3432960, + "step": 3420 + }, + { + "epoch": 1.6148043375766148, + "grad_norm": 0.4532814025878906, + "learning_rate": 4.942737893558716e-05, + "loss": 0.3424, + "num_input_tokens_seen": 3437664, + "step": 3425 + }, + { + "epoch": 1.6171617161716172, + "grad_norm": 0.6027848720550537, + "learning_rate": 4.9422992883616025e-05, + "loss": 0.3863, + "num_input_tokens_seen": 3443008, + "step": 3430 + }, + { + "epoch": 1.6195190947666194, + "grad_norm": 0.676633358001709, + "learning_rate": 4.941859029405353e-05, + "loss": 0.3383, + "num_input_tokens_seen": 3447648, + "step": 3435 + }, + { + "epoch": 1.6218764733616218, + "grad_norm": 0.5310275554656982, + "learning_rate": 4.9414171169880805e-05, + "loss": 0.3167, + "num_input_tokens_seen": 3452032, + "step": 3440 + }, + { + "epoch": 1.6242338519566242, + "grad_norm": 0.46268507838249207, + "learning_rate": 4.940973551409018e-05, + "loss": 0.2917, + "num_input_tokens_seen": 3457408, + "step": 3445 + }, + { + "epoch": 1.6265912305516266, + "grad_norm": 0.483889102935791, + "learning_rate": 4.940528332968518e-05, + "loss": 0.3425, + "num_input_tokens_seen": 3462400, + "step": 3450 + }, + { + "epoch": 1.628948609146629, + "grad_norm": 0.3777027428150177, + "learning_rate": 4.940081461968051e-05, + "loss": 0.3014, + "num_input_tokens_seen": 3466816, + "step": 3455 + }, + { + "epoch": 1.6313059877416314, + "grad_norm": 0.9813194870948792, + "learning_rate": 4.93963293871021e-05, + "loss": 0.3751, + "num_input_tokens_seen": 3471744, + "step": 3460 + }, + { + "epoch": 1.6336633663366338, + "grad_norm": 0.5338661670684814, + "learning_rate": 4.939182763498703e-05, + "loss": 0.346, + "num_input_tokens_seen": 3476608, + "step": 3465 + }, + { + "epoch": 1.6360207449316362, + "grad_norm": 0.39694884419441223, + "learning_rate": 4.938730936638357e-05, + "loss": 0.2983, + "num_input_tokens_seen": 3480544, + "step": 3470 + }, + { + "epoch": 1.6383781235266384, + "grad_norm": 0.48222458362579346, + "learning_rate": 4.938277458435122e-05, + "loss": 0.4228, + "num_input_tokens_seen": 3486944, + "step": 3475 + }, + { + "epoch": 1.6407355021216408, + "grad_norm": 0.37583044171333313, + "learning_rate": 4.937822329196059e-05, + "loss": 0.3336, + "num_input_tokens_seen": 3492256, + "step": 3480 + }, + { + "epoch": 1.643092880716643, + "grad_norm": 0.5098620653152466, + "learning_rate": 4.937365549229352e-05, + "loss": 0.4736, + "num_input_tokens_seen": 3499072, + "step": 3485 + }, + { + "epoch": 1.6454502593116453, + "grad_norm": 0.5319552421569824, + "learning_rate": 4.936907118844302e-05, + "loss": 0.3836, + "num_input_tokens_seen": 3502912, + "step": 3490 + }, + { + "epoch": 1.6478076379066477, + "grad_norm": 0.7459515333175659, + "learning_rate": 4.936447038351325e-05, + "loss": 0.3896, + "num_input_tokens_seen": 3507808, + "step": 3495 + }, + { + "epoch": 1.6501650165016502, + "grad_norm": 0.7741000652313232, + "learning_rate": 4.9359853080619586e-05, + "loss": 0.346, + "num_input_tokens_seen": 3512896, + "step": 3500 + }, + { + "epoch": 1.6525223950966526, + "grad_norm": 0.5267006158828735, + "learning_rate": 4.9355219282888543e-05, + "loss": 0.33, + "num_input_tokens_seen": 3517632, + "step": 3505 + }, + { + "epoch": 1.654879773691655, + "grad_norm": 0.4636184275150299, + "learning_rate": 4.9350568993457805e-05, + "loss": 0.301, + "num_input_tokens_seen": 3522464, + "step": 3510 + }, + { + "epoch": 1.6572371522866574, + "grad_norm": 0.5543109178543091, + "learning_rate": 4.934590221547624e-05, + "loss": 0.3666, + "num_input_tokens_seen": 3527008, + "step": 3515 + }, + { + "epoch": 1.6595945308816598, + "grad_norm": 0.5278931856155396, + "learning_rate": 4.934121895210387e-05, + "loss": 0.3726, + "num_input_tokens_seen": 3531584, + "step": 3520 + }, + { + "epoch": 1.661951909476662, + "grad_norm": 2.010517120361328, + "learning_rate": 4.9336519206511894e-05, + "loss": 0.3767, + "num_input_tokens_seen": 3537920, + "step": 3525 + }, + { + "epoch": 1.6643092880716643, + "grad_norm": 1.1119837760925293, + "learning_rate": 4.9331802981882645e-05, + "loss": 0.3791, + "num_input_tokens_seen": 3543360, + "step": 3530 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.4123317301273346, + "learning_rate": 4.9327070281409636e-05, + "loss": 0.283, + "num_input_tokens_seen": 3548160, + "step": 3535 + }, + { + "epoch": 1.669024045261669, + "grad_norm": 0.5203651785850525, + "learning_rate": 4.932232110829753e-05, + "loss": 0.3312, + "num_input_tokens_seen": 3552288, + "step": 3540 + }, + { + "epoch": 1.6713814238566713, + "grad_norm": 0.5884875059127808, + "learning_rate": 4.931755546576215e-05, + "loss": 0.2622, + "num_input_tokens_seen": 3556704, + "step": 3545 + }, + { + "epoch": 1.6737388024516737, + "grad_norm": 0.4344152510166168, + "learning_rate": 4.931277335703045e-05, + "loss": 0.3029, + "num_input_tokens_seen": 3561952, + "step": 3550 + }, + { + "epoch": 1.6760961810466761, + "grad_norm": 0.3734598457813263, + "learning_rate": 4.930797478534057e-05, + "loss": 0.263, + "num_input_tokens_seen": 3566304, + "step": 3555 + }, + { + "epoch": 1.6784535596416785, + "grad_norm": 0.6265285611152649, + "learning_rate": 4.930315975394175e-05, + "loss": 0.3676, + "num_input_tokens_seen": 3571648, + "step": 3560 + }, + { + "epoch": 1.680810938236681, + "grad_norm": 0.4415866434574127, + "learning_rate": 4.929832826609443e-05, + "loss": 0.3392, + "num_input_tokens_seen": 3577728, + "step": 3565 + }, + { + "epoch": 1.6831683168316833, + "grad_norm": 0.3261505365371704, + "learning_rate": 4.9293480325070154e-05, + "loss": 0.2723, + "num_input_tokens_seen": 3582400, + "step": 3570 + }, + { + "epoch": 1.6855256954266855, + "grad_norm": 0.42135539650917053, + "learning_rate": 4.928861593415161e-05, + "loss": 0.2782, + "num_input_tokens_seen": 3587968, + "step": 3575 + }, + { + "epoch": 1.687883074021688, + "grad_norm": 0.5497151613235474, + "learning_rate": 4.928373509663264e-05, + "loss": 0.4102, + "num_input_tokens_seen": 3593408, + "step": 3580 + }, + { + "epoch": 1.69024045261669, + "grad_norm": 0.6480590105056763, + "learning_rate": 4.9278837815818226e-05, + "loss": 0.4636, + "num_input_tokens_seen": 3598592, + "step": 3585 + }, + { + "epoch": 1.6925978312116925, + "grad_norm": 0.5959892868995667, + "learning_rate": 4.9273924095024454e-05, + "loss": 0.3462, + "num_input_tokens_seen": 3603872, + "step": 3590 + }, + { + "epoch": 1.694955209806695, + "grad_norm": 1.0242400169372559, + "learning_rate": 4.926899393757858e-05, + "loss": 0.3725, + "num_input_tokens_seen": 3608448, + "step": 3595 + }, + { + "epoch": 1.6973125884016973, + "grad_norm": 0.5425651669502258, + "learning_rate": 4.926404734681895e-05, + "loss": 0.3564, + "num_input_tokens_seen": 3612896, + "step": 3600 + }, + { + "epoch": 1.6996699669966997, + "grad_norm": 0.35184019804000854, + "learning_rate": 4.925908432609508e-05, + "loss": 0.34, + "num_input_tokens_seen": 3618336, + "step": 3605 + }, + { + "epoch": 1.702027345591702, + "grad_norm": 0.5574763417243958, + "learning_rate": 4.925410487876759e-05, + "loss": 0.395, + "num_input_tokens_seen": 3622656, + "step": 3610 + }, + { + "epoch": 1.7043847241867045, + "grad_norm": 0.4601038098335266, + "learning_rate": 4.9249109008208204e-05, + "loss": 0.3485, + "num_input_tokens_seen": 3628896, + "step": 3615 + }, + { + "epoch": 1.706742102781707, + "grad_norm": 0.39471808075904846, + "learning_rate": 4.92440967177998e-05, + "loss": 0.3009, + "num_input_tokens_seen": 3633952, + "step": 3620 + }, + { + "epoch": 1.709099481376709, + "grad_norm": 0.6780459880828857, + "learning_rate": 4.923906801093637e-05, + "loss": 0.3845, + "num_input_tokens_seen": 3638752, + "step": 3625 + }, + { + "epoch": 1.7114568599717115, + "grad_norm": 0.6021654009819031, + "learning_rate": 4.923402289102299e-05, + "loss": 0.3763, + "num_input_tokens_seen": 3642912, + "step": 3630 + }, + { + "epoch": 1.7138142385667137, + "grad_norm": 0.7846017479896545, + "learning_rate": 4.922896136147589e-05, + "loss": 0.3438, + "num_input_tokens_seen": 3648352, + "step": 3635 + }, + { + "epoch": 1.716171617161716, + "grad_norm": 0.4043896794319153, + "learning_rate": 4.9223883425722386e-05, + "loss": 0.3631, + "num_input_tokens_seen": 3652928, + "step": 3640 + }, + { + "epoch": 1.7185289957567185, + "grad_norm": 0.4544791281223297, + "learning_rate": 4.921878908720091e-05, + "loss": 0.3654, + "num_input_tokens_seen": 3658016, + "step": 3645 + }, + { + "epoch": 1.7208863743517209, + "grad_norm": 0.4194674491882324, + "learning_rate": 4.9213678349361e-05, + "loss": 0.331, + "num_input_tokens_seen": 3662400, + "step": 3650 + }, + { + "epoch": 1.7232437529467233, + "grad_norm": 0.5959081649780273, + "learning_rate": 4.920855121566332e-05, + "loss": 0.3664, + "num_input_tokens_seen": 3666400, + "step": 3655 + }, + { + "epoch": 1.7256011315417257, + "grad_norm": 0.5944045782089233, + "learning_rate": 4.9203407689579595e-05, + "loss": 0.3997, + "num_input_tokens_seen": 3672128, + "step": 3660 + }, + { + "epoch": 1.727958510136728, + "grad_norm": 0.42679205536842346, + "learning_rate": 4.919824777459268e-05, + "loss": 0.3686, + "num_input_tokens_seen": 3676576, + "step": 3665 + }, + { + "epoch": 1.7303158887317305, + "grad_norm": 0.5103271007537842, + "learning_rate": 4.9193071474196504e-05, + "loss": 0.3784, + "num_input_tokens_seen": 3682240, + "step": 3670 + }, + { + "epoch": 1.7326732673267327, + "grad_norm": 0.4569803774356842, + "learning_rate": 4.918787879189613e-05, + "loss": 0.3737, + "num_input_tokens_seen": 3688128, + "step": 3675 + }, + { + "epoch": 1.735030645921735, + "grad_norm": 0.4899881184101105, + "learning_rate": 4.918266973120768e-05, + "loss": 0.3629, + "num_input_tokens_seen": 3693888, + "step": 3680 + }, + { + "epoch": 1.7373880245167372, + "grad_norm": 0.7389105558395386, + "learning_rate": 4.917744429565837e-05, + "loss": 0.3597, + "num_input_tokens_seen": 3698368, + "step": 3685 + }, + { + "epoch": 1.7397454031117396, + "grad_norm": 0.2842647433280945, + "learning_rate": 4.917220248878651e-05, + "loss": 0.3549, + "num_input_tokens_seen": 3703040, + "step": 3690 + }, + { + "epoch": 1.742102781706742, + "grad_norm": 0.6885014176368713, + "learning_rate": 4.9166944314141514e-05, + "loss": 0.3143, + "num_input_tokens_seen": 3707616, + "step": 3695 + }, + { + "epoch": 1.7444601603017444, + "grad_norm": 0.6131570339202881, + "learning_rate": 4.916166977528384e-05, + "loss": 0.3624, + "num_input_tokens_seen": 3712832, + "step": 3700 + }, + { + "epoch": 1.7468175388967468, + "grad_norm": 0.41307759284973145, + "learning_rate": 4.915637887578505e-05, + "loss": 0.3448, + "num_input_tokens_seen": 3717440, + "step": 3705 + }, + { + "epoch": 1.7491749174917492, + "grad_norm": 0.3188974857330322, + "learning_rate": 4.9151071619227785e-05, + "loss": 0.3036, + "num_input_tokens_seen": 3722592, + "step": 3710 + }, + { + "epoch": 1.7515322960867516, + "grad_norm": 0.35153788328170776, + "learning_rate": 4.914574800920576e-05, + "loss": 0.3528, + "num_input_tokens_seen": 3727520, + "step": 3715 + }, + { + "epoch": 1.753889674681754, + "grad_norm": 0.3927820026874542, + "learning_rate": 4.914040804932376e-05, + "loss": 0.3825, + "num_input_tokens_seen": 3732096, + "step": 3720 + }, + { + "epoch": 1.7562470532767562, + "grad_norm": 0.6324740052223206, + "learning_rate": 4.913505174319765e-05, + "loss": 0.3579, + "num_input_tokens_seen": 3736480, + "step": 3725 + }, + { + "epoch": 1.7586044318717586, + "grad_norm": 0.4476220905780792, + "learning_rate": 4.9129679094454346e-05, + "loss": 0.3721, + "num_input_tokens_seen": 3741152, + "step": 3730 + }, + { + "epoch": 1.7609618104667608, + "grad_norm": 0.45035359263420105, + "learning_rate": 4.912429010673184e-05, + "loss": 0.353, + "num_input_tokens_seen": 3746080, + "step": 3735 + }, + { + "epoch": 1.7633191890617632, + "grad_norm": 0.8100205659866333, + "learning_rate": 4.9118884783679204e-05, + "loss": 0.3575, + "num_input_tokens_seen": 3751616, + "step": 3740 + }, + { + "epoch": 1.7656765676567656, + "grad_norm": 0.7003343105316162, + "learning_rate": 4.911346312895654e-05, + "loss": 0.3398, + "num_input_tokens_seen": 3757024, + "step": 3745 + }, + { + "epoch": 1.768033946251768, + "grad_norm": 0.417478084564209, + "learning_rate": 4.910802514623503e-05, + "loss": 0.3585, + "num_input_tokens_seen": 3761344, + "step": 3750 + }, + { + "epoch": 1.7703913248467704, + "grad_norm": 0.6788808703422546, + "learning_rate": 4.91025708391969e-05, + "loss": 0.358, + "num_input_tokens_seen": 3766336, + "step": 3755 + }, + { + "epoch": 1.7727487034417728, + "grad_norm": 0.4504188299179077, + "learning_rate": 4.9097100211535455e-05, + "loss": 0.305, + "num_input_tokens_seen": 3770560, + "step": 3760 + }, + { + "epoch": 1.7751060820367752, + "grad_norm": 0.44032421708106995, + "learning_rate": 4.909161326695501e-05, + "loss": 0.2708, + "num_input_tokens_seen": 3774592, + "step": 3765 + }, + { + "epoch": 1.7774634606317776, + "grad_norm": 0.7612159252166748, + "learning_rate": 4.908611000917096e-05, + "loss": 0.3995, + "num_input_tokens_seen": 3780064, + "step": 3770 + }, + { + "epoch": 1.7798208392267798, + "grad_norm": 0.5138895511627197, + "learning_rate": 4.908059044190974e-05, + "loss": 0.3535, + "num_input_tokens_seen": 3784480, + "step": 3775 + }, + { + "epoch": 1.7821782178217822, + "grad_norm": 1.0788884162902832, + "learning_rate": 4.907505456890882e-05, + "loss": 0.4446, + "num_input_tokens_seen": 3791424, + "step": 3780 + }, + { + "epoch": 1.7845355964167844, + "grad_norm": 0.47593849897384644, + "learning_rate": 4.9069502393916725e-05, + "loss": 0.324, + "num_input_tokens_seen": 3796736, + "step": 3785 + }, + { + "epoch": 1.7868929750117868, + "grad_norm": 0.39372241497039795, + "learning_rate": 4.9063933920693006e-05, + "loss": 0.3606, + "num_input_tokens_seen": 3802016, + "step": 3790 + }, + { + "epoch": 1.7892503536067892, + "grad_norm": 0.6128644943237305, + "learning_rate": 4.9058349153008253e-05, + "loss": 0.333, + "num_input_tokens_seen": 3806848, + "step": 3795 + }, + { + "epoch": 1.7916077322017916, + "grad_norm": 0.5885056257247925, + "learning_rate": 4.905274809464409e-05, + "loss": 0.4201, + "num_input_tokens_seen": 3812160, + "step": 3800 + }, + { + "epoch": 1.793965110796794, + "grad_norm": 0.4901723861694336, + "learning_rate": 4.904713074939318e-05, + "loss": 0.3406, + "num_input_tokens_seen": 3817184, + "step": 3805 + }, + { + "epoch": 1.7963224893917964, + "grad_norm": 0.4148782789707184, + "learning_rate": 4.90414971210592e-05, + "loss": 0.3339, + "num_input_tokens_seen": 3822848, + "step": 3810 + }, + { + "epoch": 1.7986798679867988, + "grad_norm": 0.41852235794067383, + "learning_rate": 4.9035847213456874e-05, + "loss": 0.3911, + "num_input_tokens_seen": 3827296, + "step": 3815 + }, + { + "epoch": 1.8010372465818012, + "grad_norm": 0.7410247921943665, + "learning_rate": 4.903018103041192e-05, + "loss": 0.3419, + "num_input_tokens_seen": 3831616, + "step": 3820 + }, + { + "epoch": 1.8033946251768034, + "grad_norm": 1.1095752716064453, + "learning_rate": 4.902449857576109e-05, + "loss": 0.4215, + "num_input_tokens_seen": 3836480, + "step": 3825 + }, + { + "epoch": 1.8057520037718058, + "grad_norm": 0.5020027160644531, + "learning_rate": 4.901879985335217e-05, + "loss": 0.3538, + "num_input_tokens_seen": 3841920, + "step": 3830 + }, + { + "epoch": 1.808109382366808, + "grad_norm": 0.5382871031761169, + "learning_rate": 4.901308486704395e-05, + "loss": 0.4023, + "num_input_tokens_seen": 3847968, + "step": 3835 + }, + { + "epoch": 1.8104667609618104, + "grad_norm": 0.4193156063556671, + "learning_rate": 4.900735362070621e-05, + "loss": 0.3677, + "num_input_tokens_seen": 3852736, + "step": 3840 + }, + { + "epoch": 1.8128241395568128, + "grad_norm": 0.5103532075881958, + "learning_rate": 4.900160611821978e-05, + "loss": 0.338, + "num_input_tokens_seen": 3857824, + "step": 3845 + }, + { + "epoch": 1.8151815181518152, + "grad_norm": 0.5766565203666687, + "learning_rate": 4.8995842363476465e-05, + "loss": 0.3181, + "num_input_tokens_seen": 3864256, + "step": 3850 + }, + { + "epoch": 1.8175388967468176, + "grad_norm": 0.38607776165008545, + "learning_rate": 4.89900623603791e-05, + "loss": 0.2858, + "num_input_tokens_seen": 3869920, + "step": 3855 + }, + { + "epoch": 1.81989627534182, + "grad_norm": 0.6073411107063293, + "learning_rate": 4.8984266112841504e-05, + "loss": 0.3126, + "num_input_tokens_seen": 3876128, + "step": 3860 + }, + { + "epoch": 1.8222536539368224, + "grad_norm": 0.5288861393928528, + "learning_rate": 4.8978453624788494e-05, + "loss": 0.3763, + "num_input_tokens_seen": 3880896, + "step": 3865 + }, + { + "epoch": 1.8246110325318248, + "grad_norm": 0.34140440821647644, + "learning_rate": 4.8972624900155904e-05, + "loss": 0.3361, + "num_input_tokens_seen": 3885504, + "step": 3870 + }, + { + "epoch": 1.826968411126827, + "grad_norm": 0.8635503649711609, + "learning_rate": 4.896677994289055e-05, + "loss": 0.3712, + "num_input_tokens_seen": 3890048, + "step": 3875 + }, + { + "epoch": 1.8293257897218294, + "grad_norm": 0.4903278052806854, + "learning_rate": 4.896091875695024e-05, + "loss": 0.3404, + "num_input_tokens_seen": 3895296, + "step": 3880 + }, + { + "epoch": 1.8316831683168315, + "grad_norm": 0.32797330617904663, + "learning_rate": 4.895504134630376e-05, + "loss": 0.3607, + "num_input_tokens_seen": 3899808, + "step": 3885 + }, + { + "epoch": 1.834040546911834, + "grad_norm": 0.4531113803386688, + "learning_rate": 4.89491477149309e-05, + "loss": 0.3884, + "num_input_tokens_seen": 3905760, + "step": 3890 + }, + { + "epoch": 1.8363979255068363, + "grad_norm": 0.4707378149032593, + "learning_rate": 4.894323786682243e-05, + "loss": 0.4128, + "num_input_tokens_seen": 3912160, + "step": 3895 + }, + { + "epoch": 1.8387553041018387, + "grad_norm": 0.6973928213119507, + "learning_rate": 4.89373118059801e-05, + "loss": 0.3617, + "num_input_tokens_seen": 3917120, + "step": 3900 + }, + { + "epoch": 1.8411126826968411, + "grad_norm": 0.6425697803497314, + "learning_rate": 4.893136953641663e-05, + "loss": 0.3255, + "num_input_tokens_seen": 3922752, + "step": 3905 + }, + { + "epoch": 1.8434700612918435, + "grad_norm": 0.44391846656799316, + "learning_rate": 4.892541106215571e-05, + "loss": 0.3212, + "num_input_tokens_seen": 3927840, + "step": 3910 + }, + { + "epoch": 1.845827439886846, + "grad_norm": 0.5907256603240967, + "learning_rate": 4.891943638723203e-05, + "loss": 0.3117, + "num_input_tokens_seen": 3932864, + "step": 3915 + }, + { + "epoch": 1.8481848184818483, + "grad_norm": 0.8090701103210449, + "learning_rate": 4.891344551569124e-05, + "loss": 0.4036, + "num_input_tokens_seen": 3938016, + "step": 3920 + }, + { + "epoch": 1.8505421970768505, + "grad_norm": 0.5774091482162476, + "learning_rate": 4.890743845158993e-05, + "loss": 0.3464, + "num_input_tokens_seen": 3944960, + "step": 3925 + }, + { + "epoch": 1.852899575671853, + "grad_norm": 0.6599173545837402, + "learning_rate": 4.890141519899569e-05, + "loss": 0.3816, + "num_input_tokens_seen": 3949760, + "step": 3930 + }, + { + "epoch": 1.855256954266855, + "grad_norm": 0.5072494745254517, + "learning_rate": 4.8895375761987056e-05, + "loss": 0.294, + "num_input_tokens_seen": 3954656, + "step": 3935 + }, + { + "epoch": 1.8576143328618575, + "grad_norm": 0.37289783358573914, + "learning_rate": 4.888932014465352e-05, + "loss": 0.3054, + "num_input_tokens_seen": 3959136, + "step": 3940 + }, + { + "epoch": 1.85997171145686, + "grad_norm": 0.4260596036911011, + "learning_rate": 4.888324835109554e-05, + "loss": 0.3453, + "num_input_tokens_seen": 3963776, + "step": 3945 + }, + { + "epoch": 1.8623290900518623, + "grad_norm": 0.6524696946144104, + "learning_rate": 4.887716038542451e-05, + "loss": 0.3226, + "num_input_tokens_seen": 3968896, + "step": 3950 + }, + { + "epoch": 1.8646864686468647, + "grad_norm": 0.3887139558792114, + "learning_rate": 4.88710562517628e-05, + "loss": 0.2908, + "num_input_tokens_seen": 3973824, + "step": 3955 + }, + { + "epoch": 1.8670438472418671, + "grad_norm": 0.4233371913433075, + "learning_rate": 4.886493595424372e-05, + "loss": 0.338, + "num_input_tokens_seen": 3979776, + "step": 3960 + }, + { + "epoch": 1.8694012258368695, + "grad_norm": 0.48795488476753235, + "learning_rate": 4.88587994970115e-05, + "loss": 0.3167, + "num_input_tokens_seen": 3985344, + "step": 3965 + }, + { + "epoch": 1.871758604431872, + "grad_norm": 0.37394246459007263, + "learning_rate": 4.8852646884221346e-05, + "loss": 0.3847, + "num_input_tokens_seen": 3990144, + "step": 3970 + }, + { + "epoch": 1.874115983026874, + "grad_norm": 0.30283793807029724, + "learning_rate": 4.884647812003938e-05, + "loss": 0.2998, + "num_input_tokens_seen": 3994464, + "step": 3975 + }, + { + "epoch": 1.8764733616218765, + "grad_norm": 0.4020773470401764, + "learning_rate": 4.884029320864268e-05, + "loss": 0.3504, + "num_input_tokens_seen": 3999808, + "step": 3980 + }, + { + "epoch": 1.8788307402168787, + "grad_norm": 0.3164846897125244, + "learning_rate": 4.883409215421924e-05, + "loss": 0.3427, + "num_input_tokens_seen": 4004032, + "step": 3985 + }, + { + "epoch": 1.881188118811881, + "grad_norm": 0.43037664890289307, + "learning_rate": 4.8827874960968e-05, + "loss": 0.3776, + "num_input_tokens_seen": 4008960, + "step": 3990 + }, + { + "epoch": 1.8835454974068835, + "grad_norm": 0.25492656230926514, + "learning_rate": 4.882164163309881e-05, + "loss": 0.3237, + "num_input_tokens_seen": 4013120, + "step": 3995 + }, + { + "epoch": 1.8859028760018859, + "grad_norm": 0.42692649364471436, + "learning_rate": 4.8815392174832464e-05, + "loss": 0.2773, + "num_input_tokens_seen": 4017696, + "step": 4000 + }, + { + "epoch": 1.8882602545968883, + "grad_norm": 0.41296637058258057, + "learning_rate": 4.880912659040067e-05, + "loss": 0.3653, + "num_input_tokens_seen": 4022464, + "step": 4005 + }, + { + "epoch": 1.8906176331918907, + "grad_norm": 0.5438706278800964, + "learning_rate": 4.8802844884046064e-05, + "loss": 0.3126, + "num_input_tokens_seen": 4026912, + "step": 4010 + }, + { + "epoch": 1.892975011786893, + "grad_norm": 0.525915801525116, + "learning_rate": 4.8796547060022165e-05, + "loss": 0.304, + "num_input_tokens_seen": 4032000, + "step": 4015 + }, + { + "epoch": 1.8953323903818955, + "grad_norm": 0.5359346270561218, + "learning_rate": 4.879023312259346e-05, + "loss": 0.3139, + "num_input_tokens_seen": 4036608, + "step": 4020 + }, + { + "epoch": 1.8976897689768977, + "grad_norm": 0.5543795824050903, + "learning_rate": 4.87839030760353e-05, + "loss": 0.3933, + "num_input_tokens_seen": 4041280, + "step": 4025 + }, + { + "epoch": 1.9000471475719, + "grad_norm": 0.3648849129676819, + "learning_rate": 4.877755692463397e-05, + "loss": 0.3349, + "num_input_tokens_seen": 4047520, + "step": 4030 + }, + { + "epoch": 1.9024045261669023, + "grad_norm": 0.49733713269233704, + "learning_rate": 4.877119467268666e-05, + "loss": 0.3489, + "num_input_tokens_seen": 4053696, + "step": 4035 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 0.5659785270690918, + "learning_rate": 4.876481632450144e-05, + "loss": 0.3416, + "num_input_tokens_seen": 4057888, + "step": 4040 + }, + { + "epoch": 1.907119283356907, + "grad_norm": 0.3728081285953522, + "learning_rate": 4.875842188439731e-05, + "loss": 0.3459, + "num_input_tokens_seen": 4063520, + "step": 4045 + }, + { + "epoch": 1.9094766619519095, + "grad_norm": 0.5313366055488586, + "learning_rate": 4.875201135670413e-05, + "loss": 0.3276, + "num_input_tokens_seen": 4068384, + "step": 4050 + }, + { + "epoch": 1.9118340405469119, + "grad_norm": 0.4423787295818329, + "learning_rate": 4.874558474576268e-05, + "loss": 0.3516, + "num_input_tokens_seen": 4072768, + "step": 4055 + }, + { + "epoch": 1.9141914191419143, + "grad_norm": 0.43578940629959106, + "learning_rate": 4.8739142055924636e-05, + "loss": 0.3761, + "num_input_tokens_seen": 4078400, + "step": 4060 + }, + { + "epoch": 1.9165487977369167, + "grad_norm": 0.5131593346595764, + "learning_rate": 4.873268329155255e-05, + "loss": 0.3224, + "num_input_tokens_seen": 4083168, + "step": 4065 + }, + { + "epoch": 1.918906176331919, + "grad_norm": 0.6057640910148621, + "learning_rate": 4.872620845701984e-05, + "loss": 0.2616, + "num_input_tokens_seen": 4087904, + "step": 4070 + }, + { + "epoch": 1.9212635549269212, + "grad_norm": 0.48330196738243103, + "learning_rate": 4.871971755671084e-05, + "loss": 0.4113, + "num_input_tokens_seen": 4092704, + "step": 4075 + }, + { + "epoch": 1.9236209335219236, + "grad_norm": 0.4203997850418091, + "learning_rate": 4.8713210595020744e-05, + "loss": 0.4093, + "num_input_tokens_seen": 4097984, + "step": 4080 + }, + { + "epoch": 1.9259783121169258, + "grad_norm": 0.42204803228378296, + "learning_rate": 4.870668757635562e-05, + "loss": 0.3985, + "num_input_tokens_seen": 4102976, + "step": 4085 + }, + { + "epoch": 1.9283356907119282, + "grad_norm": 0.4091854393482208, + "learning_rate": 4.870014850513241e-05, + "loss": 0.3688, + "num_input_tokens_seen": 4108896, + "step": 4090 + }, + { + "epoch": 1.9306930693069306, + "grad_norm": 0.40593913197517395, + "learning_rate": 4.869359338577894e-05, + "loss": 0.3368, + "num_input_tokens_seen": 4114336, + "step": 4095 + }, + { + "epoch": 1.933050447901933, + "grad_norm": 0.7564314007759094, + "learning_rate": 4.868702222273388e-05, + "loss": 0.4137, + "num_input_tokens_seen": 4118944, + "step": 4100 + }, + { + "epoch": 1.9354078264969354, + "grad_norm": 0.5491163730621338, + "learning_rate": 4.8680435020446784e-05, + "loss": 0.3902, + "num_input_tokens_seen": 4123968, + "step": 4105 + }, + { + "epoch": 1.9377652050919378, + "grad_norm": 0.4665642976760864, + "learning_rate": 4.8673831783378055e-05, + "loss": 0.3575, + "num_input_tokens_seen": 4129120, + "step": 4110 + }, + { + "epoch": 1.9401225836869402, + "grad_norm": 0.521999180316925, + "learning_rate": 4.866721251599896e-05, + "loss": 0.3392, + "num_input_tokens_seen": 4133536, + "step": 4115 + }, + { + "epoch": 1.9424799622819426, + "grad_norm": 0.6245047450065613, + "learning_rate": 4.86605772227916e-05, + "loss": 0.3744, + "num_input_tokens_seen": 4138208, + "step": 4120 + }, + { + "epoch": 1.9448373408769448, + "grad_norm": 0.860499382019043, + "learning_rate": 4.8653925908248974e-05, + "loss": 0.3633, + "num_input_tokens_seen": 4142656, + "step": 4125 + }, + { + "epoch": 1.9471947194719472, + "grad_norm": 0.5430553555488586, + "learning_rate": 4.8647258576874875e-05, + "loss": 0.3609, + "num_input_tokens_seen": 4148224, + "step": 4130 + }, + { + "epoch": 1.9495520980669494, + "grad_norm": 0.3605160117149353, + "learning_rate": 4.864057523318398e-05, + "loss": 0.3829, + "num_input_tokens_seen": 4152800, + "step": 4135 + }, + { + "epoch": 1.9519094766619518, + "grad_norm": 0.49815213680267334, + "learning_rate": 4.863387588170178e-05, + "loss": 0.4406, + "num_input_tokens_seen": 4160608, + "step": 4140 + }, + { + "epoch": 1.9542668552569542, + "grad_norm": 0.9321429133415222, + "learning_rate": 4.8627160526964646e-05, + "loss": 0.366, + "num_input_tokens_seen": 4165056, + "step": 4145 + }, + { + "epoch": 1.9566242338519566, + "grad_norm": 0.5945044755935669, + "learning_rate": 4.862042917351974e-05, + "loss": 0.3072, + "num_input_tokens_seen": 4170048, + "step": 4150 + }, + { + "epoch": 1.958981612446959, + "grad_norm": 0.8122310638427734, + "learning_rate": 4.861368182592508e-05, + "loss": 0.3584, + "num_input_tokens_seen": 4175520, + "step": 4155 + }, + { + "epoch": 1.9613389910419614, + "grad_norm": 0.5157389640808105, + "learning_rate": 4.8606918488749516e-05, + "loss": 0.3788, + "num_input_tokens_seen": 4182176, + "step": 4160 + }, + { + "epoch": 1.9636963696369638, + "grad_norm": 0.28033071756362915, + "learning_rate": 4.8600139166572725e-05, + "loss": 0.2928, + "num_input_tokens_seen": 4187552, + "step": 4165 + }, + { + "epoch": 1.9660537482319662, + "grad_norm": 0.7532975077629089, + "learning_rate": 4.8593343863985186e-05, + "loss": 0.4365, + "num_input_tokens_seen": 4193056, + "step": 4170 + }, + { + "epoch": 1.9684111268269684, + "grad_norm": 0.6312500238418579, + "learning_rate": 4.8586532585588226e-05, + "loss": 0.3363, + "num_input_tokens_seen": 4197792, + "step": 4175 + }, + { + "epoch": 1.9707685054219708, + "grad_norm": 0.4279022216796875, + "learning_rate": 4.8579705335993994e-05, + "loss": 0.3701, + "num_input_tokens_seen": 4201984, + "step": 4180 + }, + { + "epoch": 1.973125884016973, + "grad_norm": 1.1027402877807617, + "learning_rate": 4.8572862119825414e-05, + "loss": 0.3733, + "num_input_tokens_seen": 4208000, + "step": 4185 + }, + { + "epoch": 1.9754832626119754, + "grad_norm": 0.4556558132171631, + "learning_rate": 4.856600294171627e-05, + "loss": 0.346, + "num_input_tokens_seen": 4213056, + "step": 4190 + }, + { + "epoch": 1.9778406412069778, + "grad_norm": 0.505276620388031, + "learning_rate": 4.855912780631111e-05, + "loss": 0.3633, + "num_input_tokens_seen": 4217952, + "step": 4195 + }, + { + "epoch": 1.9801980198019802, + "grad_norm": 0.44905775785446167, + "learning_rate": 4.855223671826533e-05, + "loss": 0.3238, + "num_input_tokens_seen": 4222336, + "step": 4200 + }, + { + "epoch": 1.9825553983969826, + "grad_norm": 0.4933197498321533, + "learning_rate": 4.854532968224509e-05, + "loss": 0.3209, + "num_input_tokens_seen": 4227840, + "step": 4205 + }, + { + "epoch": 1.984912776991985, + "grad_norm": 0.5219621062278748, + "learning_rate": 4.853840670292737e-05, + "loss": 0.3294, + "num_input_tokens_seen": 4233280, + "step": 4210 + }, + { + "epoch": 1.9872701555869874, + "grad_norm": 0.4660954773426056, + "learning_rate": 4.853146778499995e-05, + "loss": 0.3761, + "num_input_tokens_seen": 4237280, + "step": 4215 + }, + { + "epoch": 1.9896275341819898, + "grad_norm": 0.3480290472507477, + "learning_rate": 4.8524512933161384e-05, + "loss": 0.2775, + "num_input_tokens_seen": 4242016, + "step": 4220 + }, + { + "epoch": 1.991984912776992, + "grad_norm": 0.8185409903526306, + "learning_rate": 4.851754215212103e-05, + "loss": 0.4219, + "num_input_tokens_seen": 4247008, + "step": 4225 + }, + { + "epoch": 1.9943422913719944, + "grad_norm": 0.4691927433013916, + "learning_rate": 4.851055544659902e-05, + "loss": 0.3697, + "num_input_tokens_seen": 4252768, + "step": 4230 + }, + { + "epoch": 1.9966996699669965, + "grad_norm": 0.4996395409107208, + "learning_rate": 4.850355282132628e-05, + "loss": 0.3351, + "num_input_tokens_seen": 4257568, + "step": 4235 + }, + { + "epoch": 1.999057048561999, + "grad_norm": 0.6886594891548157, + "learning_rate": 4.849653428104452e-05, + "loss": 0.3162, + "num_input_tokens_seen": 4262496, + "step": 4240 + }, + { + "epoch": 2.000942951438001, + "eval_loss": 0.33889830112457275, + "eval_runtime": 25.5799, + "eval_samples_per_second": 36.865, + "eval_steps_per_second": 9.226, + "num_input_tokens_seen": 4266592, + "step": 4244 + }, + { + "epoch": 2.0014144271570014, + "grad_norm": 0.337459534406662, + "learning_rate": 4.848949983050621e-05, + "loss": 0.3499, + "num_input_tokens_seen": 4267584, + "step": 4245 + }, + { + "epoch": 2.0037718057520038, + "grad_norm": 0.4674561321735382, + "learning_rate": 4.84824494744746e-05, + "loss": 0.3028, + "num_input_tokens_seen": 4272640, + "step": 4250 + }, + { + "epoch": 2.006129184347006, + "grad_norm": 0.36924025416374207, + "learning_rate": 4.847538321772372e-05, + "loss": 0.3197, + "num_input_tokens_seen": 4278080, + "step": 4255 + }, + { + "epoch": 2.0084865629420086, + "grad_norm": 0.77660071849823, + "learning_rate": 4.8468301065038355e-05, + "loss": 0.3762, + "num_input_tokens_seen": 4282496, + "step": 4260 + }, + { + "epoch": 2.010843941537011, + "grad_norm": 0.43062132596969604, + "learning_rate": 4.846120302121405e-05, + "loss": 0.2875, + "num_input_tokens_seen": 4287456, + "step": 4265 + }, + { + "epoch": 2.0132013201320134, + "grad_norm": 0.5843988060951233, + "learning_rate": 4.845408909105714e-05, + "loss": 0.3108, + "num_input_tokens_seen": 4292576, + "step": 4270 + }, + { + "epoch": 2.0155586987270158, + "grad_norm": 0.4873974025249481, + "learning_rate": 4.844695927938467e-05, + "loss": 0.375, + "num_input_tokens_seen": 4297600, + "step": 4275 + }, + { + "epoch": 2.0179160773220177, + "grad_norm": 0.5445888042449951, + "learning_rate": 4.8439813591024475e-05, + "loss": 0.3299, + "num_input_tokens_seen": 4303744, + "step": 4280 + }, + { + "epoch": 2.02027345591702, + "grad_norm": 0.5442112684249878, + "learning_rate": 4.843265203081513e-05, + "loss": 0.3235, + "num_input_tokens_seen": 4309312, + "step": 4285 + }, + { + "epoch": 2.0226308345120225, + "grad_norm": 0.3582642674446106, + "learning_rate": 4.842547460360596e-05, + "loss": 0.355, + "num_input_tokens_seen": 4314496, + "step": 4290 + }, + { + "epoch": 2.024988213107025, + "grad_norm": 0.29797136783599854, + "learning_rate": 4.841828131425703e-05, + "loss": 0.3679, + "num_input_tokens_seen": 4319552, + "step": 4295 + }, + { + "epoch": 2.0273455917020273, + "grad_norm": 0.49905478954315186, + "learning_rate": 4.8411072167639135e-05, + "loss": 0.3487, + "num_input_tokens_seen": 4324448, + "step": 4300 + }, + { + "epoch": 2.0297029702970297, + "grad_norm": 0.6814953088760376, + "learning_rate": 4.840384716863383e-05, + "loss": 0.3308, + "num_input_tokens_seen": 4329760, + "step": 4305 + }, + { + "epoch": 2.032060348892032, + "grad_norm": 0.4640512764453888, + "learning_rate": 4.83966063221334e-05, + "loss": 0.3391, + "num_input_tokens_seen": 4334880, + "step": 4310 + }, + { + "epoch": 2.0344177274870345, + "grad_norm": 0.4870789349079132, + "learning_rate": 4.8389349633040836e-05, + "loss": 0.4029, + "num_input_tokens_seen": 4340096, + "step": 4315 + }, + { + "epoch": 2.036775106082037, + "grad_norm": 0.6941541433334351, + "learning_rate": 4.8382077106269895e-05, + "loss": 0.3425, + "num_input_tokens_seen": 4345344, + "step": 4320 + }, + { + "epoch": 2.0391324846770393, + "grad_norm": 0.43181923031806946, + "learning_rate": 4.837478874674502e-05, + "loss": 0.3732, + "num_input_tokens_seen": 4351360, + "step": 4325 + }, + { + "epoch": 2.0414898632720413, + "grad_norm": 0.614492654800415, + "learning_rate": 4.8367484559401406e-05, + "loss": 0.2915, + "num_input_tokens_seen": 4356192, + "step": 4330 + }, + { + "epoch": 2.0438472418670437, + "grad_norm": 0.35067081451416016, + "learning_rate": 4.8360164549184945e-05, + "loss": 0.3752, + "num_input_tokens_seen": 4362496, + "step": 4335 + }, + { + "epoch": 2.046204620462046, + "grad_norm": 0.5147934556007385, + "learning_rate": 4.835282872105225e-05, + "loss": 0.2947, + "num_input_tokens_seen": 4366752, + "step": 4340 + }, + { + "epoch": 2.0485619990570485, + "grad_norm": 0.42161324620246887, + "learning_rate": 4.8345477079970654e-05, + "loss": 0.3437, + "num_input_tokens_seen": 4371648, + "step": 4345 + }, + { + "epoch": 2.050919377652051, + "grad_norm": 0.29862701892852783, + "learning_rate": 4.8338109630918185e-05, + "loss": 0.3853, + "num_input_tokens_seen": 4376864, + "step": 4350 + }, + { + "epoch": 2.0532767562470533, + "grad_norm": 0.5709139704704285, + "learning_rate": 4.833072637888356e-05, + "loss": 0.344, + "num_input_tokens_seen": 4381408, + "step": 4355 + }, + { + "epoch": 2.0556341348420557, + "grad_norm": 0.4882689416408539, + "learning_rate": 4.832332732886625e-05, + "loss": 0.3074, + "num_input_tokens_seen": 4385888, + "step": 4360 + }, + { + "epoch": 2.057991513437058, + "grad_norm": 0.5616267323493958, + "learning_rate": 4.831591248587637e-05, + "loss": 0.3537, + "num_input_tokens_seen": 4390432, + "step": 4365 + }, + { + "epoch": 2.0603488920320605, + "grad_norm": 0.3552066683769226, + "learning_rate": 4.830848185493474e-05, + "loss": 0.3666, + "num_input_tokens_seen": 4395296, + "step": 4370 + }, + { + "epoch": 2.062706270627063, + "grad_norm": 0.38998690247535706, + "learning_rate": 4.83010354410729e-05, + "loss": 0.3784, + "num_input_tokens_seen": 4400192, + "step": 4375 + }, + { + "epoch": 2.065063649222065, + "grad_norm": 0.35014912486076355, + "learning_rate": 4.829357324933303e-05, + "loss": 0.291, + "num_input_tokens_seen": 4405632, + "step": 4380 + }, + { + "epoch": 2.0674210278170673, + "grad_norm": 0.4858569800853729, + "learning_rate": 4.828609528476804e-05, + "loss": 0.3277, + "num_input_tokens_seen": 4410656, + "step": 4385 + }, + { + "epoch": 2.0697784064120697, + "grad_norm": 0.3907780051231384, + "learning_rate": 4.827860155244149e-05, + "loss": 0.3374, + "num_input_tokens_seen": 4415680, + "step": 4390 + }, + { + "epoch": 2.072135785007072, + "grad_norm": 0.6515645384788513, + "learning_rate": 4.827109205742763e-05, + "loss": 0.2841, + "num_input_tokens_seen": 4421024, + "step": 4395 + }, + { + "epoch": 2.0744931636020745, + "grad_norm": 0.4231064021587372, + "learning_rate": 4.826356680481138e-05, + "loss": 0.3453, + "num_input_tokens_seen": 4425376, + "step": 4400 + }, + { + "epoch": 2.076850542197077, + "grad_norm": 0.39276957511901855, + "learning_rate": 4.825602579968832e-05, + "loss": 0.3071, + "num_input_tokens_seen": 4430304, + "step": 4405 + }, + { + "epoch": 2.0792079207920793, + "grad_norm": 0.5359079241752625, + "learning_rate": 4.8248469047164725e-05, + "loss": 0.3724, + "num_input_tokens_seen": 4435520, + "step": 4410 + }, + { + "epoch": 2.0815652993870817, + "grad_norm": 0.31293511390686035, + "learning_rate": 4.824089655235749e-05, + "loss": 0.3839, + "num_input_tokens_seen": 4440256, + "step": 4415 + }, + { + "epoch": 2.083922677982084, + "grad_norm": 0.46326059103012085, + "learning_rate": 4.8233308320394224e-05, + "loss": 0.3444, + "num_input_tokens_seen": 4445152, + "step": 4420 + }, + { + "epoch": 2.0862800565770865, + "grad_norm": 0.6923593282699585, + "learning_rate": 4.822570435641314e-05, + "loss": 0.3664, + "num_input_tokens_seen": 4449632, + "step": 4425 + }, + { + "epoch": 2.0886374351720884, + "grad_norm": 0.40182819962501526, + "learning_rate": 4.8218084665563126e-05, + "loss": 0.3159, + "num_input_tokens_seen": 4455136, + "step": 4430 + }, + { + "epoch": 2.090994813767091, + "grad_norm": 0.40102046728134155, + "learning_rate": 4.821044925300372e-05, + "loss": 0.2705, + "num_input_tokens_seen": 4459808, + "step": 4435 + }, + { + "epoch": 2.0933521923620932, + "grad_norm": 0.32904955744743347, + "learning_rate": 4.8202798123905125e-05, + "loss": 0.3446, + "num_input_tokens_seen": 4466080, + "step": 4440 + }, + { + "epoch": 2.0957095709570956, + "grad_norm": 0.39740505814552307, + "learning_rate": 4.819513128344814e-05, + "loss": 0.3904, + "num_input_tokens_seen": 4473184, + "step": 4445 + }, + { + "epoch": 2.098066949552098, + "grad_norm": 0.4749106466770172, + "learning_rate": 4.8187448736824234e-05, + "loss": 0.3221, + "num_input_tokens_seen": 4477824, + "step": 4450 + }, + { + "epoch": 2.1004243281471005, + "grad_norm": 0.32755059003829956, + "learning_rate": 4.817975048923552e-05, + "loss": 0.3039, + "num_input_tokens_seen": 4482624, + "step": 4455 + }, + { + "epoch": 2.102781706742103, + "grad_norm": 0.3057654798030853, + "learning_rate": 4.8172036545894714e-05, + "loss": 0.2772, + "num_input_tokens_seen": 4487488, + "step": 4460 + }, + { + "epoch": 2.1051390853371053, + "grad_norm": 0.31737032532691956, + "learning_rate": 4.8164306912025174e-05, + "loss": 0.2762, + "num_input_tokens_seen": 4492768, + "step": 4465 + }, + { + "epoch": 2.1074964639321077, + "grad_norm": 0.8580963611602783, + "learning_rate": 4.81565615928609e-05, + "loss": 0.3468, + "num_input_tokens_seen": 4498208, + "step": 4470 + }, + { + "epoch": 2.10985384252711, + "grad_norm": 0.37068232893943787, + "learning_rate": 4.814880059364647e-05, + "loss": 0.3168, + "num_input_tokens_seen": 4502880, + "step": 4475 + }, + { + "epoch": 2.112211221122112, + "grad_norm": 0.32691729068756104, + "learning_rate": 4.8141023919637117e-05, + "loss": 0.2529, + "num_input_tokens_seen": 4508032, + "step": 4480 + }, + { + "epoch": 2.1145685997171144, + "grad_norm": 0.5827587842941284, + "learning_rate": 4.813323157609868e-05, + "loss": 0.3519, + "num_input_tokens_seen": 4513568, + "step": 4485 + }, + { + "epoch": 2.116925978312117, + "grad_norm": 0.45014235377311707, + "learning_rate": 4.812542356830761e-05, + "loss": 0.368, + "num_input_tokens_seen": 4518976, + "step": 4490 + }, + { + "epoch": 2.119283356907119, + "grad_norm": 0.5354323387145996, + "learning_rate": 4.811759990155095e-05, + "loss": 0.3603, + "num_input_tokens_seen": 4522912, + "step": 4495 + }, + { + "epoch": 2.1216407355021216, + "grad_norm": 0.30999067425727844, + "learning_rate": 4.810976058112635e-05, + "loss": 0.3513, + "num_input_tokens_seen": 4527520, + "step": 4500 + }, + { + "epoch": 2.123998114097124, + "grad_norm": 0.6007501482963562, + "learning_rate": 4.810190561234208e-05, + "loss": 0.332, + "num_input_tokens_seen": 4532416, + "step": 4505 + }, + { + "epoch": 2.1263554926921264, + "grad_norm": 0.46144551038742065, + "learning_rate": 4.809403500051698e-05, + "loss": 0.3218, + "num_input_tokens_seen": 4537952, + "step": 4510 + }, + { + "epoch": 2.128712871287129, + "grad_norm": 0.6658599972724915, + "learning_rate": 4.8086148750980493e-05, + "loss": 0.2989, + "num_input_tokens_seen": 4542528, + "step": 4515 + }, + { + "epoch": 2.1310702498821312, + "grad_norm": 0.8446155786514282, + "learning_rate": 4.807824686907266e-05, + "loss": 0.4116, + "num_input_tokens_seen": 4547008, + "step": 4520 + }, + { + "epoch": 2.1334276284771336, + "grad_norm": 0.3059292435646057, + "learning_rate": 4.807032936014409e-05, + "loss": 0.3103, + "num_input_tokens_seen": 4551680, + "step": 4525 + }, + { + "epoch": 2.1357850070721356, + "grad_norm": 0.34065672755241394, + "learning_rate": 4.806239622955599e-05, + "loss": 0.3818, + "num_input_tokens_seen": 4557216, + "step": 4530 + }, + { + "epoch": 2.138142385667138, + "grad_norm": 0.8002220392227173, + "learning_rate": 4.805444748268013e-05, + "loss": 0.3201, + "num_input_tokens_seen": 4562464, + "step": 4535 + }, + { + "epoch": 2.1404997642621404, + "grad_norm": 0.6143326163291931, + "learning_rate": 4.8046483124898865e-05, + "loss": 0.3763, + "num_input_tokens_seen": 4567776, + "step": 4540 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.6652312874794006, + "learning_rate": 4.803850316160512e-05, + "loss": 0.3436, + "num_input_tokens_seen": 4573152, + "step": 4545 + }, + { + "epoch": 2.145214521452145, + "grad_norm": 0.5703457593917847, + "learning_rate": 4.803050759820238e-05, + "loss": 0.3789, + "num_input_tokens_seen": 4578048, + "step": 4550 + }, + { + "epoch": 2.1475719000471476, + "grad_norm": 0.43659019470214844, + "learning_rate": 4.802249644010469e-05, + "loss": 0.3216, + "num_input_tokens_seen": 4582336, + "step": 4555 + }, + { + "epoch": 2.14992927864215, + "grad_norm": 0.6278697848320007, + "learning_rate": 4.8014469692736674e-05, + "loss": 0.3741, + "num_input_tokens_seen": 4587360, + "step": 4560 + }, + { + "epoch": 2.1522866572371524, + "grad_norm": 0.4343399703502655, + "learning_rate": 4.80064273615335e-05, + "loss": 0.3652, + "num_input_tokens_seen": 4591872, + "step": 4565 + }, + { + "epoch": 2.154644035832155, + "grad_norm": 0.3364063501358032, + "learning_rate": 4.7998369451940886e-05, + "loss": 0.3571, + "num_input_tokens_seen": 4596640, + "step": 4570 + }, + { + "epoch": 2.157001414427157, + "grad_norm": 0.8682699203491211, + "learning_rate": 4.7990295969415086e-05, + "loss": 0.3704, + "num_input_tokens_seen": 4600672, + "step": 4575 + }, + { + "epoch": 2.159358793022159, + "grad_norm": 0.7738638520240784, + "learning_rate": 4.798220691942294e-05, + "loss": 0.3881, + "num_input_tokens_seen": 4606080, + "step": 4580 + }, + { + "epoch": 2.1617161716171616, + "grad_norm": 0.6731601357460022, + "learning_rate": 4.797410230744177e-05, + "loss": 0.3156, + "num_input_tokens_seen": 4610816, + "step": 4585 + }, + { + "epoch": 2.164073550212164, + "grad_norm": 0.610457181930542, + "learning_rate": 4.796598213895951e-05, + "loss": 0.3242, + "num_input_tokens_seen": 4615968, + "step": 4590 + }, + { + "epoch": 2.1664309288071664, + "grad_norm": 0.402705579996109, + "learning_rate": 4.795784641947455e-05, + "loss": 0.3621, + "num_input_tokens_seen": 4620288, + "step": 4595 + }, + { + "epoch": 2.1687883074021688, + "grad_norm": 0.6227291822433472, + "learning_rate": 4.794969515449587e-05, + "loss": 0.391, + "num_input_tokens_seen": 4625568, + "step": 4600 + }, + { + "epoch": 2.171145685997171, + "grad_norm": 0.34683024883270264, + "learning_rate": 4.794152834954293e-05, + "loss": 0.4285, + "num_input_tokens_seen": 4630560, + "step": 4605 + }, + { + "epoch": 2.1735030645921736, + "grad_norm": 0.5114995837211609, + "learning_rate": 4.7933346010145764e-05, + "loss": 0.342, + "num_input_tokens_seen": 4636192, + "step": 4610 + }, + { + "epoch": 2.175860443187176, + "grad_norm": 0.4832593500614166, + "learning_rate": 4.7925148141844874e-05, + "loss": 0.3614, + "num_input_tokens_seen": 4641344, + "step": 4615 + }, + { + "epoch": 2.1782178217821784, + "grad_norm": 0.40831518173217773, + "learning_rate": 4.7916934750191306e-05, + "loss": 0.3284, + "num_input_tokens_seen": 4646528, + "step": 4620 + }, + { + "epoch": 2.1805752003771808, + "grad_norm": 0.33611077070236206, + "learning_rate": 4.790870584074661e-05, + "loss": 0.3607, + "num_input_tokens_seen": 4651616, + "step": 4625 + }, + { + "epoch": 2.1829325789721827, + "grad_norm": 0.46523162722587585, + "learning_rate": 4.790046141908285e-05, + "loss": 0.3442, + "num_input_tokens_seen": 4656672, + "step": 4630 + }, + { + "epoch": 2.185289957567185, + "grad_norm": 0.4341171383857727, + "learning_rate": 4.7892201490782583e-05, + "loss": 0.3334, + "num_input_tokens_seen": 4662144, + "step": 4635 + }, + { + "epoch": 2.1876473361621875, + "grad_norm": 0.43916624784469604, + "learning_rate": 4.788392606143888e-05, + "loss": 0.3363, + "num_input_tokens_seen": 4667488, + "step": 4640 + }, + { + "epoch": 2.19000471475719, + "grad_norm": 0.36775800585746765, + "learning_rate": 4.787563513665528e-05, + "loss": 0.2712, + "num_input_tokens_seen": 4672480, + "step": 4645 + }, + { + "epoch": 2.1923620933521923, + "grad_norm": 0.44327497482299805, + "learning_rate": 4.786732872204585e-05, + "loss": 0.3913, + "num_input_tokens_seen": 4677024, + "step": 4650 + }, + { + "epoch": 2.1947194719471947, + "grad_norm": 0.3258865177631378, + "learning_rate": 4.785900682323513e-05, + "loss": 0.3316, + "num_input_tokens_seen": 4681728, + "step": 4655 + }, + { + "epoch": 2.197076850542197, + "grad_norm": 0.3674670457839966, + "learning_rate": 4.7850669445858134e-05, + "loss": 0.3348, + "num_input_tokens_seen": 4686016, + "step": 4660 + }, + { + "epoch": 2.1994342291371995, + "grad_norm": 0.47524750232696533, + "learning_rate": 4.784231659556037e-05, + "loss": 0.3154, + "num_input_tokens_seen": 4690240, + "step": 4665 + }, + { + "epoch": 2.201791607732202, + "grad_norm": 0.44110605120658875, + "learning_rate": 4.7833948277997834e-05, + "loss": 0.3362, + "num_input_tokens_seen": 4694848, + "step": 4670 + }, + { + "epoch": 2.2041489863272044, + "grad_norm": 0.9092927575111389, + "learning_rate": 4.782556449883696e-05, + "loss": 0.4023, + "num_input_tokens_seen": 4699136, + "step": 4675 + }, + { + "epoch": 2.2065063649222063, + "grad_norm": 0.6434308886528015, + "learning_rate": 4.7817165263754694e-05, + "loss": 0.3456, + "num_input_tokens_seen": 4704384, + "step": 4680 + }, + { + "epoch": 2.2088637435172087, + "grad_norm": 0.6509522795677185, + "learning_rate": 4.780875057843842e-05, + "loss": 0.3362, + "num_input_tokens_seen": 4709632, + "step": 4685 + }, + { + "epoch": 2.211221122112211, + "grad_norm": 0.41875961422920227, + "learning_rate": 4.7800320448585984e-05, + "loss": 0.285, + "num_input_tokens_seen": 4714560, + "step": 4690 + }, + { + "epoch": 2.2135785007072135, + "grad_norm": 0.33401262760162354, + "learning_rate": 4.779187487990571e-05, + "loss": 0.335, + "num_input_tokens_seen": 4719008, + "step": 4695 + }, + { + "epoch": 2.215935879302216, + "grad_norm": 0.3725087642669678, + "learning_rate": 4.778341387811634e-05, + "loss": 0.3469, + "num_input_tokens_seen": 4723136, + "step": 4700 + }, + { + "epoch": 2.2182932578972183, + "grad_norm": 0.620409369468689, + "learning_rate": 4.7774937448947124e-05, + "loss": 0.324, + "num_input_tokens_seen": 4729024, + "step": 4705 + }, + { + "epoch": 2.2206506364922207, + "grad_norm": 0.46329838037490845, + "learning_rate": 4.77664455981377e-05, + "loss": 0.3484, + "num_input_tokens_seen": 4733152, + "step": 4710 + }, + { + "epoch": 2.223008015087223, + "grad_norm": 0.4821571111679077, + "learning_rate": 4.775793833143818e-05, + "loss": 0.3537, + "num_input_tokens_seen": 4737824, + "step": 4715 + }, + { + "epoch": 2.2253653936822255, + "grad_norm": 0.5188208818435669, + "learning_rate": 4.7749415654609106e-05, + "loss": 0.3608, + "num_input_tokens_seen": 4741792, + "step": 4720 + }, + { + "epoch": 2.227722772277228, + "grad_norm": 0.493938684463501, + "learning_rate": 4.774087757342146e-05, + "loss": 0.3329, + "num_input_tokens_seen": 4746976, + "step": 4725 + }, + { + "epoch": 2.23008015087223, + "grad_norm": 0.5535350441932678, + "learning_rate": 4.773232409365663e-05, + "loss": 0.3527, + "num_input_tokens_seen": 4752544, + "step": 4730 + }, + { + "epoch": 2.2324375294672323, + "grad_norm": 0.4631662666797638, + "learning_rate": 4.772375522110648e-05, + "loss": 0.3389, + "num_input_tokens_seen": 4756928, + "step": 4735 + }, + { + "epoch": 2.2347949080622347, + "grad_norm": 0.5504041910171509, + "learning_rate": 4.771517096157325e-05, + "loss": 0.474, + "num_input_tokens_seen": 4764800, + "step": 4740 + }, + { + "epoch": 2.237152286657237, + "grad_norm": 0.3957507014274597, + "learning_rate": 4.770657132086962e-05, + "loss": 0.4188, + "num_input_tokens_seen": 4771712, + "step": 4745 + }, + { + "epoch": 2.2395096652522395, + "grad_norm": 0.45872169733047485, + "learning_rate": 4.769795630481869e-05, + "loss": 0.3812, + "num_input_tokens_seen": 4776896, + "step": 4750 + }, + { + "epoch": 2.241867043847242, + "grad_norm": 0.5916630625724792, + "learning_rate": 4.768932591925396e-05, + "loss": 0.3582, + "num_input_tokens_seen": 4782496, + "step": 4755 + }, + { + "epoch": 2.2442244224422443, + "grad_norm": 0.4334796667098999, + "learning_rate": 4.768068017001933e-05, + "loss": 0.3365, + "num_input_tokens_seen": 4788480, + "step": 4760 + }, + { + "epoch": 2.2465818010372467, + "grad_norm": 0.34215229749679565, + "learning_rate": 4.7672019062969126e-05, + "loss": 0.3164, + "num_input_tokens_seen": 4793024, + "step": 4765 + }, + { + "epoch": 2.248939179632249, + "grad_norm": 0.3826615512371063, + "learning_rate": 4.7663342603968064e-05, + "loss": 0.2986, + "num_input_tokens_seen": 4798336, + "step": 4770 + }, + { + "epoch": 2.251296558227251, + "grad_norm": 0.6471920013427734, + "learning_rate": 4.765465079889124e-05, + "loss": 0.355, + "num_input_tokens_seen": 4802336, + "step": 4775 + }, + { + "epoch": 2.2536539368222535, + "grad_norm": 0.49995169043540955, + "learning_rate": 4.7645943653624156e-05, + "loss": 0.3237, + "num_input_tokens_seen": 4807776, + "step": 4780 + }, + { + "epoch": 2.256011315417256, + "grad_norm": 0.5232663750648499, + "learning_rate": 4.763722117406272e-05, + "loss": 0.3318, + "num_input_tokens_seen": 4812896, + "step": 4785 + }, + { + "epoch": 2.2583686940122583, + "grad_norm": 0.6610704064369202, + "learning_rate": 4.762848336611317e-05, + "loss": 0.303, + "num_input_tokens_seen": 4818560, + "step": 4790 + }, + { + "epoch": 2.2607260726072607, + "grad_norm": 0.31775858998298645, + "learning_rate": 4.7619730235692186e-05, + "loss": 0.3572, + "num_input_tokens_seen": 4822912, + "step": 4795 + }, + { + "epoch": 2.263083451202263, + "grad_norm": 0.48804187774658203, + "learning_rate": 4.761096178872677e-05, + "loss": 0.2948, + "num_input_tokens_seen": 4827456, + "step": 4800 + }, + { + "epoch": 2.2654408297972655, + "grad_norm": 0.680256724357605, + "learning_rate": 4.760217803115433e-05, + "loss": 0.4376, + "num_input_tokens_seen": 4832576, + "step": 4805 + }, + { + "epoch": 2.267798208392268, + "grad_norm": 0.640396237373352, + "learning_rate": 4.7593378968922625e-05, + "loss": 0.3704, + "num_input_tokens_seen": 4836736, + "step": 4810 + }, + { + "epoch": 2.2701555869872703, + "grad_norm": 0.45978838205337524, + "learning_rate": 4.758456460798979e-05, + "loss": 0.3118, + "num_input_tokens_seen": 4841312, + "step": 4815 + }, + { + "epoch": 2.2725129655822727, + "grad_norm": 0.5210190415382385, + "learning_rate": 4.7575734954324306e-05, + "loss": 0.3473, + "num_input_tokens_seen": 4845984, + "step": 4820 + }, + { + "epoch": 2.274870344177275, + "grad_norm": 0.3590638339519501, + "learning_rate": 4.756689001390501e-05, + "loss": 0.3245, + "num_input_tokens_seen": 4851104, + "step": 4825 + }, + { + "epoch": 2.2772277227722775, + "grad_norm": 0.3150668144226074, + "learning_rate": 4.75580297927211e-05, + "loss": 0.298, + "num_input_tokens_seen": 4856384, + "step": 4830 + }, + { + "epoch": 2.2795851013672794, + "grad_norm": 0.9879144430160522, + "learning_rate": 4.754915429677211e-05, + "loss": 0.3576, + "num_input_tokens_seen": 4861632, + "step": 4835 + }, + { + "epoch": 2.281942479962282, + "grad_norm": 0.4380239248275757, + "learning_rate": 4.7540263532067935e-05, + "loss": 0.3738, + "num_input_tokens_seen": 4866400, + "step": 4840 + }, + { + "epoch": 2.2842998585572842, + "grad_norm": 0.5049492716789246, + "learning_rate": 4.7531357504628795e-05, + "loss": 0.3019, + "num_input_tokens_seen": 4871744, + "step": 4845 + }, + { + "epoch": 2.2866572371522866, + "grad_norm": 0.46013495326042175, + "learning_rate": 4.752243622048523e-05, + "loss": 0.3703, + "num_input_tokens_seen": 4876992, + "step": 4850 + }, + { + "epoch": 2.289014615747289, + "grad_norm": 0.5540921688079834, + "learning_rate": 4.751349968567814e-05, + "loss": 0.3446, + "num_input_tokens_seen": 4882304, + "step": 4855 + }, + { + "epoch": 2.2913719943422914, + "grad_norm": 0.35101351141929626, + "learning_rate": 4.7504547906258745e-05, + "loss": 0.374, + "num_input_tokens_seen": 4886080, + "step": 4860 + }, + { + "epoch": 2.293729372937294, + "grad_norm": 0.4519904851913452, + "learning_rate": 4.749558088828857e-05, + "loss": 0.3133, + "num_input_tokens_seen": 4891264, + "step": 4865 + }, + { + "epoch": 2.2960867515322962, + "grad_norm": 0.6172090172767639, + "learning_rate": 4.7486598637839474e-05, + "loss": 0.2965, + "num_input_tokens_seen": 4897856, + "step": 4870 + }, + { + "epoch": 2.298444130127298, + "grad_norm": 0.6096044778823853, + "learning_rate": 4.747760116099363e-05, + "loss": 0.3165, + "num_input_tokens_seen": 4902240, + "step": 4875 + }, + { + "epoch": 2.3008015087223006, + "grad_norm": 0.9810012578964233, + "learning_rate": 4.746858846384351e-05, + "loss": 0.3481, + "num_input_tokens_seen": 4906912, + "step": 4880 + }, + { + "epoch": 2.303158887317303, + "grad_norm": 0.32078516483306885, + "learning_rate": 4.7459560552491917e-05, + "loss": 0.3462, + "num_input_tokens_seen": 4911488, + "step": 4885 + }, + { + "epoch": 2.3055162659123054, + "grad_norm": 0.4471733272075653, + "learning_rate": 4.745051743305192e-05, + "loss": 0.3622, + "num_input_tokens_seen": 4916032, + "step": 4890 + }, + { + "epoch": 2.307873644507308, + "grad_norm": 0.44092872738838196, + "learning_rate": 4.7441459111646916e-05, + "loss": 0.3373, + "num_input_tokens_seen": 4921152, + "step": 4895 + }, + { + "epoch": 2.31023102310231, + "grad_norm": 0.5008043646812439, + "learning_rate": 4.743238559441058e-05, + "loss": 0.3061, + "num_input_tokens_seen": 4925280, + "step": 4900 + }, + { + "epoch": 2.3125884016973126, + "grad_norm": 0.4369982182979584, + "learning_rate": 4.742329688748688e-05, + "loss": 0.3196, + "num_input_tokens_seen": 4930496, + "step": 4905 + }, + { + "epoch": 2.314945780292315, + "grad_norm": 0.48036307096481323, + "learning_rate": 4.7414192997030084e-05, + "loss": 0.3308, + "num_input_tokens_seen": 4935968, + "step": 4910 + }, + { + "epoch": 2.3173031588873174, + "grad_norm": 0.33819082379341125, + "learning_rate": 4.7405073929204715e-05, + "loss": 0.3816, + "num_input_tokens_seen": 4941312, + "step": 4915 + }, + { + "epoch": 2.31966053748232, + "grad_norm": 0.6478641629219055, + "learning_rate": 4.7395939690185584e-05, + "loss": 0.3198, + "num_input_tokens_seen": 4946208, + "step": 4920 + }, + { + "epoch": 2.322017916077322, + "grad_norm": 0.3568427264690399, + "learning_rate": 4.738679028615778e-05, + "loss": 0.2722, + "num_input_tokens_seen": 4951072, + "step": 4925 + }, + { + "epoch": 2.3243752946723246, + "grad_norm": 0.35138458013534546, + "learning_rate": 4.7377625723316664e-05, + "loss": 0.341, + "num_input_tokens_seen": 4954624, + "step": 4930 + }, + { + "epoch": 2.3267326732673266, + "grad_norm": 0.3695351183414459, + "learning_rate": 4.736844600786785e-05, + "loss": 0.3139, + "num_input_tokens_seen": 4959584, + "step": 4935 + }, + { + "epoch": 2.329090051862329, + "grad_norm": 0.6069442629814148, + "learning_rate": 4.7359251146027225e-05, + "loss": 0.3846, + "num_input_tokens_seen": 4963776, + "step": 4940 + }, + { + "epoch": 2.3314474304573314, + "grad_norm": 0.5011512637138367, + "learning_rate": 4.7350041144020915e-05, + "loss": 0.321, + "num_input_tokens_seen": 4968448, + "step": 4945 + }, + { + "epoch": 2.333804809052334, + "grad_norm": 0.3263905346393585, + "learning_rate": 4.734081600808531e-05, + "loss": 0.3105, + "num_input_tokens_seen": 4973280, + "step": 4950 + }, + { + "epoch": 2.336162187647336, + "grad_norm": 0.43932414054870605, + "learning_rate": 4.7331575744467047e-05, + "loss": 0.3682, + "num_input_tokens_seen": 4977984, + "step": 4955 + }, + { + "epoch": 2.3385195662423386, + "grad_norm": 0.3687242865562439, + "learning_rate": 4.7322320359423e-05, + "loss": 0.3235, + "num_input_tokens_seen": 4983136, + "step": 4960 + }, + { + "epoch": 2.340876944837341, + "grad_norm": 0.39061713218688965, + "learning_rate": 4.7313049859220294e-05, + "loss": 0.3633, + "num_input_tokens_seen": 4988032, + "step": 4965 + }, + { + "epoch": 2.3432343234323434, + "grad_norm": 0.5410740971565247, + "learning_rate": 4.7303764250136284e-05, + "loss": 0.3219, + "num_input_tokens_seen": 4994176, + "step": 4970 + }, + { + "epoch": 2.3455917020273453, + "grad_norm": 0.7204230427742004, + "learning_rate": 4.7294463538458544e-05, + "loss": 0.3753, + "num_input_tokens_seen": 5000480, + "step": 4975 + }, + { + "epoch": 2.3479490806223478, + "grad_norm": 0.45537179708480835, + "learning_rate": 4.728514773048489e-05, + "loss": 0.3342, + "num_input_tokens_seen": 5004960, + "step": 4980 + }, + { + "epoch": 2.35030645921735, + "grad_norm": 0.43479108810424805, + "learning_rate": 4.727581683252334e-05, + "loss": 0.3763, + "num_input_tokens_seen": 5009120, + "step": 4985 + }, + { + "epoch": 2.3526638378123526, + "grad_norm": 0.40350717306137085, + "learning_rate": 4.7266470850892175e-05, + "loss": 0.3479, + "num_input_tokens_seen": 5013984, + "step": 4990 + }, + { + "epoch": 2.355021216407355, + "grad_norm": 0.4354986548423767, + "learning_rate": 4.725710979191983e-05, + "loss": 0.353, + "num_input_tokens_seen": 5018880, + "step": 4995 + }, + { + "epoch": 2.3573785950023574, + "grad_norm": 0.42793503403663635, + "learning_rate": 4.7247733661944995e-05, + "loss": 0.358, + "num_input_tokens_seen": 5024000, + "step": 5000 + }, + { + "epoch": 2.3597359735973598, + "grad_norm": 0.6097524166107178, + "learning_rate": 4.723834246731654e-05, + "loss": 0.3548, + "num_input_tokens_seen": 5028064, + "step": 5005 + }, + { + "epoch": 2.362093352192362, + "grad_norm": 0.5392748117446899, + "learning_rate": 4.722893621439354e-05, + "loss": 0.3467, + "num_input_tokens_seen": 5032768, + "step": 5010 + }, + { + "epoch": 2.3644507307873646, + "grad_norm": 0.3692268133163452, + "learning_rate": 4.721951490954528e-05, + "loss": 0.3226, + "num_input_tokens_seen": 5037024, + "step": 5015 + }, + { + "epoch": 2.366808109382367, + "grad_norm": 0.4168626070022583, + "learning_rate": 4.7210078559151226e-05, + "loss": 0.4129, + "num_input_tokens_seen": 5042688, + "step": 5020 + }, + { + "epoch": 2.3691654879773694, + "grad_norm": 0.7557758092880249, + "learning_rate": 4.720062716960103e-05, + "loss": 0.3405, + "num_input_tokens_seen": 5049120, + "step": 5025 + }, + { + "epoch": 2.3715228665723718, + "grad_norm": 0.4674548804759979, + "learning_rate": 4.719116074729453e-05, + "loss": 0.2857, + "num_input_tokens_seen": 5054560, + "step": 5030 + }, + { + "epoch": 2.3738802451673737, + "grad_norm": 0.4881629943847656, + "learning_rate": 4.7181679298641745e-05, + "loss": 0.3195, + "num_input_tokens_seen": 5059232, + "step": 5035 + }, + { + "epoch": 2.376237623762376, + "grad_norm": 0.449289470911026, + "learning_rate": 4.717218283006287e-05, + "loss": 0.3294, + "num_input_tokens_seen": 5063936, + "step": 5040 + }, + { + "epoch": 2.3785950023573785, + "grad_norm": 0.5258601307868958, + "learning_rate": 4.716267134798826e-05, + "loss": 0.3231, + "num_input_tokens_seen": 5069920, + "step": 5045 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 1.0178159475326538, + "learning_rate": 4.7153144858858464e-05, + "loss": 0.3683, + "num_input_tokens_seen": 5076128, + "step": 5050 + }, + { + "epoch": 2.3833097595473833, + "grad_norm": 0.40310874581336975, + "learning_rate": 4.714360336912415e-05, + "loss": 0.3696, + "num_input_tokens_seen": 5080448, + "step": 5055 + }, + { + "epoch": 2.3856671381423857, + "grad_norm": 0.6676579117774963, + "learning_rate": 4.713404688524619e-05, + "loss": 0.3477, + "num_input_tokens_seen": 5085312, + "step": 5060 + }, + { + "epoch": 2.388024516737388, + "grad_norm": 0.6524103283882141, + "learning_rate": 4.712447541369556e-05, + "loss": 0.362, + "num_input_tokens_seen": 5089632, + "step": 5065 + }, + { + "epoch": 2.3903818953323905, + "grad_norm": 0.7232765555381775, + "learning_rate": 4.711488896095342e-05, + "loss": 0.3197, + "num_input_tokens_seen": 5094208, + "step": 5070 + }, + { + "epoch": 2.3927392739273925, + "grad_norm": 0.37477824091911316, + "learning_rate": 4.710528753351108e-05, + "loss": 0.3695, + "num_input_tokens_seen": 5099424, + "step": 5075 + }, + { + "epoch": 2.395096652522395, + "grad_norm": 0.7015263438224792, + "learning_rate": 4.709567113786995e-05, + "loss": 0.3344, + "num_input_tokens_seen": 5104896, + "step": 5080 + }, + { + "epoch": 2.3974540311173973, + "grad_norm": 0.4963046908378601, + "learning_rate": 4.7086039780541616e-05, + "loss": 0.3316, + "num_input_tokens_seen": 5109952, + "step": 5085 + }, + { + "epoch": 2.3998114097123997, + "grad_norm": 0.8637708425521851, + "learning_rate": 4.707639346804778e-05, + "loss": 0.3817, + "num_input_tokens_seen": 5115040, + "step": 5090 + }, + { + "epoch": 2.402168788307402, + "grad_norm": 0.47111111879348755, + "learning_rate": 4.706673220692025e-05, + "loss": 0.3497, + "num_input_tokens_seen": 5120864, + "step": 5095 + }, + { + "epoch": 2.4045261669024045, + "grad_norm": 0.6770507097244263, + "learning_rate": 4.7057056003701e-05, + "loss": 0.3194, + "num_input_tokens_seen": 5125408, + "step": 5100 + }, + { + "epoch": 2.406883545497407, + "grad_norm": 0.4650384783744812, + "learning_rate": 4.704736486494208e-05, + "loss": 0.3218, + "num_input_tokens_seen": 5130560, + "step": 5105 + }, + { + "epoch": 2.4092409240924093, + "grad_norm": 0.3980252742767334, + "learning_rate": 4.703765879720568e-05, + "loss": 0.3002, + "num_input_tokens_seen": 5134784, + "step": 5110 + }, + { + "epoch": 2.4115983026874117, + "grad_norm": 0.39544251561164856, + "learning_rate": 4.702793780706409e-05, + "loss": 0.3868, + "num_input_tokens_seen": 5142208, + "step": 5115 + }, + { + "epoch": 2.413955681282414, + "grad_norm": 0.7151238918304443, + "learning_rate": 4.7018201901099697e-05, + "loss": 0.3905, + "num_input_tokens_seen": 5147744, + "step": 5120 + }, + { + "epoch": 2.4163130598774165, + "grad_norm": 0.5716570019721985, + "learning_rate": 4.7008451085905e-05, + "loss": 0.3365, + "num_input_tokens_seen": 5154272, + "step": 5125 + }, + { + "epoch": 2.418670438472419, + "grad_norm": 0.35204872488975525, + "learning_rate": 4.6998685368082585e-05, + "loss": 0.2787, + "num_input_tokens_seen": 5159488, + "step": 5130 + }, + { + "epoch": 2.421027817067421, + "grad_norm": 0.4118606448173523, + "learning_rate": 4.6988904754245134e-05, + "loss": 0.3718, + "num_input_tokens_seen": 5164352, + "step": 5135 + }, + { + "epoch": 2.4233851956624233, + "grad_norm": 0.699285089969635, + "learning_rate": 4.697910925101542e-05, + "loss": 0.4109, + "num_input_tokens_seen": 5169920, + "step": 5140 + }, + { + "epoch": 2.4257425742574257, + "grad_norm": 0.3951466381549835, + "learning_rate": 4.696929886502628e-05, + "loss": 0.3603, + "num_input_tokens_seen": 5173792, + "step": 5145 + }, + { + "epoch": 2.428099952852428, + "grad_norm": 0.72333163022995, + "learning_rate": 4.695947360292065e-05, + "loss": 0.4567, + "num_input_tokens_seen": 5181568, + "step": 5150 + }, + { + "epoch": 2.4304573314474305, + "grad_norm": 0.571545422077179, + "learning_rate": 4.6949633471351526e-05, + "loss": 0.37, + "num_input_tokens_seen": 5188992, + "step": 5155 + }, + { + "epoch": 2.432814710042433, + "grad_norm": 0.7226229906082153, + "learning_rate": 4.693977847698199e-05, + "loss": 0.3258, + "num_input_tokens_seen": 5194176, + "step": 5160 + }, + { + "epoch": 2.4351720886374353, + "grad_norm": 0.37246382236480713, + "learning_rate": 4.692990862648515e-05, + "loss": 0.3638, + "num_input_tokens_seen": 5198656, + "step": 5165 + }, + { + "epoch": 2.4375294672324377, + "grad_norm": 0.5303564071655273, + "learning_rate": 4.692002392654422e-05, + "loss": 0.321, + "num_input_tokens_seen": 5203968, + "step": 5170 + }, + { + "epoch": 2.4398868458274396, + "grad_norm": 0.32406777143478394, + "learning_rate": 4.691012438385243e-05, + "loss": 0.3893, + "num_input_tokens_seen": 5208384, + "step": 5175 + }, + { + "epoch": 2.442244224422442, + "grad_norm": 0.6238230466842651, + "learning_rate": 4.69002100051131e-05, + "loss": 0.3437, + "num_input_tokens_seen": 5212544, + "step": 5180 + }, + { + "epoch": 2.4446016030174444, + "grad_norm": 0.42823758721351624, + "learning_rate": 4.6890280797039544e-05, + "loss": 0.328, + "num_input_tokens_seen": 5218080, + "step": 5185 + }, + { + "epoch": 2.446958981612447, + "grad_norm": 0.4311583340167999, + "learning_rate": 4.688033676635517e-05, + "loss": 0.3202, + "num_input_tokens_seen": 5222784, + "step": 5190 + }, + { + "epoch": 2.4493163602074493, + "grad_norm": 0.4463573694229126, + "learning_rate": 4.68703779197934e-05, + "loss": 0.3599, + "num_input_tokens_seen": 5228832, + "step": 5195 + }, + { + "epoch": 2.4516737388024517, + "grad_norm": 0.482952356338501, + "learning_rate": 4.686040426409767e-05, + "loss": 0.2922, + "num_input_tokens_seen": 5233536, + "step": 5200 + }, + { + "epoch": 2.454031117397454, + "grad_norm": 0.40931636095046997, + "learning_rate": 4.685041580602148e-05, + "loss": 0.3314, + "num_input_tokens_seen": 5238560, + "step": 5205 + }, + { + "epoch": 2.4563884959924565, + "grad_norm": 0.6283487677574158, + "learning_rate": 4.684041255232832e-05, + "loss": 0.3922, + "num_input_tokens_seen": 5242560, + "step": 5210 + }, + { + "epoch": 2.458745874587459, + "grad_norm": 0.3876926898956299, + "learning_rate": 4.683039450979172e-05, + "loss": 0.3361, + "num_input_tokens_seen": 5247552, + "step": 5215 + }, + { + "epoch": 2.4611032531824613, + "grad_norm": 0.3513420820236206, + "learning_rate": 4.6820361685195224e-05, + "loss": 0.3617, + "num_input_tokens_seen": 5252448, + "step": 5220 + }, + { + "epoch": 2.4634606317774637, + "grad_norm": 0.6885468363761902, + "learning_rate": 4.681031408533238e-05, + "loss": 0.346, + "num_input_tokens_seen": 5256544, + "step": 5225 + }, + { + "epoch": 2.465818010372466, + "grad_norm": 0.5747681856155396, + "learning_rate": 4.6800251717006735e-05, + "loss": 0.323, + "num_input_tokens_seen": 5261184, + "step": 5230 + }, + { + "epoch": 2.468175388967468, + "grad_norm": 0.37003451585769653, + "learning_rate": 4.6790174587031834e-05, + "loss": 0.3917, + "num_input_tokens_seen": 5266144, + "step": 5235 + }, + { + "epoch": 2.4705327675624704, + "grad_norm": 0.8930907249450684, + "learning_rate": 4.678008270223123e-05, + "loss": 0.3514, + "num_input_tokens_seen": 5272960, + "step": 5240 + }, + { + "epoch": 2.472890146157473, + "grad_norm": 0.5440678000450134, + "learning_rate": 4.676997606943847e-05, + "loss": 0.3679, + "num_input_tokens_seen": 5279136, + "step": 5245 + }, + { + "epoch": 2.4752475247524752, + "grad_norm": 0.427359402179718, + "learning_rate": 4.6759854695497066e-05, + "loss": 0.3377, + "num_input_tokens_seen": 5284448, + "step": 5250 + }, + { + "epoch": 2.4776049033474776, + "grad_norm": 0.4317522346973419, + "learning_rate": 4.674971858726054e-05, + "loss": 0.3607, + "num_input_tokens_seen": 5289408, + "step": 5255 + }, + { + "epoch": 2.47996228194248, + "grad_norm": 0.6638148427009583, + "learning_rate": 4.673956775159236e-05, + "loss": 0.3709, + "num_input_tokens_seen": 5294688, + "step": 5260 + }, + { + "epoch": 2.4823196605374824, + "grad_norm": 0.4366839826107025, + "learning_rate": 4.672940219536599e-05, + "loss": 0.3502, + "num_input_tokens_seen": 5299360, + "step": 5265 + }, + { + "epoch": 2.484677039132485, + "grad_norm": 0.5469561815261841, + "learning_rate": 4.671922192546485e-05, + "loss": 0.3503, + "num_input_tokens_seen": 5305472, + "step": 5270 + }, + { + "epoch": 2.487034417727487, + "grad_norm": 0.43334731459617615, + "learning_rate": 4.6709026948782333e-05, + "loss": 0.3541, + "num_input_tokens_seen": 5311680, + "step": 5275 + }, + { + "epoch": 2.489391796322489, + "grad_norm": 0.477250874042511, + "learning_rate": 4.669881727222179e-05, + "loss": 0.3444, + "num_input_tokens_seen": 5317184, + "step": 5280 + }, + { + "epoch": 2.4917491749174916, + "grad_norm": 0.3371625244617462, + "learning_rate": 4.66885929026965e-05, + "loss": 0.2768, + "num_input_tokens_seen": 5321952, + "step": 5285 + }, + { + "epoch": 2.494106553512494, + "grad_norm": 0.4648168087005615, + "learning_rate": 4.667835384712973e-05, + "loss": 0.322, + "num_input_tokens_seen": 5328800, + "step": 5290 + }, + { + "epoch": 2.4964639321074964, + "grad_norm": 0.3893931806087494, + "learning_rate": 4.666810011245466e-05, + "loss": 0.3806, + "num_input_tokens_seen": 5333472, + "step": 5295 + }, + { + "epoch": 2.498821310702499, + "grad_norm": 0.40954509377479553, + "learning_rate": 4.665783170561443e-05, + "loss": 0.3407, + "num_input_tokens_seen": 5338368, + "step": 5300 + }, + { + "epoch": 2.501178689297501, + "grad_norm": 0.5158207416534424, + "learning_rate": 4.664754863356211e-05, + "loss": 0.3763, + "num_input_tokens_seen": 5343200, + "step": 5305 + }, + { + "epoch": 2.501178689297501, + "eval_loss": 0.3401831090450287, + "eval_runtime": 25.5933, + "eval_samples_per_second": 36.846, + "eval_steps_per_second": 9.221, + "num_input_tokens_seen": 5343200, + "step": 5305 + }, + { + "epoch": 2.5035360678925036, + "grad_norm": 0.4078710079193115, + "learning_rate": 4.6637250903260686e-05, + "loss": 0.3159, + "num_input_tokens_seen": 5348128, + "step": 5310 + }, + { + "epoch": 2.505893446487506, + "grad_norm": 0.4351803958415985, + "learning_rate": 4.6626938521683096e-05, + "loss": 0.3384, + "num_input_tokens_seen": 5353504, + "step": 5315 + }, + { + "epoch": 2.5082508250825084, + "grad_norm": 0.44958218932151794, + "learning_rate": 4.661661149581218e-05, + "loss": 0.3361, + "num_input_tokens_seen": 5357760, + "step": 5320 + }, + { + "epoch": 2.510608203677511, + "grad_norm": 0.5204352736473083, + "learning_rate": 4.660626983264068e-05, + "loss": 0.3012, + "num_input_tokens_seen": 5362144, + "step": 5325 + }, + { + "epoch": 2.512965582272513, + "grad_norm": 0.3531167209148407, + "learning_rate": 4.6595913539171295e-05, + "loss": 0.2909, + "num_input_tokens_seen": 5366560, + "step": 5330 + }, + { + "epoch": 2.515322960867515, + "grad_norm": 0.4690229296684265, + "learning_rate": 4.658554262241659e-05, + "loss": 0.3612, + "num_input_tokens_seen": 5371168, + "step": 5335 + }, + { + "epoch": 2.5176803394625176, + "grad_norm": 0.2799920439720154, + "learning_rate": 4.6575157089399045e-05, + "loss": 0.2948, + "num_input_tokens_seen": 5376096, + "step": 5340 + }, + { + "epoch": 2.52003771805752, + "grad_norm": 0.5557320713996887, + "learning_rate": 4.656475694715104e-05, + "loss": 0.3738, + "num_input_tokens_seen": 5380928, + "step": 5345 + }, + { + "epoch": 2.5223950966525224, + "grad_norm": 0.5934690237045288, + "learning_rate": 4.655434220271484e-05, + "loss": 0.3425, + "num_input_tokens_seen": 5385600, + "step": 5350 + }, + { + "epoch": 2.5247524752475248, + "grad_norm": 0.6798064112663269, + "learning_rate": 4.6543912863142605e-05, + "loss": 0.3377, + "num_input_tokens_seen": 5389760, + "step": 5355 + }, + { + "epoch": 2.527109853842527, + "grad_norm": 0.7072538733482361, + "learning_rate": 4.653346893549638e-05, + "loss": 0.4002, + "num_input_tokens_seen": 5395296, + "step": 5360 + }, + { + "epoch": 2.5294672324375296, + "grad_norm": 0.5974365472793579, + "learning_rate": 4.652301042684808e-05, + "loss": 0.3461, + "num_input_tokens_seen": 5399072, + "step": 5365 + }, + { + "epoch": 2.531824611032532, + "grad_norm": 0.425037145614624, + "learning_rate": 4.651253734427949e-05, + "loss": 0.33, + "num_input_tokens_seen": 5404128, + "step": 5370 + }, + { + "epoch": 2.534181989627534, + "grad_norm": 0.6531172394752502, + "learning_rate": 4.650204969488228e-05, + "loss": 0.3099, + "num_input_tokens_seen": 5408736, + "step": 5375 + }, + { + "epoch": 2.5365393682225363, + "grad_norm": 0.45137104392051697, + "learning_rate": 4.649154748575796e-05, + "loss": 0.3374, + "num_input_tokens_seen": 5414080, + "step": 5380 + }, + { + "epoch": 2.5388967468175387, + "grad_norm": 0.4100959599018097, + "learning_rate": 4.648103072401793e-05, + "loss": 0.3723, + "num_input_tokens_seen": 5418784, + "step": 5385 + }, + { + "epoch": 2.541254125412541, + "grad_norm": 0.3213784992694855, + "learning_rate": 4.647049941678342e-05, + "loss": 0.3153, + "num_input_tokens_seen": 5424704, + "step": 5390 + }, + { + "epoch": 2.5436115040075435, + "grad_norm": 0.31116926670074463, + "learning_rate": 4.645995357118551e-05, + "loss": 0.3186, + "num_input_tokens_seen": 5428928, + "step": 5395 + }, + { + "epoch": 2.545968882602546, + "grad_norm": 0.4729045331478119, + "learning_rate": 4.644939319436513e-05, + "loss": 0.3183, + "num_input_tokens_seen": 5433856, + "step": 5400 + }, + { + "epoch": 2.5483262611975483, + "grad_norm": 0.37601184844970703, + "learning_rate": 4.643881829347305e-05, + "loss": 0.3377, + "num_input_tokens_seen": 5438688, + "step": 5405 + }, + { + "epoch": 2.5506836397925507, + "grad_norm": 0.3878232538700104, + "learning_rate": 4.6428228875669885e-05, + "loss": 0.2605, + "num_input_tokens_seen": 5444672, + "step": 5410 + }, + { + "epoch": 2.553041018387553, + "grad_norm": 0.31358110904693604, + "learning_rate": 4.6417624948126065e-05, + "loss": 0.2719, + "num_input_tokens_seen": 5450720, + "step": 5415 + }, + { + "epoch": 2.5553983969825556, + "grad_norm": 0.4788685142993927, + "learning_rate": 4.6407006518021845e-05, + "loss": 0.2946, + "num_input_tokens_seen": 5454432, + "step": 5420 + }, + { + "epoch": 2.557755775577558, + "grad_norm": 0.3112141489982605, + "learning_rate": 4.639637359254731e-05, + "loss": 0.209, + "num_input_tokens_seen": 5459424, + "step": 5425 + }, + { + "epoch": 2.5601131541725604, + "grad_norm": 0.31030625104904175, + "learning_rate": 4.6385726178902333e-05, + "loss": 0.2772, + "num_input_tokens_seen": 5464288, + "step": 5430 + }, + { + "epoch": 2.5624705327675623, + "grad_norm": 0.41614022850990295, + "learning_rate": 4.6375064284296645e-05, + "loss": 0.4234, + "num_input_tokens_seen": 5468736, + "step": 5435 + }, + { + "epoch": 2.5648279113625647, + "grad_norm": 0.42958956956863403, + "learning_rate": 4.636438791594975e-05, + "loss": 0.2564, + "num_input_tokens_seen": 5474400, + "step": 5440 + }, + { + "epoch": 2.567185289957567, + "grad_norm": 0.4016191065311432, + "learning_rate": 4.635369708109095e-05, + "loss": 0.375, + "num_input_tokens_seen": 5479808, + "step": 5445 + }, + { + "epoch": 2.5695426685525695, + "grad_norm": 0.4656152129173279, + "learning_rate": 4.6342991786959374e-05, + "loss": 0.26, + "num_input_tokens_seen": 5485632, + "step": 5450 + }, + { + "epoch": 2.571900047147572, + "grad_norm": 0.335584819316864, + "learning_rate": 4.6332272040803895e-05, + "loss": 0.3554, + "num_input_tokens_seen": 5492128, + "step": 5455 + }, + { + "epoch": 2.5742574257425743, + "grad_norm": 0.3789937496185303, + "learning_rate": 4.632153784988321e-05, + "loss": 0.3775, + "num_input_tokens_seen": 5496480, + "step": 5460 + }, + { + "epoch": 2.5766148043375767, + "grad_norm": 0.34694379568099976, + "learning_rate": 4.631078922146578e-05, + "loss": 0.302, + "num_input_tokens_seen": 5501344, + "step": 5465 + }, + { + "epoch": 2.578972182932579, + "grad_norm": 0.5970708727836609, + "learning_rate": 4.630002616282984e-05, + "loss": 0.343, + "num_input_tokens_seen": 5506304, + "step": 5470 + }, + { + "epoch": 2.581329561527581, + "grad_norm": 0.4309825003147125, + "learning_rate": 4.628924868126341e-05, + "loss": 0.4072, + "num_input_tokens_seen": 5510016, + "step": 5475 + }, + { + "epoch": 2.5836869401225835, + "grad_norm": 0.4807042181491852, + "learning_rate": 4.6278456784064274e-05, + "loss": 0.3401, + "num_input_tokens_seen": 5515456, + "step": 5480 + }, + { + "epoch": 2.586044318717586, + "grad_norm": 0.40027356147766113, + "learning_rate": 4.626765047853996e-05, + "loss": 0.3094, + "num_input_tokens_seen": 5521568, + "step": 5485 + }, + { + "epoch": 2.5884016973125883, + "grad_norm": 0.6948379874229431, + "learning_rate": 4.625682977200777e-05, + "loss": 0.3497, + "num_input_tokens_seen": 5526304, + "step": 5490 + }, + { + "epoch": 2.5907590759075907, + "grad_norm": 0.514423131942749, + "learning_rate": 4.624599467179475e-05, + "loss": 0.317, + "num_input_tokens_seen": 5532608, + "step": 5495 + }, + { + "epoch": 2.593116454502593, + "grad_norm": 0.46846672892570496, + "learning_rate": 4.623514518523768e-05, + "loss": 0.2875, + "num_input_tokens_seen": 5537312, + "step": 5500 + }, + { + "epoch": 2.5954738330975955, + "grad_norm": 0.36562490463256836, + "learning_rate": 4.622428131968313e-05, + "loss": 0.3212, + "num_input_tokens_seen": 5542720, + "step": 5505 + }, + { + "epoch": 2.597831211692598, + "grad_norm": 0.4648624658584595, + "learning_rate": 4.621340308248733e-05, + "loss": 0.3078, + "num_input_tokens_seen": 5546816, + "step": 5510 + }, + { + "epoch": 2.6001885902876003, + "grad_norm": 0.3514614701271057, + "learning_rate": 4.6202510481016313e-05, + "loss": 0.3943, + "num_input_tokens_seen": 5551936, + "step": 5515 + }, + { + "epoch": 2.6025459688826027, + "grad_norm": 0.46834951639175415, + "learning_rate": 4.61916035226458e-05, + "loss": 0.4378, + "num_input_tokens_seen": 5557120, + "step": 5520 + }, + { + "epoch": 2.604903347477605, + "grad_norm": 0.27222225069999695, + "learning_rate": 4.618068221476123e-05, + "loss": 0.3265, + "num_input_tokens_seen": 5561728, + "step": 5525 + }, + { + "epoch": 2.6072607260726075, + "grad_norm": 0.4475250542163849, + "learning_rate": 4.6169746564757786e-05, + "loss": 0.3742, + "num_input_tokens_seen": 5566656, + "step": 5530 + }, + { + "epoch": 2.6096181046676095, + "grad_norm": 0.33297693729400635, + "learning_rate": 4.615879658004035e-05, + "loss": 0.3187, + "num_input_tokens_seen": 5570720, + "step": 5535 + }, + { + "epoch": 2.611975483262612, + "grad_norm": 0.575234055519104, + "learning_rate": 4.614783226802349e-05, + "loss": 0.4105, + "num_input_tokens_seen": 5575328, + "step": 5540 + }, + { + "epoch": 2.6143328618576143, + "grad_norm": 0.40309321880340576, + "learning_rate": 4.613685363613151e-05, + "loss": 0.3775, + "num_input_tokens_seen": 5580352, + "step": 5545 + }, + { + "epoch": 2.6166902404526167, + "grad_norm": 0.4837341606616974, + "learning_rate": 4.612586069179839e-05, + "loss": 0.3138, + "num_input_tokens_seen": 5585440, + "step": 5550 + }, + { + "epoch": 2.619047619047619, + "grad_norm": 0.48334795236587524, + "learning_rate": 4.6114853442467785e-05, + "loss": 0.3539, + "num_input_tokens_seen": 5590208, + "step": 5555 + }, + { + "epoch": 2.6214049976426215, + "grad_norm": 0.31988632678985596, + "learning_rate": 4.6103831895593084e-05, + "loss": 0.3234, + "num_input_tokens_seen": 5595872, + "step": 5560 + }, + { + "epoch": 2.623762376237624, + "grad_norm": 0.5064396858215332, + "learning_rate": 4.609279605863732e-05, + "loss": 0.3104, + "num_input_tokens_seen": 5601536, + "step": 5565 + }, + { + "epoch": 2.6261197548326263, + "grad_norm": 0.6056028604507446, + "learning_rate": 4.6081745939073214e-05, + "loss": 0.3069, + "num_input_tokens_seen": 5605760, + "step": 5570 + }, + { + "epoch": 2.6284771334276282, + "grad_norm": 0.38130852580070496, + "learning_rate": 4.607068154438314e-05, + "loss": 0.302, + "num_input_tokens_seen": 5610976, + "step": 5575 + }, + { + "epoch": 2.6308345120226306, + "grad_norm": 0.35791876912117004, + "learning_rate": 4.6059602882059183e-05, + "loss": 0.3101, + "num_input_tokens_seen": 5616352, + "step": 5580 + }, + { + "epoch": 2.633191890617633, + "grad_norm": 0.3547684848308563, + "learning_rate": 4.604850995960304e-05, + "loss": 0.3663, + "num_input_tokens_seen": 5621312, + "step": 5585 + }, + { + "epoch": 2.6355492692126354, + "grad_norm": 0.3916061222553253, + "learning_rate": 4.6037402784526076e-05, + "loss": 0.3539, + "num_input_tokens_seen": 5626880, + "step": 5590 + }, + { + "epoch": 2.637906647807638, + "grad_norm": 0.3663788139820099, + "learning_rate": 4.602628136434934e-05, + "loss": 0.2889, + "num_input_tokens_seen": 5632192, + "step": 5595 + }, + { + "epoch": 2.6402640264026402, + "grad_norm": 0.4341062903404236, + "learning_rate": 4.601514570660349e-05, + "loss": 0.3736, + "num_input_tokens_seen": 5637536, + "step": 5600 + }, + { + "epoch": 2.6426214049976426, + "grad_norm": 0.22827006876468658, + "learning_rate": 4.600399581882884e-05, + "loss": 0.2737, + "num_input_tokens_seen": 5642944, + "step": 5605 + }, + { + "epoch": 2.644978783592645, + "grad_norm": 0.544339120388031, + "learning_rate": 4.5992831708575325e-05, + "loss": 0.3031, + "num_input_tokens_seen": 5647808, + "step": 5610 + }, + { + "epoch": 2.6473361621876474, + "grad_norm": 0.5029906034469604, + "learning_rate": 4.598165338340254e-05, + "loss": 0.3508, + "num_input_tokens_seen": 5652576, + "step": 5615 + }, + { + "epoch": 2.64969354078265, + "grad_norm": 0.43827584385871887, + "learning_rate": 4.597046085087967e-05, + "loss": 0.3706, + "num_input_tokens_seen": 5658592, + "step": 5620 + }, + { + "epoch": 2.6520509193776522, + "grad_norm": 0.41535767912864685, + "learning_rate": 4.595925411858555e-05, + "loss": 0.3922, + "num_input_tokens_seen": 5663040, + "step": 5625 + }, + { + "epoch": 2.6544082979726547, + "grad_norm": 0.34725111722946167, + "learning_rate": 4.594803319410861e-05, + "loss": 0.4167, + "num_input_tokens_seen": 5667456, + "step": 5630 + }, + { + "epoch": 2.6567656765676566, + "grad_norm": 0.43256303668022156, + "learning_rate": 4.59367980850469e-05, + "loss": 0.2556, + "num_input_tokens_seen": 5672736, + "step": 5635 + }, + { + "epoch": 2.659123055162659, + "grad_norm": 0.4522285461425781, + "learning_rate": 4.5925548799008074e-05, + "loss": 0.3284, + "num_input_tokens_seen": 5678080, + "step": 5640 + }, + { + "epoch": 2.6614804337576614, + "grad_norm": 0.40707260370254517, + "learning_rate": 4.591428534360938e-05, + "loss": 0.3557, + "num_input_tokens_seen": 5682304, + "step": 5645 + }, + { + "epoch": 2.663837812352664, + "grad_norm": 0.4815312325954437, + "learning_rate": 4.590300772647768e-05, + "loss": 0.3265, + "num_input_tokens_seen": 5688032, + "step": 5650 + }, + { + "epoch": 2.666195190947666, + "grad_norm": 0.353821724653244, + "learning_rate": 4.589171595524938e-05, + "loss": 0.371, + "num_input_tokens_seen": 5692256, + "step": 5655 + }, + { + "epoch": 2.6685525695426686, + "grad_norm": 0.6598552465438843, + "learning_rate": 4.588041003757053e-05, + "loss": 0.3085, + "num_input_tokens_seen": 5696448, + "step": 5660 + }, + { + "epoch": 2.670909948137671, + "grad_norm": 0.7067870497703552, + "learning_rate": 4.586908998109672e-05, + "loss": 0.3023, + "num_input_tokens_seen": 5701088, + "step": 5665 + }, + { + "epoch": 2.6732673267326734, + "grad_norm": 0.3998774290084839, + "learning_rate": 4.5857755793493116e-05, + "loss": 0.3467, + "num_input_tokens_seen": 5707008, + "step": 5670 + }, + { + "epoch": 2.6756247053276754, + "grad_norm": 0.4875812232494354, + "learning_rate": 4.584640748243447e-05, + "loss": 0.2996, + "num_input_tokens_seen": 5711648, + "step": 5675 + }, + { + "epoch": 2.677982083922678, + "grad_norm": 0.42242082953453064, + "learning_rate": 4.583504505560508e-05, + "loss": 0.3406, + "num_input_tokens_seen": 5716384, + "step": 5680 + }, + { + "epoch": 2.68033946251768, + "grad_norm": 0.3848675787448883, + "learning_rate": 4.5823668520698805e-05, + "loss": 0.3201, + "num_input_tokens_seen": 5721408, + "step": 5685 + }, + { + "epoch": 2.6826968411126826, + "grad_norm": 0.3834408223628998, + "learning_rate": 4.581227788541907e-05, + "loss": 0.3727, + "num_input_tokens_seen": 5727552, + "step": 5690 + }, + { + "epoch": 2.685054219707685, + "grad_norm": 0.4546240568161011, + "learning_rate": 4.580087315747884e-05, + "loss": 0.3413, + "num_input_tokens_seen": 5731424, + "step": 5695 + }, + { + "epoch": 2.6874115983026874, + "grad_norm": 0.4147045612335205, + "learning_rate": 4.5789454344600616e-05, + "loss": 0.387, + "num_input_tokens_seen": 5737504, + "step": 5700 + }, + { + "epoch": 2.68976897689769, + "grad_norm": 0.3859056830406189, + "learning_rate": 4.577802145451644e-05, + "loss": 0.315, + "num_input_tokens_seen": 5741600, + "step": 5705 + }, + { + "epoch": 2.692126355492692, + "grad_norm": 0.6656709313392639, + "learning_rate": 4.57665744949679e-05, + "loss": 0.3294, + "num_input_tokens_seen": 5746528, + "step": 5710 + }, + { + "epoch": 2.6944837340876946, + "grad_norm": 0.4493334889411926, + "learning_rate": 4.575511347370609e-05, + "loss": 0.3696, + "num_input_tokens_seen": 5751040, + "step": 5715 + }, + { + "epoch": 2.696841112682697, + "grad_norm": 0.6425447463989258, + "learning_rate": 4.574363839849164e-05, + "loss": 0.3617, + "num_input_tokens_seen": 5756736, + "step": 5720 + }, + { + "epoch": 2.6991984912776994, + "grad_norm": 0.4269389808177948, + "learning_rate": 4.573214927709469e-05, + "loss": 0.3675, + "num_input_tokens_seen": 5761376, + "step": 5725 + }, + { + "epoch": 2.701555869872702, + "grad_norm": 0.32747066020965576, + "learning_rate": 4.572064611729488e-05, + "loss": 0.3742, + "num_input_tokens_seen": 5766272, + "step": 5730 + }, + { + "epoch": 2.7039132484677038, + "grad_norm": 0.5462536811828613, + "learning_rate": 4.57091289268814e-05, + "loss": 0.3245, + "num_input_tokens_seen": 5770944, + "step": 5735 + }, + { + "epoch": 2.706270627062706, + "grad_norm": 0.44281479716300964, + "learning_rate": 4.569759771365287e-05, + "loss": 0.3623, + "num_input_tokens_seen": 5776288, + "step": 5740 + }, + { + "epoch": 2.7086280056577086, + "grad_norm": 0.3550892770290375, + "learning_rate": 4.568605248541747e-05, + "loss": 0.3328, + "num_input_tokens_seen": 5780544, + "step": 5745 + }, + { + "epoch": 2.710985384252711, + "grad_norm": 0.43307650089263916, + "learning_rate": 4.5674493249992835e-05, + "loss": 0.3439, + "num_input_tokens_seen": 5785600, + "step": 5750 + }, + { + "epoch": 2.7133427628477134, + "grad_norm": 0.477135568857193, + "learning_rate": 4.566292001520609e-05, + "loss": 0.3038, + "num_input_tokens_seen": 5790080, + "step": 5755 + }, + { + "epoch": 2.7157001414427158, + "grad_norm": 0.3772336542606354, + "learning_rate": 4.5651332788893855e-05, + "loss": 0.3359, + "num_input_tokens_seen": 5794368, + "step": 5760 + }, + { + "epoch": 2.718057520037718, + "grad_norm": 0.4019562304019928, + "learning_rate": 4.563973157890221e-05, + "loss": 0.3183, + "num_input_tokens_seen": 5798784, + "step": 5765 + }, + { + "epoch": 2.7204148986327206, + "grad_norm": 0.34942927956581116, + "learning_rate": 4.5628116393086695e-05, + "loss": 0.3153, + "num_input_tokens_seen": 5803456, + "step": 5770 + }, + { + "epoch": 2.7227722772277225, + "grad_norm": 0.28316596150398254, + "learning_rate": 4.561648723931233e-05, + "loss": 0.3572, + "num_input_tokens_seen": 5809088, + "step": 5775 + }, + { + "epoch": 2.725129655822725, + "grad_norm": 0.4670548737049103, + "learning_rate": 4.56048441254536e-05, + "loss": 0.383, + "num_input_tokens_seen": 5815488, + "step": 5780 + }, + { + "epoch": 2.7274870344177273, + "grad_norm": 0.5313908457756042, + "learning_rate": 4.5593187059394405e-05, + "loss": 0.2959, + "num_input_tokens_seen": 5820512, + "step": 5785 + }, + { + "epoch": 2.7298444130127297, + "grad_norm": 0.4181777238845825, + "learning_rate": 4.558151604902814e-05, + "loss": 0.3268, + "num_input_tokens_seen": 5824864, + "step": 5790 + }, + { + "epoch": 2.732201791607732, + "grad_norm": 0.5321410894393921, + "learning_rate": 4.5569831102257616e-05, + "loss": 0.3237, + "num_input_tokens_seen": 5829312, + "step": 5795 + }, + { + "epoch": 2.7345591702027345, + "grad_norm": 0.6256006360054016, + "learning_rate": 4.555813222699507e-05, + "loss": 0.3551, + "num_input_tokens_seen": 5833824, + "step": 5800 + }, + { + "epoch": 2.736916548797737, + "grad_norm": 0.4565264582633972, + "learning_rate": 4.554641943116222e-05, + "loss": 0.3521, + "num_input_tokens_seen": 5839488, + "step": 5805 + }, + { + "epoch": 2.7392739273927393, + "grad_norm": 0.3447866439819336, + "learning_rate": 4.5534692722690145e-05, + "loss": 0.3018, + "num_input_tokens_seen": 5844288, + "step": 5810 + }, + { + "epoch": 2.7416313059877417, + "grad_norm": 0.4555065333843231, + "learning_rate": 4.5522952109519374e-05, + "loss": 0.323, + "num_input_tokens_seen": 5848736, + "step": 5815 + }, + { + "epoch": 2.743988684582744, + "grad_norm": 0.3925100564956665, + "learning_rate": 4.551119759959987e-05, + "loss": 0.3613, + "num_input_tokens_seen": 5853856, + "step": 5820 + }, + { + "epoch": 2.7463460631777465, + "grad_norm": 0.31637144088745117, + "learning_rate": 4.549942920089097e-05, + "loss": 0.3014, + "num_input_tokens_seen": 5859712, + "step": 5825 + }, + { + "epoch": 2.748703441772749, + "grad_norm": 0.47829070687294006, + "learning_rate": 4.548764692136146e-05, + "loss": 0.3397, + "num_input_tokens_seen": 5864448, + "step": 5830 + }, + { + "epoch": 2.751060820367751, + "grad_norm": 0.523245096206665, + "learning_rate": 4.547585076898948e-05, + "loss": 0.3419, + "num_input_tokens_seen": 5869408, + "step": 5835 + }, + { + "epoch": 2.7534181989627533, + "grad_norm": 0.35943564772605896, + "learning_rate": 4.5464040751762584e-05, + "loss": 0.2768, + "num_input_tokens_seen": 5874816, + "step": 5840 + }, + { + "epoch": 2.7557755775577557, + "grad_norm": 0.24591171741485596, + "learning_rate": 4.5452216877677714e-05, + "loss": 0.3636, + "num_input_tokens_seen": 5881536, + "step": 5845 + }, + { + "epoch": 2.758132956152758, + "grad_norm": 0.5315858125686646, + "learning_rate": 4.54403791547412e-05, + "loss": 0.327, + "num_input_tokens_seen": 5886336, + "step": 5850 + }, + { + "epoch": 2.7604903347477605, + "grad_norm": 0.25814348459243774, + "learning_rate": 4.542852759096874e-05, + "loss": 0.3514, + "num_input_tokens_seen": 5891200, + "step": 5855 + }, + { + "epoch": 2.762847713342763, + "grad_norm": 0.45375505089759827, + "learning_rate": 4.54166621943854e-05, + "loss": 0.3381, + "num_input_tokens_seen": 5896128, + "step": 5860 + }, + { + "epoch": 2.7652050919377653, + "grad_norm": 0.5645458698272705, + "learning_rate": 4.5404782973025636e-05, + "loss": 0.307, + "num_input_tokens_seen": 5900480, + "step": 5865 + }, + { + "epoch": 2.7675624705327677, + "grad_norm": 0.4194570779800415, + "learning_rate": 4.5392889934933236e-05, + "loss": 0.3089, + "num_input_tokens_seen": 5905120, + "step": 5870 + }, + { + "epoch": 2.7699198491277697, + "grad_norm": 0.4954720437526703, + "learning_rate": 4.538098308816137e-05, + "loss": 0.3714, + "num_input_tokens_seen": 5909984, + "step": 5875 + }, + { + "epoch": 2.772277227722772, + "grad_norm": 0.40657395124435425, + "learning_rate": 4.536906244077252e-05, + "loss": 0.3473, + "num_input_tokens_seen": 5914848, + "step": 5880 + }, + { + "epoch": 2.7746346063177745, + "grad_norm": 0.29153499007225037, + "learning_rate": 4.535712800083858e-05, + "loss": 0.3702, + "num_input_tokens_seen": 5919872, + "step": 5885 + }, + { + "epoch": 2.776991984912777, + "grad_norm": 0.36553409695625305, + "learning_rate": 4.53451797764407e-05, + "loss": 0.3625, + "num_input_tokens_seen": 5924672, + "step": 5890 + }, + { + "epoch": 2.7793493635077793, + "grad_norm": 0.35539716482162476, + "learning_rate": 4.533321777566944e-05, + "loss": 0.3311, + "num_input_tokens_seen": 5930496, + "step": 5895 + }, + { + "epoch": 2.7817067421027817, + "grad_norm": 0.31760916113853455, + "learning_rate": 4.532124200662463e-05, + "loss": 0.3383, + "num_input_tokens_seen": 5935808, + "step": 5900 + }, + { + "epoch": 2.784064120697784, + "grad_norm": 0.49867138266563416, + "learning_rate": 4.530925247741546e-05, + "loss": 0.3439, + "num_input_tokens_seen": 5940352, + "step": 5905 + }, + { + "epoch": 2.7864214992927865, + "grad_norm": 0.5589591264724731, + "learning_rate": 4.529724919616042e-05, + "loss": 0.3659, + "num_input_tokens_seen": 5945184, + "step": 5910 + }, + { + "epoch": 2.788778877887789, + "grad_norm": 0.7056329846382141, + "learning_rate": 4.5285232170987314e-05, + "loss": 0.3855, + "num_input_tokens_seen": 5950208, + "step": 5915 + }, + { + "epoch": 2.7911362564827913, + "grad_norm": 0.48985081911087036, + "learning_rate": 4.527320141003325e-05, + "loss": 0.4055, + "num_input_tokens_seen": 5956224, + "step": 5920 + }, + { + "epoch": 2.7934936350777937, + "grad_norm": 0.5604062080383301, + "learning_rate": 4.5261156921444655e-05, + "loss": 0.3397, + "num_input_tokens_seen": 5961408, + "step": 5925 + }, + { + "epoch": 2.795851013672796, + "grad_norm": 0.5245674848556519, + "learning_rate": 4.524909871337724e-05, + "loss": 0.3185, + "num_input_tokens_seen": 5966144, + "step": 5930 + }, + { + "epoch": 2.798208392267798, + "grad_norm": 0.36675459146499634, + "learning_rate": 4.5237026793996e-05, + "loss": 0.3409, + "num_input_tokens_seen": 5971712, + "step": 5935 + }, + { + "epoch": 2.8005657708628005, + "grad_norm": 0.5101695656776428, + "learning_rate": 4.52249411714752e-05, + "loss": 0.3633, + "num_input_tokens_seen": 5976736, + "step": 5940 + }, + { + "epoch": 2.802923149457803, + "grad_norm": 0.3938312828540802, + "learning_rate": 4.5212841853998425e-05, + "loss": 0.3081, + "num_input_tokens_seen": 5982176, + "step": 5945 + }, + { + "epoch": 2.8052805280528053, + "grad_norm": 0.3888830542564392, + "learning_rate": 4.520072884975851e-05, + "loss": 0.3249, + "num_input_tokens_seen": 5987072, + "step": 5950 + }, + { + "epoch": 2.8076379066478077, + "grad_norm": 0.45854857563972473, + "learning_rate": 4.518860216695754e-05, + "loss": 0.343, + "num_input_tokens_seen": 5992736, + "step": 5955 + }, + { + "epoch": 2.80999528524281, + "grad_norm": 0.36851581931114197, + "learning_rate": 4.5176461813806904e-05, + "loss": 0.247, + "num_input_tokens_seen": 5998464, + "step": 5960 + }, + { + "epoch": 2.8123526638378125, + "grad_norm": 0.3677513301372528, + "learning_rate": 4.516430779852721e-05, + "loss": 0.2961, + "num_input_tokens_seen": 6003584, + "step": 5965 + }, + { + "epoch": 2.814710042432815, + "grad_norm": 0.39788421988487244, + "learning_rate": 4.515214012934833e-05, + "loss": 0.3431, + "num_input_tokens_seen": 6008160, + "step": 5970 + }, + { + "epoch": 2.817067421027817, + "grad_norm": 0.7038438320159912, + "learning_rate": 4.51399588145094e-05, + "loss": 0.3331, + "num_input_tokens_seen": 6012864, + "step": 5975 + }, + { + "epoch": 2.8194247996228192, + "grad_norm": 0.3094949424266815, + "learning_rate": 4.5127763862258755e-05, + "loss": 0.3656, + "num_input_tokens_seen": 6018176, + "step": 5980 + }, + { + "epoch": 2.8217821782178216, + "grad_norm": 0.2778588831424713, + "learning_rate": 4.5115555280853996e-05, + "loss": 0.3702, + "num_input_tokens_seen": 6024192, + "step": 5985 + }, + { + "epoch": 2.824139556812824, + "grad_norm": 0.47830042243003845, + "learning_rate": 4.5103333078561956e-05, + "loss": 0.2375, + "num_input_tokens_seen": 6028960, + "step": 5990 + }, + { + "epoch": 2.8264969354078264, + "grad_norm": 0.6398104429244995, + "learning_rate": 4.509109726365867e-05, + "loss": 0.3054, + "num_input_tokens_seen": 6034464, + "step": 5995 + }, + { + "epoch": 2.828854314002829, + "grad_norm": 0.38603663444519043, + "learning_rate": 4.50788478444294e-05, + "loss": 0.419, + "num_input_tokens_seen": 6039296, + "step": 6000 + }, + { + "epoch": 2.8312116925978312, + "grad_norm": 0.6143643260002136, + "learning_rate": 4.5066584829168625e-05, + "loss": 0.2975, + "num_input_tokens_seen": 6043744, + "step": 6005 + }, + { + "epoch": 2.8335690711928336, + "grad_norm": 0.32614412903785706, + "learning_rate": 4.505430822618002e-05, + "loss": 0.3118, + "num_input_tokens_seen": 6048384, + "step": 6010 + }, + { + "epoch": 2.835926449787836, + "grad_norm": 0.3683755099773407, + "learning_rate": 4.5042018043776475e-05, + "loss": 0.272, + "num_input_tokens_seen": 6053824, + "step": 6015 + }, + { + "epoch": 2.8382838283828384, + "grad_norm": 0.612296998500824, + "learning_rate": 4.502971429028005e-05, + "loss": 0.4179, + "num_input_tokens_seen": 6058592, + "step": 6020 + }, + { + "epoch": 2.840641206977841, + "grad_norm": 0.2383977621793747, + "learning_rate": 4.501739697402204e-05, + "loss": 0.242, + "num_input_tokens_seen": 6063680, + "step": 6025 + }, + { + "epoch": 2.8429985855728432, + "grad_norm": 0.3190314471721649, + "learning_rate": 4.500506610334287e-05, + "loss": 0.4011, + "num_input_tokens_seen": 6069056, + "step": 6030 + }, + { + "epoch": 2.845355964167845, + "grad_norm": 0.4055706858634949, + "learning_rate": 4.499272168659218e-05, + "loss": 0.3382, + "num_input_tokens_seen": 6075168, + "step": 6035 + }, + { + "epoch": 2.8477133427628476, + "grad_norm": 0.42957603931427, + "learning_rate": 4.4980363732128746e-05, + "loss": 0.3224, + "num_input_tokens_seen": 6081728, + "step": 6040 + }, + { + "epoch": 2.85007072135785, + "grad_norm": 0.4983263909816742, + "learning_rate": 4.4967992248320575e-05, + "loss": 0.4342, + "num_input_tokens_seen": 6086592, + "step": 6045 + }, + { + "epoch": 2.8524280999528524, + "grad_norm": 0.47873181104660034, + "learning_rate": 4.495560724354476e-05, + "loss": 0.3835, + "num_input_tokens_seen": 6091008, + "step": 6050 + }, + { + "epoch": 2.854785478547855, + "grad_norm": 0.3917851150035858, + "learning_rate": 4.494320872618761e-05, + "loss": 0.2866, + "num_input_tokens_seen": 6096928, + "step": 6055 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.38597267866134644, + "learning_rate": 4.493079670464454e-05, + "loss": 0.3622, + "num_input_tokens_seen": 6101888, + "step": 6060 + }, + { + "epoch": 2.8595002357378596, + "grad_norm": 0.5605831146240234, + "learning_rate": 4.491837118732014e-05, + "loss": 0.3356, + "num_input_tokens_seen": 6107680, + "step": 6065 + }, + { + "epoch": 2.861857614332862, + "grad_norm": 0.44489774107933044, + "learning_rate": 4.490593218262813e-05, + "loss": 0.3419, + "num_input_tokens_seen": 6112608, + "step": 6070 + }, + { + "epoch": 2.864214992927864, + "grad_norm": 0.3855955898761749, + "learning_rate": 4.4893479698991336e-05, + "loss": 0.3328, + "num_input_tokens_seen": 6117536, + "step": 6075 + }, + { + "epoch": 2.8665723715228664, + "grad_norm": 0.3526040017604828, + "learning_rate": 4.488101374484176e-05, + "loss": 0.3544, + "num_input_tokens_seen": 6122400, + "step": 6080 + }, + { + "epoch": 2.8689297501178688, + "grad_norm": 0.5818087458610535, + "learning_rate": 4.486853432862047e-05, + "loss": 0.3353, + "num_input_tokens_seen": 6126976, + "step": 6085 + }, + { + "epoch": 2.871287128712871, + "grad_norm": 0.37775614857673645, + "learning_rate": 4.485604145877771e-05, + "loss": 0.327, + "num_input_tokens_seen": 6132704, + "step": 6090 + }, + { + "epoch": 2.8736445073078736, + "grad_norm": 0.5405126214027405, + "learning_rate": 4.4843535143772785e-05, + "loss": 0.3354, + "num_input_tokens_seen": 6137024, + "step": 6095 + }, + { + "epoch": 2.876001885902876, + "grad_norm": 0.40874284505844116, + "learning_rate": 4.483101539207413e-05, + "loss": 0.2962, + "num_input_tokens_seen": 6142784, + "step": 6100 + }, + { + "epoch": 2.8783592644978784, + "grad_norm": 0.6189165115356445, + "learning_rate": 4.481848221215926e-05, + "loss": 0.3274, + "num_input_tokens_seen": 6148672, + "step": 6105 + }, + { + "epoch": 2.880716643092881, + "grad_norm": 0.29911142587661743, + "learning_rate": 4.480593561251479e-05, + "loss": 0.335, + "num_input_tokens_seen": 6154208, + "step": 6110 + }, + { + "epoch": 2.883074021687883, + "grad_norm": 0.9257929921150208, + "learning_rate": 4.479337560163645e-05, + "loss": 0.4669, + "num_input_tokens_seen": 6159232, + "step": 6115 + }, + { + "epoch": 2.8854314002828856, + "grad_norm": 0.34572848677635193, + "learning_rate": 4.4780802188029004e-05, + "loss": 0.3283, + "num_input_tokens_seen": 6164768, + "step": 6120 + }, + { + "epoch": 2.887788778877888, + "grad_norm": 0.40576791763305664, + "learning_rate": 4.4768215380206314e-05, + "loss": 0.3625, + "num_input_tokens_seen": 6169248, + "step": 6125 + }, + { + "epoch": 2.8901461574728904, + "grad_norm": 0.3918079435825348, + "learning_rate": 4.475561518669132e-05, + "loss": 0.3588, + "num_input_tokens_seen": 6174016, + "step": 6130 + }, + { + "epoch": 2.8925035360678923, + "grad_norm": 0.4480747580528259, + "learning_rate": 4.474300161601601e-05, + "loss": 0.3696, + "num_input_tokens_seen": 6179616, + "step": 6135 + }, + { + "epoch": 2.8948609146628947, + "grad_norm": 0.4197780191898346, + "learning_rate": 4.4730374676721445e-05, + "loss": 0.3778, + "num_input_tokens_seen": 6184352, + "step": 6140 + }, + { + "epoch": 2.897218293257897, + "grad_norm": 0.46022525429725647, + "learning_rate": 4.4717734377357725e-05, + "loss": 0.3385, + "num_input_tokens_seen": 6188608, + "step": 6145 + }, + { + "epoch": 2.8995756718528995, + "grad_norm": 0.6210612058639526, + "learning_rate": 4.4705080726483996e-05, + "loss": 0.3438, + "num_input_tokens_seen": 6193440, + "step": 6150 + }, + { + "epoch": 2.901933050447902, + "grad_norm": 0.44350749254226685, + "learning_rate": 4.469241373266846e-05, + "loss": 0.3499, + "num_input_tokens_seen": 6199296, + "step": 6155 + }, + { + "epoch": 2.9042904290429044, + "grad_norm": 0.4173169732093811, + "learning_rate": 4.467973340448834e-05, + "loss": 0.3189, + "num_input_tokens_seen": 6204800, + "step": 6160 + }, + { + "epoch": 2.9066478076379068, + "grad_norm": 0.43419989943504333, + "learning_rate": 4.466703975052989e-05, + "loss": 0.3504, + "num_input_tokens_seen": 6210144, + "step": 6165 + }, + { + "epoch": 2.909005186232909, + "grad_norm": 0.3248017728328705, + "learning_rate": 4.46543327793884e-05, + "loss": 0.3541, + "num_input_tokens_seen": 6215040, + "step": 6170 + }, + { + "epoch": 2.911362564827911, + "grad_norm": 0.31983983516693115, + "learning_rate": 4.464161249966816e-05, + "loss": 0.2775, + "num_input_tokens_seen": 6219488, + "step": 6175 + }, + { + "epoch": 2.9137199434229135, + "grad_norm": 0.3826760947704315, + "learning_rate": 4.4628878919982476e-05, + "loss": 0.3764, + "num_input_tokens_seen": 6224320, + "step": 6180 + }, + { + "epoch": 2.916077322017916, + "grad_norm": 0.6506315469741821, + "learning_rate": 4.4616132048953675e-05, + "loss": 0.3184, + "num_input_tokens_seen": 6230272, + "step": 6185 + }, + { + "epoch": 2.9184347006129183, + "grad_norm": 0.6094279885292053, + "learning_rate": 4.4603371895213063e-05, + "loss": 0.3273, + "num_input_tokens_seen": 6234784, + "step": 6190 + }, + { + "epoch": 2.9207920792079207, + "grad_norm": 0.3913220763206482, + "learning_rate": 4.4590598467400947e-05, + "loss": 0.4286, + "num_input_tokens_seen": 6239904, + "step": 6195 + }, + { + "epoch": 2.923149457802923, + "grad_norm": 0.3823224902153015, + "learning_rate": 4.457781177416663e-05, + "loss": 0.3082, + "num_input_tokens_seen": 6245344, + "step": 6200 + }, + { + "epoch": 2.9255068363979255, + "grad_norm": 0.28660935163497925, + "learning_rate": 4.456501182416839e-05, + "loss": 0.3156, + "num_input_tokens_seen": 6249440, + "step": 6205 + }, + { + "epoch": 2.927864214992928, + "grad_norm": 0.5452511310577393, + "learning_rate": 4.4552198626073484e-05, + "loss": 0.3668, + "num_input_tokens_seen": 6254432, + "step": 6210 + }, + { + "epoch": 2.9302215935879303, + "grad_norm": 0.36164772510528564, + "learning_rate": 4.453937218855814e-05, + "loss": 0.331, + "num_input_tokens_seen": 6259520, + "step": 6215 + }, + { + "epoch": 2.9325789721829327, + "grad_norm": 0.37885963916778564, + "learning_rate": 4.452653252030754e-05, + "loss": 0.3534, + "num_input_tokens_seen": 6264448, + "step": 6220 + }, + { + "epoch": 2.934936350777935, + "grad_norm": 0.28464701771736145, + "learning_rate": 4.4513679630015845e-05, + "loss": 0.3729, + "num_input_tokens_seen": 6269664, + "step": 6225 + }, + { + "epoch": 2.9372937293729375, + "grad_norm": 0.5597991347312927, + "learning_rate": 4.450081352638617e-05, + "loss": 0.3477, + "num_input_tokens_seen": 6274400, + "step": 6230 + }, + { + "epoch": 2.9396511079679395, + "grad_norm": 0.39115530252456665, + "learning_rate": 4.448793421813054e-05, + "loss": 0.3553, + "num_input_tokens_seen": 6279296, + "step": 6235 + }, + { + "epoch": 2.942008486562942, + "grad_norm": 0.41875678300857544, + "learning_rate": 4.447504171396997e-05, + "loss": 0.3525, + "num_input_tokens_seen": 6284224, + "step": 6240 + }, + { + "epoch": 2.9443658651579443, + "grad_norm": 0.6139435172080994, + "learning_rate": 4.446213602263437e-05, + "loss": 0.3704, + "num_input_tokens_seen": 6288512, + "step": 6245 + }, + { + "epoch": 2.9467232437529467, + "grad_norm": 0.43573233485221863, + "learning_rate": 4.444921715286261e-05, + "loss": 0.32, + "num_input_tokens_seen": 6294688, + "step": 6250 + }, + { + "epoch": 2.949080622347949, + "grad_norm": 0.38673725724220276, + "learning_rate": 4.443628511340247e-05, + "loss": 0.2777, + "num_input_tokens_seen": 6299392, + "step": 6255 + }, + { + "epoch": 2.9514380009429515, + "grad_norm": 0.5108609199523926, + "learning_rate": 4.442333991301064e-05, + "loss": 0.3635, + "num_input_tokens_seen": 6304160, + "step": 6260 + }, + { + "epoch": 2.953795379537954, + "grad_norm": 0.7423079013824463, + "learning_rate": 4.441038156045274e-05, + "loss": 0.4457, + "num_input_tokens_seen": 6311136, + "step": 6265 + }, + { + "epoch": 2.9561527581329563, + "grad_norm": 0.29764580726623535, + "learning_rate": 4.4397410064503284e-05, + "loss": 0.3425, + "num_input_tokens_seen": 6315264, + "step": 6270 + }, + { + "epoch": 2.9585101367279583, + "grad_norm": 0.5864250063896179, + "learning_rate": 4.438442543394568e-05, + "loss": 0.3076, + "num_input_tokens_seen": 6320192, + "step": 6275 + }, + { + "epoch": 2.9608675153229607, + "grad_norm": 0.424582839012146, + "learning_rate": 4.437142767757225e-05, + "loss": 0.3308, + "num_input_tokens_seen": 6324864, + "step": 6280 + }, + { + "epoch": 2.963224893917963, + "grad_norm": 0.40160873532295227, + "learning_rate": 4.4358416804184175e-05, + "loss": 0.3907, + "num_input_tokens_seen": 6329536, + "step": 6285 + }, + { + "epoch": 2.9655822725129655, + "grad_norm": 0.7903292179107666, + "learning_rate": 4.434539282259155e-05, + "loss": 0.3359, + "num_input_tokens_seen": 6335424, + "step": 6290 + }, + { + "epoch": 2.967939651107968, + "grad_norm": 0.7187665104866028, + "learning_rate": 4.433235574161333e-05, + "loss": 0.3869, + "num_input_tokens_seen": 6340160, + "step": 6295 + }, + { + "epoch": 2.9702970297029703, + "grad_norm": 0.5584414601325989, + "learning_rate": 4.431930557007732e-05, + "loss": 0.3364, + "num_input_tokens_seen": 6344288, + "step": 6300 + }, + { + "epoch": 2.9726544082979727, + "grad_norm": 0.6487731337547302, + "learning_rate": 4.4306242316820234e-05, + "loss": 0.3293, + "num_input_tokens_seen": 6349088, + "step": 6305 + }, + { + "epoch": 2.975011786892975, + "grad_norm": 0.3859809339046478, + "learning_rate": 4.42931659906876e-05, + "loss": 0.3292, + "num_input_tokens_seen": 6353248, + "step": 6310 + }, + { + "epoch": 2.9773691654879775, + "grad_norm": 0.3834514915943146, + "learning_rate": 4.4280076600533834e-05, + "loss": 0.3464, + "num_input_tokens_seen": 6359200, + "step": 6315 + }, + { + "epoch": 2.97972654408298, + "grad_norm": 0.35841840505599976, + "learning_rate": 4.426697415522218e-05, + "loss": 0.347, + "num_input_tokens_seen": 6364448, + "step": 6320 + }, + { + "epoch": 2.9820839226779823, + "grad_norm": 0.2982954978942871, + "learning_rate": 4.425385866362471e-05, + "loss": 0.3021, + "num_input_tokens_seen": 6368928, + "step": 6325 + }, + { + "epoch": 2.9844413012729847, + "grad_norm": 0.37796565890312195, + "learning_rate": 4.424073013462236e-05, + "loss": 0.374, + "num_input_tokens_seen": 6374016, + "step": 6330 + }, + { + "epoch": 2.9867986798679866, + "grad_norm": 0.45703303813934326, + "learning_rate": 4.422758857710487e-05, + "loss": 0.3368, + "num_input_tokens_seen": 6378816, + "step": 6335 + }, + { + "epoch": 2.989156058462989, + "grad_norm": 0.3674054443836212, + "learning_rate": 4.421443399997081e-05, + "loss": 0.3415, + "num_input_tokens_seen": 6384416, + "step": 6340 + }, + { + "epoch": 2.9915134370579914, + "grad_norm": 0.30195674300193787, + "learning_rate": 4.420126641212755e-05, + "loss": 0.3275, + "num_input_tokens_seen": 6388768, + "step": 6345 + }, + { + "epoch": 2.993870815652994, + "grad_norm": 0.395389199256897, + "learning_rate": 4.418808582249132e-05, + "loss": 0.3815, + "num_input_tokens_seen": 6393312, + "step": 6350 + }, + { + "epoch": 2.9962281942479962, + "grad_norm": 0.5456263422966003, + "learning_rate": 4.4174892239987095e-05, + "loss": 0.4015, + "num_input_tokens_seen": 6397728, + "step": 6355 + }, + { + "epoch": 2.9985855728429986, + "grad_norm": 0.48159775137901306, + "learning_rate": 4.416168567354868e-05, + "loss": 0.3208, + "num_input_tokens_seen": 6402080, + "step": 6360 + }, + { + "epoch": 3.000942951438001, + "grad_norm": 0.5352911949157715, + "learning_rate": 4.414846613211866e-05, + "loss": 0.3416, + "num_input_tokens_seen": 6407104, + "step": 6365 + }, + { + "epoch": 3.0014144271570014, + "eval_loss": 0.3454541265964508, + "eval_runtime": 25.627, + "eval_samples_per_second": 36.797, + "eval_steps_per_second": 9.209, + "num_input_tokens_seen": 6407840, + "step": 6366 + }, + { + "epoch": 3.0033003300330035, + "grad_norm": 0.3301316797733307, + "learning_rate": 4.413523362464842e-05, + "loss": 0.366, + "num_input_tokens_seen": 6411808, + "step": 6370 + }, + { + "epoch": 3.005657708628006, + "grad_norm": 0.7868497967720032, + "learning_rate": 4.4121988160098107e-05, + "loss": 0.3944, + "num_input_tokens_seen": 6418912, + "step": 6375 + }, + { + "epoch": 3.008015087223008, + "grad_norm": 0.45215240120887756, + "learning_rate": 4.410872974743665e-05, + "loss": 0.3281, + "num_input_tokens_seen": 6424864, + "step": 6380 + }, + { + "epoch": 3.01037246581801, + "grad_norm": 0.41506296396255493, + "learning_rate": 4.409545839564174e-05, + "loss": 0.3353, + "num_input_tokens_seen": 6430496, + "step": 6385 + }, + { + "epoch": 3.0127298444130126, + "grad_norm": 0.5750104784965515, + "learning_rate": 4.408217411369985e-05, + "loss": 0.3462, + "num_input_tokens_seen": 6436160, + "step": 6390 + }, + { + "epoch": 3.015087223008015, + "grad_norm": 0.3247506618499756, + "learning_rate": 4.406887691060619e-05, + "loss": 0.4313, + "num_input_tokens_seen": 6444320, + "step": 6395 + }, + { + "epoch": 3.0174446016030174, + "grad_norm": 0.5505650043487549, + "learning_rate": 4.40555667953647e-05, + "loss": 0.3558, + "num_input_tokens_seen": 6449120, + "step": 6400 + }, + { + "epoch": 3.01980198019802, + "grad_norm": 0.5332307815551758, + "learning_rate": 4.404224377698812e-05, + "loss": 0.3435, + "num_input_tokens_seen": 6453248, + "step": 6405 + }, + { + "epoch": 3.022159358793022, + "grad_norm": 0.3894548714160919, + "learning_rate": 4.402890786449787e-05, + "loss": 0.3349, + "num_input_tokens_seen": 6457664, + "step": 6410 + }, + { + "epoch": 3.0245167373880246, + "grad_norm": 0.3249974846839905, + "learning_rate": 4.401555906692413e-05, + "loss": 0.3565, + "num_input_tokens_seen": 6463520, + "step": 6415 + }, + { + "epoch": 3.026874115983027, + "grad_norm": 0.39273813366889954, + "learning_rate": 4.4002197393305785e-05, + "loss": 0.3335, + "num_input_tokens_seen": 6468672, + "step": 6420 + }, + { + "epoch": 3.0292314945780294, + "grad_norm": 0.3143294155597687, + "learning_rate": 4.398882285269048e-05, + "loss": 0.3027, + "num_input_tokens_seen": 6474048, + "step": 6425 + }, + { + "epoch": 3.0315888731730314, + "grad_norm": 0.41795414686203003, + "learning_rate": 4.397543545413453e-05, + "loss": 0.3903, + "num_input_tokens_seen": 6479936, + "step": 6430 + }, + { + "epoch": 3.033946251768034, + "grad_norm": 0.4412952661514282, + "learning_rate": 4.396203520670297e-05, + "loss": 0.3492, + "num_input_tokens_seen": 6486528, + "step": 6435 + }, + { + "epoch": 3.036303630363036, + "grad_norm": 0.45036277174949646, + "learning_rate": 4.394862211946954e-05, + "loss": 0.3362, + "num_input_tokens_seen": 6492928, + "step": 6440 + }, + { + "epoch": 3.0386610089580386, + "grad_norm": 0.5828651189804077, + "learning_rate": 4.393519620151667e-05, + "loss": 0.2877, + "num_input_tokens_seen": 6496992, + "step": 6445 + }, + { + "epoch": 3.041018387553041, + "grad_norm": 0.45617571473121643, + "learning_rate": 4.3921757461935475e-05, + "loss": 0.2819, + "num_input_tokens_seen": 6502368, + "step": 6450 + }, + { + "epoch": 3.0433757661480434, + "grad_norm": 0.5463685393333435, + "learning_rate": 4.3908305909825764e-05, + "loss": 0.304, + "num_input_tokens_seen": 6507840, + "step": 6455 + }, + { + "epoch": 3.045733144743046, + "grad_norm": 0.49420443177223206, + "learning_rate": 4.389484155429602e-05, + "loss": 0.4189, + "num_input_tokens_seen": 6512832, + "step": 6460 + }, + { + "epoch": 3.048090523338048, + "grad_norm": 0.6852856278419495, + "learning_rate": 4.388136440446337e-05, + "loss": 0.3809, + "num_input_tokens_seen": 6518400, + "step": 6465 + }, + { + "epoch": 3.0504479019330506, + "grad_norm": 0.3296712636947632, + "learning_rate": 4.386787446945365e-05, + "loss": 0.3803, + "num_input_tokens_seen": 6523200, + "step": 6470 + }, + { + "epoch": 3.052805280528053, + "grad_norm": 0.5105844140052795, + "learning_rate": 4.38543717584013e-05, + "loss": 0.3256, + "num_input_tokens_seen": 6527904, + "step": 6475 + }, + { + "epoch": 3.055162659123055, + "grad_norm": 0.36181432008743286, + "learning_rate": 4.384085628044945e-05, + "loss": 0.3215, + "num_input_tokens_seen": 6532192, + "step": 6480 + }, + { + "epoch": 3.0575200377180574, + "grad_norm": 0.3591940701007843, + "learning_rate": 4.382732804474988e-05, + "loss": 0.3623, + "num_input_tokens_seen": 6537216, + "step": 6485 + }, + { + "epoch": 3.0598774163130598, + "grad_norm": 0.4190884232521057, + "learning_rate": 4.381378706046296e-05, + "loss": 0.3429, + "num_input_tokens_seen": 6541664, + "step": 6490 + }, + { + "epoch": 3.062234794908062, + "grad_norm": 0.615077018737793, + "learning_rate": 4.380023333675775e-05, + "loss": 0.3161, + "num_input_tokens_seen": 6547008, + "step": 6495 + }, + { + "epoch": 3.0645921735030646, + "grad_norm": 0.46370574831962585, + "learning_rate": 4.378666688281191e-05, + "loss": 0.3654, + "num_input_tokens_seen": 6551552, + "step": 6500 + }, + { + "epoch": 3.066949552098067, + "grad_norm": 0.26646289229393005, + "learning_rate": 4.37730877078117e-05, + "loss": 0.2983, + "num_input_tokens_seen": 6556640, + "step": 6505 + }, + { + "epoch": 3.0693069306930694, + "grad_norm": 0.6444278359413147, + "learning_rate": 4.375949582095204e-05, + "loss": 0.2873, + "num_input_tokens_seen": 6561152, + "step": 6510 + }, + { + "epoch": 3.0716643092880718, + "grad_norm": 0.3220682442188263, + "learning_rate": 4.374589123143641e-05, + "loss": 0.362, + "num_input_tokens_seen": 6566560, + "step": 6515 + }, + { + "epoch": 3.074021687883074, + "grad_norm": 0.259213924407959, + "learning_rate": 4.373227394847692e-05, + "loss": 0.278, + "num_input_tokens_seen": 6571680, + "step": 6520 + }, + { + "epoch": 3.0763790664780766, + "grad_norm": 0.6307334303855896, + "learning_rate": 4.3718643981294286e-05, + "loss": 0.3855, + "num_input_tokens_seen": 6576032, + "step": 6525 + }, + { + "epoch": 3.0787364450730785, + "grad_norm": 0.35197630524635315, + "learning_rate": 4.370500133911777e-05, + "loss": 0.3502, + "num_input_tokens_seen": 6581280, + "step": 6530 + }, + { + "epoch": 3.081093823668081, + "grad_norm": 0.7070784568786621, + "learning_rate": 4.369134603118526e-05, + "loss": 0.394, + "num_input_tokens_seen": 6585696, + "step": 6535 + }, + { + "epoch": 3.0834512022630833, + "grad_norm": 0.5804458260536194, + "learning_rate": 4.36776780667432e-05, + "loss": 0.2745, + "num_input_tokens_seen": 6591104, + "step": 6540 + }, + { + "epoch": 3.0858085808580857, + "grad_norm": 0.4025607705116272, + "learning_rate": 4.36639974550466e-05, + "loss": 0.3362, + "num_input_tokens_seen": 6596448, + "step": 6545 + }, + { + "epoch": 3.088165959453088, + "grad_norm": 0.5487650036811829, + "learning_rate": 4.365030420535904e-05, + "loss": 0.3458, + "num_input_tokens_seen": 6600256, + "step": 6550 + }, + { + "epoch": 3.0905233380480905, + "grad_norm": 0.41709378361701965, + "learning_rate": 4.3636598326952674e-05, + "loss": 0.317, + "num_input_tokens_seen": 6605792, + "step": 6555 + }, + { + "epoch": 3.092880716643093, + "grad_norm": 0.407279908657074, + "learning_rate": 4.362287982910818e-05, + "loss": 0.3875, + "num_input_tokens_seen": 6610048, + "step": 6560 + }, + { + "epoch": 3.0952380952380953, + "grad_norm": 0.5400933027267456, + "learning_rate": 4.3609148721114785e-05, + "loss": 0.3345, + "num_input_tokens_seen": 6614848, + "step": 6565 + }, + { + "epoch": 3.0975954738330977, + "grad_norm": 0.4922398626804352, + "learning_rate": 4.3595405012270275e-05, + "loss": 0.368, + "num_input_tokens_seen": 6620256, + "step": 6570 + }, + { + "epoch": 3.0999528524281, + "grad_norm": 0.8416628837585449, + "learning_rate": 4.3581648711880954e-05, + "loss": 0.3334, + "num_input_tokens_seen": 6625440, + "step": 6575 + }, + { + "epoch": 3.102310231023102, + "grad_norm": 0.5063835978507996, + "learning_rate": 4.356787982926165e-05, + "loss": 0.3278, + "num_input_tokens_seen": 6631200, + "step": 6580 + }, + { + "epoch": 3.1046676096181045, + "grad_norm": 0.47339075803756714, + "learning_rate": 4.3554098373735715e-05, + "loss": 0.3373, + "num_input_tokens_seen": 6636160, + "step": 6585 + }, + { + "epoch": 3.107024988213107, + "grad_norm": 0.40245380997657776, + "learning_rate": 4.3540304354635014e-05, + "loss": 0.3325, + "num_input_tokens_seen": 6641792, + "step": 6590 + }, + { + "epoch": 3.1093823668081093, + "grad_norm": 0.4220947027206421, + "learning_rate": 4.352649778129993e-05, + "loss": 0.3225, + "num_input_tokens_seen": 6646656, + "step": 6595 + }, + { + "epoch": 3.1117397454031117, + "grad_norm": 0.48473021388053894, + "learning_rate": 4.3512678663079306e-05, + "loss": 0.3516, + "num_input_tokens_seen": 6651168, + "step": 6600 + }, + { + "epoch": 3.114097123998114, + "grad_norm": 0.38695916533470154, + "learning_rate": 4.3498847009330555e-05, + "loss": 0.2682, + "num_input_tokens_seen": 6655744, + "step": 6605 + }, + { + "epoch": 3.1164545025931165, + "grad_norm": 0.264726847410202, + "learning_rate": 4.3485002829419493e-05, + "loss": 0.2944, + "num_input_tokens_seen": 6661536, + "step": 6610 + }, + { + "epoch": 3.118811881188119, + "grad_norm": 0.7667772769927979, + "learning_rate": 4.3471146132720485e-05, + "loss": 0.3333, + "num_input_tokens_seen": 6665728, + "step": 6615 + }, + { + "epoch": 3.1211692597831213, + "grad_norm": 0.3840348422527313, + "learning_rate": 4.345727692861634e-05, + "loss": 0.4022, + "num_input_tokens_seen": 6670688, + "step": 6620 + }, + { + "epoch": 3.1235266383781237, + "grad_norm": 0.4101231098175049, + "learning_rate": 4.3443395226498315e-05, + "loss": 0.3723, + "num_input_tokens_seen": 6674816, + "step": 6625 + }, + { + "epoch": 3.1258840169731257, + "grad_norm": 0.5684316158294678, + "learning_rate": 4.3429501035766194e-05, + "loss": 0.3621, + "num_input_tokens_seen": 6680192, + "step": 6630 + }, + { + "epoch": 3.128241395568128, + "grad_norm": 0.5049511790275574, + "learning_rate": 4.341559436582815e-05, + "loss": 0.2966, + "num_input_tokens_seen": 6685120, + "step": 6635 + }, + { + "epoch": 3.1305987741631305, + "grad_norm": 0.5635237097740173, + "learning_rate": 4.340167522610085e-05, + "loss": 0.3392, + "num_input_tokens_seen": 6690528, + "step": 6640 + }, + { + "epoch": 3.132956152758133, + "grad_norm": 0.3036704957485199, + "learning_rate": 4.338774362600938e-05, + "loss": 0.3088, + "num_input_tokens_seen": 6696352, + "step": 6645 + }, + { + "epoch": 3.1353135313531353, + "grad_norm": 0.3908867835998535, + "learning_rate": 4.337379957498728e-05, + "loss": 0.3234, + "num_input_tokens_seen": 6701280, + "step": 6650 + }, + { + "epoch": 3.1376709099481377, + "grad_norm": 0.3600652813911438, + "learning_rate": 4.3359843082476515e-05, + "loss": 0.3496, + "num_input_tokens_seen": 6706144, + "step": 6655 + }, + { + "epoch": 3.14002828854314, + "grad_norm": 0.23100684583187103, + "learning_rate": 4.334587415792747e-05, + "loss": 0.3283, + "num_input_tokens_seen": 6711936, + "step": 6660 + }, + { + "epoch": 3.1423856671381425, + "grad_norm": 0.2702731192111969, + "learning_rate": 4.333189281079895e-05, + "loss": 0.3323, + "num_input_tokens_seen": 6716384, + "step": 6665 + }, + { + "epoch": 3.144743045733145, + "grad_norm": 0.4665803909301758, + "learning_rate": 4.3317899050558175e-05, + "loss": 0.3209, + "num_input_tokens_seen": 6721664, + "step": 6670 + }, + { + "epoch": 3.1471004243281473, + "grad_norm": 0.3370196521282196, + "learning_rate": 4.3303892886680766e-05, + "loss": 0.2513, + "num_input_tokens_seen": 6726880, + "step": 6675 + }, + { + "epoch": 3.1494578029231493, + "grad_norm": 0.31395432353019714, + "learning_rate": 4.328987432865075e-05, + "loss": 0.3242, + "num_input_tokens_seen": 6732544, + "step": 6680 + }, + { + "epoch": 3.1518151815181517, + "grad_norm": 0.4532095789909363, + "learning_rate": 4.327584338596054e-05, + "loss": 0.3854, + "num_input_tokens_seen": 6740288, + "step": 6685 + }, + { + "epoch": 3.154172560113154, + "grad_norm": 0.654516339302063, + "learning_rate": 4.3261800068110935e-05, + "loss": 0.3489, + "num_input_tokens_seen": 6747040, + "step": 6690 + }, + { + "epoch": 3.1565299387081565, + "grad_norm": 0.4508136212825775, + "learning_rate": 4.324774438461112e-05, + "loss": 0.3355, + "num_input_tokens_seen": 6752096, + "step": 6695 + }, + { + "epoch": 3.158887317303159, + "grad_norm": 0.3940248191356659, + "learning_rate": 4.323367634497866e-05, + "loss": 0.3602, + "num_input_tokens_seen": 6757184, + "step": 6700 + }, + { + "epoch": 3.1612446958981613, + "grad_norm": 0.30912843346595764, + "learning_rate": 4.3219595958739444e-05, + "loss": 0.3254, + "num_input_tokens_seen": 6762688, + "step": 6705 + }, + { + "epoch": 3.1636020744931637, + "grad_norm": 0.2995823919773102, + "learning_rate": 4.320550323542778e-05, + "loss": 0.2857, + "num_input_tokens_seen": 6766720, + "step": 6710 + }, + { + "epoch": 3.165959453088166, + "grad_norm": 0.30918025970458984, + "learning_rate": 4.319139818458629e-05, + "loss": 0.3725, + "num_input_tokens_seen": 6771200, + "step": 6715 + }, + { + "epoch": 3.1683168316831685, + "grad_norm": 0.5461475849151611, + "learning_rate": 4.317728081576596e-05, + "loss": 0.3868, + "num_input_tokens_seen": 6776096, + "step": 6720 + }, + { + "epoch": 3.170674210278171, + "grad_norm": 0.506752610206604, + "learning_rate": 4.316315113852612e-05, + "loss": 0.3655, + "num_input_tokens_seen": 6780192, + "step": 6725 + }, + { + "epoch": 3.173031588873173, + "grad_norm": 0.3573331832885742, + "learning_rate": 4.314900916243442e-05, + "loss": 0.3591, + "num_input_tokens_seen": 6784896, + "step": 6730 + }, + { + "epoch": 3.1753889674681752, + "grad_norm": 0.44676539301872253, + "learning_rate": 4.313485489706683e-05, + "loss": 0.3457, + "num_input_tokens_seen": 6789568, + "step": 6735 + }, + { + "epoch": 3.1777463460631776, + "grad_norm": 0.5225008130073547, + "learning_rate": 4.3120688352007685e-05, + "loss": 0.3614, + "num_input_tokens_seen": 6793664, + "step": 6740 + }, + { + "epoch": 3.18010372465818, + "grad_norm": 0.34455734491348267, + "learning_rate": 4.31065095368496e-05, + "loss": 0.3053, + "num_input_tokens_seen": 6798816, + "step": 6745 + }, + { + "epoch": 3.1824611032531824, + "grad_norm": 0.35491707921028137, + "learning_rate": 4.309231846119349e-05, + "loss": 0.3296, + "num_input_tokens_seen": 6804416, + "step": 6750 + }, + { + "epoch": 3.184818481848185, + "grad_norm": 0.5704344511032104, + "learning_rate": 4.3078115134648604e-05, + "loss": 0.3656, + "num_input_tokens_seen": 6809568, + "step": 6755 + }, + { + "epoch": 3.1871758604431872, + "grad_norm": 0.40071767568588257, + "learning_rate": 4.306389956683246e-05, + "loss": 0.3823, + "num_input_tokens_seen": 6814912, + "step": 6760 + }, + { + "epoch": 3.1895332390381896, + "grad_norm": 0.5717672109603882, + "learning_rate": 4.304967176737088e-05, + "loss": 0.3472, + "num_input_tokens_seen": 6819584, + "step": 6765 + }, + { + "epoch": 3.191890617633192, + "grad_norm": 0.4986872971057892, + "learning_rate": 4.3035431745897955e-05, + "loss": 0.3496, + "num_input_tokens_seen": 6825536, + "step": 6770 + }, + { + "epoch": 3.1942479962281944, + "grad_norm": 0.5337924957275391, + "learning_rate": 4.3021179512056065e-05, + "loss": 0.3612, + "num_input_tokens_seen": 6831264, + "step": 6775 + }, + { + "epoch": 3.1966053748231964, + "grad_norm": 0.4053051769733429, + "learning_rate": 4.3006915075495854e-05, + "loss": 0.3154, + "num_input_tokens_seen": 6835648, + "step": 6780 + }, + { + "epoch": 3.198962753418199, + "grad_norm": 0.39639541506767273, + "learning_rate": 4.299263844587623e-05, + "loss": 0.2929, + "num_input_tokens_seen": 6839936, + "step": 6785 + }, + { + "epoch": 3.201320132013201, + "grad_norm": 0.31200799345970154, + "learning_rate": 4.297834963286436e-05, + "loss": 0.3262, + "num_input_tokens_seen": 6844736, + "step": 6790 + }, + { + "epoch": 3.2036775106082036, + "grad_norm": 0.4378182291984558, + "learning_rate": 4.296404864613566e-05, + "loss": 0.3306, + "num_input_tokens_seen": 6849536, + "step": 6795 + }, + { + "epoch": 3.206034889203206, + "grad_norm": 0.32660427689552307, + "learning_rate": 4.2949735495373764e-05, + "loss": 0.3359, + "num_input_tokens_seen": 6853728, + "step": 6800 + }, + { + "epoch": 3.2083922677982084, + "grad_norm": 0.6423951387405396, + "learning_rate": 4.29354101902706e-05, + "loss": 0.3459, + "num_input_tokens_seen": 6859264, + "step": 6805 + }, + { + "epoch": 3.210749646393211, + "grad_norm": 0.310724139213562, + "learning_rate": 4.292107274052626e-05, + "loss": 0.3356, + "num_input_tokens_seen": 6864352, + "step": 6810 + }, + { + "epoch": 3.213107024988213, + "grad_norm": 0.46618762612342834, + "learning_rate": 4.2906723155849105e-05, + "loss": 0.2805, + "num_input_tokens_seen": 6868576, + "step": 6815 + }, + { + "epoch": 3.2154644035832156, + "grad_norm": 0.4740270674228668, + "learning_rate": 4.28923614459557e-05, + "loss": 0.2944, + "num_input_tokens_seen": 6874080, + "step": 6820 + }, + { + "epoch": 3.217821782178218, + "grad_norm": 0.44964632391929626, + "learning_rate": 4.287798762057081e-05, + "loss": 0.3604, + "num_input_tokens_seen": 6879680, + "step": 6825 + }, + { + "epoch": 3.22017916077322, + "grad_norm": 0.3496374785900116, + "learning_rate": 4.2863601689427434e-05, + "loss": 0.3909, + "num_input_tokens_seen": 6885760, + "step": 6830 + }, + { + "epoch": 3.2225365393682224, + "grad_norm": 0.36368870735168457, + "learning_rate": 4.284920366226672e-05, + "loss": 0.3429, + "num_input_tokens_seen": 6890784, + "step": 6835 + }, + { + "epoch": 3.2248939179632248, + "grad_norm": 0.4467063844203949, + "learning_rate": 4.2834793548838054e-05, + "loss": 0.4289, + "num_input_tokens_seen": 6896352, + "step": 6840 + }, + { + "epoch": 3.227251296558227, + "grad_norm": 0.4671189486980438, + "learning_rate": 4.2820371358898974e-05, + "loss": 0.3951, + "num_input_tokens_seen": 6901216, + "step": 6845 + }, + { + "epoch": 3.2296086751532296, + "grad_norm": 0.3710731863975525, + "learning_rate": 4.280593710221521e-05, + "loss": 0.3105, + "num_input_tokens_seen": 6906016, + "step": 6850 + }, + { + "epoch": 3.231966053748232, + "grad_norm": 0.43807461857795715, + "learning_rate": 4.279149078856066e-05, + "loss": 0.3594, + "num_input_tokens_seen": 6911040, + "step": 6855 + }, + { + "epoch": 3.2343234323432344, + "grad_norm": 0.46474841237068176, + "learning_rate": 4.277703242771739e-05, + "loss": 0.349, + "num_input_tokens_seen": 6916160, + "step": 6860 + }, + { + "epoch": 3.236680810938237, + "grad_norm": 0.6257234215736389, + "learning_rate": 4.276256202947562e-05, + "loss": 0.3766, + "num_input_tokens_seen": 6920384, + "step": 6865 + }, + { + "epoch": 3.239038189533239, + "grad_norm": 0.307049959897995, + "learning_rate": 4.2748079603633714e-05, + "loss": 0.3166, + "num_input_tokens_seen": 6924960, + "step": 6870 + }, + { + "epoch": 3.2413955681282416, + "grad_norm": 0.39434412121772766, + "learning_rate": 4.2733585159998194e-05, + "loss": 0.3369, + "num_input_tokens_seen": 6929280, + "step": 6875 + }, + { + "epoch": 3.2437529467232435, + "grad_norm": 0.4757706820964813, + "learning_rate": 4.27190787083837e-05, + "loss": 0.3506, + "num_input_tokens_seen": 6935488, + "step": 6880 + }, + { + "epoch": 3.246110325318246, + "grad_norm": 0.46673890948295593, + "learning_rate": 4.270456025861303e-05, + "loss": 0.3613, + "num_input_tokens_seen": 6939616, + "step": 6885 + }, + { + "epoch": 3.2484677039132484, + "grad_norm": 0.3548860251903534, + "learning_rate": 4.269002982051709e-05, + "loss": 0.336, + "num_input_tokens_seen": 6946464, + "step": 6890 + }, + { + "epoch": 3.2508250825082508, + "grad_norm": 0.5105555057525635, + "learning_rate": 4.267548740393489e-05, + "loss": 0.3725, + "num_input_tokens_seen": 6951104, + "step": 6895 + }, + { + "epoch": 3.253182461103253, + "grad_norm": 0.5590119361877441, + "learning_rate": 4.266093301871358e-05, + "loss": 0.326, + "num_input_tokens_seen": 6957088, + "step": 6900 + }, + { + "epoch": 3.2555398396982556, + "grad_norm": 0.3830565810203552, + "learning_rate": 4.2646366674708396e-05, + "loss": 0.3639, + "num_input_tokens_seen": 6963776, + "step": 6905 + }, + { + "epoch": 3.257897218293258, + "grad_norm": 0.30161598324775696, + "learning_rate": 4.263178838178269e-05, + "loss": 0.3095, + "num_input_tokens_seen": 6968960, + "step": 6910 + }, + { + "epoch": 3.2602545968882604, + "grad_norm": 0.5000569820404053, + "learning_rate": 4.261719814980787e-05, + "loss": 0.3626, + "num_input_tokens_seen": 6974240, + "step": 6915 + }, + { + "epoch": 3.2626119754832628, + "grad_norm": 0.2444571554660797, + "learning_rate": 4.260259598866346e-05, + "loss": 0.3399, + "num_input_tokens_seen": 6979104, + "step": 6920 + }, + { + "epoch": 3.264969354078265, + "grad_norm": 0.2735518217086792, + "learning_rate": 4.258798190823705e-05, + "loss": 0.3547, + "num_input_tokens_seen": 6984864, + "step": 6925 + }, + { + "epoch": 3.2673267326732676, + "grad_norm": 0.33663997054100037, + "learning_rate": 4.2573355918424304e-05, + "loss": 0.327, + "num_input_tokens_seen": 6988768, + "step": 6930 + }, + { + "epoch": 3.2696841112682695, + "grad_norm": 0.4529820680618286, + "learning_rate": 4.2558718029128954e-05, + "loss": 0.3331, + "num_input_tokens_seen": 6995040, + "step": 6935 + }, + { + "epoch": 3.272041489863272, + "grad_norm": 0.5414720177650452, + "learning_rate": 4.254406825026278e-05, + "loss": 0.3361, + "num_input_tokens_seen": 7000032, + "step": 6940 + }, + { + "epoch": 3.2743988684582743, + "grad_norm": 0.3310602307319641, + "learning_rate": 4.252940659174561e-05, + "loss": 0.3413, + "num_input_tokens_seen": 7004480, + "step": 6945 + }, + { + "epoch": 3.2767562470532767, + "grad_norm": 0.5035929083824158, + "learning_rate": 4.2514733063505325e-05, + "loss": 0.3296, + "num_input_tokens_seen": 7009216, + "step": 6950 + }, + { + "epoch": 3.279113625648279, + "grad_norm": 0.39100396633148193, + "learning_rate": 4.250004767547785e-05, + "loss": 0.3031, + "num_input_tokens_seen": 7014368, + "step": 6955 + }, + { + "epoch": 3.2814710042432815, + "grad_norm": 0.311513215303421, + "learning_rate": 4.2485350437607126e-05, + "loss": 0.3304, + "num_input_tokens_seen": 7018880, + "step": 6960 + }, + { + "epoch": 3.283828382838284, + "grad_norm": 0.4044381380081177, + "learning_rate": 4.247064135984512e-05, + "loss": 0.3399, + "num_input_tokens_seen": 7023264, + "step": 6965 + }, + { + "epoch": 3.2861857614332863, + "grad_norm": 0.4778646230697632, + "learning_rate": 4.245592045215182e-05, + "loss": 0.3788, + "num_input_tokens_seen": 7028000, + "step": 6970 + }, + { + "epoch": 3.2885431400282887, + "grad_norm": 0.6327771544456482, + "learning_rate": 4.244118772449523e-05, + "loss": 0.3397, + "num_input_tokens_seen": 7032960, + "step": 6975 + }, + { + "epoch": 3.2909005186232907, + "grad_norm": 0.29962462186813354, + "learning_rate": 4.2426443186851364e-05, + "loss": 0.3481, + "num_input_tokens_seen": 7037504, + "step": 6980 + }, + { + "epoch": 3.293257897218293, + "grad_norm": 0.2872562110424042, + "learning_rate": 4.24116868492042e-05, + "loss": 0.3123, + "num_input_tokens_seen": 7042304, + "step": 6985 + }, + { + "epoch": 3.2956152758132955, + "grad_norm": 0.5513589382171631, + "learning_rate": 4.239691872154573e-05, + "loss": 0.3599, + "num_input_tokens_seen": 7046528, + "step": 6990 + }, + { + "epoch": 3.297972654408298, + "grad_norm": 0.5263110399246216, + "learning_rate": 4.238213881387594e-05, + "loss": 0.3468, + "num_input_tokens_seen": 7050496, + "step": 6995 + }, + { + "epoch": 3.3003300330033003, + "grad_norm": 0.5062713027000427, + "learning_rate": 4.236734713620276e-05, + "loss": 0.322, + "num_input_tokens_seen": 7055776, + "step": 7000 + }, + { + "epoch": 3.3026874115983027, + "grad_norm": 0.49360981583595276, + "learning_rate": 4.235254369854214e-05, + "loss": 0.3384, + "num_input_tokens_seen": 7061248, + "step": 7005 + }, + { + "epoch": 3.305044790193305, + "grad_norm": 0.4987942576408386, + "learning_rate": 4.233772851091792e-05, + "loss": 0.2965, + "num_input_tokens_seen": 7065856, + "step": 7010 + }, + { + "epoch": 3.3074021687883075, + "grad_norm": 0.5760272741317749, + "learning_rate": 4.2322901583361976e-05, + "loss": 0.3573, + "num_input_tokens_seen": 7069568, + "step": 7015 + }, + { + "epoch": 3.30975954738331, + "grad_norm": 0.6260614991188049, + "learning_rate": 4.2308062925914075e-05, + "loss": 0.3832, + "num_input_tokens_seen": 7074592, + "step": 7020 + }, + { + "epoch": 3.3121169259783123, + "grad_norm": 0.42620420455932617, + "learning_rate": 4.229321254862195e-05, + "loss": 0.3406, + "num_input_tokens_seen": 7080032, + "step": 7025 + }, + { + "epoch": 3.3144743045733147, + "grad_norm": 0.37441033124923706, + "learning_rate": 4.2278350461541275e-05, + "loss": 0.3467, + "num_input_tokens_seen": 7083968, + "step": 7030 + }, + { + "epoch": 3.3168316831683167, + "grad_norm": 0.3430050015449524, + "learning_rate": 4.226347667473564e-05, + "loss": 0.341, + "num_input_tokens_seen": 7089600, + "step": 7035 + }, + { + "epoch": 3.319189061763319, + "grad_norm": 0.3519374430179596, + "learning_rate": 4.2248591198276565e-05, + "loss": 0.3595, + "num_input_tokens_seen": 7094976, + "step": 7040 + }, + { + "epoch": 3.3215464403583215, + "grad_norm": 0.26223182678222656, + "learning_rate": 4.223369404224348e-05, + "loss": 0.3312, + "num_input_tokens_seen": 7099424, + "step": 7045 + }, + { + "epoch": 3.323903818953324, + "grad_norm": 0.32610106468200684, + "learning_rate": 4.2218785216723736e-05, + "loss": 0.3861, + "num_input_tokens_seen": 7104448, + "step": 7050 + }, + { + "epoch": 3.3262611975483263, + "grad_norm": 0.39423272013664246, + "learning_rate": 4.220386473181256e-05, + "loss": 0.3006, + "num_input_tokens_seen": 7109440, + "step": 7055 + }, + { + "epoch": 3.3286185761433287, + "grad_norm": 0.43025487661361694, + "learning_rate": 4.2188932597613104e-05, + "loss": 0.3463, + "num_input_tokens_seen": 7114496, + "step": 7060 + }, + { + "epoch": 3.330975954738331, + "grad_norm": 0.3621689975261688, + "learning_rate": 4.21739888242364e-05, + "loss": 0.3464, + "num_input_tokens_seen": 7119360, + "step": 7065 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.5284667611122131, + "learning_rate": 4.2159033421801345e-05, + "loss": 0.3639, + "num_input_tokens_seen": 7124160, + "step": 7070 + }, + { + "epoch": 3.335690711928336, + "grad_norm": 0.38007164001464844, + "learning_rate": 4.214406640043471e-05, + "loss": 0.3472, + "num_input_tokens_seen": 7128960, + "step": 7075 + }, + { + "epoch": 3.338048090523338, + "grad_norm": 0.3376415967941284, + "learning_rate": 4.212908777027117e-05, + "loss": 0.3344, + "num_input_tokens_seen": 7133888, + "step": 7080 + }, + { + "epoch": 3.3404054691183402, + "grad_norm": 0.2982708513736725, + "learning_rate": 4.2114097541453224e-05, + "loss": 0.3186, + "num_input_tokens_seen": 7138400, + "step": 7085 + }, + { + "epoch": 3.3427628477133426, + "grad_norm": 0.35456693172454834, + "learning_rate": 4.209909572413123e-05, + "loss": 0.324, + "num_input_tokens_seen": 7144544, + "step": 7090 + }, + { + "epoch": 3.345120226308345, + "grad_norm": 0.38172248005867004, + "learning_rate": 4.2084082328463415e-05, + "loss": 0.3391, + "num_input_tokens_seen": 7149312, + "step": 7095 + }, + { + "epoch": 3.3474776049033474, + "grad_norm": 0.3818110525608063, + "learning_rate": 4.206905736461582e-05, + "loss": 0.3544, + "num_input_tokens_seen": 7154208, + "step": 7100 + }, + { + "epoch": 3.34983498349835, + "grad_norm": 0.40138980746269226, + "learning_rate": 4.205402084276232e-05, + "loss": 0.3391, + "num_input_tokens_seen": 7158528, + "step": 7105 + }, + { + "epoch": 3.3521923620933523, + "grad_norm": 0.3062674105167389, + "learning_rate": 4.203897277308465e-05, + "loss": 0.3316, + "num_input_tokens_seen": 7163040, + "step": 7110 + }, + { + "epoch": 3.3545497406883547, + "grad_norm": 0.400146484375, + "learning_rate": 4.202391316577232e-05, + "loss": 0.3547, + "num_input_tokens_seen": 7169024, + "step": 7115 + }, + { + "epoch": 3.356907119283357, + "grad_norm": 0.490060031414032, + "learning_rate": 4.200884203102269e-05, + "loss": 0.3083, + "num_input_tokens_seen": 7173664, + "step": 7120 + }, + { + "epoch": 3.3592644978783595, + "grad_norm": 0.7506343722343445, + "learning_rate": 4.199375937904089e-05, + "loss": 0.3382, + "num_input_tokens_seen": 7178688, + "step": 7125 + }, + { + "epoch": 3.361621876473362, + "grad_norm": 0.33447685837745667, + "learning_rate": 4.1978665220039884e-05, + "loss": 0.3338, + "num_input_tokens_seen": 7183680, + "step": 7130 + }, + { + "epoch": 3.363979255068364, + "grad_norm": 0.3768433630466461, + "learning_rate": 4.196355956424039e-05, + "loss": 0.3709, + "num_input_tokens_seen": 7188352, + "step": 7135 + }, + { + "epoch": 3.366336633663366, + "grad_norm": 0.41123610734939575, + "learning_rate": 4.194844242187096e-05, + "loss": 0.2947, + "num_input_tokens_seen": 7193632, + "step": 7140 + }, + { + "epoch": 3.3686940122583686, + "grad_norm": 0.31357258558273315, + "learning_rate": 4.1933313803167865e-05, + "loss": 0.3379, + "num_input_tokens_seen": 7197568, + "step": 7145 + }, + { + "epoch": 3.371051390853371, + "grad_norm": 0.33958500623703003, + "learning_rate": 4.19181737183752e-05, + "loss": 0.3135, + "num_input_tokens_seen": 7202880, + "step": 7150 + }, + { + "epoch": 3.3734087694483734, + "grad_norm": 0.515180766582489, + "learning_rate": 4.19030221777448e-05, + "loss": 0.4093, + "num_input_tokens_seen": 7207264, + "step": 7155 + }, + { + "epoch": 3.375766148043376, + "grad_norm": 0.36941829323768616, + "learning_rate": 4.188785919153625e-05, + "loss": 0.3271, + "num_input_tokens_seen": 7213472, + "step": 7160 + }, + { + "epoch": 3.3781235266383782, + "grad_norm": 0.30250394344329834, + "learning_rate": 4.187268477001689e-05, + "loss": 0.3153, + "num_input_tokens_seen": 7218656, + "step": 7165 + }, + { + "epoch": 3.3804809052333806, + "grad_norm": 0.3479789197444916, + "learning_rate": 4.1857498923461826e-05, + "loss": 0.3503, + "num_input_tokens_seen": 7224224, + "step": 7170 + }, + { + "epoch": 3.382838283828383, + "grad_norm": 0.39322566986083984, + "learning_rate": 4.184230166215386e-05, + "loss": 0.333, + "num_input_tokens_seen": 7228864, + "step": 7175 + }, + { + "epoch": 3.385195662423385, + "grad_norm": 0.7707170248031616, + "learning_rate": 4.182709299638356e-05, + "loss": 0.3262, + "num_input_tokens_seen": 7233824, + "step": 7180 + }, + { + "epoch": 3.3875530410183874, + "grad_norm": 0.3338298201560974, + "learning_rate": 4.1811872936449196e-05, + "loss": 0.2858, + "num_input_tokens_seen": 7239584, + "step": 7185 + }, + { + "epoch": 3.38991041961339, + "grad_norm": 0.26867911219596863, + "learning_rate": 4.179664149265676e-05, + "loss": 0.3357, + "num_input_tokens_seen": 7244992, + "step": 7190 + }, + { + "epoch": 3.392267798208392, + "grad_norm": 0.5586385130882263, + "learning_rate": 4.178139867531995e-05, + "loss": 0.4239, + "num_input_tokens_seen": 7249184, + "step": 7195 + }, + { + "epoch": 3.3946251768033946, + "grad_norm": 0.34849226474761963, + "learning_rate": 4.1766144494760164e-05, + "loss": 0.3141, + "num_input_tokens_seen": 7254976, + "step": 7200 + }, + { + "epoch": 3.396982555398397, + "grad_norm": 0.5017176866531372, + "learning_rate": 4.1750878961306494e-05, + "loss": 0.3468, + "num_input_tokens_seen": 7259712, + "step": 7205 + }, + { + "epoch": 3.3993399339933994, + "grad_norm": 0.3247958719730377, + "learning_rate": 4.1735602085295736e-05, + "loss": 0.3267, + "num_input_tokens_seen": 7263904, + "step": 7210 + }, + { + "epoch": 3.401697312588402, + "grad_norm": 0.39122113585472107, + "learning_rate": 4.172031387707234e-05, + "loss": 0.3606, + "num_input_tokens_seen": 7268288, + "step": 7215 + }, + { + "epoch": 3.404054691183404, + "grad_norm": 0.34737977385520935, + "learning_rate": 4.1705014346988436e-05, + "loss": 0.3543, + "num_input_tokens_seen": 7272128, + "step": 7220 + }, + { + "epoch": 3.4064120697784066, + "grad_norm": 0.442315012216568, + "learning_rate": 4.168970350540384e-05, + "loss": 0.3365, + "num_input_tokens_seen": 7276448, + "step": 7225 + }, + { + "epoch": 3.408769448373409, + "grad_norm": 0.5443242192268372, + "learning_rate": 4.167438136268601e-05, + "loss": 0.3171, + "num_input_tokens_seen": 7280896, + "step": 7230 + }, + { + "epoch": 3.411126826968411, + "grad_norm": 0.3454189598560333, + "learning_rate": 4.165904792921006e-05, + "loss": 0.328, + "num_input_tokens_seen": 7286080, + "step": 7235 + }, + { + "epoch": 3.4134842055634134, + "grad_norm": 0.3422050476074219, + "learning_rate": 4.164370321535875e-05, + "loss": 0.3334, + "num_input_tokens_seen": 7291232, + "step": 7240 + }, + { + "epoch": 3.4158415841584158, + "grad_norm": 0.3912779986858368, + "learning_rate": 4.162834723152247e-05, + "loss": 0.3102, + "num_input_tokens_seen": 7297184, + "step": 7245 + }, + { + "epoch": 3.418198962753418, + "grad_norm": 0.3484586775302887, + "learning_rate": 4.1612979988099256e-05, + "loss": 0.3845, + "num_input_tokens_seen": 7303360, + "step": 7250 + }, + { + "epoch": 3.4205563413484206, + "grad_norm": 0.3196077048778534, + "learning_rate": 4.159760149549476e-05, + "loss": 0.2799, + "num_input_tokens_seen": 7308544, + "step": 7255 + }, + { + "epoch": 3.422913719943423, + "grad_norm": 0.3459513783454895, + "learning_rate": 4.1582211764122245e-05, + "loss": 0.3215, + "num_input_tokens_seen": 7313120, + "step": 7260 + }, + { + "epoch": 3.4252710985384254, + "grad_norm": 0.3081681430339813, + "learning_rate": 4.156681080440261e-05, + "loss": 0.2799, + "num_input_tokens_seen": 7318464, + "step": 7265 + }, + { + "epoch": 3.4276284771334278, + "grad_norm": 0.35959020256996155, + "learning_rate": 4.155139862676432e-05, + "loss": 0.3399, + "num_input_tokens_seen": 7323168, + "step": 7270 + }, + { + "epoch": 3.42998585572843, + "grad_norm": 0.27512577176094055, + "learning_rate": 4.1535975241643465e-05, + "loss": 0.3535, + "num_input_tokens_seen": 7328320, + "step": 7275 + }, + { + "epoch": 3.432343234323432, + "grad_norm": 0.4230688214302063, + "learning_rate": 4.1520540659483704e-05, + "loss": 0.3172, + "num_input_tokens_seen": 7334656, + "step": 7280 + }, + { + "epoch": 3.4347006129184345, + "grad_norm": 0.4056749939918518, + "learning_rate": 4.15050948907363e-05, + "loss": 0.3115, + "num_input_tokens_seen": 7339616, + "step": 7285 + }, + { + "epoch": 3.437057991513437, + "grad_norm": 0.34734827280044556, + "learning_rate": 4.148963794586006e-05, + "loss": 0.3039, + "num_input_tokens_seen": 7344256, + "step": 7290 + }, + { + "epoch": 3.4394153701084393, + "grad_norm": 0.41231539845466614, + "learning_rate": 4.1474169835321394e-05, + "loss": 0.3034, + "num_input_tokens_seen": 7348320, + "step": 7295 + }, + { + "epoch": 3.4417727487034417, + "grad_norm": 0.41511043906211853, + "learning_rate": 4.145869056959426e-05, + "loss": 0.3494, + "num_input_tokens_seen": 7352640, + "step": 7300 + }, + { + "epoch": 3.444130127298444, + "grad_norm": 0.38274136185646057, + "learning_rate": 4.1443200159160144e-05, + "loss": 0.3021, + "num_input_tokens_seen": 7357376, + "step": 7305 + }, + { + "epoch": 3.4464875058934465, + "grad_norm": 0.34482643008232117, + "learning_rate": 4.142769861450811e-05, + "loss": 0.3379, + "num_input_tokens_seen": 7361600, + "step": 7310 + }, + { + "epoch": 3.448844884488449, + "grad_norm": 0.46022915840148926, + "learning_rate": 4.1412185946134754e-05, + "loss": 0.3006, + "num_input_tokens_seen": 7366592, + "step": 7315 + }, + { + "epoch": 3.4512022630834513, + "grad_norm": 0.4138903021812439, + "learning_rate": 4.13966621645442e-05, + "loss": 0.3597, + "num_input_tokens_seen": 7371520, + "step": 7320 + }, + { + "epoch": 3.4535596416784538, + "grad_norm": 0.23350323736667633, + "learning_rate": 4.138112728024809e-05, + "loss": 0.3938, + "num_input_tokens_seen": 7376896, + "step": 7325 + }, + { + "epoch": 3.455917020273456, + "grad_norm": 0.4832562804222107, + "learning_rate": 4.13655813037656e-05, + "loss": 0.3203, + "num_input_tokens_seen": 7381472, + "step": 7330 + }, + { + "epoch": 3.458274398868458, + "grad_norm": 0.5827515721321106, + "learning_rate": 4.13500242456234e-05, + "loss": 0.251, + "num_input_tokens_seen": 7385728, + "step": 7335 + }, + { + "epoch": 3.4606317774634605, + "grad_norm": 0.2791096568107605, + "learning_rate": 4.133445611635569e-05, + "loss": 0.3027, + "num_input_tokens_seen": 7390432, + "step": 7340 + }, + { + "epoch": 3.462989156058463, + "grad_norm": 0.4526396691799164, + "learning_rate": 4.1318876926504136e-05, + "loss": 0.3525, + "num_input_tokens_seen": 7396128, + "step": 7345 + }, + { + "epoch": 3.4653465346534653, + "grad_norm": 0.44763338565826416, + "learning_rate": 4.1303286686617914e-05, + "loss": 0.424, + "num_input_tokens_seen": 7400960, + "step": 7350 + }, + { + "epoch": 3.4677039132484677, + "grad_norm": 0.4249245226383209, + "learning_rate": 4.128768540725367e-05, + "loss": 0.3688, + "num_input_tokens_seen": 7405888, + "step": 7355 + }, + { + "epoch": 3.47006129184347, + "grad_norm": 0.5481520891189575, + "learning_rate": 4.127207309897553e-05, + "loss": 0.3286, + "num_input_tokens_seen": 7411072, + "step": 7360 + }, + { + "epoch": 3.4724186704384725, + "grad_norm": 0.26251220703125, + "learning_rate": 4.125644977235509e-05, + "loss": 0.2952, + "num_input_tokens_seen": 7417120, + "step": 7365 + }, + { + "epoch": 3.474776049033475, + "grad_norm": 0.36725980043411255, + "learning_rate": 4.1240815437971404e-05, + "loss": 0.2812, + "num_input_tokens_seen": 7422592, + "step": 7370 + }, + { + "epoch": 3.4771334276284773, + "grad_norm": 0.5320187211036682, + "learning_rate": 4.1225170106410985e-05, + "loss": 0.3708, + "num_input_tokens_seen": 7426848, + "step": 7375 + }, + { + "epoch": 3.4794908062234793, + "grad_norm": 0.2510869801044464, + "learning_rate": 4.120951378826778e-05, + "loss": 0.3425, + "num_input_tokens_seen": 7431712, + "step": 7380 + }, + { + "epoch": 3.4818481848184817, + "grad_norm": 0.7908915877342224, + "learning_rate": 4.1193846494143197e-05, + "loss": 0.402, + "num_input_tokens_seen": 7437088, + "step": 7385 + }, + { + "epoch": 3.484205563413484, + "grad_norm": 0.32001951336860657, + "learning_rate": 4.117816823464605e-05, + "loss": 0.3288, + "num_input_tokens_seen": 7442240, + "step": 7390 + }, + { + "epoch": 3.4865629420084865, + "grad_norm": 0.5326070189476013, + "learning_rate": 4.1162479020392596e-05, + "loss": 0.3278, + "num_input_tokens_seen": 7446944, + "step": 7395 + }, + { + "epoch": 3.488920320603489, + "grad_norm": 0.3698699474334717, + "learning_rate": 4.114677886200651e-05, + "loss": 0.3588, + "num_input_tokens_seen": 7451328, + "step": 7400 + }, + { + "epoch": 3.4912776991984913, + "grad_norm": 0.3573928475379944, + "learning_rate": 4.113106777011886e-05, + "loss": 0.3463, + "num_input_tokens_seen": 7456000, + "step": 7405 + }, + { + "epoch": 3.4936350777934937, + "grad_norm": 0.38411208987236023, + "learning_rate": 4.111534575536815e-05, + "loss": 0.318, + "num_input_tokens_seen": 7460384, + "step": 7410 + }, + { + "epoch": 3.495992456388496, + "grad_norm": 0.30085501074790955, + "learning_rate": 4.109961282840024e-05, + "loss": 0.3443, + "num_input_tokens_seen": 7465920, + "step": 7415 + }, + { + "epoch": 3.4983498349834985, + "grad_norm": 0.6141192317008972, + "learning_rate": 4.1083868999868415e-05, + "loss": 0.4146, + "num_input_tokens_seen": 7470752, + "step": 7420 + }, + { + "epoch": 3.500707213578501, + "grad_norm": 0.43399783968925476, + "learning_rate": 4.106811428043331e-05, + "loss": 0.3181, + "num_input_tokens_seen": 7475168, + "step": 7425 + }, + { + "epoch": 3.5016501650165015, + "eval_loss": 0.3337898552417755, + "eval_runtime": 25.7057, + "eval_samples_per_second": 36.684, + "eval_steps_per_second": 9.181, + "num_input_tokens_seen": 7476544, + "step": 7427 + }, + { + "epoch": 3.5030645921735033, + "grad_norm": 0.34314587712287903, + "learning_rate": 4.1052348680762966e-05, + "loss": 0.3042, + "num_input_tokens_seen": 7479648, + "step": 7430 + }, + { + "epoch": 3.5054219707685053, + "grad_norm": 0.38773131370544434, + "learning_rate": 4.1036572211532766e-05, + "loss": 0.2632, + "num_input_tokens_seen": 7483872, + "step": 7435 + }, + { + "epoch": 3.5077793493635077, + "grad_norm": 0.2727902829647064, + "learning_rate": 4.1020784883425455e-05, + "loss": 0.2842, + "num_input_tokens_seen": 7489152, + "step": 7440 + }, + { + "epoch": 3.51013672795851, + "grad_norm": 0.6295969486236572, + "learning_rate": 4.1004986707131164e-05, + "loss": 0.2849, + "num_input_tokens_seen": 7493760, + "step": 7445 + }, + { + "epoch": 3.5124941065535125, + "grad_norm": 0.45467373728752136, + "learning_rate": 4.0989177693347324e-05, + "loss": 0.3554, + "num_input_tokens_seen": 7497952, + "step": 7450 + }, + { + "epoch": 3.514851485148515, + "grad_norm": 0.406804621219635, + "learning_rate": 4.0973357852778736e-05, + "loss": 0.3246, + "num_input_tokens_seen": 7503872, + "step": 7455 + }, + { + "epoch": 3.5172088637435173, + "grad_norm": 0.4299904406070709, + "learning_rate": 4.0957527196137515e-05, + "loss": 0.3069, + "num_input_tokens_seen": 7509152, + "step": 7460 + }, + { + "epoch": 3.5195662423385197, + "grad_norm": 1.0067267417907715, + "learning_rate": 4.094168573414311e-05, + "loss": 0.3992, + "num_input_tokens_seen": 7513952, + "step": 7465 + }, + { + "epoch": 3.521923620933522, + "grad_norm": 0.3407234847545624, + "learning_rate": 4.092583347752228e-05, + "loss": 0.3826, + "num_input_tokens_seen": 7518720, + "step": 7470 + }, + { + "epoch": 3.524280999528524, + "grad_norm": 0.38068684935569763, + "learning_rate": 4.0909970437009096e-05, + "loss": 0.3723, + "num_input_tokens_seen": 7523648, + "step": 7475 + }, + { + "epoch": 3.5266383781235264, + "grad_norm": 0.5647323727607727, + "learning_rate": 4.089409662334492e-05, + "loss": 0.3581, + "num_input_tokens_seen": 7528256, + "step": 7480 + }, + { + "epoch": 3.528995756718529, + "grad_norm": 0.4298003315925598, + "learning_rate": 4.087821204727844e-05, + "loss": 0.342, + "num_input_tokens_seen": 7533248, + "step": 7485 + }, + { + "epoch": 3.5313531353135312, + "grad_norm": 0.4847729802131653, + "learning_rate": 4.086231671956561e-05, + "loss": 0.3096, + "num_input_tokens_seen": 7538272, + "step": 7490 + }, + { + "epoch": 3.5337105139085336, + "grad_norm": 0.4504232108592987, + "learning_rate": 4.084641065096966e-05, + "loss": 0.3577, + "num_input_tokens_seen": 7543872, + "step": 7495 + }, + { + "epoch": 3.536067892503536, + "grad_norm": 0.32481640577316284, + "learning_rate": 4.083049385226109e-05, + "loss": 0.348, + "num_input_tokens_seen": 7548576, + "step": 7500 + }, + { + "epoch": 3.5384252710985384, + "grad_norm": 0.34437650442123413, + "learning_rate": 4.081456633421768e-05, + "loss": 0.3283, + "num_input_tokens_seen": 7554336, + "step": 7505 + }, + { + "epoch": 3.540782649693541, + "grad_norm": 0.3357120454311371, + "learning_rate": 4.079862810762447e-05, + "loss": 0.3314, + "num_input_tokens_seen": 7558880, + "step": 7510 + }, + { + "epoch": 3.5431400282885432, + "grad_norm": 0.3733872175216675, + "learning_rate": 4.078267918327374e-05, + "loss": 0.3121, + "num_input_tokens_seen": 7563264, + "step": 7515 + }, + { + "epoch": 3.5454974068835456, + "grad_norm": 0.4892962872982025, + "learning_rate": 4.0766719571965024e-05, + "loss": 0.2978, + "num_input_tokens_seen": 7568064, + "step": 7520 + }, + { + "epoch": 3.547854785478548, + "grad_norm": 0.8500136137008667, + "learning_rate": 4.0750749284505075e-05, + "loss": 0.4, + "num_input_tokens_seen": 7574432, + "step": 7525 + }, + { + "epoch": 3.5502121640735504, + "grad_norm": 0.2844037413597107, + "learning_rate": 4.073476833170789e-05, + "loss": 0.341, + "num_input_tokens_seen": 7579328, + "step": 7530 + }, + { + "epoch": 3.5525695426685524, + "grad_norm": 0.3046455979347229, + "learning_rate": 4.0718776724394694e-05, + "loss": 0.3496, + "num_input_tokens_seen": 7584608, + "step": 7535 + }, + { + "epoch": 3.554926921263555, + "grad_norm": 0.6133196353912354, + "learning_rate": 4.07027744733939e-05, + "loss": 0.3704, + "num_input_tokens_seen": 7589600, + "step": 7540 + }, + { + "epoch": 3.557284299858557, + "grad_norm": 0.4132906496524811, + "learning_rate": 4.068676158954116e-05, + "loss": 0.3208, + "num_input_tokens_seen": 7594624, + "step": 7545 + }, + { + "epoch": 3.5596416784535596, + "grad_norm": 0.29842138290405273, + "learning_rate": 4.0670738083679294e-05, + "loss": 0.4167, + "num_input_tokens_seen": 7599360, + "step": 7550 + }, + { + "epoch": 3.561999057048562, + "grad_norm": 0.5938999056816101, + "learning_rate": 4.065470396665836e-05, + "loss": 0.3185, + "num_input_tokens_seen": 7603872, + "step": 7555 + }, + { + "epoch": 3.5643564356435644, + "grad_norm": 0.5742672681808472, + "learning_rate": 4.063865924933554e-05, + "loss": 0.2951, + "num_input_tokens_seen": 7608160, + "step": 7560 + }, + { + "epoch": 3.566713814238567, + "grad_norm": 0.29801857471466064, + "learning_rate": 4.0622603942575256e-05, + "loss": 0.3469, + "num_input_tokens_seen": 7614336, + "step": 7565 + }, + { + "epoch": 3.569071192833569, + "grad_norm": 0.35748597979545593, + "learning_rate": 4.060653805724904e-05, + "loss": 0.3469, + "num_input_tokens_seen": 7618592, + "step": 7570 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.34315744042396545, + "learning_rate": 4.059046160423565e-05, + "loss": 0.3782, + "num_input_tokens_seen": 7623296, + "step": 7575 + }, + { + "epoch": 3.5737859500235736, + "grad_norm": 0.5821058750152588, + "learning_rate": 4.057437459442095e-05, + "loss": 0.3249, + "num_input_tokens_seen": 7629152, + "step": 7580 + }, + { + "epoch": 3.576143328618576, + "grad_norm": 0.6026676297187805, + "learning_rate": 4.0558277038697965e-05, + "loss": 0.2802, + "num_input_tokens_seen": 7633312, + "step": 7585 + }, + { + "epoch": 3.5785007072135784, + "grad_norm": 0.41986292600631714, + "learning_rate": 4.054216894796688e-05, + "loss": 0.3842, + "num_input_tokens_seen": 7637376, + "step": 7590 + }, + { + "epoch": 3.580858085808581, + "grad_norm": 0.270597368478775, + "learning_rate": 4.052605033313499e-05, + "loss": 0.3242, + "num_input_tokens_seen": 7642080, + "step": 7595 + }, + { + "epoch": 3.583215464403583, + "grad_norm": 0.4310343265533447, + "learning_rate": 4.050992120511673e-05, + "loss": 0.376, + "num_input_tokens_seen": 7646752, + "step": 7600 + }, + { + "epoch": 3.5855728429985856, + "grad_norm": 0.37201228737831116, + "learning_rate": 4.049378157483365e-05, + "loss": 0.3522, + "num_input_tokens_seen": 7651200, + "step": 7605 + }, + { + "epoch": 3.587930221593588, + "grad_norm": 0.32910874485969543, + "learning_rate": 4.04776314532144e-05, + "loss": 0.3158, + "num_input_tokens_seen": 7655456, + "step": 7610 + }, + { + "epoch": 3.5902876001885904, + "grad_norm": 0.41995853185653687, + "learning_rate": 4.046147085119476e-05, + "loss": 0.3285, + "num_input_tokens_seen": 7659360, + "step": 7615 + }, + { + "epoch": 3.592644978783593, + "grad_norm": 0.46149513125419617, + "learning_rate": 4.044529977971758e-05, + "loss": 0.3842, + "num_input_tokens_seen": 7663904, + "step": 7620 + }, + { + "epoch": 3.595002357378595, + "grad_norm": 0.5851768255233765, + "learning_rate": 4.0429118249732836e-05, + "loss": 0.3624, + "num_input_tokens_seen": 7668352, + "step": 7625 + }, + { + "epoch": 3.5973597359735976, + "grad_norm": 0.5964788794517517, + "learning_rate": 4.041292627219752e-05, + "loss": 0.2902, + "num_input_tokens_seen": 7673952, + "step": 7630 + }, + { + "epoch": 3.5997171145685996, + "grad_norm": 0.35485413670539856, + "learning_rate": 4.039672385807577e-05, + "loss": 0.3332, + "num_input_tokens_seen": 7679648, + "step": 7635 + }, + { + "epoch": 3.602074493163602, + "grad_norm": 0.36772477626800537, + "learning_rate": 4.038051101833875e-05, + "loss": 0.345, + "num_input_tokens_seen": 7684192, + "step": 7640 + }, + { + "epoch": 3.6044318717586044, + "grad_norm": 0.32999181747436523, + "learning_rate": 4.03642877639647e-05, + "loss": 0.3417, + "num_input_tokens_seen": 7689376, + "step": 7645 + }, + { + "epoch": 3.6067892503536068, + "grad_norm": 0.33961400389671326, + "learning_rate": 4.0348054105938894e-05, + "loss": 0.3376, + "num_input_tokens_seen": 7694336, + "step": 7650 + }, + { + "epoch": 3.609146628948609, + "grad_norm": 0.3032572865486145, + "learning_rate": 4.033181005525366e-05, + "loss": 0.34, + "num_input_tokens_seen": 7699648, + "step": 7655 + }, + { + "epoch": 3.6115040075436116, + "grad_norm": 0.48672226071357727, + "learning_rate": 4.031555562290838e-05, + "loss": 0.3396, + "num_input_tokens_seen": 7705184, + "step": 7660 + }, + { + "epoch": 3.613861386138614, + "grad_norm": 0.5833381414413452, + "learning_rate": 4.029929081990945e-05, + "loss": 0.3822, + "num_input_tokens_seen": 7712064, + "step": 7665 + }, + { + "epoch": 3.6162187647336164, + "grad_norm": 0.6606496572494507, + "learning_rate": 4.0283015657270274e-05, + "loss": 0.36, + "num_input_tokens_seen": 7717216, + "step": 7670 + }, + { + "epoch": 3.6185761433286183, + "grad_norm": 0.4374718964099884, + "learning_rate": 4.02667301460113e-05, + "loss": 0.3376, + "num_input_tokens_seen": 7722848, + "step": 7675 + }, + { + "epoch": 3.6209335219236207, + "grad_norm": 0.31789925694465637, + "learning_rate": 4.0250434297159964e-05, + "loss": 0.3086, + "num_input_tokens_seen": 7727872, + "step": 7680 + }, + { + "epoch": 3.623290900518623, + "grad_norm": 0.23457783460617065, + "learning_rate": 4.023412812175071e-05, + "loss": 0.2848, + "num_input_tokens_seen": 7733056, + "step": 7685 + }, + { + "epoch": 3.6256482791136255, + "grad_norm": 0.24783359467983246, + "learning_rate": 4.0217811630824985e-05, + "loss": 0.2994, + "num_input_tokens_seen": 7737792, + "step": 7690 + }, + { + "epoch": 3.628005657708628, + "grad_norm": 0.3667454719543457, + "learning_rate": 4.020148483543118e-05, + "loss": 0.3002, + "num_input_tokens_seen": 7742016, + "step": 7695 + }, + { + "epoch": 3.6303630363036303, + "grad_norm": 0.2731368839740753, + "learning_rate": 4.0185147746624704e-05, + "loss": 0.3534, + "num_input_tokens_seen": 7749632, + "step": 7700 + }, + { + "epoch": 3.6327204148986327, + "grad_norm": 0.3570387065410614, + "learning_rate": 4.016880037546794e-05, + "loss": 0.2503, + "num_input_tokens_seen": 7754656, + "step": 7705 + }, + { + "epoch": 3.635077793493635, + "grad_norm": 0.4274637997150421, + "learning_rate": 4.0152442733030194e-05, + "loss": 0.2873, + "num_input_tokens_seen": 7759424, + "step": 7710 + }, + { + "epoch": 3.6374351720886375, + "grad_norm": 0.4985097348690033, + "learning_rate": 4.013607483038776e-05, + "loss": 0.3557, + "num_input_tokens_seen": 7764672, + "step": 7715 + }, + { + "epoch": 3.63979255068364, + "grad_norm": 0.3069036602973938, + "learning_rate": 4.011969667862386e-05, + "loss": 0.2654, + "num_input_tokens_seen": 7770880, + "step": 7720 + }, + { + "epoch": 3.6421499292786423, + "grad_norm": 0.3092837333679199, + "learning_rate": 4.010330828882868e-05, + "loss": 0.3242, + "num_input_tokens_seen": 7775744, + "step": 7725 + }, + { + "epoch": 3.6445073078736447, + "grad_norm": 0.28503021597862244, + "learning_rate": 4.008690967209932e-05, + "loss": 0.3886, + "num_input_tokens_seen": 7780320, + "step": 7730 + }, + { + "epoch": 3.6468646864686467, + "grad_norm": 0.6116894483566284, + "learning_rate": 4.0070500839539804e-05, + "loss": 0.389, + "num_input_tokens_seen": 7784768, + "step": 7735 + }, + { + "epoch": 3.649222065063649, + "grad_norm": 0.3477393686771393, + "learning_rate": 4.005408180226109e-05, + "loss": 0.4148, + "num_input_tokens_seen": 7790464, + "step": 7740 + }, + { + "epoch": 3.6515794436586515, + "grad_norm": 0.2875749468803406, + "learning_rate": 4.003765257138102e-05, + "loss": 0.3415, + "num_input_tokens_seen": 7795584, + "step": 7745 + }, + { + "epoch": 3.653936822253654, + "grad_norm": 0.8324252367019653, + "learning_rate": 4.0021213158024355e-05, + "loss": 0.3321, + "num_input_tokens_seen": 7800864, + "step": 7750 + }, + { + "epoch": 3.6562942008486563, + "grad_norm": 0.6089072227478027, + "learning_rate": 4.0004763573322754e-05, + "loss": 0.3108, + "num_input_tokens_seen": 7804896, + "step": 7755 + }, + { + "epoch": 3.6586515794436587, + "grad_norm": 0.62898850440979, + "learning_rate": 3.998830382841476e-05, + "loss": 0.3697, + "num_input_tokens_seen": 7809376, + "step": 7760 + }, + { + "epoch": 3.661008958038661, + "grad_norm": 0.26908648014068604, + "learning_rate": 3.997183393444579e-05, + "loss": 0.3357, + "num_input_tokens_seen": 7813920, + "step": 7765 + }, + { + "epoch": 3.6633663366336635, + "grad_norm": 0.38090693950653076, + "learning_rate": 3.995535390256814e-05, + "loss": 0.2941, + "num_input_tokens_seen": 7818112, + "step": 7770 + }, + { + "epoch": 3.6657237152286655, + "grad_norm": 0.3942584693431854, + "learning_rate": 3.993886374394096e-05, + "loss": 0.3649, + "num_input_tokens_seen": 7823104, + "step": 7775 + }, + { + "epoch": 3.668081093823668, + "grad_norm": 0.40635213255882263, + "learning_rate": 3.9922363469730274e-05, + "loss": 0.3153, + "num_input_tokens_seen": 7828256, + "step": 7780 + }, + { + "epoch": 3.6704384724186703, + "grad_norm": 0.44275325536727905, + "learning_rate": 3.9905853091108936e-05, + "loss": 0.3864, + "num_input_tokens_seen": 7833600, + "step": 7785 + }, + { + "epoch": 3.6727958510136727, + "grad_norm": 0.32199588418006897, + "learning_rate": 3.988933261925667e-05, + "loss": 0.3169, + "num_input_tokens_seen": 7838208, + "step": 7790 + }, + { + "epoch": 3.675153229608675, + "grad_norm": 0.4776056110858917, + "learning_rate": 3.9872802065360004e-05, + "loss": 0.3095, + "num_input_tokens_seen": 7843296, + "step": 7795 + }, + { + "epoch": 3.6775106082036775, + "grad_norm": 0.31870514154434204, + "learning_rate": 3.98562614406123e-05, + "loss": 0.2924, + "num_input_tokens_seen": 7849024, + "step": 7800 + }, + { + "epoch": 3.67986798679868, + "grad_norm": 0.3101949691772461, + "learning_rate": 3.983971075621377e-05, + "loss": 0.3246, + "num_input_tokens_seen": 7853792, + "step": 7805 + }, + { + "epoch": 3.6822253653936823, + "grad_norm": 0.25170284509658813, + "learning_rate": 3.982315002337138e-05, + "loss": 0.3254, + "num_input_tokens_seen": 7857824, + "step": 7810 + }, + { + "epoch": 3.6845827439886847, + "grad_norm": 0.37578195333480835, + "learning_rate": 3.980657925329896e-05, + "loss": 0.3348, + "num_input_tokens_seen": 7863008, + "step": 7815 + }, + { + "epoch": 3.686940122583687, + "grad_norm": 0.5825908184051514, + "learning_rate": 3.97899984572171e-05, + "loss": 0.4061, + "num_input_tokens_seen": 7867968, + "step": 7820 + }, + { + "epoch": 3.6892975011786895, + "grad_norm": 0.43807128071784973, + "learning_rate": 3.977340764635319e-05, + "loss": 0.327, + "num_input_tokens_seen": 7872928, + "step": 7825 + }, + { + "epoch": 3.691654879773692, + "grad_norm": 0.3425138294696808, + "learning_rate": 3.975680683194138e-05, + "loss": 0.2949, + "num_input_tokens_seen": 7878304, + "step": 7830 + }, + { + "epoch": 3.694012258368694, + "grad_norm": 0.28184938430786133, + "learning_rate": 3.9740196025222654e-05, + "loss": 0.2929, + "num_input_tokens_seen": 7883168, + "step": 7835 + }, + { + "epoch": 3.6963696369636962, + "grad_norm": 0.7152112126350403, + "learning_rate": 3.9723575237444686e-05, + "loss": 0.3431, + "num_input_tokens_seen": 7887456, + "step": 7840 + }, + { + "epoch": 3.6987270155586986, + "grad_norm": 0.3027220666408539, + "learning_rate": 3.9706944479861964e-05, + "loss": 0.3166, + "num_input_tokens_seen": 7892000, + "step": 7845 + }, + { + "epoch": 3.701084394153701, + "grad_norm": 0.46027541160583496, + "learning_rate": 3.9690303763735703e-05, + "loss": 0.3351, + "num_input_tokens_seen": 7896544, + "step": 7850 + }, + { + "epoch": 3.7034417727487035, + "grad_norm": 0.2832137942314148, + "learning_rate": 3.967365310033385e-05, + "loss": 0.3651, + "num_input_tokens_seen": 7901536, + "step": 7855 + }, + { + "epoch": 3.705799151343706, + "grad_norm": 0.2617887854576111, + "learning_rate": 3.965699250093113e-05, + "loss": 0.2984, + "num_input_tokens_seen": 7906592, + "step": 7860 + }, + { + "epoch": 3.7081565299387083, + "grad_norm": 0.574702262878418, + "learning_rate": 3.964032197680895e-05, + "loss": 0.3868, + "num_input_tokens_seen": 7911776, + "step": 7865 + }, + { + "epoch": 3.7105139085337107, + "grad_norm": 0.27855563163757324, + "learning_rate": 3.9623641539255466e-05, + "loss": 0.3396, + "num_input_tokens_seen": 7917024, + "step": 7870 + }, + { + "epoch": 3.7128712871287126, + "grad_norm": 0.3394208252429962, + "learning_rate": 3.9606951199565524e-05, + "loss": 0.3284, + "num_input_tokens_seen": 7922592, + "step": 7875 + }, + { + "epoch": 3.715228665723715, + "grad_norm": 0.3490373194217682, + "learning_rate": 3.9590250969040694e-05, + "loss": 0.3398, + "num_input_tokens_seen": 7927776, + "step": 7880 + }, + { + "epoch": 3.7175860443187174, + "grad_norm": 0.474835604429245, + "learning_rate": 3.9573540858989243e-05, + "loss": 0.3718, + "num_input_tokens_seen": 7933184, + "step": 7885 + }, + { + "epoch": 3.71994342291372, + "grad_norm": 0.7927852272987366, + "learning_rate": 3.955682088072612e-05, + "loss": 0.3715, + "num_input_tokens_seen": 7937152, + "step": 7890 + }, + { + "epoch": 3.7223008015087222, + "grad_norm": 0.6793241500854492, + "learning_rate": 3.9540091045572954e-05, + "loss": 0.3508, + "num_input_tokens_seen": 7942464, + "step": 7895 + }, + { + "epoch": 3.7246581801037246, + "grad_norm": 0.6564528942108154, + "learning_rate": 3.9523351364858054e-05, + "loss": 0.3651, + "num_input_tokens_seen": 7946848, + "step": 7900 + }, + { + "epoch": 3.727015558698727, + "grad_norm": 0.30711662769317627, + "learning_rate": 3.9506601849916405e-05, + "loss": 0.3474, + "num_input_tokens_seen": 7951872, + "step": 7905 + }, + { + "epoch": 3.7293729372937294, + "grad_norm": 0.8347296118736267, + "learning_rate": 3.9489842512089615e-05, + "loss": 0.3731, + "num_input_tokens_seen": 7957120, + "step": 7910 + }, + { + "epoch": 3.731730315888732, + "grad_norm": 0.3027005195617676, + "learning_rate": 3.9473073362725996e-05, + "loss": 0.3683, + "num_input_tokens_seen": 7963616, + "step": 7915 + }, + { + "epoch": 3.7340876944837342, + "grad_norm": 0.4129179120063782, + "learning_rate": 3.945629441318047e-05, + "loss": 0.3632, + "num_input_tokens_seen": 7969248, + "step": 7920 + }, + { + "epoch": 3.7364450730787366, + "grad_norm": 0.6572999954223633, + "learning_rate": 3.943950567481459e-05, + "loss": 0.3039, + "num_input_tokens_seen": 7974656, + "step": 7925 + }, + { + "epoch": 3.738802451673739, + "grad_norm": 0.47175171971321106, + "learning_rate": 3.942270715899655e-05, + "loss": 0.309, + "num_input_tokens_seen": 7981184, + "step": 7930 + }, + { + "epoch": 3.741159830268741, + "grad_norm": 0.5228359699249268, + "learning_rate": 3.940589887710118e-05, + "loss": 0.3189, + "num_input_tokens_seen": 7985856, + "step": 7935 + }, + { + "epoch": 3.7435172088637434, + "grad_norm": 0.2554144561290741, + "learning_rate": 3.9389080840509896e-05, + "loss": 0.3496, + "num_input_tokens_seen": 7993088, + "step": 7940 + }, + { + "epoch": 3.745874587458746, + "grad_norm": 0.45008203387260437, + "learning_rate": 3.937225306061073e-05, + "loss": 0.3037, + "num_input_tokens_seen": 7997760, + "step": 7945 + }, + { + "epoch": 3.748231966053748, + "grad_norm": 0.40278905630111694, + "learning_rate": 3.9355415548798305e-05, + "loss": 0.3592, + "num_input_tokens_seen": 8003648, + "step": 7950 + }, + { + "epoch": 3.7505893446487506, + "grad_norm": 0.48782920837402344, + "learning_rate": 3.933856831647385e-05, + "loss": 0.3285, + "num_input_tokens_seen": 8009248, + "step": 7955 + }, + { + "epoch": 3.752946723243753, + "grad_norm": 0.37180396914482117, + "learning_rate": 3.932171137504516e-05, + "loss": 0.3082, + "num_input_tokens_seen": 8014464, + "step": 7960 + }, + { + "epoch": 3.7553041018387554, + "grad_norm": 0.3682900071144104, + "learning_rate": 3.9304844735926616e-05, + "loss": 0.3064, + "num_input_tokens_seen": 8018560, + "step": 7965 + }, + { + "epoch": 3.757661480433758, + "grad_norm": 0.29135486483573914, + "learning_rate": 3.928796841053916e-05, + "loss": 0.3193, + "num_input_tokens_seen": 8023712, + "step": 7970 + }, + { + "epoch": 3.7600188590287598, + "grad_norm": 0.2507135272026062, + "learning_rate": 3.927108241031028e-05, + "loss": 0.2871, + "num_input_tokens_seen": 8028736, + "step": 7975 + }, + { + "epoch": 3.762376237623762, + "grad_norm": 0.30187729001045227, + "learning_rate": 3.925418674667405e-05, + "loss": 0.2458, + "num_input_tokens_seen": 8033632, + "step": 7980 + }, + { + "epoch": 3.7647336162187646, + "grad_norm": 0.2713507115840912, + "learning_rate": 3.923728143107104e-05, + "loss": 0.3467, + "num_input_tokens_seen": 8037376, + "step": 7985 + }, + { + "epoch": 3.767090994813767, + "grad_norm": 0.4225744903087616, + "learning_rate": 3.92203664749484e-05, + "loss": 0.3143, + "num_input_tokens_seen": 8041952, + "step": 7990 + }, + { + "epoch": 3.7694483734087694, + "grad_norm": 0.40959686040878296, + "learning_rate": 3.920344188975978e-05, + "loss": 0.3748, + "num_input_tokens_seen": 8046752, + "step": 7995 + }, + { + "epoch": 3.7718057520037718, + "grad_norm": 0.3203642964363098, + "learning_rate": 3.9186507686965364e-05, + "loss": 0.369, + "num_input_tokens_seen": 8051136, + "step": 8000 + }, + { + "epoch": 3.774163130598774, + "grad_norm": 0.4387960433959961, + "learning_rate": 3.9169563878031836e-05, + "loss": 0.3636, + "num_input_tokens_seen": 8057280, + "step": 8005 + }, + { + "epoch": 3.7765205091937766, + "grad_norm": 0.3671746253967285, + "learning_rate": 3.9152610474432406e-05, + "loss": 0.2879, + "num_input_tokens_seen": 8062688, + "step": 8010 + }, + { + "epoch": 3.778877887788779, + "grad_norm": 0.36160576343536377, + "learning_rate": 3.9135647487646755e-05, + "loss": 0.3228, + "num_input_tokens_seen": 8067584, + "step": 8015 + }, + { + "epoch": 3.7812352663837814, + "grad_norm": 0.42800724506378174, + "learning_rate": 3.911867492916107e-05, + "loss": 0.2799, + "num_input_tokens_seen": 8073056, + "step": 8020 + }, + { + "epoch": 3.783592644978784, + "grad_norm": 0.6113935708999634, + "learning_rate": 3.9101692810468016e-05, + "loss": 0.3712, + "num_input_tokens_seen": 8077504, + "step": 8025 + }, + { + "epoch": 3.785950023573786, + "grad_norm": 0.35703912377357483, + "learning_rate": 3.908470114306673e-05, + "loss": 0.2875, + "num_input_tokens_seen": 8082720, + "step": 8030 + }, + { + "epoch": 3.7883074021687886, + "grad_norm": 0.41651415824890137, + "learning_rate": 3.9067699938462804e-05, + "loss": 0.293, + "num_input_tokens_seen": 8086944, + "step": 8035 + }, + { + "epoch": 3.7906647807637905, + "grad_norm": 0.2658780515193939, + "learning_rate": 3.905068920816831e-05, + "loss": 0.2914, + "num_input_tokens_seen": 8092448, + "step": 8040 + }, + { + "epoch": 3.793022159358793, + "grad_norm": 0.35078880190849304, + "learning_rate": 3.903366896370175e-05, + "loss": 0.397, + "num_input_tokens_seen": 8097344, + "step": 8045 + }, + { + "epoch": 3.7953795379537953, + "grad_norm": 0.5351168513298035, + "learning_rate": 3.901663921658809e-05, + "loss": 0.2591, + "num_input_tokens_seen": 8102016, + "step": 8050 + }, + { + "epoch": 3.7977369165487977, + "grad_norm": 0.31240180134773254, + "learning_rate": 3.89995999783587e-05, + "loss": 0.3074, + "num_input_tokens_seen": 8107456, + "step": 8055 + }, + { + "epoch": 3.8000942951438, + "grad_norm": 0.43206551671028137, + "learning_rate": 3.8982551260551415e-05, + "loss": 0.3712, + "num_input_tokens_seen": 8112576, + "step": 8060 + }, + { + "epoch": 3.8024516737388026, + "grad_norm": 0.318299263715744, + "learning_rate": 3.896549307471046e-05, + "loss": 0.3163, + "num_input_tokens_seen": 8116928, + "step": 8065 + }, + { + "epoch": 3.804809052333805, + "grad_norm": 0.31804320216178894, + "learning_rate": 3.894842543238647e-05, + "loss": 0.3021, + "num_input_tokens_seen": 8121376, + "step": 8070 + }, + { + "epoch": 3.807166430928807, + "grad_norm": 0.6161442399024963, + "learning_rate": 3.89313483451365e-05, + "loss": 0.3651, + "num_input_tokens_seen": 8125952, + "step": 8075 + }, + { + "epoch": 3.8095238095238093, + "grad_norm": 0.3411281108856201, + "learning_rate": 3.8914261824524e-05, + "loss": 0.2714, + "num_input_tokens_seen": 8130688, + "step": 8080 + }, + { + "epoch": 3.8118811881188117, + "grad_norm": 0.3914463222026825, + "learning_rate": 3.889716588211879e-05, + "loss": 0.3436, + "num_input_tokens_seen": 8136032, + "step": 8085 + }, + { + "epoch": 3.814238566713814, + "grad_norm": 0.3909475803375244, + "learning_rate": 3.8880060529497085e-05, + "loss": 0.3717, + "num_input_tokens_seen": 8141024, + "step": 8090 + }, + { + "epoch": 3.8165959453088165, + "grad_norm": 0.308993935585022, + "learning_rate": 3.8862945778241465e-05, + "loss": 0.3055, + "num_input_tokens_seen": 8146176, + "step": 8095 + }, + { + "epoch": 3.818953323903819, + "grad_norm": 0.4064185619354248, + "learning_rate": 3.8845821639940884e-05, + "loss": 0.3267, + "num_input_tokens_seen": 8151232, + "step": 8100 + }, + { + "epoch": 3.8213107024988213, + "grad_norm": 0.2300092577934265, + "learning_rate": 3.882868812619063e-05, + "loss": 0.314, + "num_input_tokens_seen": 8156192, + "step": 8105 + }, + { + "epoch": 3.8236680810938237, + "grad_norm": 0.39955225586891174, + "learning_rate": 3.8811545248592375e-05, + "loss": 0.3227, + "num_input_tokens_seen": 8160640, + "step": 8110 + }, + { + "epoch": 3.826025459688826, + "grad_norm": 0.3457110822200775, + "learning_rate": 3.879439301875409e-05, + "loss": 0.3005, + "num_input_tokens_seen": 8165024, + "step": 8115 + }, + { + "epoch": 3.8283828382838285, + "grad_norm": 0.4725908637046814, + "learning_rate": 3.877723144829012e-05, + "loss": 0.3499, + "num_input_tokens_seen": 8171296, + "step": 8120 + }, + { + "epoch": 3.830740216878831, + "grad_norm": 0.3871127665042877, + "learning_rate": 3.87600605488211e-05, + "loss": 0.3209, + "num_input_tokens_seen": 8176576, + "step": 8125 + }, + { + "epoch": 3.8330975954738333, + "grad_norm": 0.2449956089258194, + "learning_rate": 3.8742880331973994e-05, + "loss": 0.3083, + "num_input_tokens_seen": 8181216, + "step": 8130 + }, + { + "epoch": 3.8354549740688357, + "grad_norm": 0.3613891005516052, + "learning_rate": 3.8725690809382096e-05, + "loss": 0.2772, + "num_input_tokens_seen": 8186016, + "step": 8135 + }, + { + "epoch": 3.8378123526638377, + "grad_norm": 0.24348165094852448, + "learning_rate": 3.870849199268497e-05, + "loss": 0.3994, + "num_input_tokens_seen": 8190944, + "step": 8140 + }, + { + "epoch": 3.84016973125884, + "grad_norm": 0.3899194002151489, + "learning_rate": 3.869128389352848e-05, + "loss": 0.3201, + "num_input_tokens_seen": 8195200, + "step": 8145 + }, + { + "epoch": 3.8425271098538425, + "grad_norm": 0.3198568820953369, + "learning_rate": 3.867406652356479e-05, + "loss": 0.3729, + "num_input_tokens_seen": 8199552, + "step": 8150 + }, + { + "epoch": 3.844884488448845, + "grad_norm": 0.3087399899959564, + "learning_rate": 3.865683989445234e-05, + "loss": 0.3663, + "num_input_tokens_seen": 8204064, + "step": 8155 + }, + { + "epoch": 3.8472418670438473, + "grad_norm": 0.5621525049209595, + "learning_rate": 3.863960401785581e-05, + "loss": 0.3332, + "num_input_tokens_seen": 8208832, + "step": 8160 + }, + { + "epoch": 3.8495992456388497, + "grad_norm": 0.4408474564552307, + "learning_rate": 3.862235890544619e-05, + "loss": 0.3618, + "num_input_tokens_seen": 8213696, + "step": 8165 + }, + { + "epoch": 3.851956624233852, + "grad_norm": 0.4640212059020996, + "learning_rate": 3.8605104568900685e-05, + "loss": 0.3505, + "num_input_tokens_seen": 8218048, + "step": 8170 + }, + { + "epoch": 3.854314002828854, + "grad_norm": 0.32545095682144165, + "learning_rate": 3.858784101990276e-05, + "loss": 0.3185, + "num_input_tokens_seen": 8221856, + "step": 8175 + }, + { + "epoch": 3.8566713814238565, + "grad_norm": 0.6053699851036072, + "learning_rate": 3.857056827014213e-05, + "loss": 0.3206, + "num_input_tokens_seen": 8227584, + "step": 8180 + }, + { + "epoch": 3.859028760018859, + "grad_norm": 0.5172849893569946, + "learning_rate": 3.8553286331314705e-05, + "loss": 0.3441, + "num_input_tokens_seen": 8233280, + "step": 8185 + }, + { + "epoch": 3.8613861386138613, + "grad_norm": 0.384685754776001, + "learning_rate": 3.8535995215122656e-05, + "loss": 0.3527, + "num_input_tokens_seen": 8237568, + "step": 8190 + }, + { + "epoch": 3.8637435172088637, + "grad_norm": 0.34352609515190125, + "learning_rate": 3.851869493327434e-05, + "loss": 0.3189, + "num_input_tokens_seen": 8243200, + "step": 8195 + }, + { + "epoch": 3.866100895803866, + "grad_norm": 0.3110477924346924, + "learning_rate": 3.8501385497484354e-05, + "loss": 0.3286, + "num_input_tokens_seen": 8247520, + "step": 8200 + }, + { + "epoch": 3.8684582743988685, + "grad_norm": 0.4806363880634308, + "learning_rate": 3.848406691947345e-05, + "loss": 0.3738, + "num_input_tokens_seen": 8252192, + "step": 8205 + }, + { + "epoch": 3.870815652993871, + "grad_norm": 0.4991491436958313, + "learning_rate": 3.84667392109686e-05, + "loss": 0.3562, + "num_input_tokens_seen": 8257024, + "step": 8210 + }, + { + "epoch": 3.8731730315888733, + "grad_norm": 0.5074194073677063, + "learning_rate": 3.844940238370295e-05, + "loss": 0.3319, + "num_input_tokens_seen": 8261216, + "step": 8215 + }, + { + "epoch": 3.8755304101838757, + "grad_norm": 0.3802829682826996, + "learning_rate": 3.843205644941582e-05, + "loss": 0.3602, + "num_input_tokens_seen": 8266176, + "step": 8220 + }, + { + "epoch": 3.877887788778878, + "grad_norm": 0.3699110746383667, + "learning_rate": 3.84147014198527e-05, + "loss": 0.3285, + "num_input_tokens_seen": 8271360, + "step": 8225 + }, + { + "epoch": 3.8802451673738805, + "grad_norm": 0.4186588227748871, + "learning_rate": 3.8397337306765254e-05, + "loss": 0.3231, + "num_input_tokens_seen": 8276800, + "step": 8230 + }, + { + "epoch": 3.882602545968883, + "grad_norm": 0.35234659910202026, + "learning_rate": 3.8379964121911246e-05, + "loss": 0.3426, + "num_input_tokens_seen": 8282560, + "step": 8235 + }, + { + "epoch": 3.884959924563885, + "grad_norm": 0.37024471163749695, + "learning_rate": 3.836258187705464e-05, + "loss": 0.3246, + "num_input_tokens_seen": 8288480, + "step": 8240 + }, + { + "epoch": 3.8873173031588872, + "grad_norm": 0.37278226017951965, + "learning_rate": 3.8345190583965506e-05, + "loss": 0.2696, + "num_input_tokens_seen": 8294592, + "step": 8245 + }, + { + "epoch": 3.8896746817538896, + "grad_norm": 0.6180894374847412, + "learning_rate": 3.832779025442004e-05, + "loss": 0.3762, + "num_input_tokens_seen": 8300192, + "step": 8250 + }, + { + "epoch": 3.892032060348892, + "grad_norm": 0.6257302761077881, + "learning_rate": 3.831038090020058e-05, + "loss": 0.3185, + "num_input_tokens_seen": 8306720, + "step": 8255 + }, + { + "epoch": 3.8943894389438944, + "grad_norm": 0.2908874452114105, + "learning_rate": 3.829296253309553e-05, + "loss": 0.3844, + "num_input_tokens_seen": 8312096, + "step": 8260 + }, + { + "epoch": 3.896746817538897, + "grad_norm": 0.5328723788261414, + "learning_rate": 3.8275535164899445e-05, + "loss": 0.3511, + "num_input_tokens_seen": 8320704, + "step": 8265 + }, + { + "epoch": 3.8991041961338992, + "grad_norm": 0.2807946503162384, + "learning_rate": 3.8258098807412954e-05, + "loss": 0.299, + "num_input_tokens_seen": 8325120, + "step": 8270 + }, + { + "epoch": 3.901461574728901, + "grad_norm": 0.3721446096897125, + "learning_rate": 3.824065347244277e-05, + "loss": 0.3863, + "num_input_tokens_seen": 8330944, + "step": 8275 + }, + { + "epoch": 3.9038189533239036, + "grad_norm": 0.4620278477668762, + "learning_rate": 3.8223199171801674e-05, + "loss": 0.3324, + "num_input_tokens_seen": 8335968, + "step": 8280 + }, + { + "epoch": 3.906176331918906, + "grad_norm": 0.6547933220863342, + "learning_rate": 3.820573591730856e-05, + "loss": 0.3246, + "num_input_tokens_seen": 8340576, + "step": 8285 + }, + { + "epoch": 3.9085337105139084, + "grad_norm": 0.4623752236366272, + "learning_rate": 3.818826372078834e-05, + "loss": 0.3415, + "num_input_tokens_seen": 8346080, + "step": 8290 + }, + { + "epoch": 3.910891089108911, + "grad_norm": 0.6003925800323486, + "learning_rate": 3.8170782594072e-05, + "loss": 0.3945, + "num_input_tokens_seen": 8352000, + "step": 8295 + }, + { + "epoch": 3.913248467703913, + "grad_norm": 0.31285998225212097, + "learning_rate": 3.8153292548996576e-05, + "loss": 0.3775, + "num_input_tokens_seen": 8356064, + "step": 8300 + }, + { + "epoch": 3.9156058462989156, + "grad_norm": 0.33924874663352966, + "learning_rate": 3.813579359740514e-05, + "loss": 0.3244, + "num_input_tokens_seen": 8360448, + "step": 8305 + }, + { + "epoch": 3.917963224893918, + "grad_norm": 0.5403138995170593, + "learning_rate": 3.811828575114678e-05, + "loss": 0.3375, + "num_input_tokens_seen": 8364896, + "step": 8310 + }, + { + "epoch": 3.9203206034889204, + "grad_norm": 0.34791430830955505, + "learning_rate": 3.8100769022076634e-05, + "loss": 0.2933, + "num_input_tokens_seen": 8370592, + "step": 8315 + }, + { + "epoch": 3.922677982083923, + "grad_norm": 0.267951101064682, + "learning_rate": 3.808324342205583e-05, + "loss": 0.2932, + "num_input_tokens_seen": 8374688, + "step": 8320 + }, + { + "epoch": 3.9250353606789252, + "grad_norm": 0.42862194776535034, + "learning_rate": 3.806570896295153e-05, + "loss": 0.3833, + "num_input_tokens_seen": 8379872, + "step": 8325 + }, + { + "epoch": 3.9273927392739276, + "grad_norm": 0.23938220739364624, + "learning_rate": 3.804816565663685e-05, + "loss": 0.2867, + "num_input_tokens_seen": 8383808, + "step": 8330 + }, + { + "epoch": 3.92975011786893, + "grad_norm": 0.8749874234199524, + "learning_rate": 3.8030613514990953e-05, + "loss": 0.3565, + "num_input_tokens_seen": 8388800, + "step": 8335 + }, + { + "epoch": 3.932107496463932, + "grad_norm": 0.2950035631656647, + "learning_rate": 3.8013052549898945e-05, + "loss": 0.2608, + "num_input_tokens_seen": 8393312, + "step": 8340 + }, + { + "epoch": 3.9344648750589344, + "grad_norm": 0.4227633774280548, + "learning_rate": 3.799548277325192e-05, + "loss": 0.3141, + "num_input_tokens_seen": 8398144, + "step": 8345 + }, + { + "epoch": 3.936822253653937, + "grad_norm": 0.276938796043396, + "learning_rate": 3.7977904196946936e-05, + "loss": 0.2547, + "num_input_tokens_seen": 8402720, + "step": 8350 + }, + { + "epoch": 3.939179632248939, + "grad_norm": 0.33811822533607483, + "learning_rate": 3.796031683288701e-05, + "loss": 0.2759, + "num_input_tokens_seen": 8408512, + "step": 8355 + }, + { + "epoch": 3.9415370108439416, + "grad_norm": 0.4131723642349243, + "learning_rate": 3.794272069298111e-05, + "loss": 0.3338, + "num_input_tokens_seen": 8413120, + "step": 8360 + }, + { + "epoch": 3.943894389438944, + "grad_norm": 0.667405366897583, + "learning_rate": 3.792511578914415e-05, + "loss": 0.3734, + "num_input_tokens_seen": 8417920, + "step": 8365 + }, + { + "epoch": 3.9462517680339464, + "grad_norm": 0.22697119414806366, + "learning_rate": 3.7907502133296966e-05, + "loss": 0.326, + "num_input_tokens_seen": 8422368, + "step": 8370 + }, + { + "epoch": 3.9486091466289484, + "grad_norm": 0.2646130919456482, + "learning_rate": 3.788987973736634e-05, + "loss": 0.3137, + "num_input_tokens_seen": 8428160, + "step": 8375 + }, + { + "epoch": 3.9509665252239508, + "grad_norm": 0.4451930820941925, + "learning_rate": 3.7872248613284953e-05, + "loss": 0.3822, + "num_input_tokens_seen": 8432448, + "step": 8380 + }, + { + "epoch": 3.953323903818953, + "grad_norm": 0.2583085894584656, + "learning_rate": 3.785460877299141e-05, + "loss": 0.3181, + "num_input_tokens_seen": 8436928, + "step": 8385 + }, + { + "epoch": 3.9556812824139556, + "grad_norm": 0.33510932326316833, + "learning_rate": 3.783696022843021e-05, + "loss": 0.3425, + "num_input_tokens_seen": 8441408, + "step": 8390 + }, + { + "epoch": 3.958038661008958, + "grad_norm": 0.5335872173309326, + "learning_rate": 3.781930299155175e-05, + "loss": 0.3653, + "num_input_tokens_seen": 8445120, + "step": 8395 + }, + { + "epoch": 3.9603960396039604, + "grad_norm": 0.4740268290042877, + "learning_rate": 3.78016370743123e-05, + "loss": 0.4037, + "num_input_tokens_seen": 8449632, + "step": 8400 + }, + { + "epoch": 3.9627534181989628, + "grad_norm": 0.5060139298439026, + "learning_rate": 3.7783962488674035e-05, + "loss": 0.32, + "num_input_tokens_seen": 8455392, + "step": 8405 + }, + { + "epoch": 3.965110796793965, + "grad_norm": 0.23826119303703308, + "learning_rate": 3.7766279246604976e-05, + "loss": 0.3315, + "num_input_tokens_seen": 8461216, + "step": 8410 + }, + { + "epoch": 3.9674681753889676, + "grad_norm": 0.33873987197875977, + "learning_rate": 3.7748587360079016e-05, + "loss": 0.3534, + "num_input_tokens_seen": 8465888, + "step": 8415 + }, + { + "epoch": 3.96982555398397, + "grad_norm": 0.3835511803627014, + "learning_rate": 3.7730886841075897e-05, + "loss": 0.3106, + "num_input_tokens_seen": 8471136, + "step": 8420 + }, + { + "epoch": 3.9721829325789724, + "grad_norm": 0.3193999230861664, + "learning_rate": 3.7713177701581195e-05, + "loss": 0.3137, + "num_input_tokens_seen": 8477088, + "step": 8425 + }, + { + "epoch": 3.9745403111739748, + "grad_norm": 0.28745242953300476, + "learning_rate": 3.769545995358635e-05, + "loss": 0.3156, + "num_input_tokens_seen": 8481440, + "step": 8430 + }, + { + "epoch": 3.976897689768977, + "grad_norm": 0.37394413352012634, + "learning_rate": 3.767773360908862e-05, + "loss": 0.2786, + "num_input_tokens_seen": 8486176, + "step": 8435 + }, + { + "epoch": 3.979255068363979, + "grad_norm": 0.40252310037612915, + "learning_rate": 3.765999868009108e-05, + "loss": 0.4247, + "num_input_tokens_seen": 8491648, + "step": 8440 + }, + { + "epoch": 3.9816124469589815, + "grad_norm": 0.48004916310310364, + "learning_rate": 3.76422551786026e-05, + "loss": 0.3248, + "num_input_tokens_seen": 8496512, + "step": 8445 + }, + { + "epoch": 3.983969825553984, + "grad_norm": 0.3522685468196869, + "learning_rate": 3.762450311663791e-05, + "loss": 0.3433, + "num_input_tokens_seen": 8502208, + "step": 8450 + }, + { + "epoch": 3.9863272041489863, + "grad_norm": 0.5333105325698853, + "learning_rate": 3.7606742506217466e-05, + "loss": 0.3952, + "num_input_tokens_seen": 8507200, + "step": 8455 + }, + { + "epoch": 3.9886845827439887, + "grad_norm": 0.4708607494831085, + "learning_rate": 3.758897335936756e-05, + "loss": 0.3382, + "num_input_tokens_seen": 8511648, + "step": 8460 + }, + { + "epoch": 3.991041961338991, + "grad_norm": 0.26699569821357727, + "learning_rate": 3.757119568812024e-05, + "loss": 0.305, + "num_input_tokens_seen": 8519520, + "step": 8465 + }, + { + "epoch": 3.9933993399339935, + "grad_norm": 0.2731078267097473, + "learning_rate": 3.7553409504513366e-05, + "loss": 0.3471, + "num_input_tokens_seen": 8523488, + "step": 8470 + }, + { + "epoch": 3.9957567185289955, + "grad_norm": 0.3829655945301056, + "learning_rate": 3.75356148205905e-05, + "loss": 0.3123, + "num_input_tokens_seen": 8528576, + "step": 8475 + }, + { + "epoch": 3.998114097123998, + "grad_norm": 0.5224087834358215, + "learning_rate": 3.7517811648401016e-05, + "loss": 0.3566, + "num_input_tokens_seen": 8533824, + "step": 8480 + }, + { + "epoch": 4.000471475719, + "grad_norm": 0.28016045689582825, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.319, + "num_input_tokens_seen": 8537824, + "step": 8485 + }, + { + "epoch": 4.001885902876002, + "eval_loss": 0.34270626306533813, + "eval_runtime": 25.6282, + "eval_samples_per_second": 36.795, + "eval_steps_per_second": 9.209, + "num_input_tokens_seen": 8540672, + "step": 8488 + }, + { + "epoch": 4.002828854314003, + "grad_norm": 0.3556959331035614, + "learning_rate": 3.74821798874483e-05, + "loss": 0.3446, + "num_input_tokens_seen": 8542784, + "step": 8490 + }, + { + "epoch": 4.005186232909005, + "grad_norm": 0.23847424983978271, + "learning_rate": 3.7464351322812475e-05, + "loss": 0.3472, + "num_input_tokens_seen": 8547456, + "step": 8495 + }, + { + "epoch": 4.0075436115040075, + "grad_norm": 0.35787782073020935, + "learning_rate": 3.744651431816482e-05, + "loss": 0.3202, + "num_input_tokens_seen": 8552704, + "step": 8500 + }, + { + "epoch": 4.00990099009901, + "grad_norm": 0.539198637008667, + "learning_rate": 3.742866888558334e-05, + "loss": 0.3175, + "num_input_tokens_seen": 8558112, + "step": 8505 + }, + { + "epoch": 4.012258368694012, + "grad_norm": 0.39528992772102356, + "learning_rate": 3.741081503715176e-05, + "loss": 0.2685, + "num_input_tokens_seen": 8563200, + "step": 8510 + }, + { + "epoch": 4.014615747289015, + "grad_norm": 0.6931132078170776, + "learning_rate": 3.739295278495949e-05, + "loss": 0.4571, + "num_input_tokens_seen": 8568128, + "step": 8515 + }, + { + "epoch": 4.016973125884017, + "grad_norm": 0.26602450013160706, + "learning_rate": 3.7375082141101626e-05, + "loss": 0.3292, + "num_input_tokens_seen": 8572736, + "step": 8520 + }, + { + "epoch": 4.0193305044790195, + "grad_norm": 0.3703839182853699, + "learning_rate": 3.735720311767896e-05, + "loss": 0.2942, + "num_input_tokens_seen": 8577248, + "step": 8525 + }, + { + "epoch": 4.021687883074022, + "grad_norm": 0.4040640592575073, + "learning_rate": 3.7339315726797946e-05, + "loss": 0.305, + "num_input_tokens_seen": 8581120, + "step": 8530 + }, + { + "epoch": 4.024045261669024, + "grad_norm": 0.3386871814727783, + "learning_rate": 3.7321419980570724e-05, + "loss": 0.364, + "num_input_tokens_seen": 8586176, + "step": 8535 + }, + { + "epoch": 4.026402640264027, + "grad_norm": 0.3966901898384094, + "learning_rate": 3.7303515891115056e-05, + "loss": 0.3565, + "num_input_tokens_seen": 8590816, + "step": 8540 + }, + { + "epoch": 4.028760018859029, + "grad_norm": 0.42853039503097534, + "learning_rate": 3.72856034705544e-05, + "loss": 0.2625, + "num_input_tokens_seen": 8595104, + "step": 8545 + }, + { + "epoch": 4.0311173974540315, + "grad_norm": 0.31907787919044495, + "learning_rate": 3.7267682731017805e-05, + "loss": 0.2417, + "num_input_tokens_seen": 8601632, + "step": 8550 + }, + { + "epoch": 4.033474776049033, + "grad_norm": 0.2203151136636734, + "learning_rate": 3.7249753684639996e-05, + "loss": 0.2881, + "num_input_tokens_seen": 8605984, + "step": 8555 + }, + { + "epoch": 4.035832154644035, + "grad_norm": 0.6225719451904297, + "learning_rate": 3.7231816343561295e-05, + "loss": 0.3913, + "num_input_tokens_seen": 8611168, + "step": 8560 + }, + { + "epoch": 4.038189533239038, + "grad_norm": 0.6804258823394775, + "learning_rate": 3.7213870719927655e-05, + "loss": 0.4417, + "num_input_tokens_seen": 8615552, + "step": 8565 + }, + { + "epoch": 4.04054691183404, + "grad_norm": 0.606192946434021, + "learning_rate": 3.719591682589063e-05, + "loss": 0.4181, + "num_input_tokens_seen": 8621024, + "step": 8570 + }, + { + "epoch": 4.042904290429043, + "grad_norm": 0.4545203149318695, + "learning_rate": 3.7177954673607386e-05, + "loss": 0.3342, + "num_input_tokens_seen": 8626016, + "step": 8575 + }, + { + "epoch": 4.045261669024045, + "grad_norm": 0.3594398498535156, + "learning_rate": 3.715998427524066e-05, + "loss": 0.334, + "num_input_tokens_seen": 8630528, + "step": 8580 + }, + { + "epoch": 4.0476190476190474, + "grad_norm": 0.326364666223526, + "learning_rate": 3.714200564295879e-05, + "loss": 0.2935, + "num_input_tokens_seen": 8636096, + "step": 8585 + }, + { + "epoch": 4.04997642621405, + "grad_norm": 0.3589053750038147, + "learning_rate": 3.712401878893569e-05, + "loss": 0.3388, + "num_input_tokens_seen": 8640608, + "step": 8590 + }, + { + "epoch": 4.052333804809052, + "grad_norm": 0.41294896602630615, + "learning_rate": 3.7106023725350826e-05, + "loss": 0.3056, + "num_input_tokens_seen": 8645664, + "step": 8595 + }, + { + "epoch": 4.054691183404055, + "grad_norm": 0.19971616566181183, + "learning_rate": 3.708802046438924e-05, + "loss": 0.342, + "num_input_tokens_seen": 8650272, + "step": 8600 + }, + { + "epoch": 4.057048561999057, + "grad_norm": 0.38022929430007935, + "learning_rate": 3.7070009018241505e-05, + "loss": 0.3211, + "num_input_tokens_seen": 8655360, + "step": 8605 + }, + { + "epoch": 4.0594059405940595, + "grad_norm": 0.5690507888793945, + "learning_rate": 3.7051989399103765e-05, + "loss": 0.3289, + "num_input_tokens_seen": 8660160, + "step": 8610 + }, + { + "epoch": 4.061763319189062, + "grad_norm": 0.3727482855319977, + "learning_rate": 3.703396161917767e-05, + "loss": 0.3422, + "num_input_tokens_seen": 8664672, + "step": 8615 + }, + { + "epoch": 4.064120697784064, + "grad_norm": 0.6443054676055908, + "learning_rate": 3.701592569067041e-05, + "loss": 0.3036, + "num_input_tokens_seen": 8670176, + "step": 8620 + }, + { + "epoch": 4.066478076379067, + "grad_norm": 0.4386352598667145, + "learning_rate": 3.699788162579469e-05, + "loss": 0.3073, + "num_input_tokens_seen": 8675744, + "step": 8625 + }, + { + "epoch": 4.068835454974069, + "grad_norm": 0.2635610103607178, + "learning_rate": 3.697982943676874e-05, + "loss": 0.32, + "num_input_tokens_seen": 8680448, + "step": 8630 + }, + { + "epoch": 4.0711928335690715, + "grad_norm": 0.3054110109806061, + "learning_rate": 3.696176913581625e-05, + "loss": 0.3416, + "num_input_tokens_seen": 8685216, + "step": 8635 + }, + { + "epoch": 4.073550212164074, + "grad_norm": 0.38394248485565186, + "learning_rate": 3.694370073516645e-05, + "loss": 0.301, + "num_input_tokens_seen": 8690592, + "step": 8640 + }, + { + "epoch": 4.075907590759076, + "grad_norm": 0.33442050218582153, + "learning_rate": 3.692562424705402e-05, + "loss": 0.246, + "num_input_tokens_seen": 8695680, + "step": 8645 + }, + { + "epoch": 4.078264969354079, + "grad_norm": 0.6289357542991638, + "learning_rate": 3.690753968371913e-05, + "loss": 0.3974, + "num_input_tokens_seen": 8701216, + "step": 8650 + }, + { + "epoch": 4.08062234794908, + "grad_norm": 0.24870531260967255, + "learning_rate": 3.688944705740743e-05, + "loss": 0.2889, + "num_input_tokens_seen": 8706176, + "step": 8655 + }, + { + "epoch": 4.082979726544083, + "grad_norm": 0.3096323311328888, + "learning_rate": 3.6871346380370004e-05, + "loss": 0.2986, + "num_input_tokens_seen": 8711040, + "step": 8660 + }, + { + "epoch": 4.085337105139085, + "grad_norm": 0.31963813304901123, + "learning_rate": 3.68532376648634e-05, + "loss": 0.3716, + "num_input_tokens_seen": 8715520, + "step": 8665 + }, + { + "epoch": 4.087694483734087, + "grad_norm": 0.3680574595928192, + "learning_rate": 3.683512092314962e-05, + "loss": 0.3446, + "num_input_tokens_seen": 8720256, + "step": 8670 + }, + { + "epoch": 4.09005186232909, + "grad_norm": 0.28509172797203064, + "learning_rate": 3.681699616749609e-05, + "loss": 0.3329, + "num_input_tokens_seen": 8724256, + "step": 8675 + }, + { + "epoch": 4.092409240924092, + "grad_norm": 0.3652224540710449, + "learning_rate": 3.6798863410175654e-05, + "loss": 0.3693, + "num_input_tokens_seen": 8730048, + "step": 8680 + }, + { + "epoch": 4.094766619519095, + "grad_norm": 0.4106779992580414, + "learning_rate": 3.678072266346658e-05, + "loss": 0.2734, + "num_input_tokens_seen": 8734656, + "step": 8685 + }, + { + "epoch": 4.097123998114097, + "grad_norm": 0.2630663216114044, + "learning_rate": 3.676257393965257e-05, + "loss": 0.3354, + "num_input_tokens_seen": 8739488, + "step": 8690 + }, + { + "epoch": 4.099481376709099, + "grad_norm": 0.4668046236038208, + "learning_rate": 3.6744417251022686e-05, + "loss": 0.3771, + "num_input_tokens_seen": 8744928, + "step": 8695 + }, + { + "epoch": 4.101838755304102, + "grad_norm": 0.3120458126068115, + "learning_rate": 3.6726252609871416e-05, + "loss": 0.3437, + "num_input_tokens_seen": 8751872, + "step": 8700 + }, + { + "epoch": 4.104196133899104, + "grad_norm": 0.49215278029441833, + "learning_rate": 3.670808002849862e-05, + "loss": 0.2901, + "num_input_tokens_seen": 8756448, + "step": 8705 + }, + { + "epoch": 4.106553512494107, + "grad_norm": 0.32126447558403015, + "learning_rate": 3.6689899519209526e-05, + "loss": 0.3299, + "num_input_tokens_seen": 8760832, + "step": 8710 + }, + { + "epoch": 4.108910891089109, + "grad_norm": 0.3220272362232208, + "learning_rate": 3.667171109431474e-05, + "loss": 0.3441, + "num_input_tokens_seen": 8764832, + "step": 8715 + }, + { + "epoch": 4.111268269684111, + "grad_norm": 0.6157819032669067, + "learning_rate": 3.665351476613025e-05, + "loss": 0.3211, + "num_input_tokens_seen": 8769312, + "step": 8720 + }, + { + "epoch": 4.113625648279114, + "grad_norm": 0.4754011332988739, + "learning_rate": 3.663531054697734e-05, + "loss": 0.3025, + "num_input_tokens_seen": 8775360, + "step": 8725 + }, + { + "epoch": 4.115983026874116, + "grad_norm": 0.3579901158809662, + "learning_rate": 3.661709844918269e-05, + "loss": 0.3386, + "num_input_tokens_seen": 8781536, + "step": 8730 + }, + { + "epoch": 4.118340405469119, + "grad_norm": 0.4197659194469452, + "learning_rate": 3.659887848507829e-05, + "loss": 0.2985, + "num_input_tokens_seen": 8787328, + "step": 8735 + }, + { + "epoch": 4.120697784064121, + "grad_norm": 0.547514796257019, + "learning_rate": 3.658065066700147e-05, + "loss": 0.3121, + "num_input_tokens_seen": 8793376, + "step": 8740 + }, + { + "epoch": 4.123055162659123, + "grad_norm": 0.3538365662097931, + "learning_rate": 3.656241500729486e-05, + "loss": 0.3375, + "num_input_tokens_seen": 8798304, + "step": 8745 + }, + { + "epoch": 4.125412541254126, + "grad_norm": 0.3476709723472595, + "learning_rate": 3.6544171518306415e-05, + "loss": 0.3022, + "num_input_tokens_seen": 8804800, + "step": 8750 + }, + { + "epoch": 4.127769919849127, + "grad_norm": 0.3440362811088562, + "learning_rate": 3.6525920212389375e-05, + "loss": 0.2895, + "num_input_tokens_seen": 8809152, + "step": 8755 + }, + { + "epoch": 4.13012729844413, + "grad_norm": 0.3086565136909485, + "learning_rate": 3.65076611019023e-05, + "loss": 0.3679, + "num_input_tokens_seen": 8813536, + "step": 8760 + }, + { + "epoch": 4.132484677039132, + "grad_norm": 0.4114321172237396, + "learning_rate": 3.648939419920902e-05, + "loss": 0.3093, + "num_input_tokens_seen": 8818048, + "step": 8765 + }, + { + "epoch": 4.1348420556341345, + "grad_norm": 0.4630244970321655, + "learning_rate": 3.647111951667862e-05, + "loss": 0.339, + "num_input_tokens_seen": 8823008, + "step": 8770 + }, + { + "epoch": 4.137199434229137, + "grad_norm": 0.29075706005096436, + "learning_rate": 3.645283706668549e-05, + "loss": 0.327, + "num_input_tokens_seen": 8829376, + "step": 8775 + }, + { + "epoch": 4.139556812824139, + "grad_norm": 0.4972011148929596, + "learning_rate": 3.643454686160926e-05, + "loss": 0.3303, + "num_input_tokens_seen": 8833376, + "step": 8780 + }, + { + "epoch": 4.141914191419142, + "grad_norm": 0.7182678580284119, + "learning_rate": 3.6416248913834814e-05, + "loss": 0.3241, + "num_input_tokens_seen": 8838368, + "step": 8785 + }, + { + "epoch": 4.144271570014144, + "grad_norm": 0.4635830223560333, + "learning_rate": 3.639794323575227e-05, + "loss": 0.3254, + "num_input_tokens_seen": 8844096, + "step": 8790 + }, + { + "epoch": 4.1466289486091465, + "grad_norm": 0.5863334536552429, + "learning_rate": 3.6379629839757015e-05, + "loss": 0.3841, + "num_input_tokens_seen": 8849440, + "step": 8795 + }, + { + "epoch": 4.148986327204149, + "grad_norm": 0.5431289076805115, + "learning_rate": 3.6361308738249606e-05, + "loss": 0.3337, + "num_input_tokens_seen": 8854144, + "step": 8800 + }, + { + "epoch": 4.151343705799151, + "grad_norm": 0.37725257873535156, + "learning_rate": 3.6342979943635866e-05, + "loss": 0.3259, + "num_input_tokens_seen": 8858496, + "step": 8805 + }, + { + "epoch": 4.153701084394154, + "grad_norm": 0.34190040826797485, + "learning_rate": 3.63246434683268e-05, + "loss": 0.293, + "num_input_tokens_seen": 8863360, + "step": 8810 + }, + { + "epoch": 4.156058462989156, + "grad_norm": 0.29759401082992554, + "learning_rate": 3.6306299324738634e-05, + "loss": 0.3445, + "num_input_tokens_seen": 8868384, + "step": 8815 + }, + { + "epoch": 4.158415841584159, + "grad_norm": 0.6185327172279358, + "learning_rate": 3.6287947525292765e-05, + "loss": 0.3984, + "num_input_tokens_seen": 8872096, + "step": 8820 + }, + { + "epoch": 4.160773220179161, + "grad_norm": 0.3532562255859375, + "learning_rate": 3.6269588082415794e-05, + "loss": 0.3542, + "num_input_tokens_seen": 8876576, + "step": 8825 + }, + { + "epoch": 4.163130598774163, + "grad_norm": 0.39273181557655334, + "learning_rate": 3.6251221008539475e-05, + "loss": 0.3132, + "num_input_tokens_seen": 8882976, + "step": 8830 + }, + { + "epoch": 4.165487977369166, + "grad_norm": 0.3565346598625183, + "learning_rate": 3.623284631610076e-05, + "loss": 0.3449, + "num_input_tokens_seen": 8888064, + "step": 8835 + }, + { + "epoch": 4.167845355964168, + "grad_norm": 0.36075475811958313, + "learning_rate": 3.6214464017541726e-05, + "loss": 0.3051, + "num_input_tokens_seen": 8894048, + "step": 8840 + }, + { + "epoch": 4.170202734559171, + "grad_norm": 0.45840469002723694, + "learning_rate": 3.6196074125309634e-05, + "loss": 0.3766, + "num_input_tokens_seen": 8899008, + "step": 8845 + }, + { + "epoch": 4.172560113154173, + "grad_norm": 0.45325732231140137, + "learning_rate": 3.617767665185684e-05, + "loss": 0.3289, + "num_input_tokens_seen": 8903456, + "step": 8850 + }, + { + "epoch": 4.174917491749175, + "grad_norm": 0.4105195701122284, + "learning_rate": 3.61592716096409e-05, + "loss": 0.2942, + "num_input_tokens_seen": 8908384, + "step": 8855 + }, + { + "epoch": 4.177274870344177, + "grad_norm": 0.5706161856651306, + "learning_rate": 3.614085901112443e-05, + "loss": 0.343, + "num_input_tokens_seen": 8912640, + "step": 8860 + }, + { + "epoch": 4.179632248939179, + "grad_norm": 0.35235336422920227, + "learning_rate": 3.612243886877521e-05, + "loss": 0.4227, + "num_input_tokens_seen": 8917792, + "step": 8865 + }, + { + "epoch": 4.181989627534182, + "grad_norm": 0.665531575679779, + "learning_rate": 3.610401119506609e-05, + "loss": 0.3237, + "num_input_tokens_seen": 8922048, + "step": 8870 + }, + { + "epoch": 4.184347006129184, + "grad_norm": 0.35615596175193787, + "learning_rate": 3.608557600247506e-05, + "loss": 0.3509, + "num_input_tokens_seen": 8927904, + "step": 8875 + }, + { + "epoch": 4.1867043847241865, + "grad_norm": 0.36174312233924866, + "learning_rate": 3.6067133303485165e-05, + "loss": 0.3028, + "num_input_tokens_seen": 8932800, + "step": 8880 + }, + { + "epoch": 4.189061763319189, + "grad_norm": 0.299553245306015, + "learning_rate": 3.604868311058454e-05, + "loss": 0.2798, + "num_input_tokens_seen": 8937312, + "step": 8885 + }, + { + "epoch": 4.191419141914191, + "grad_norm": 0.3877962529659271, + "learning_rate": 3.603022543626642e-05, + "loss": 0.3221, + "num_input_tokens_seen": 8943008, + "step": 8890 + }, + { + "epoch": 4.193776520509194, + "grad_norm": 0.3817616105079651, + "learning_rate": 3.601176029302909e-05, + "loss": 0.3923, + "num_input_tokens_seen": 8949184, + "step": 8895 + }, + { + "epoch": 4.196133899104196, + "grad_norm": 0.40011703968048096, + "learning_rate": 3.599328769337586e-05, + "loss": 0.3326, + "num_input_tokens_seen": 8954816, + "step": 8900 + }, + { + "epoch": 4.1984912776991985, + "grad_norm": 0.4178822636604309, + "learning_rate": 3.5974807649815155e-05, + "loss": 0.3606, + "num_input_tokens_seen": 8960064, + "step": 8905 + }, + { + "epoch": 4.200848656294201, + "grad_norm": 0.35430586338043213, + "learning_rate": 3.59563201748604e-05, + "loss": 0.3372, + "num_input_tokens_seen": 8964288, + "step": 8910 + }, + { + "epoch": 4.203206034889203, + "grad_norm": 0.26685285568237305, + "learning_rate": 3.593782528103003e-05, + "loss": 0.3262, + "num_input_tokens_seen": 8969888, + "step": 8915 + }, + { + "epoch": 4.205563413484206, + "grad_norm": 0.4294887185096741, + "learning_rate": 3.591932298084756e-05, + "loss": 0.339, + "num_input_tokens_seen": 8974400, + "step": 8920 + }, + { + "epoch": 4.207920792079208, + "grad_norm": 0.30690181255340576, + "learning_rate": 3.590081328684147e-05, + "loss": 0.3463, + "num_input_tokens_seen": 8980416, + "step": 8925 + }, + { + "epoch": 4.2102781706742105, + "grad_norm": 0.4274744689464569, + "learning_rate": 3.588229621154529e-05, + "loss": 0.3393, + "num_input_tokens_seen": 8986880, + "step": 8930 + }, + { + "epoch": 4.212635549269213, + "grad_norm": 0.32489731907844543, + "learning_rate": 3.586377176749749e-05, + "loss": 0.3318, + "num_input_tokens_seen": 8991872, + "step": 8935 + }, + { + "epoch": 4.214992927864215, + "grad_norm": 0.3409070372581482, + "learning_rate": 3.5845239967241593e-05, + "loss": 0.3184, + "num_input_tokens_seen": 8996960, + "step": 8940 + }, + { + "epoch": 4.217350306459218, + "grad_norm": 0.34720465540885925, + "learning_rate": 3.582670082332607e-05, + "loss": 0.3749, + "num_input_tokens_seen": 9001152, + "step": 8945 + }, + { + "epoch": 4.21970768505422, + "grad_norm": 0.5437415242195129, + "learning_rate": 3.580815434830437e-05, + "loss": 0.3418, + "num_input_tokens_seen": 9007360, + "step": 8950 + }, + { + "epoch": 4.222065063649222, + "grad_norm": 0.43062734603881836, + "learning_rate": 3.5789600554734895e-05, + "loss": 0.3482, + "num_input_tokens_seen": 9013280, + "step": 8955 + }, + { + "epoch": 4.224422442244224, + "grad_norm": 0.5460294485092163, + "learning_rate": 3.577103945518103e-05, + "loss": 0.3355, + "num_input_tokens_seen": 9018528, + "step": 8960 + }, + { + "epoch": 4.226779820839226, + "grad_norm": 0.39477717876434326, + "learning_rate": 3.575247106221108e-05, + "loss": 0.3003, + "num_input_tokens_seen": 9023264, + "step": 8965 + }, + { + "epoch": 4.229137199434229, + "grad_norm": 0.4292011559009552, + "learning_rate": 3.5733895388398305e-05, + "loss": 0.3644, + "num_input_tokens_seen": 9028672, + "step": 8970 + }, + { + "epoch": 4.231494578029231, + "grad_norm": 0.3252149224281311, + "learning_rate": 3.571531244632088e-05, + "loss": 0.3311, + "num_input_tokens_seen": 9032704, + "step": 8975 + }, + { + "epoch": 4.233851956624234, + "grad_norm": 0.36198124289512634, + "learning_rate": 3.569672224856191e-05, + "loss": 0.3807, + "num_input_tokens_seen": 9037664, + "step": 8980 + }, + { + "epoch": 4.236209335219236, + "grad_norm": 0.30529162287712097, + "learning_rate": 3.5678124807709435e-05, + "loss": 0.2982, + "num_input_tokens_seen": 9042880, + "step": 8985 + }, + { + "epoch": 4.238566713814238, + "grad_norm": 0.4373253285884857, + "learning_rate": 3.565952013635635e-05, + "loss": 0.347, + "num_input_tokens_seen": 9048832, + "step": 8990 + }, + { + "epoch": 4.240924092409241, + "grad_norm": 0.46100062131881714, + "learning_rate": 3.564090824710049e-05, + "loss": 0.317, + "num_input_tokens_seen": 9053984, + "step": 8995 + }, + { + "epoch": 4.243281471004243, + "grad_norm": 0.45523926615715027, + "learning_rate": 3.5622289152544565e-05, + "loss": 0.2961, + "num_input_tokens_seen": 9059584, + "step": 9000 + }, + { + "epoch": 4.245638849599246, + "grad_norm": 0.30469515919685364, + "learning_rate": 3.560366286529615e-05, + "loss": 0.3743, + "num_input_tokens_seen": 9064480, + "step": 9005 + }, + { + "epoch": 4.247996228194248, + "grad_norm": 0.4471295177936554, + "learning_rate": 3.558502939796771e-05, + "loss": 0.3277, + "num_input_tokens_seen": 9070528, + "step": 9010 + }, + { + "epoch": 4.2503536067892504, + "grad_norm": 0.4942092001438141, + "learning_rate": 3.556638876317655e-05, + "loss": 0.3807, + "num_input_tokens_seen": 9075360, + "step": 9015 + }, + { + "epoch": 4.252710985384253, + "grad_norm": 0.24216118454933167, + "learning_rate": 3.5547740973544854e-05, + "loss": 0.3703, + "num_input_tokens_seen": 9079808, + "step": 9020 + }, + { + "epoch": 4.255068363979255, + "grad_norm": 0.4306030869483948, + "learning_rate": 3.552908604169964e-05, + "loss": 0.3085, + "num_input_tokens_seen": 9083744, + "step": 9025 + }, + { + "epoch": 4.257425742574258, + "grad_norm": 0.3752971887588501, + "learning_rate": 3.5510423980272755e-05, + "loss": 0.3652, + "num_input_tokens_seen": 9088256, + "step": 9030 + }, + { + "epoch": 4.25978312116926, + "grad_norm": 0.24241366982460022, + "learning_rate": 3.5491754801900876e-05, + "loss": 0.318, + "num_input_tokens_seen": 9093504, + "step": 9035 + }, + { + "epoch": 4.2621404997642625, + "grad_norm": 0.3530544936656952, + "learning_rate": 3.547307851922551e-05, + "loss": 0.3589, + "num_input_tokens_seen": 9098528, + "step": 9040 + }, + { + "epoch": 4.264497878359265, + "grad_norm": 0.36683255434036255, + "learning_rate": 3.5454395144892965e-05, + "loss": 0.3392, + "num_input_tokens_seen": 9103296, + "step": 9045 + }, + { + "epoch": 4.266855256954267, + "grad_norm": 0.2538736164569855, + "learning_rate": 3.543570469155434e-05, + "loss": 0.33, + "num_input_tokens_seen": 9108064, + "step": 9050 + }, + { + "epoch": 4.26921263554927, + "grad_norm": 0.3321214020252228, + "learning_rate": 3.5417007171865556e-05, + "loss": 0.2942, + "num_input_tokens_seen": 9112320, + "step": 9055 + }, + { + "epoch": 4.271570014144271, + "grad_norm": 0.31625789403915405, + "learning_rate": 3.5398302598487285e-05, + "loss": 0.359, + "num_input_tokens_seen": 9118368, + "step": 9060 + }, + { + "epoch": 4.273927392739274, + "grad_norm": 0.4034087359905243, + "learning_rate": 3.537959098408502e-05, + "loss": 0.3274, + "num_input_tokens_seen": 9124000, + "step": 9065 + }, + { + "epoch": 4.276284771334276, + "grad_norm": 0.6484527587890625, + "learning_rate": 3.5360872341328974e-05, + "loss": 0.3763, + "num_input_tokens_seen": 9129472, + "step": 9070 + }, + { + "epoch": 4.278642149929278, + "grad_norm": 0.49768930673599243, + "learning_rate": 3.5342146682894145e-05, + "loss": 0.3557, + "num_input_tokens_seen": 9133952, + "step": 9075 + }, + { + "epoch": 4.280999528524281, + "grad_norm": 0.7492267489433289, + "learning_rate": 3.532341402146027e-05, + "loss": 0.3362, + "num_input_tokens_seen": 9138944, + "step": 9080 + }, + { + "epoch": 4.283356907119283, + "grad_norm": 0.3032297194004059, + "learning_rate": 3.530467436971185e-05, + "loss": 0.2966, + "num_input_tokens_seen": 9144352, + "step": 9085 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.35454973578453064, + "learning_rate": 3.5285927740338096e-05, + "loss": 0.3498, + "num_input_tokens_seen": 9148896, + "step": 9090 + }, + { + "epoch": 4.288071664309288, + "grad_norm": 0.2612301707267761, + "learning_rate": 3.5267174146032955e-05, + "loss": 0.318, + "num_input_tokens_seen": 9153472, + "step": 9095 + }, + { + "epoch": 4.29042904290429, + "grad_norm": 0.35062772035598755, + "learning_rate": 3.524841359949508e-05, + "loss": 0.3478, + "num_input_tokens_seen": 9159392, + "step": 9100 + }, + { + "epoch": 4.292786421499293, + "grad_norm": 0.4190049171447754, + "learning_rate": 3.522964611342784e-05, + "loss": 0.3411, + "num_input_tokens_seen": 9164032, + "step": 9105 + }, + { + "epoch": 4.295143800094295, + "grad_norm": 0.3066687285900116, + "learning_rate": 3.5210871700539296e-05, + "loss": 0.3155, + "num_input_tokens_seen": 9168480, + "step": 9110 + }, + { + "epoch": 4.297501178689298, + "grad_norm": 0.2791149318218231, + "learning_rate": 3.519209037354222e-05, + "loss": 0.3092, + "num_input_tokens_seen": 9172928, + "step": 9115 + }, + { + "epoch": 4.2998585572843, + "grad_norm": 0.41061222553253174, + "learning_rate": 3.517330214515405e-05, + "loss": 0.301, + "num_input_tokens_seen": 9177504, + "step": 9120 + }, + { + "epoch": 4.302215935879302, + "grad_norm": 0.40368571877479553, + "learning_rate": 3.515450702809687e-05, + "loss": 0.3913, + "num_input_tokens_seen": 9183584, + "step": 9125 + }, + { + "epoch": 4.304573314474305, + "grad_norm": 0.40076422691345215, + "learning_rate": 3.513570503509749e-05, + "loss": 0.375, + "num_input_tokens_seen": 9188640, + "step": 9130 + }, + { + "epoch": 4.306930693069307, + "grad_norm": 0.5423717498779297, + "learning_rate": 3.511689617888733e-05, + "loss": 0.2967, + "num_input_tokens_seen": 9194016, + "step": 9135 + }, + { + "epoch": 4.30928807166431, + "grad_norm": 0.2854869067668915, + "learning_rate": 3.5098080472202464e-05, + "loss": 0.3402, + "num_input_tokens_seen": 9199040, + "step": 9140 + }, + { + "epoch": 4.311645450259312, + "grad_norm": 0.43500033020973206, + "learning_rate": 3.507925792778362e-05, + "loss": 0.3238, + "num_input_tokens_seen": 9205472, + "step": 9145 + }, + { + "epoch": 4.314002828854314, + "grad_norm": 0.5401458144187927, + "learning_rate": 3.506042855837614e-05, + "loss": 0.3492, + "num_input_tokens_seen": 9209600, + "step": 9150 + }, + { + "epoch": 4.316360207449316, + "grad_norm": 0.4648403823375702, + "learning_rate": 3.504159237672999e-05, + "loss": 0.3517, + "num_input_tokens_seen": 9214496, + "step": 9155 + }, + { + "epoch": 4.318717586044318, + "grad_norm": 0.24054236710071564, + "learning_rate": 3.502274939559975e-05, + "loss": 0.327, + "num_input_tokens_seen": 9218912, + "step": 9160 + }, + { + "epoch": 4.321074964639321, + "grad_norm": 0.585920512676239, + "learning_rate": 3.500389962774462e-05, + "loss": 0.3219, + "num_input_tokens_seen": 9223232, + "step": 9165 + }, + { + "epoch": 4.323432343234323, + "grad_norm": 0.4122132658958435, + "learning_rate": 3.498504308592838e-05, + "loss": 0.3142, + "num_input_tokens_seen": 9228224, + "step": 9170 + }, + { + "epoch": 4.3257897218293255, + "grad_norm": 0.2958455979824066, + "learning_rate": 3.4966179782919384e-05, + "loss": 0.3032, + "num_input_tokens_seen": 9232768, + "step": 9175 + }, + { + "epoch": 4.328147100424328, + "grad_norm": 0.29849526286125183, + "learning_rate": 3.4947309731490594e-05, + "loss": 0.3019, + "num_input_tokens_seen": 9238080, + "step": 9180 + }, + { + "epoch": 4.33050447901933, + "grad_norm": 0.5024157762527466, + "learning_rate": 3.4928432944419513e-05, + "loss": 0.4203, + "num_input_tokens_seen": 9243296, + "step": 9185 + }, + { + "epoch": 4.332861857614333, + "grad_norm": 0.42652732133865356, + "learning_rate": 3.490954943448824e-05, + "loss": 0.3952, + "num_input_tokens_seen": 9249440, + "step": 9190 + }, + { + "epoch": 4.335219236209335, + "grad_norm": 0.4119875729084015, + "learning_rate": 3.489065921448338e-05, + "loss": 0.3927, + "num_input_tokens_seen": 9255392, + "step": 9195 + }, + { + "epoch": 4.3375766148043375, + "grad_norm": 0.4143989086151123, + "learning_rate": 3.487176229719613e-05, + "loss": 0.2785, + "num_input_tokens_seen": 9260768, + "step": 9200 + }, + { + "epoch": 4.33993399339934, + "grad_norm": 0.27124232053756714, + "learning_rate": 3.485285869542218e-05, + "loss": 0.2958, + "num_input_tokens_seen": 9266144, + "step": 9205 + }, + { + "epoch": 4.342291371994342, + "grad_norm": 0.23463670909404755, + "learning_rate": 3.483394842196178e-05, + "loss": 0.316, + "num_input_tokens_seen": 9270336, + "step": 9210 + }, + { + "epoch": 4.344648750589345, + "grad_norm": 0.3387948274612427, + "learning_rate": 3.481503148961967e-05, + "loss": 0.3563, + "num_input_tokens_seen": 9275200, + "step": 9215 + }, + { + "epoch": 4.347006129184347, + "grad_norm": 0.31746575236320496, + "learning_rate": 3.4796107911205115e-05, + "loss": 0.3624, + "num_input_tokens_seen": 9279360, + "step": 9220 + }, + { + "epoch": 4.3493635077793495, + "grad_norm": 0.35201674699783325, + "learning_rate": 3.477717769953188e-05, + "loss": 0.3211, + "num_input_tokens_seen": 9284544, + "step": 9225 + }, + { + "epoch": 4.351720886374352, + "grad_norm": 0.3452504277229309, + "learning_rate": 3.475824086741821e-05, + "loss": 0.3177, + "num_input_tokens_seen": 9289184, + "step": 9230 + }, + { + "epoch": 4.354078264969354, + "grad_norm": 0.40407195687294006, + "learning_rate": 3.4739297427686843e-05, + "loss": 0.3393, + "num_input_tokens_seen": 9294208, + "step": 9235 + }, + { + "epoch": 4.356435643564357, + "grad_norm": 0.3594681918621063, + "learning_rate": 3.4720347393165e-05, + "loss": 0.3379, + "num_input_tokens_seen": 9298656, + "step": 9240 + }, + { + "epoch": 4.358793022159359, + "grad_norm": 0.4726978838443756, + "learning_rate": 3.4701390776684346e-05, + "loss": 0.3316, + "num_input_tokens_seen": 9304960, + "step": 9245 + }, + { + "epoch": 4.3611504007543616, + "grad_norm": 0.3077569901943207, + "learning_rate": 3.4682427591081016e-05, + "loss": 0.2962, + "num_input_tokens_seen": 9310496, + "step": 9250 + }, + { + "epoch": 4.363507779349364, + "grad_norm": 0.33487504720687866, + "learning_rate": 3.46634578491956e-05, + "loss": 0.2956, + "num_input_tokens_seen": 9314528, + "step": 9255 + }, + { + "epoch": 4.3658651579443655, + "grad_norm": 0.3171832263469696, + "learning_rate": 3.46444815638731e-05, + "loss": 0.3639, + "num_input_tokens_seen": 9319936, + "step": 9260 + }, + { + "epoch": 4.368222536539368, + "grad_norm": 0.3922940790653229, + "learning_rate": 3.462549874796298e-05, + "loss": 0.3185, + "num_input_tokens_seen": 9324608, + "step": 9265 + }, + { + "epoch": 4.37057991513437, + "grad_norm": 0.38577020168304443, + "learning_rate": 3.460650941431911e-05, + "loss": 0.2633, + "num_input_tokens_seen": 9329568, + "step": 9270 + }, + { + "epoch": 4.372937293729373, + "grad_norm": 0.25364622473716736, + "learning_rate": 3.4587513575799787e-05, + "loss": 0.3745, + "num_input_tokens_seen": 9333888, + "step": 9275 + }, + { + "epoch": 4.375294672324375, + "grad_norm": 0.37324512004852295, + "learning_rate": 3.4568511245267676e-05, + "loss": 0.2932, + "num_input_tokens_seen": 9338528, + "step": 9280 + }, + { + "epoch": 4.3776520509193775, + "grad_norm": 0.7016696333885193, + "learning_rate": 3.4549502435589896e-05, + "loss": 0.3351, + "num_input_tokens_seen": 9343488, + "step": 9285 + }, + { + "epoch": 4.38000942951438, + "grad_norm": 0.6452207565307617, + "learning_rate": 3.453048715963789e-05, + "loss": 0.4026, + "num_input_tokens_seen": 9348256, + "step": 9290 + }, + { + "epoch": 4.382366808109382, + "grad_norm": 0.4782797694206238, + "learning_rate": 3.451146543028754e-05, + "loss": 0.3441, + "num_input_tokens_seen": 9353216, + "step": 9295 + }, + { + "epoch": 4.384724186704385, + "grad_norm": 0.3794703185558319, + "learning_rate": 3.449243726041904e-05, + "loss": 0.3438, + "num_input_tokens_seen": 9358144, + "step": 9300 + }, + { + "epoch": 4.387081565299387, + "grad_norm": 0.2749648690223694, + "learning_rate": 3.4473402662917e-05, + "loss": 0.287, + "num_input_tokens_seen": 9363072, + "step": 9305 + }, + { + "epoch": 4.3894389438943895, + "grad_norm": 0.3451083302497864, + "learning_rate": 3.445436165067034e-05, + "loss": 0.3381, + "num_input_tokens_seen": 9370080, + "step": 9310 + }, + { + "epoch": 4.391796322489392, + "grad_norm": 0.3389555513858795, + "learning_rate": 3.443531423657235e-05, + "loss": 0.3407, + "num_input_tokens_seen": 9375424, + "step": 9315 + }, + { + "epoch": 4.394153701084394, + "grad_norm": 0.5615812540054321, + "learning_rate": 3.441626043352064e-05, + "loss": 0.3492, + "num_input_tokens_seen": 9380512, + "step": 9320 + }, + { + "epoch": 4.396511079679397, + "grad_norm": 0.3632600009441376, + "learning_rate": 3.439720025441715e-05, + "loss": 0.3011, + "num_input_tokens_seen": 9385024, + "step": 9325 + }, + { + "epoch": 4.398868458274399, + "grad_norm": 0.49968573451042175, + "learning_rate": 3.437813371216813e-05, + "loss": 0.3241, + "num_input_tokens_seen": 9388832, + "step": 9330 + }, + { + "epoch": 4.4012258368694015, + "grad_norm": 0.27020779252052307, + "learning_rate": 3.435906081968416e-05, + "loss": 0.3551, + "num_input_tokens_seen": 9393952, + "step": 9335 + }, + { + "epoch": 4.403583215464404, + "grad_norm": 0.5058796405792236, + "learning_rate": 3.43399815898801e-05, + "loss": 0.3377, + "num_input_tokens_seen": 9398496, + "step": 9340 + }, + { + "epoch": 4.405940594059406, + "grad_norm": 0.2877548635005951, + "learning_rate": 3.432089603567512e-05, + "loss": 0.3857, + "num_input_tokens_seen": 9402848, + "step": 9345 + }, + { + "epoch": 4.408297972654409, + "grad_norm": 0.28760799765586853, + "learning_rate": 3.430180416999264e-05, + "loss": 0.3043, + "num_input_tokens_seen": 9407520, + "step": 9350 + }, + { + "epoch": 4.41065535124941, + "grad_norm": 0.3528154790401459, + "learning_rate": 3.428270600576039e-05, + "loss": 0.2942, + "num_input_tokens_seen": 9412096, + "step": 9355 + }, + { + "epoch": 4.413012729844413, + "grad_norm": 0.2470400482416153, + "learning_rate": 3.4263601555910343e-05, + "loss": 0.3155, + "num_input_tokens_seen": 9417696, + "step": 9360 + }, + { + "epoch": 4.415370108439415, + "grad_norm": 0.3127797245979309, + "learning_rate": 3.424449083337874e-05, + "loss": 0.3155, + "num_input_tokens_seen": 9423008, + "step": 9365 + }, + { + "epoch": 4.417727487034417, + "grad_norm": 0.38712501525878906, + "learning_rate": 3.4225373851106056e-05, + "loss": 0.3356, + "num_input_tokens_seen": 9427488, + "step": 9370 + }, + { + "epoch": 4.42008486562942, + "grad_norm": 0.30650588870048523, + "learning_rate": 3.420625062203702e-05, + "loss": 0.3304, + "num_input_tokens_seen": 9432352, + "step": 9375 + }, + { + "epoch": 4.422442244224422, + "grad_norm": 0.3209199905395508, + "learning_rate": 3.4187121159120575e-05, + "loss": 0.3724, + "num_input_tokens_seen": 9437184, + "step": 9380 + }, + { + "epoch": 4.424799622819425, + "grad_norm": 0.3365941047668457, + "learning_rate": 3.41679854753099e-05, + "loss": 0.3145, + "num_input_tokens_seen": 9443264, + "step": 9385 + }, + { + "epoch": 4.427157001414427, + "grad_norm": 0.34256237745285034, + "learning_rate": 3.4148843583562385e-05, + "loss": 0.2766, + "num_input_tokens_seen": 9447392, + "step": 9390 + }, + { + "epoch": 4.429514380009429, + "grad_norm": 0.4389457702636719, + "learning_rate": 3.4129695496839595e-05, + "loss": 0.353, + "num_input_tokens_seen": 9452064, + "step": 9395 + }, + { + "epoch": 4.431871758604432, + "grad_norm": 0.6188599467277527, + "learning_rate": 3.411054122810734e-05, + "loss": 0.3668, + "num_input_tokens_seen": 9456960, + "step": 9400 + }, + { + "epoch": 4.434229137199434, + "grad_norm": 0.4012417793273926, + "learning_rate": 3.409138079033558e-05, + "loss": 0.3436, + "num_input_tokens_seen": 9461856, + "step": 9405 + }, + { + "epoch": 4.436586515794437, + "grad_norm": 0.19712156057357788, + "learning_rate": 3.407221419649846e-05, + "loss": 0.3305, + "num_input_tokens_seen": 9466848, + "step": 9410 + }, + { + "epoch": 4.438943894389439, + "grad_norm": 0.37018561363220215, + "learning_rate": 3.405304145957429e-05, + "loss": 0.3292, + "num_input_tokens_seen": 9471648, + "step": 9415 + }, + { + "epoch": 4.441301272984441, + "grad_norm": 0.39217501878738403, + "learning_rate": 3.403386259254556e-05, + "loss": 0.3299, + "num_input_tokens_seen": 9476896, + "step": 9420 + }, + { + "epoch": 4.443658651579444, + "grad_norm": 0.3325536251068115, + "learning_rate": 3.401467760839888e-05, + "loss": 0.3671, + "num_input_tokens_seen": 9481056, + "step": 9425 + }, + { + "epoch": 4.446016030174446, + "grad_norm": 0.29483625292778015, + "learning_rate": 3.3995486520125025e-05, + "loss": 0.2913, + "num_input_tokens_seen": 9486592, + "step": 9430 + }, + { + "epoch": 4.448373408769449, + "grad_norm": 0.30045005679130554, + "learning_rate": 3.39762893407189e-05, + "loss": 0.2787, + "num_input_tokens_seen": 9491136, + "step": 9435 + }, + { + "epoch": 4.450730787364451, + "grad_norm": 0.41737037897109985, + "learning_rate": 3.3957086083179525e-05, + "loss": 0.3062, + "num_input_tokens_seen": 9495488, + "step": 9440 + }, + { + "epoch": 4.4530881659594534, + "grad_norm": 0.21291759610176086, + "learning_rate": 3.393787676051003e-05, + "loss": 0.3244, + "num_input_tokens_seen": 9500064, + "step": 9445 + }, + { + "epoch": 4.455445544554456, + "grad_norm": 0.9070121645927429, + "learning_rate": 3.391866138571769e-05, + "loss": 0.3616, + "num_input_tokens_seen": 9504960, + "step": 9450 + }, + { + "epoch": 4.457802923149458, + "grad_norm": 0.5078638195991516, + "learning_rate": 3.3899439971813825e-05, + "loss": 0.3421, + "num_input_tokens_seen": 9511840, + "step": 9455 + }, + { + "epoch": 4.46016030174446, + "grad_norm": 0.44726160168647766, + "learning_rate": 3.388021253181388e-05, + "loss": 0.3489, + "num_input_tokens_seen": 9521920, + "step": 9460 + }, + { + "epoch": 4.462517680339462, + "grad_norm": 0.43945223093032837, + "learning_rate": 3.386097907873738e-05, + "loss": 0.2652, + "num_input_tokens_seen": 9527552, + "step": 9465 + }, + { + "epoch": 4.464875058934465, + "grad_norm": 0.6550853252410889, + "learning_rate": 3.38417396256079e-05, + "loss": 0.4064, + "num_input_tokens_seen": 9532960, + "step": 9470 + }, + { + "epoch": 4.467232437529467, + "grad_norm": 0.5094072222709656, + "learning_rate": 3.38224941854531e-05, + "loss": 0.3907, + "num_input_tokens_seen": 9537952, + "step": 9475 + }, + { + "epoch": 4.469589816124469, + "grad_norm": 0.2862012982368469, + "learning_rate": 3.380324277130468e-05, + "loss": 0.3365, + "num_input_tokens_seen": 9542848, + "step": 9480 + }, + { + "epoch": 4.471947194719472, + "grad_norm": 0.6431093811988831, + "learning_rate": 3.378398539619839e-05, + "loss": 0.3486, + "num_input_tokens_seen": 9547680, + "step": 9485 + }, + { + "epoch": 4.474304573314474, + "grad_norm": 0.2741422951221466, + "learning_rate": 3.3764722073174017e-05, + "loss": 0.2878, + "num_input_tokens_seen": 9552352, + "step": 9490 + }, + { + "epoch": 4.476661951909477, + "grad_norm": 0.6575927734375, + "learning_rate": 3.374545281527538e-05, + "loss": 0.3509, + "num_input_tokens_seen": 9559232, + "step": 9495 + }, + { + "epoch": 4.479019330504479, + "grad_norm": 0.5759170055389404, + "learning_rate": 3.372617763555028e-05, + "loss": 0.3499, + "num_input_tokens_seen": 9563968, + "step": 9500 + }, + { + "epoch": 4.481376709099481, + "grad_norm": 0.2732667922973633, + "learning_rate": 3.37068965470506e-05, + "loss": 0.3342, + "num_input_tokens_seen": 9568800, + "step": 9505 + }, + { + "epoch": 4.483734087694484, + "grad_norm": 0.3063904941082001, + "learning_rate": 3.368760956283217e-05, + "loss": 0.3505, + "num_input_tokens_seen": 9573472, + "step": 9510 + }, + { + "epoch": 4.486091466289486, + "grad_norm": 0.3403800129890442, + "learning_rate": 3.3668316695954824e-05, + "loss": 0.32, + "num_input_tokens_seen": 9578528, + "step": 9515 + }, + { + "epoch": 4.488448844884489, + "grad_norm": 0.3629792034626007, + "learning_rate": 3.364901795948237e-05, + "loss": 0.3449, + "num_input_tokens_seen": 9583680, + "step": 9520 + }, + { + "epoch": 4.490806223479491, + "grad_norm": 0.40915265679359436, + "learning_rate": 3.3629713366482604e-05, + "loss": 0.3746, + "num_input_tokens_seen": 9588352, + "step": 9525 + }, + { + "epoch": 4.493163602074493, + "grad_norm": 0.5095410346984863, + "learning_rate": 3.3610402930027293e-05, + "loss": 0.3455, + "num_input_tokens_seen": 9593600, + "step": 9530 + }, + { + "epoch": 4.495520980669496, + "grad_norm": 0.4838995337486267, + "learning_rate": 3.3591086663192164e-05, + "loss": 0.306, + "num_input_tokens_seen": 9599296, + "step": 9535 + }, + { + "epoch": 4.497878359264498, + "grad_norm": 0.4512448012828827, + "learning_rate": 3.3571764579056853e-05, + "loss": 0.3549, + "num_input_tokens_seen": 9603296, + "step": 9540 + }, + { + "epoch": 4.500235737859501, + "grad_norm": 0.34337693452835083, + "learning_rate": 3.3552436690704986e-05, + "loss": 0.3932, + "num_input_tokens_seen": 9609088, + "step": 9545 + }, + { + "epoch": 4.502121640735502, + "eval_loss": 0.3341943025588989, + "eval_runtime": 25.611, + "eval_samples_per_second": 36.82, + "eval_steps_per_second": 9.215, + "num_input_tokens_seen": 9613088, + "step": 9549 + }, + { + "epoch": 4.502593116454502, + "grad_norm": 0.2769257426261902, + "learning_rate": 3.3533103011224075e-05, + "loss": 0.3682, + "num_input_tokens_seen": 9613920, + "step": 9550 + }, + { + "epoch": 4.5049504950495045, + "grad_norm": 0.49829283356666565, + "learning_rate": 3.351376355370559e-05, + "loss": 0.3611, + "num_input_tokens_seen": 9619040, + "step": 9555 + }, + { + "epoch": 4.507307873644507, + "grad_norm": 0.38766393065452576, + "learning_rate": 3.349441833124489e-05, + "loss": 0.325, + "num_input_tokens_seen": 9624576, + "step": 9560 + }, + { + "epoch": 4.509665252239509, + "grad_norm": 0.46530449390411377, + "learning_rate": 3.347506735694125e-05, + "loss": 0.3145, + "num_input_tokens_seen": 9629280, + "step": 9565 + }, + { + "epoch": 4.512022630834512, + "grad_norm": 0.48804962635040283, + "learning_rate": 3.345571064389783e-05, + "loss": 0.3146, + "num_input_tokens_seen": 9633984, + "step": 9570 + }, + { + "epoch": 4.514380009429514, + "grad_norm": 0.3308086395263672, + "learning_rate": 3.343634820522169e-05, + "loss": 0.3894, + "num_input_tokens_seen": 9638944, + "step": 9575 + }, + { + "epoch": 4.5167373880245165, + "grad_norm": 0.387979656457901, + "learning_rate": 3.341698005402374e-05, + "loss": 0.3309, + "num_input_tokens_seen": 9643072, + "step": 9580 + }, + { + "epoch": 4.519094766619519, + "grad_norm": 0.26611316204071045, + "learning_rate": 3.3397606203418794e-05, + "loss": 0.3536, + "num_input_tokens_seen": 9647200, + "step": 9585 + }, + { + "epoch": 4.521452145214521, + "grad_norm": 0.2839503586292267, + "learning_rate": 3.3378226666525506e-05, + "loss": 0.2842, + "num_input_tokens_seen": 9652960, + "step": 9590 + }, + { + "epoch": 4.523809523809524, + "grad_norm": 0.5516809821128845, + "learning_rate": 3.335884145646637e-05, + "loss": 0.3844, + "num_input_tokens_seen": 9657888, + "step": 9595 + }, + { + "epoch": 4.526166902404526, + "grad_norm": 0.5253661870956421, + "learning_rate": 3.333945058636774e-05, + "loss": 0.3577, + "num_input_tokens_seen": 9664320, + "step": 9600 + }, + { + "epoch": 4.5285242809995285, + "grad_norm": 0.4610287845134735, + "learning_rate": 3.332005406935979e-05, + "loss": 0.2918, + "num_input_tokens_seen": 9670176, + "step": 9605 + }, + { + "epoch": 4.530881659594531, + "grad_norm": 0.266886830329895, + "learning_rate": 3.330065191857654e-05, + "loss": 0.2852, + "num_input_tokens_seen": 9674176, + "step": 9610 + }, + { + "epoch": 4.533239038189533, + "grad_norm": 0.3578052520751953, + "learning_rate": 3.328124414715579e-05, + "loss": 0.3244, + "num_input_tokens_seen": 9679808, + "step": 9615 + }, + { + "epoch": 4.535596416784536, + "grad_norm": 0.523036539554596, + "learning_rate": 3.326183076823917e-05, + "loss": 0.3781, + "num_input_tokens_seen": 9685504, + "step": 9620 + }, + { + "epoch": 4.537953795379538, + "grad_norm": 0.3313932716846466, + "learning_rate": 3.32424117949721e-05, + "loss": 0.334, + "num_input_tokens_seen": 9690112, + "step": 9625 + }, + { + "epoch": 4.5403111739745405, + "grad_norm": 0.4056689143180847, + "learning_rate": 3.322298724050379e-05, + "loss": 0.3512, + "num_input_tokens_seen": 9695392, + "step": 9630 + }, + { + "epoch": 4.542668552569543, + "grad_norm": 0.3357671797275543, + "learning_rate": 3.320355711798724e-05, + "loss": 0.3208, + "num_input_tokens_seen": 9699936, + "step": 9635 + }, + { + "epoch": 4.545025931164545, + "grad_norm": 0.3416486978530884, + "learning_rate": 3.31841214405792e-05, + "loss": 0.2992, + "num_input_tokens_seen": 9704736, + "step": 9640 + }, + { + "epoch": 4.547383309759548, + "grad_norm": 0.321772962808609, + "learning_rate": 3.316468022144017e-05, + "loss": 0.3187, + "num_input_tokens_seen": 9710016, + "step": 9645 + }, + { + "epoch": 4.54974068835455, + "grad_norm": 0.4166353940963745, + "learning_rate": 3.314523347373446e-05, + "loss": 0.3766, + "num_input_tokens_seen": 9714816, + "step": 9650 + }, + { + "epoch": 4.5520980669495525, + "grad_norm": 0.30527716875076294, + "learning_rate": 3.312578121063006e-05, + "loss": 0.336, + "num_input_tokens_seen": 9719744, + "step": 9655 + }, + { + "epoch": 4.554455445544555, + "grad_norm": 0.5113146901130676, + "learning_rate": 3.3106323445298734e-05, + "loss": 0.3092, + "num_input_tokens_seen": 9724960, + "step": 9660 + }, + { + "epoch": 4.5568128241395565, + "grad_norm": 0.3444417715072632, + "learning_rate": 3.308686019091595e-05, + "loss": 0.359, + "num_input_tokens_seen": 9729408, + "step": 9665 + }, + { + "epoch": 4.559170202734559, + "grad_norm": 0.37052857875823975, + "learning_rate": 3.3067391460660905e-05, + "loss": 0.3396, + "num_input_tokens_seen": 9735360, + "step": 9670 + }, + { + "epoch": 4.561527581329561, + "grad_norm": 0.5851538181304932, + "learning_rate": 3.30479172677165e-05, + "loss": 0.3312, + "num_input_tokens_seen": 9740736, + "step": 9675 + }, + { + "epoch": 4.563884959924564, + "grad_norm": 0.3912689983844757, + "learning_rate": 3.302843762526933e-05, + "loss": 0.2699, + "num_input_tokens_seen": 9745312, + "step": 9680 + }, + { + "epoch": 4.566242338519566, + "grad_norm": 0.4134732484817505, + "learning_rate": 3.30089525465097e-05, + "loss": 0.2572, + "num_input_tokens_seen": 9749472, + "step": 9685 + }, + { + "epoch": 4.5685997171145685, + "grad_norm": 0.4383890926837921, + "learning_rate": 3.2989462044631564e-05, + "loss": 0.344, + "num_input_tokens_seen": 9753568, + "step": 9690 + }, + { + "epoch": 4.570957095709571, + "grad_norm": 0.3481048345565796, + "learning_rate": 3.296996613283257e-05, + "loss": 0.3323, + "num_input_tokens_seen": 9758432, + "step": 9695 + }, + { + "epoch": 4.573314474304573, + "grad_norm": 0.34485888481140137, + "learning_rate": 3.2950464824314044e-05, + "loss": 0.2609, + "num_input_tokens_seen": 9763872, + "step": 9700 + }, + { + "epoch": 4.575671852899576, + "grad_norm": 0.29898956418037415, + "learning_rate": 3.293095813228092e-05, + "loss": 0.2804, + "num_input_tokens_seen": 9769248, + "step": 9705 + }, + { + "epoch": 4.578029231494578, + "grad_norm": 0.2621973752975464, + "learning_rate": 3.291144606994182e-05, + "loss": 0.2938, + "num_input_tokens_seen": 9773600, + "step": 9710 + }, + { + "epoch": 4.5803866100895805, + "grad_norm": 0.3463822901248932, + "learning_rate": 3.2891928650508983e-05, + "loss": 0.3376, + "num_input_tokens_seen": 9777696, + "step": 9715 + }, + { + "epoch": 4.582743988684583, + "grad_norm": 0.19136105477809906, + "learning_rate": 3.287240588719829e-05, + "loss": 0.2646, + "num_input_tokens_seen": 9781984, + "step": 9720 + }, + { + "epoch": 4.585101367279585, + "grad_norm": 0.39154186844825745, + "learning_rate": 3.28528777932292e-05, + "loss": 0.3594, + "num_input_tokens_seen": 9786912, + "step": 9725 + }, + { + "epoch": 4.587458745874588, + "grad_norm": 0.5783519148826599, + "learning_rate": 3.283334438182484e-05, + "loss": 0.3189, + "num_input_tokens_seen": 9791840, + "step": 9730 + }, + { + "epoch": 4.58981612446959, + "grad_norm": 0.22297091782093048, + "learning_rate": 3.281380566621189e-05, + "loss": 0.3761, + "num_input_tokens_seen": 9797184, + "step": 9735 + }, + { + "epoch": 4.5921735030645925, + "grad_norm": 0.35129162669181824, + "learning_rate": 3.2794261659620665e-05, + "loss": 0.3546, + "num_input_tokens_seen": 9802016, + "step": 9740 + }, + { + "epoch": 4.594530881659595, + "grad_norm": 0.3613086938858032, + "learning_rate": 3.277471237528502e-05, + "loss": 0.3018, + "num_input_tokens_seen": 9807776, + "step": 9745 + }, + { + "epoch": 4.596888260254596, + "grad_norm": 0.4147372245788574, + "learning_rate": 3.27551578264424e-05, + "loss": 0.2624, + "num_input_tokens_seen": 9813312, + "step": 9750 + }, + { + "epoch": 4.599245638849599, + "grad_norm": 0.33278703689575195, + "learning_rate": 3.273559802633383e-05, + "loss": 0.3293, + "num_input_tokens_seen": 9818144, + "step": 9755 + }, + { + "epoch": 4.601603017444601, + "grad_norm": 0.4596158266067505, + "learning_rate": 3.271603298820386e-05, + "loss": 0.3391, + "num_input_tokens_seen": 9822816, + "step": 9760 + }, + { + "epoch": 4.603960396039604, + "grad_norm": 0.34559470415115356, + "learning_rate": 3.2696462725300615e-05, + "loss": 0.3502, + "num_input_tokens_seen": 9827136, + "step": 9765 + }, + { + "epoch": 4.606317774634606, + "grad_norm": 0.3902835249900818, + "learning_rate": 3.267688725087575e-05, + "loss": 0.3992, + "num_input_tokens_seen": 9831872, + "step": 9770 + }, + { + "epoch": 4.608675153229608, + "grad_norm": 0.2768820524215698, + "learning_rate": 3.265730657818445e-05, + "loss": 0.3339, + "num_input_tokens_seen": 9836704, + "step": 9775 + }, + { + "epoch": 4.611032531824611, + "grad_norm": 0.35448285937309265, + "learning_rate": 3.263772072048541e-05, + "loss": 0.3798, + "num_input_tokens_seen": 9840928, + "step": 9780 + }, + { + "epoch": 4.613389910419613, + "grad_norm": 0.3057882487773895, + "learning_rate": 3.261812969104083e-05, + "loss": 0.3109, + "num_input_tokens_seen": 9845312, + "step": 9785 + }, + { + "epoch": 4.615747289014616, + "grad_norm": 0.26945367455482483, + "learning_rate": 3.259853350311644e-05, + "loss": 0.287, + "num_input_tokens_seen": 9850336, + "step": 9790 + }, + { + "epoch": 4.618104667609618, + "grad_norm": 0.291614830493927, + "learning_rate": 3.257893216998145e-05, + "loss": 0.3548, + "num_input_tokens_seen": 9854816, + "step": 9795 + }, + { + "epoch": 4.62046204620462, + "grad_norm": 0.35183289647102356, + "learning_rate": 3.255932570490853e-05, + "loss": 0.3395, + "num_input_tokens_seen": 9858560, + "step": 9800 + }, + { + "epoch": 4.622819424799623, + "grad_norm": 0.502435564994812, + "learning_rate": 3.253971412117387e-05, + "loss": 0.3317, + "num_input_tokens_seen": 9864096, + "step": 9805 + }, + { + "epoch": 4.625176803394625, + "grad_norm": 0.46072685718536377, + "learning_rate": 3.2520097432057086e-05, + "loss": 0.3516, + "num_input_tokens_seen": 9869024, + "step": 9810 + }, + { + "epoch": 4.627534181989628, + "grad_norm": 0.3969542384147644, + "learning_rate": 3.2500475650841275e-05, + "loss": 0.3231, + "num_input_tokens_seen": 9874432, + "step": 9815 + }, + { + "epoch": 4.62989156058463, + "grad_norm": 0.4761911928653717, + "learning_rate": 3.2480848790812965e-05, + "loss": 0.3187, + "num_input_tokens_seen": 9878400, + "step": 9820 + }, + { + "epoch": 4.632248939179632, + "grad_norm": 0.5930882096290588, + "learning_rate": 3.246121686526215e-05, + "loss": 0.3332, + "num_input_tokens_seen": 9884256, + "step": 9825 + }, + { + "epoch": 4.634606317774635, + "grad_norm": 0.33859190344810486, + "learning_rate": 3.244157988748222e-05, + "loss": 0.3614, + "num_input_tokens_seen": 9889184, + "step": 9830 + }, + { + "epoch": 4.636963696369637, + "grad_norm": 0.2782784402370453, + "learning_rate": 3.2421937870770005e-05, + "loss": 0.3891, + "num_input_tokens_seen": 9893472, + "step": 9835 + }, + { + "epoch": 4.63932107496464, + "grad_norm": 0.5263830423355103, + "learning_rate": 3.240229082842575e-05, + "loss": 0.3204, + "num_input_tokens_seen": 9897472, + "step": 9840 + }, + { + "epoch": 4.641678453559642, + "grad_norm": 0.3211168348789215, + "learning_rate": 3.238263877375309e-05, + "loss": 0.3101, + "num_input_tokens_seen": 9901568, + "step": 9845 + }, + { + "epoch": 4.644035832154644, + "grad_norm": 0.33522024750709534, + "learning_rate": 3.236298172005906e-05, + "loss": 0.3164, + "num_input_tokens_seen": 9907584, + "step": 9850 + }, + { + "epoch": 4.646393210749647, + "grad_norm": 0.729858934879303, + "learning_rate": 3.234331968065409e-05, + "loss": 0.344, + "num_input_tokens_seen": 9912160, + "step": 9855 + }, + { + "epoch": 4.648750589344649, + "grad_norm": 0.3353634476661682, + "learning_rate": 3.232365266885197e-05, + "loss": 0.3566, + "num_input_tokens_seen": 9916800, + "step": 9860 + }, + { + "epoch": 4.651107967939651, + "grad_norm": 0.45090165734291077, + "learning_rate": 3.230398069796987e-05, + "loss": 0.2603, + "num_input_tokens_seen": 9922016, + "step": 9865 + }, + { + "epoch": 4.653465346534653, + "grad_norm": 0.4330595135688782, + "learning_rate": 3.2284303781328303e-05, + "loss": 0.2714, + "num_input_tokens_seen": 9926720, + "step": 9870 + }, + { + "epoch": 4.655822725129656, + "grad_norm": 0.6311883330345154, + "learning_rate": 3.226462193225116e-05, + "loss": 0.3286, + "num_input_tokens_seen": 9931392, + "step": 9875 + }, + { + "epoch": 4.658180103724658, + "grad_norm": 0.29352840781211853, + "learning_rate": 3.224493516406563e-05, + "loss": 0.3428, + "num_input_tokens_seen": 9936576, + "step": 9880 + }, + { + "epoch": 4.66053748231966, + "grad_norm": 0.3649916350841522, + "learning_rate": 3.222524349010226e-05, + "loss": 0.3515, + "num_input_tokens_seen": 9941824, + "step": 9885 + }, + { + "epoch": 4.662894860914663, + "grad_norm": 0.32959672808647156, + "learning_rate": 3.220554692369492e-05, + "loss": 0.3343, + "num_input_tokens_seen": 9946688, + "step": 9890 + }, + { + "epoch": 4.665252239509665, + "grad_norm": 0.29050886631011963, + "learning_rate": 3.218584547818078e-05, + "loss": 0.3654, + "num_input_tokens_seen": 9953984, + "step": 9895 + }, + { + "epoch": 4.667609618104668, + "grad_norm": 0.2713654041290283, + "learning_rate": 3.216613916690032e-05, + "loss": 0.2958, + "num_input_tokens_seen": 9960032, + "step": 9900 + }, + { + "epoch": 4.66996699669967, + "grad_norm": 0.3470735549926758, + "learning_rate": 3.2146428003197317e-05, + "loss": 0.4007, + "num_input_tokens_seen": 9964704, + "step": 9905 + }, + { + "epoch": 4.672324375294672, + "grad_norm": 0.5108291506767273, + "learning_rate": 3.212671200041883e-05, + "loss": 0.3429, + "num_input_tokens_seen": 9970368, + "step": 9910 + }, + { + "epoch": 4.674681753889675, + "grad_norm": 0.6036664843559265, + "learning_rate": 3.210699117191521e-05, + "loss": 0.3085, + "num_input_tokens_seen": 9974880, + "step": 9915 + }, + { + "epoch": 4.677039132484677, + "grad_norm": 0.27084195613861084, + "learning_rate": 3.208726553104005e-05, + "loss": 0.323, + "num_input_tokens_seen": 9979936, + "step": 9920 + }, + { + "epoch": 4.67939651107968, + "grad_norm": 0.31859537959098816, + "learning_rate": 3.206753509115021e-05, + "loss": 0.3582, + "num_input_tokens_seen": 9986048, + "step": 9925 + }, + { + "epoch": 4.681753889674682, + "grad_norm": 0.34273675084114075, + "learning_rate": 3.204779986560581e-05, + "loss": 0.3208, + "num_input_tokens_seen": 9991072, + "step": 9930 + }, + { + "epoch": 4.684111268269684, + "grad_norm": 0.4195269048213959, + "learning_rate": 3.20280598677702e-05, + "loss": 0.3635, + "num_input_tokens_seen": 9995904, + "step": 9935 + }, + { + "epoch": 4.686468646864687, + "grad_norm": 0.2682470977306366, + "learning_rate": 3.200831511100998e-05, + "loss": 0.3156, + "num_input_tokens_seen": 10000000, + "step": 9940 + }, + { + "epoch": 4.688826025459689, + "grad_norm": 0.39105677604675293, + "learning_rate": 3.198856560869493e-05, + "loss": 0.3875, + "num_input_tokens_seen": 10005248, + "step": 9945 + }, + { + "epoch": 4.691183404054691, + "grad_norm": 0.30107393860816956, + "learning_rate": 3.196881137419809e-05, + "loss": 0.2715, + "num_input_tokens_seen": 10010688, + "step": 9950 + }, + { + "epoch": 4.693540782649693, + "grad_norm": 0.340632826089859, + "learning_rate": 3.194905242089568e-05, + "loss": 0.3284, + "num_input_tokens_seen": 10015392, + "step": 9955 + }, + { + "epoch": 4.6958981612446955, + "grad_norm": 0.41201427578926086, + "learning_rate": 3.192928876216714e-05, + "loss": 0.3294, + "num_input_tokens_seen": 10020512, + "step": 9960 + }, + { + "epoch": 4.698255539839698, + "grad_norm": 0.3091670572757721, + "learning_rate": 3.190952041139504e-05, + "loss": 0.282, + "num_input_tokens_seen": 10025344, + "step": 9965 + }, + { + "epoch": 4.7006129184347, + "grad_norm": 0.3980759084224701, + "learning_rate": 3.1889747381965194e-05, + "loss": 0.4114, + "num_input_tokens_seen": 10032320, + "step": 9970 + }, + { + "epoch": 4.702970297029703, + "grad_norm": 0.3451457619667053, + "learning_rate": 3.1869969687266535e-05, + "loss": 0.4097, + "num_input_tokens_seen": 10037408, + "step": 9975 + }, + { + "epoch": 4.705327675624705, + "grad_norm": 0.3202471137046814, + "learning_rate": 3.1850187340691186e-05, + "loss": 0.3674, + "num_input_tokens_seen": 10041984, + "step": 9980 + }, + { + "epoch": 4.7076850542197075, + "grad_norm": 0.32046520709991455, + "learning_rate": 3.18304003556344e-05, + "loss": 0.3289, + "num_input_tokens_seen": 10047296, + "step": 9985 + }, + { + "epoch": 4.71004243281471, + "grad_norm": 0.30060672760009766, + "learning_rate": 3.1810608745494593e-05, + "loss": 0.3323, + "num_input_tokens_seen": 10052608, + "step": 9990 + }, + { + "epoch": 4.712399811409712, + "grad_norm": 0.3985671401023865, + "learning_rate": 3.1790812523673275e-05, + "loss": 0.3383, + "num_input_tokens_seen": 10058656, + "step": 9995 + }, + { + "epoch": 4.714757190004715, + "grad_norm": 0.37636786699295044, + "learning_rate": 3.177101170357513e-05, + "loss": 0.3276, + "num_input_tokens_seen": 10063008, + "step": 10000 + }, + { + "epoch": 4.717114568599717, + "grad_norm": 0.3758547902107239, + "learning_rate": 3.175120629860791e-05, + "loss": 0.3225, + "num_input_tokens_seen": 10067008, + "step": 10005 + }, + { + "epoch": 4.7194719471947195, + "grad_norm": 0.28063446283340454, + "learning_rate": 3.1731396322182495e-05, + "loss": 0.3064, + "num_input_tokens_seen": 10071360, + "step": 10010 + }, + { + "epoch": 4.721829325789722, + "grad_norm": 0.3412522077560425, + "learning_rate": 3.171158178771285e-05, + "loss": 0.2958, + "num_input_tokens_seen": 10075840, + "step": 10015 + }, + { + "epoch": 4.724186704384724, + "grad_norm": 0.8482480645179749, + "learning_rate": 3.169176270861604e-05, + "loss": 0.38, + "num_input_tokens_seen": 10081952, + "step": 10020 + }, + { + "epoch": 4.726544082979727, + "grad_norm": 0.28666532039642334, + "learning_rate": 3.1671939098312184e-05, + "loss": 0.322, + "num_input_tokens_seen": 10086624, + "step": 10025 + }, + { + "epoch": 4.728901461574729, + "grad_norm": 0.5562250018119812, + "learning_rate": 3.16521109702245e-05, + "loss": 0.3616, + "num_input_tokens_seen": 10090464, + "step": 10030 + }, + { + "epoch": 4.7312588401697315, + "grad_norm": 0.24457742273807526, + "learning_rate": 3.1632278337779255e-05, + "loss": 0.3438, + "num_input_tokens_seen": 10095008, + "step": 10035 + }, + { + "epoch": 4.733616218764734, + "grad_norm": 0.3257565498352051, + "learning_rate": 3.161244121440573e-05, + "loss": 0.3684, + "num_input_tokens_seen": 10099712, + "step": 10040 + }, + { + "epoch": 4.735973597359736, + "grad_norm": 0.3742204010486603, + "learning_rate": 3.159259961353631e-05, + "loss": 0.3295, + "num_input_tokens_seen": 10104544, + "step": 10045 + }, + { + "epoch": 4.738330975954739, + "grad_norm": 0.36499014496803284, + "learning_rate": 3.1572753548606355e-05, + "loss": 0.3062, + "num_input_tokens_seen": 10109472, + "step": 10050 + }, + { + "epoch": 4.740688354549741, + "grad_norm": 0.39067038893699646, + "learning_rate": 3.155290303305429e-05, + "loss": 0.336, + "num_input_tokens_seen": 10114400, + "step": 10055 + }, + { + "epoch": 4.7430457331447435, + "grad_norm": 0.4909605085849762, + "learning_rate": 3.153304808032152e-05, + "loss": 0.3348, + "num_input_tokens_seen": 10119008, + "step": 10060 + }, + { + "epoch": 4.745403111739745, + "grad_norm": 0.3934621810913086, + "learning_rate": 3.151318870385248e-05, + "loss": 0.3294, + "num_input_tokens_seen": 10123584, + "step": 10065 + }, + { + "epoch": 4.7477604903347475, + "grad_norm": 0.30337047576904297, + "learning_rate": 3.149332491709458e-05, + "loss": 0.3052, + "num_input_tokens_seen": 10128512, + "step": 10070 + }, + { + "epoch": 4.75011786892975, + "grad_norm": 0.3055962920188904, + "learning_rate": 3.147345673349824e-05, + "loss": 0.2987, + "num_input_tokens_seen": 10133216, + "step": 10075 + }, + { + "epoch": 4.752475247524752, + "grad_norm": 0.37554430961608887, + "learning_rate": 3.1453584166516834e-05, + "loss": 0.2979, + "num_input_tokens_seen": 10140288, + "step": 10080 + }, + { + "epoch": 4.754832626119755, + "grad_norm": 0.41001585125923157, + "learning_rate": 3.143370722960672e-05, + "loss": 0.3575, + "num_input_tokens_seen": 10145632, + "step": 10085 + }, + { + "epoch": 4.757190004714757, + "grad_norm": 0.2585221230983734, + "learning_rate": 3.1413825936227206e-05, + "loss": 0.4577, + "num_input_tokens_seen": 10150720, + "step": 10090 + }, + { + "epoch": 4.7595473833097595, + "grad_norm": 0.3758082389831543, + "learning_rate": 3.139394029984056e-05, + "loss": 0.3329, + "num_input_tokens_seen": 10155264, + "step": 10095 + }, + { + "epoch": 4.761904761904762, + "grad_norm": 0.27375635504722595, + "learning_rate": 3.137405033391197e-05, + "loss": 0.3158, + "num_input_tokens_seen": 10159744, + "step": 10100 + }, + { + "epoch": 4.764262140499764, + "grad_norm": 0.30954647064208984, + "learning_rate": 3.1354156051909586e-05, + "loss": 0.3369, + "num_input_tokens_seen": 10164256, + "step": 10105 + }, + { + "epoch": 4.766619519094767, + "grad_norm": 0.47905170917510986, + "learning_rate": 3.133425746730445e-05, + "loss": 0.3894, + "num_input_tokens_seen": 10169088, + "step": 10110 + }, + { + "epoch": 4.768976897689769, + "grad_norm": 0.4606330692768097, + "learning_rate": 3.1314354593570535e-05, + "loss": 0.3213, + "num_input_tokens_seen": 10175008, + "step": 10115 + }, + { + "epoch": 4.7713342762847715, + "grad_norm": 0.36082032322883606, + "learning_rate": 3.129444744418472e-05, + "loss": 0.4136, + "num_input_tokens_seen": 10180032, + "step": 10120 + }, + { + "epoch": 4.773691654879774, + "grad_norm": 0.31009724736213684, + "learning_rate": 3.1274536032626766e-05, + "loss": 0.3058, + "num_input_tokens_seen": 10185600, + "step": 10125 + }, + { + "epoch": 4.776049033474776, + "grad_norm": 0.48915815353393555, + "learning_rate": 3.125462037237934e-05, + "loss": 0.3038, + "num_input_tokens_seen": 10191296, + "step": 10130 + }, + { + "epoch": 4.778406412069779, + "grad_norm": 0.525700032711029, + "learning_rate": 3.123470047692796e-05, + "loss": 0.3295, + "num_input_tokens_seen": 10196512, + "step": 10135 + }, + { + "epoch": 4.780763790664781, + "grad_norm": 0.3457680642604828, + "learning_rate": 3.121477635976104e-05, + "loss": 0.3801, + "num_input_tokens_seen": 10201888, + "step": 10140 + }, + { + "epoch": 4.7831211692597835, + "grad_norm": 0.4480958580970764, + "learning_rate": 3.119484803436982e-05, + "loss": 0.3527, + "num_input_tokens_seen": 10207552, + "step": 10145 + }, + { + "epoch": 4.785478547854785, + "grad_norm": 0.4894905686378479, + "learning_rate": 3.117491551424843e-05, + "loss": 0.3229, + "num_input_tokens_seen": 10213664, + "step": 10150 + }, + { + "epoch": 4.787835926449787, + "grad_norm": 0.363052099943161, + "learning_rate": 3.1154978812893795e-05, + "loss": 0.3223, + "num_input_tokens_seen": 10219360, + "step": 10155 + }, + { + "epoch": 4.79019330504479, + "grad_norm": 0.36013320088386536, + "learning_rate": 3.1135037943805717e-05, + "loss": 0.2979, + "num_input_tokens_seen": 10224768, + "step": 10160 + }, + { + "epoch": 4.792550683639792, + "grad_norm": 0.3564627766609192, + "learning_rate": 3.1115092920486785e-05, + "loss": 0.3521, + "num_input_tokens_seen": 10230208, + "step": 10165 + }, + { + "epoch": 4.794908062234795, + "grad_norm": 0.7450299263000488, + "learning_rate": 3.109514375644242e-05, + "loss": 0.3219, + "num_input_tokens_seen": 10235424, + "step": 10170 + }, + { + "epoch": 4.797265440829797, + "grad_norm": 0.5464006066322327, + "learning_rate": 3.107519046518083e-05, + "loss": 0.3557, + "num_input_tokens_seen": 10239488, + "step": 10175 + }, + { + "epoch": 4.799622819424799, + "grad_norm": 0.2781592607498169, + "learning_rate": 3.1055233060213035e-05, + "loss": 0.3453, + "num_input_tokens_seen": 10244608, + "step": 10180 + }, + { + "epoch": 4.801980198019802, + "grad_norm": 0.5569091439247131, + "learning_rate": 3.103527155505283e-05, + "loss": 0.3636, + "num_input_tokens_seen": 10250240, + "step": 10185 + }, + { + "epoch": 4.804337576614804, + "grad_norm": 0.4552147090435028, + "learning_rate": 3.10153059632168e-05, + "loss": 0.3626, + "num_input_tokens_seen": 10254784, + "step": 10190 + }, + { + "epoch": 4.806694955209807, + "grad_norm": 0.3537598252296448, + "learning_rate": 3.099533629822428e-05, + "loss": 0.3414, + "num_input_tokens_seen": 10260448, + "step": 10195 + }, + { + "epoch": 4.809052333804809, + "grad_norm": 0.4474385976791382, + "learning_rate": 3.097536257359737e-05, + "loss": 0.3229, + "num_input_tokens_seen": 10264352, + "step": 10200 + }, + { + "epoch": 4.811409712399811, + "grad_norm": 0.32169008255004883, + "learning_rate": 3.095538480286091e-05, + "loss": 0.2972, + "num_input_tokens_seen": 10269952, + "step": 10205 + }, + { + "epoch": 4.813767090994814, + "grad_norm": 0.4425453841686249, + "learning_rate": 3.093540299954251e-05, + "loss": 0.3262, + "num_input_tokens_seen": 10275040, + "step": 10210 + }, + { + "epoch": 4.816124469589816, + "grad_norm": 0.3907044231891632, + "learning_rate": 3.091541717717247e-05, + "loss": 0.325, + "num_input_tokens_seen": 10280032, + "step": 10215 + }, + { + "epoch": 4.818481848184819, + "grad_norm": 0.4085279107093811, + "learning_rate": 3.089542734928385e-05, + "loss": 0.3239, + "num_input_tokens_seen": 10285152, + "step": 10220 + }, + { + "epoch": 4.820839226779821, + "grad_norm": 0.32790499925613403, + "learning_rate": 3.0875433529412386e-05, + "loss": 0.3692, + "num_input_tokens_seen": 10290144, + "step": 10225 + }, + { + "epoch": 4.823196605374823, + "grad_norm": 0.4232173264026642, + "learning_rate": 3.085543573109655e-05, + "loss": 0.327, + "num_input_tokens_seen": 10295872, + "step": 10230 + }, + { + "epoch": 4.825553983969826, + "grad_norm": 0.45572271943092346, + "learning_rate": 3.0835433967877474e-05, + "loss": 0.3536, + "num_input_tokens_seen": 10300768, + "step": 10235 + }, + { + "epoch": 4.827911362564828, + "grad_norm": 0.7063683271408081, + "learning_rate": 3.081542825329901e-05, + "loss": 0.3343, + "num_input_tokens_seen": 10306080, + "step": 10240 + }, + { + "epoch": 4.830268741159831, + "grad_norm": 0.49935978651046753, + "learning_rate": 3.079541860090765e-05, + "loss": 0.3372, + "num_input_tokens_seen": 10311552, + "step": 10245 + }, + { + "epoch": 4.832626119754833, + "grad_norm": 0.40795430541038513, + "learning_rate": 3.077540502425258e-05, + "loss": 0.3263, + "num_input_tokens_seen": 10316736, + "step": 10250 + }, + { + "epoch": 4.834983498349835, + "grad_norm": 0.2694697380065918, + "learning_rate": 3.075538753688565e-05, + "loss": 0.3252, + "num_input_tokens_seen": 10320960, + "step": 10255 + }, + { + "epoch": 4.837340876944838, + "grad_norm": 0.4005540907382965, + "learning_rate": 3.073536615236132e-05, + "loss": 0.3349, + "num_input_tokens_seen": 10326336, + "step": 10260 + }, + { + "epoch": 4.839698255539839, + "grad_norm": 0.31836366653442383, + "learning_rate": 3.071534088423672e-05, + "loss": 0.3369, + "num_input_tokens_seen": 10331136, + "step": 10265 + }, + { + "epoch": 4.842055634134842, + "grad_norm": 0.4494684934616089, + "learning_rate": 3.06953117460716e-05, + "loss": 0.3228, + "num_input_tokens_seen": 10336416, + "step": 10270 + }, + { + "epoch": 4.844413012729844, + "grad_norm": 0.2866079807281494, + "learning_rate": 3.067527875142834e-05, + "loss": 0.3356, + "num_input_tokens_seen": 10340928, + "step": 10275 + }, + { + "epoch": 4.8467703913248465, + "grad_norm": 0.43424975872039795, + "learning_rate": 3.0655241913871914e-05, + "loss": 0.3501, + "num_input_tokens_seen": 10346080, + "step": 10280 + }, + { + "epoch": 4.849127769919849, + "grad_norm": 0.4052906036376953, + "learning_rate": 3.06352012469699e-05, + "loss": 0.3256, + "num_input_tokens_seen": 10350080, + "step": 10285 + }, + { + "epoch": 4.851485148514851, + "grad_norm": 0.2741474509239197, + "learning_rate": 3.061515676429248e-05, + "loss": 0.3326, + "num_input_tokens_seen": 10354880, + "step": 10290 + }, + { + "epoch": 4.853842527109854, + "grad_norm": 0.3434765040874481, + "learning_rate": 3.059510847941244e-05, + "loss": 0.3264, + "num_input_tokens_seen": 10359200, + "step": 10295 + }, + { + "epoch": 4.856199905704856, + "grad_norm": 0.36519449949264526, + "learning_rate": 3.057505640590508e-05, + "loss": 0.2992, + "num_input_tokens_seen": 10364032, + "step": 10300 + }, + { + "epoch": 4.858557284299859, + "grad_norm": 0.42669427394866943, + "learning_rate": 3.0555000557348324e-05, + "loss": 0.3474, + "num_input_tokens_seen": 10369792, + "step": 10305 + }, + { + "epoch": 4.860914662894861, + "grad_norm": 0.3908052146434784, + "learning_rate": 3.053494094732262e-05, + "loss": 0.3527, + "num_input_tokens_seen": 10375264, + "step": 10310 + }, + { + "epoch": 4.863272041489863, + "grad_norm": 0.482798308134079, + "learning_rate": 3.0514877589410985e-05, + "loss": 0.3055, + "num_input_tokens_seen": 10379712, + "step": 10315 + }, + { + "epoch": 4.865629420084866, + "grad_norm": 0.40534651279449463, + "learning_rate": 3.0494810497198944e-05, + "loss": 0.364, + "num_input_tokens_seen": 10385024, + "step": 10320 + }, + { + "epoch": 4.867986798679868, + "grad_norm": 0.4970194399356842, + "learning_rate": 3.0474739684274582e-05, + "loss": 0.3117, + "num_input_tokens_seen": 10390272, + "step": 10325 + }, + { + "epoch": 4.870344177274871, + "grad_norm": 0.5311806797981262, + "learning_rate": 3.045466516422848e-05, + "loss": 0.324, + "num_input_tokens_seen": 10394368, + "step": 10330 + }, + { + "epoch": 4.872701555869873, + "grad_norm": 0.3596271574497223, + "learning_rate": 3.043458695065376e-05, + "loss": 0.3513, + "num_input_tokens_seen": 10401056, + "step": 10335 + }, + { + "epoch": 4.875058934464875, + "grad_norm": 0.5273321866989136, + "learning_rate": 3.0414505057145998e-05, + "loss": 0.3215, + "num_input_tokens_seen": 10405408, + "step": 10340 + }, + { + "epoch": 4.877416313059878, + "grad_norm": 0.31102240085601807, + "learning_rate": 3.03944194973033e-05, + "loss": 0.3235, + "num_input_tokens_seen": 10410208, + "step": 10345 + }, + { + "epoch": 4.879773691654879, + "grad_norm": 0.3713321089744568, + "learning_rate": 3.037433028472624e-05, + "loss": 0.2951, + "num_input_tokens_seen": 10416000, + "step": 10350 + }, + { + "epoch": 4.882131070249882, + "grad_norm": 0.47417041659355164, + "learning_rate": 3.0354237433017872e-05, + "loss": 0.3276, + "num_input_tokens_seen": 10421120, + "step": 10355 + }, + { + "epoch": 4.884488448844884, + "grad_norm": 0.6182376742362976, + "learning_rate": 3.03341409557837e-05, + "loss": 0.3671, + "num_input_tokens_seen": 10425376, + "step": 10360 + }, + { + "epoch": 4.8868458274398865, + "grad_norm": 0.298306941986084, + "learning_rate": 3.0314040866631708e-05, + "loss": 0.3128, + "num_input_tokens_seen": 10431392, + "step": 10365 + }, + { + "epoch": 4.889203206034889, + "grad_norm": 0.31490248441696167, + "learning_rate": 3.02939371791723e-05, + "loss": 0.3808, + "num_input_tokens_seen": 10435968, + "step": 10370 + }, + { + "epoch": 4.891560584629891, + "grad_norm": 0.25548115372657776, + "learning_rate": 3.027382990701833e-05, + "loss": 0.342, + "num_input_tokens_seen": 10440544, + "step": 10375 + }, + { + "epoch": 4.893917963224894, + "grad_norm": 0.3817603290081024, + "learning_rate": 3.0253719063785074e-05, + "loss": 0.3437, + "num_input_tokens_seen": 10445408, + "step": 10380 + }, + { + "epoch": 4.896275341819896, + "grad_norm": 0.3664526641368866, + "learning_rate": 3.0233604663090238e-05, + "loss": 0.3118, + "num_input_tokens_seen": 10449888, + "step": 10385 + }, + { + "epoch": 4.8986327204148985, + "grad_norm": 0.35782700777053833, + "learning_rate": 3.0213486718553912e-05, + "loss": 0.3517, + "num_input_tokens_seen": 10455840, + "step": 10390 + }, + { + "epoch": 4.900990099009901, + "grad_norm": 0.3383384943008423, + "learning_rate": 3.019336524379861e-05, + "loss": 0.4046, + "num_input_tokens_seen": 10461280, + "step": 10395 + }, + { + "epoch": 4.903347477604903, + "grad_norm": 0.5603818893432617, + "learning_rate": 3.017324025244923e-05, + "loss": 0.3637, + "num_input_tokens_seen": 10467136, + "step": 10400 + }, + { + "epoch": 4.905704856199906, + "grad_norm": 0.5872529149055481, + "learning_rate": 3.0153111758133046e-05, + "loss": 0.3486, + "num_input_tokens_seen": 10472736, + "step": 10405 + }, + { + "epoch": 4.908062234794908, + "grad_norm": 0.3989713788032532, + "learning_rate": 3.0132979774479707e-05, + "loss": 0.3292, + "num_input_tokens_seen": 10478112, + "step": 10410 + }, + { + "epoch": 4.9104196133899105, + "grad_norm": 0.3427045941352844, + "learning_rate": 3.0112844315121218e-05, + "loss": 0.3101, + "num_input_tokens_seen": 10483456, + "step": 10415 + }, + { + "epoch": 4.912776991984913, + "grad_norm": 0.2850785553455353, + "learning_rate": 3.0092705393691957e-05, + "loss": 0.3324, + "num_input_tokens_seen": 10488704, + "step": 10420 + }, + { + "epoch": 4.915134370579915, + "grad_norm": 0.29116392135620117, + "learning_rate": 3.0072563023828622e-05, + "loss": 0.3096, + "num_input_tokens_seen": 10492960, + "step": 10425 + }, + { + "epoch": 4.917491749174918, + "grad_norm": 0.36261001229286194, + "learning_rate": 3.0052417219170263e-05, + "loss": 0.3674, + "num_input_tokens_seen": 10498592, + "step": 10430 + }, + { + "epoch": 4.91984912776992, + "grad_norm": 0.5486805438995361, + "learning_rate": 3.0032267993358248e-05, + "loss": 0.3376, + "num_input_tokens_seen": 10503232, + "step": 10435 + }, + { + "epoch": 4.9222065063649225, + "grad_norm": 0.32996952533721924, + "learning_rate": 3.0012115360036265e-05, + "loss": 0.349, + "num_input_tokens_seen": 10508448, + "step": 10440 + }, + { + "epoch": 4.924563884959925, + "grad_norm": 0.2840794026851654, + "learning_rate": 2.9991959332850305e-05, + "loss": 0.3207, + "num_input_tokens_seen": 10513184, + "step": 10445 + }, + { + "epoch": 4.926921263554927, + "grad_norm": 0.5541236996650696, + "learning_rate": 2.9971799925448673e-05, + "loss": 0.3015, + "num_input_tokens_seen": 10518208, + "step": 10450 + }, + { + "epoch": 4.92927864214993, + "grad_norm": 0.3308633863925934, + "learning_rate": 2.9951637151481926e-05, + "loss": 0.3267, + "num_input_tokens_seen": 10522912, + "step": 10455 + }, + { + "epoch": 4.931636020744932, + "grad_norm": 0.3368236720561981, + "learning_rate": 2.9931471024602946e-05, + "loss": 0.3628, + "num_input_tokens_seen": 10527200, + "step": 10460 + }, + { + "epoch": 4.933993399339934, + "grad_norm": 0.24189625680446625, + "learning_rate": 2.9911301558466853e-05, + "loss": 0.2936, + "num_input_tokens_seen": 10532448, + "step": 10465 + }, + { + "epoch": 4.936350777934936, + "grad_norm": 0.5244729518890381, + "learning_rate": 2.989112876673104e-05, + "loss": 0.3678, + "num_input_tokens_seen": 10537536, + "step": 10470 + }, + { + "epoch": 4.938708156529938, + "grad_norm": 0.3747365474700928, + "learning_rate": 2.9870952663055153e-05, + "loss": 0.3576, + "num_input_tokens_seen": 10542784, + "step": 10475 + }, + { + "epoch": 4.941065535124941, + "grad_norm": 0.5217799544334412, + "learning_rate": 2.9850773261101082e-05, + "loss": 0.3472, + "num_input_tokens_seen": 10547616, + "step": 10480 + }, + { + "epoch": 4.943422913719943, + "grad_norm": 0.4534219801425934, + "learning_rate": 2.983059057453294e-05, + "loss": 0.3463, + "num_input_tokens_seen": 10553952, + "step": 10485 + }, + { + "epoch": 4.945780292314946, + "grad_norm": 0.3068934679031372, + "learning_rate": 2.981040461701708e-05, + "loss": 0.2843, + "num_input_tokens_seen": 10560160, + "step": 10490 + }, + { + "epoch": 4.948137670909948, + "grad_norm": 0.3469199538230896, + "learning_rate": 2.9790215402222048e-05, + "loss": 0.3632, + "num_input_tokens_seen": 10565760, + "step": 10495 + }, + { + "epoch": 4.9504950495049505, + "grad_norm": 0.3261546790599823, + "learning_rate": 2.977002294381862e-05, + "loss": 0.3015, + "num_input_tokens_seen": 10572256, + "step": 10500 + }, + { + "epoch": 4.952852428099953, + "grad_norm": 0.41578757762908936, + "learning_rate": 2.9749827255479755e-05, + "loss": 0.3104, + "num_input_tokens_seen": 10576544, + "step": 10505 + }, + { + "epoch": 4.955209806694955, + "grad_norm": 0.3870900869369507, + "learning_rate": 2.972962835088059e-05, + "loss": 0.3686, + "num_input_tokens_seen": 10581632, + "step": 10510 + }, + { + "epoch": 4.957567185289958, + "grad_norm": 0.28176215291023254, + "learning_rate": 2.970942624369847e-05, + "loss": 0.3274, + "num_input_tokens_seen": 10586240, + "step": 10515 + }, + { + "epoch": 4.95992456388496, + "grad_norm": 0.3348817527294159, + "learning_rate": 2.9689220947612873e-05, + "loss": 0.3643, + "num_input_tokens_seen": 10592000, + "step": 10520 + }, + { + "epoch": 4.9622819424799625, + "grad_norm": 0.4259524345397949, + "learning_rate": 2.9669012476305457e-05, + "loss": 0.3854, + "num_input_tokens_seen": 10597056, + "step": 10525 + }, + { + "epoch": 4.964639321074965, + "grad_norm": 0.5532686710357666, + "learning_rate": 2.964880084346003e-05, + "loss": 0.3453, + "num_input_tokens_seen": 10601568, + "step": 10530 + }, + { + "epoch": 4.966996699669967, + "grad_norm": 0.4579300284385681, + "learning_rate": 2.962858606276253e-05, + "loss": 0.3164, + "num_input_tokens_seen": 10606880, + "step": 10535 + }, + { + "epoch": 4.96935407826497, + "grad_norm": 0.3809233605861664, + "learning_rate": 2.9608368147901038e-05, + "loss": 0.3699, + "num_input_tokens_seen": 10611392, + "step": 10540 + }, + { + "epoch": 4.971711456859972, + "grad_norm": 0.39290642738342285, + "learning_rate": 2.9588147112565756e-05, + "loss": 0.3206, + "num_input_tokens_seen": 10615968, + "step": 10545 + }, + { + "epoch": 4.974068835454974, + "grad_norm": 0.39894402027130127, + "learning_rate": 2.956792297044898e-05, + "loss": 0.3695, + "num_input_tokens_seen": 10621120, + "step": 10550 + }, + { + "epoch": 4.976426214049976, + "grad_norm": 0.4855651557445526, + "learning_rate": 2.9547695735245145e-05, + "loss": 0.3549, + "num_input_tokens_seen": 10625504, + "step": 10555 + }, + { + "epoch": 4.978783592644978, + "grad_norm": 0.35521459579467773, + "learning_rate": 2.9527465420650747e-05, + "loss": 0.3442, + "num_input_tokens_seen": 10630560, + "step": 10560 + }, + { + "epoch": 4.981140971239981, + "grad_norm": 0.3370521068572998, + "learning_rate": 2.9507232040364385e-05, + "loss": 0.3567, + "num_input_tokens_seen": 10635872, + "step": 10565 + }, + { + "epoch": 4.983498349834983, + "grad_norm": 0.30425816774368286, + "learning_rate": 2.9486995608086725e-05, + "loss": 0.3114, + "num_input_tokens_seen": 10641184, + "step": 10570 + }, + { + "epoch": 4.985855728429986, + "grad_norm": 0.432649701833725, + "learning_rate": 2.9466756137520512e-05, + "loss": 0.304, + "num_input_tokens_seen": 10646304, + "step": 10575 + }, + { + "epoch": 4.988213107024988, + "grad_norm": 0.4159397780895233, + "learning_rate": 2.9446513642370528e-05, + "loss": 0.2706, + "num_input_tokens_seen": 10651072, + "step": 10580 + }, + { + "epoch": 4.99057048561999, + "grad_norm": 0.5388597846031189, + "learning_rate": 2.9426268136343633e-05, + "loss": 0.3467, + "num_input_tokens_seen": 10656864, + "step": 10585 + }, + { + "epoch": 4.992927864214993, + "grad_norm": 0.3605559766292572, + "learning_rate": 2.9406019633148686e-05, + "loss": 0.399, + "num_input_tokens_seen": 10662112, + "step": 10590 + }, + { + "epoch": 4.995285242809995, + "grad_norm": 0.3036418855190277, + "learning_rate": 2.9385768146496616e-05, + "loss": 0.2937, + "num_input_tokens_seen": 10666560, + "step": 10595 + }, + { + "epoch": 4.997642621404998, + "grad_norm": 0.3285835087299347, + "learning_rate": 2.9365513690100345e-05, + "loss": 0.3215, + "num_input_tokens_seen": 10672032, + "step": 10600 + }, + { + "epoch": 5.0, + "grad_norm": 0.31006506085395813, + "learning_rate": 2.934525627767482e-05, + "loss": 0.3282, + "num_input_tokens_seen": 10677088, + "step": 10605 + }, + { + "epoch": 5.002357378595002, + "grad_norm": 0.4930444061756134, + "learning_rate": 2.9324995922936977e-05, + "loss": 0.3826, + "num_input_tokens_seen": 10682528, + "step": 10610 + }, + { + "epoch": 5.002357378595002, + "eval_loss": 0.33156171441078186, + "eval_runtime": 25.6149, + "eval_samples_per_second": 36.815, + "eval_steps_per_second": 9.213, + "num_input_tokens_seen": 10682528, + "step": 10610 + }, + { + "epoch": 5.004714757190005, + "grad_norm": 0.3123442828655243, + "learning_rate": 2.9304732639605764e-05, + "loss": 0.3426, + "num_input_tokens_seen": 10687872, + "step": 10615 + }, + { + "epoch": 5.007072135785007, + "grad_norm": 0.23724493384361267, + "learning_rate": 2.9284466441402087e-05, + "loss": 0.3472, + "num_input_tokens_seen": 10692224, + "step": 10620 + }, + { + "epoch": 5.00942951438001, + "grad_norm": 0.34521394968032837, + "learning_rate": 2.9264197342048837e-05, + "loss": 0.3163, + "num_input_tokens_seen": 10697248, + "step": 10625 + }, + { + "epoch": 5.011786892975012, + "grad_norm": 0.2959251403808594, + "learning_rate": 2.924392535527089e-05, + "loss": 0.2984, + "num_input_tokens_seen": 10702496, + "step": 10630 + }, + { + "epoch": 5.014144271570014, + "grad_norm": 0.28334590792655945, + "learning_rate": 2.9223650494795034e-05, + "loss": 0.3434, + "num_input_tokens_seen": 10706272, + "step": 10635 + }, + { + "epoch": 5.016501650165017, + "grad_norm": 0.3547787666320801, + "learning_rate": 2.9203372774350042e-05, + "loss": 0.3004, + "num_input_tokens_seen": 10710528, + "step": 10640 + }, + { + "epoch": 5.018859028760019, + "grad_norm": 0.34258922934532166, + "learning_rate": 2.91830922076666e-05, + "loss": 0.3191, + "num_input_tokens_seen": 10715488, + "step": 10645 + }, + { + "epoch": 5.021216407355022, + "grad_norm": 0.3021244406700134, + "learning_rate": 2.9162808808477342e-05, + "loss": 0.2998, + "num_input_tokens_seen": 10720576, + "step": 10650 + }, + { + "epoch": 5.023573785950024, + "grad_norm": 0.1748412847518921, + "learning_rate": 2.9142522590516796e-05, + "loss": 0.2623, + "num_input_tokens_seen": 10726304, + "step": 10655 + }, + { + "epoch": 5.0259311645450255, + "grad_norm": 0.2504948675632477, + "learning_rate": 2.9122233567521416e-05, + "loss": 0.2814, + "num_input_tokens_seen": 10732576, + "step": 10660 + }, + { + "epoch": 5.028288543140028, + "grad_norm": 0.4815309941768646, + "learning_rate": 2.9101941753229546e-05, + "loss": 0.3151, + "num_input_tokens_seen": 10737632, + "step": 10665 + }, + { + "epoch": 5.03064592173503, + "grad_norm": 0.3929275572299957, + "learning_rate": 2.9081647161381425e-05, + "loss": 0.2515, + "num_input_tokens_seen": 10741856, + "step": 10670 + }, + { + "epoch": 5.033003300330033, + "grad_norm": 0.296443372964859, + "learning_rate": 2.906134980571918e-05, + "loss": 0.3555, + "num_input_tokens_seen": 10746208, + "step": 10675 + }, + { + "epoch": 5.035360678925035, + "grad_norm": 0.5984843969345093, + "learning_rate": 2.9041049699986794e-05, + "loss": 0.4822, + "num_input_tokens_seen": 10750944, + "step": 10680 + }, + { + "epoch": 5.0377180575200375, + "grad_norm": 0.38284340500831604, + "learning_rate": 2.9020746857930108e-05, + "loss": 0.3747, + "num_input_tokens_seen": 10755200, + "step": 10685 + }, + { + "epoch": 5.04007543611504, + "grad_norm": 0.3170144557952881, + "learning_rate": 2.9000441293296854e-05, + "loss": 0.3184, + "num_input_tokens_seen": 10759392, + "step": 10690 + }, + { + "epoch": 5.042432814710042, + "grad_norm": 0.35387760400772095, + "learning_rate": 2.8980133019836554e-05, + "loss": 0.3519, + "num_input_tokens_seen": 10765216, + "step": 10695 + }, + { + "epoch": 5.044790193305045, + "grad_norm": 0.523595929145813, + "learning_rate": 2.8959822051300605e-05, + "loss": 0.3479, + "num_input_tokens_seen": 10771104, + "step": 10700 + }, + { + "epoch": 5.047147571900047, + "grad_norm": 0.7267592549324036, + "learning_rate": 2.893950840144221e-05, + "loss": 0.3655, + "num_input_tokens_seen": 10776064, + "step": 10705 + }, + { + "epoch": 5.0495049504950495, + "grad_norm": 0.41375482082366943, + "learning_rate": 2.89191920840164e-05, + "loss": 0.3236, + "num_input_tokens_seen": 10780960, + "step": 10710 + }, + { + "epoch": 5.051862329090052, + "grad_norm": 0.4715190827846527, + "learning_rate": 2.889887311277999e-05, + "loss": 0.3534, + "num_input_tokens_seen": 10785760, + "step": 10715 + }, + { + "epoch": 5.054219707685054, + "grad_norm": 0.3647102117538452, + "learning_rate": 2.8878551501491624e-05, + "loss": 0.3366, + "num_input_tokens_seen": 10790400, + "step": 10720 + }, + { + "epoch": 5.056577086280057, + "grad_norm": 0.35065752267837524, + "learning_rate": 2.8858227263911707e-05, + "loss": 0.3226, + "num_input_tokens_seen": 10795200, + "step": 10725 + }, + { + "epoch": 5.058934464875059, + "grad_norm": 0.4020078480243683, + "learning_rate": 2.8837900413802443e-05, + "loss": 0.3161, + "num_input_tokens_seen": 10800000, + "step": 10730 + }, + { + "epoch": 5.061291843470062, + "grad_norm": 0.3240717947483063, + "learning_rate": 2.881757096492777e-05, + "loss": 0.3282, + "num_input_tokens_seen": 10804000, + "step": 10735 + }, + { + "epoch": 5.063649222065064, + "grad_norm": 0.46127545833587646, + "learning_rate": 2.8797238931053432e-05, + "loss": 0.3252, + "num_input_tokens_seen": 10808896, + "step": 10740 + }, + { + "epoch": 5.066006600660066, + "grad_norm": 0.3387908935546875, + "learning_rate": 2.8776904325946884e-05, + "loss": 0.3058, + "num_input_tokens_seen": 10813664, + "step": 10745 + }, + { + "epoch": 5.068363979255069, + "grad_norm": 0.3102753460407257, + "learning_rate": 2.8756567163377356e-05, + "loss": 0.2563, + "num_input_tokens_seen": 10819264, + "step": 10750 + }, + { + "epoch": 5.07072135785007, + "grad_norm": 0.3256378173828125, + "learning_rate": 2.873622745711578e-05, + "loss": 0.3243, + "num_input_tokens_seen": 10824064, + "step": 10755 + }, + { + "epoch": 5.073078736445073, + "grad_norm": 0.35943832993507385, + "learning_rate": 2.8715885220934825e-05, + "loss": 0.3012, + "num_input_tokens_seen": 10829216, + "step": 10760 + }, + { + "epoch": 5.075436115040075, + "grad_norm": 0.8328166604042053, + "learning_rate": 2.869554046860887e-05, + "loss": 0.375, + "num_input_tokens_seen": 10835264, + "step": 10765 + }, + { + "epoch": 5.0777934936350775, + "grad_norm": 0.7837278246879578, + "learning_rate": 2.8675193213914014e-05, + "loss": 0.3864, + "num_input_tokens_seen": 10840384, + "step": 10770 + }, + { + "epoch": 5.08015087223008, + "grad_norm": 0.3895280063152313, + "learning_rate": 2.865484347062802e-05, + "loss": 0.3119, + "num_input_tokens_seen": 10845632, + "step": 10775 + }, + { + "epoch": 5.082508250825082, + "grad_norm": 0.2981898784637451, + "learning_rate": 2.8634491252530354e-05, + "loss": 0.3373, + "num_input_tokens_seen": 10850560, + "step": 10780 + }, + { + "epoch": 5.084865629420085, + "grad_norm": 0.3731767535209656, + "learning_rate": 2.8614136573402155e-05, + "loss": 0.3533, + "num_input_tokens_seen": 10855360, + "step": 10785 + }, + { + "epoch": 5.087223008015087, + "grad_norm": 0.27460119128227234, + "learning_rate": 2.8593779447026232e-05, + "loss": 0.3119, + "num_input_tokens_seen": 10862336, + "step": 10790 + }, + { + "epoch": 5.0895803866100895, + "grad_norm": 0.4602122902870178, + "learning_rate": 2.8573419887187047e-05, + "loss": 0.3439, + "num_input_tokens_seen": 10867264, + "step": 10795 + }, + { + "epoch": 5.091937765205092, + "grad_norm": 0.5756168365478516, + "learning_rate": 2.8553057907670706e-05, + "loss": 0.3268, + "num_input_tokens_seen": 10872384, + "step": 10800 + }, + { + "epoch": 5.094295143800094, + "grad_norm": 0.3490627706050873, + "learning_rate": 2.8532693522264963e-05, + "loss": 0.3284, + "num_input_tokens_seen": 10877760, + "step": 10805 + }, + { + "epoch": 5.096652522395097, + "grad_norm": 0.4264247715473175, + "learning_rate": 2.8512326744759193e-05, + "loss": 0.3888, + "num_input_tokens_seen": 10882816, + "step": 10810 + }, + { + "epoch": 5.099009900990099, + "grad_norm": 0.3561382293701172, + "learning_rate": 2.84919575889444e-05, + "loss": 0.3316, + "num_input_tokens_seen": 10887488, + "step": 10815 + }, + { + "epoch": 5.1013672795851015, + "grad_norm": 0.32500389218330383, + "learning_rate": 2.847158606861318e-05, + "loss": 0.2988, + "num_input_tokens_seen": 10892096, + "step": 10820 + }, + { + "epoch": 5.103724658180104, + "grad_norm": 0.40125036239624023, + "learning_rate": 2.845121219755976e-05, + "loss": 0.3603, + "num_input_tokens_seen": 10897952, + "step": 10825 + }, + { + "epoch": 5.106082036775106, + "grad_norm": 0.32944440841674805, + "learning_rate": 2.843083598957993e-05, + "loss": 0.3554, + "num_input_tokens_seen": 10903264, + "step": 10830 + }, + { + "epoch": 5.108439415370109, + "grad_norm": 0.3256315290927887, + "learning_rate": 2.8410457458471075e-05, + "loss": 0.2964, + "num_input_tokens_seen": 10908704, + "step": 10835 + }, + { + "epoch": 5.110796793965111, + "grad_norm": 0.3666996657848358, + "learning_rate": 2.839007661803215e-05, + "loss": 0.3343, + "num_input_tokens_seen": 10912928, + "step": 10840 + }, + { + "epoch": 5.1131541725601135, + "grad_norm": 0.33224087953567505, + "learning_rate": 2.836969348206368e-05, + "loss": 0.3222, + "num_input_tokens_seen": 10918560, + "step": 10845 + }, + { + "epoch": 5.115511551155116, + "grad_norm": 0.27541735768318176, + "learning_rate": 2.8349308064367736e-05, + "loss": 0.31, + "num_input_tokens_seen": 10922656, + "step": 10850 + }, + { + "epoch": 5.117868929750118, + "grad_norm": 0.29093703627586365, + "learning_rate": 2.832892037874794e-05, + "loss": 0.3499, + "num_input_tokens_seen": 10927232, + "step": 10855 + }, + { + "epoch": 5.12022630834512, + "grad_norm": 0.24403022229671478, + "learning_rate": 2.830853043900944e-05, + "loss": 0.2831, + "num_input_tokens_seen": 10932832, + "step": 10860 + }, + { + "epoch": 5.122583686940122, + "grad_norm": 0.5587069988250732, + "learning_rate": 2.828813825895893e-05, + "loss": 0.3859, + "num_input_tokens_seen": 10937888, + "step": 10865 + }, + { + "epoch": 5.124941065535125, + "grad_norm": 0.32919561862945557, + "learning_rate": 2.8267743852404598e-05, + "loss": 0.3098, + "num_input_tokens_seen": 10942720, + "step": 10870 + }, + { + "epoch": 5.127298444130127, + "grad_norm": 0.48484957218170166, + "learning_rate": 2.8247347233156164e-05, + "loss": 0.3087, + "num_input_tokens_seen": 10948288, + "step": 10875 + }, + { + "epoch": 5.129655822725129, + "grad_norm": 0.24355244636535645, + "learning_rate": 2.8226948415024813e-05, + "loss": 0.3485, + "num_input_tokens_seen": 10953152, + "step": 10880 + }, + { + "epoch": 5.132013201320132, + "grad_norm": 0.31361123919487, + "learning_rate": 2.820654741182326e-05, + "loss": 0.319, + "num_input_tokens_seen": 10958368, + "step": 10885 + }, + { + "epoch": 5.134370579915134, + "grad_norm": 0.3546243906021118, + "learning_rate": 2.8186144237365657e-05, + "loss": 0.3658, + "num_input_tokens_seen": 10963968, + "step": 10890 + }, + { + "epoch": 5.136727958510137, + "grad_norm": 0.3190551698207855, + "learning_rate": 2.816573890546767e-05, + "loss": 0.3244, + "num_input_tokens_seen": 10968288, + "step": 10895 + }, + { + "epoch": 5.139085337105139, + "grad_norm": 0.28898823261260986, + "learning_rate": 2.8145331429946387e-05, + "loss": 0.2825, + "num_input_tokens_seen": 10973888, + "step": 10900 + }, + { + "epoch": 5.141442715700141, + "grad_norm": 0.3616991639137268, + "learning_rate": 2.8124921824620364e-05, + "loss": 0.362, + "num_input_tokens_seen": 10981184, + "step": 10905 + }, + { + "epoch": 5.143800094295144, + "grad_norm": 0.35264524817466736, + "learning_rate": 2.8104510103309612e-05, + "loss": 0.3604, + "num_input_tokens_seen": 10986304, + "step": 10910 + }, + { + "epoch": 5.146157472890146, + "grad_norm": 0.5849035978317261, + "learning_rate": 2.808409627983555e-05, + "loss": 0.3431, + "num_input_tokens_seen": 10993312, + "step": 10915 + }, + { + "epoch": 5.148514851485149, + "grad_norm": 0.7035502195358276, + "learning_rate": 2.8063680368021032e-05, + "loss": 0.3467, + "num_input_tokens_seen": 10997120, + "step": 10920 + }, + { + "epoch": 5.150872230080151, + "grad_norm": 0.46414709091186523, + "learning_rate": 2.8043262381690328e-05, + "loss": 0.3236, + "num_input_tokens_seen": 11002592, + "step": 10925 + }, + { + "epoch": 5.1532296086751535, + "grad_norm": 0.3975028991699219, + "learning_rate": 2.8022842334669118e-05, + "loss": 0.3352, + "num_input_tokens_seen": 11007040, + "step": 10930 + }, + { + "epoch": 5.155586987270156, + "grad_norm": 0.70233553647995, + "learning_rate": 2.8002420240784466e-05, + "loss": 0.3262, + "num_input_tokens_seen": 11011776, + "step": 10935 + }, + { + "epoch": 5.157944365865158, + "grad_norm": 0.33096277713775635, + "learning_rate": 2.7981996113864823e-05, + "loss": 0.3409, + "num_input_tokens_seen": 11015840, + "step": 10940 + }, + { + "epoch": 5.160301744460161, + "grad_norm": 0.4018300175666809, + "learning_rate": 2.796156996774002e-05, + "loss": 0.3319, + "num_input_tokens_seen": 11021088, + "step": 10945 + }, + { + "epoch": 5.162659123055163, + "grad_norm": 0.3508104085922241, + "learning_rate": 2.7941141816241268e-05, + "loss": 0.3696, + "num_input_tokens_seen": 11026208, + "step": 10950 + }, + { + "epoch": 5.165016501650165, + "grad_norm": 0.44728922843933105, + "learning_rate": 2.792071167320111e-05, + "loss": 0.3156, + "num_input_tokens_seen": 11031104, + "step": 10955 + }, + { + "epoch": 5.167373880245167, + "grad_norm": 0.3496010899543762, + "learning_rate": 2.7900279552453452e-05, + "loss": 0.3121, + "num_input_tokens_seen": 11036032, + "step": 10960 + }, + { + "epoch": 5.169731258840169, + "grad_norm": 0.5275045037269592, + "learning_rate": 2.787984546783354e-05, + "loss": 0.3576, + "num_input_tokens_seen": 11040544, + "step": 10965 + }, + { + "epoch": 5.172088637435172, + "grad_norm": 0.33893755078315735, + "learning_rate": 2.7859409433177946e-05, + "loss": 0.3069, + "num_input_tokens_seen": 11046592, + "step": 10970 + }, + { + "epoch": 5.174446016030174, + "grad_norm": 0.2895634174346924, + "learning_rate": 2.7838971462324565e-05, + "loss": 0.2947, + "num_input_tokens_seen": 11051424, + "step": 10975 + }, + { + "epoch": 5.176803394625177, + "grad_norm": 0.3622147738933563, + "learning_rate": 2.7818531569112604e-05, + "loss": 0.2786, + "num_input_tokens_seen": 11056992, + "step": 10980 + }, + { + "epoch": 5.179160773220179, + "grad_norm": 0.32796719670295715, + "learning_rate": 2.7798089767382567e-05, + "loss": 0.4128, + "num_input_tokens_seen": 11063520, + "step": 10985 + }, + { + "epoch": 5.181518151815181, + "grad_norm": 0.4168797433376312, + "learning_rate": 2.7777646070976264e-05, + "loss": 0.3164, + "num_input_tokens_seen": 11068256, + "step": 10990 + }, + { + "epoch": 5.183875530410184, + "grad_norm": 0.3608628213405609, + "learning_rate": 2.7757200493736758e-05, + "loss": 0.3241, + "num_input_tokens_seen": 11074048, + "step": 10995 + }, + { + "epoch": 5.186232909005186, + "grad_norm": 0.32814303040504456, + "learning_rate": 2.7736753049508425e-05, + "loss": 0.3805, + "num_input_tokens_seen": 11078560, + "step": 11000 + }, + { + "epoch": 5.188590287600189, + "grad_norm": 0.4504071772098541, + "learning_rate": 2.7716303752136864e-05, + "loss": 0.3264, + "num_input_tokens_seen": 11083488, + "step": 11005 + }, + { + "epoch": 5.190947666195191, + "grad_norm": 0.8013522624969482, + "learning_rate": 2.769585261546897e-05, + "loss": 0.348, + "num_input_tokens_seen": 11087872, + "step": 11010 + }, + { + "epoch": 5.193305044790193, + "grad_norm": 0.4010031521320343, + "learning_rate": 2.767539965335285e-05, + "loss": 0.3152, + "num_input_tokens_seen": 11094144, + "step": 11015 + }, + { + "epoch": 5.195662423385196, + "grad_norm": 0.43040943145751953, + "learning_rate": 2.7654944879637863e-05, + "loss": 0.2842, + "num_input_tokens_seen": 11098976, + "step": 11020 + }, + { + "epoch": 5.198019801980198, + "grad_norm": 0.4704750180244446, + "learning_rate": 2.7634488308174593e-05, + "loss": 0.3001, + "num_input_tokens_seen": 11104960, + "step": 11025 + }, + { + "epoch": 5.200377180575201, + "grad_norm": 0.2873738408088684, + "learning_rate": 2.761402995281484e-05, + "loss": 0.3448, + "num_input_tokens_seen": 11110048, + "step": 11030 + }, + { + "epoch": 5.202734559170203, + "grad_norm": 0.31982243061065674, + "learning_rate": 2.759356982741162e-05, + "loss": 0.3317, + "num_input_tokens_seen": 11115712, + "step": 11035 + }, + { + "epoch": 5.205091937765205, + "grad_norm": 0.2583771347999573, + "learning_rate": 2.757310794581912e-05, + "loss": 0.3449, + "num_input_tokens_seen": 11121344, + "step": 11040 + }, + { + "epoch": 5.207449316360208, + "grad_norm": 0.2999914288520813, + "learning_rate": 2.7552644321892762e-05, + "loss": 0.3407, + "num_input_tokens_seen": 11127232, + "step": 11045 + }, + { + "epoch": 5.20980669495521, + "grad_norm": 0.334559828042984, + "learning_rate": 2.7532178969489097e-05, + "loss": 0.3544, + "num_input_tokens_seen": 11132832, + "step": 11050 + }, + { + "epoch": 5.212164073550213, + "grad_norm": 0.7232909202575684, + "learning_rate": 2.7511711902465886e-05, + "loss": 0.3551, + "num_input_tokens_seen": 11138752, + "step": 11055 + }, + { + "epoch": 5.214521452145214, + "grad_norm": 0.21849940717220306, + "learning_rate": 2.7491243134682033e-05, + "loss": 0.3017, + "num_input_tokens_seen": 11143904, + "step": 11060 + }, + { + "epoch": 5.2168788307402165, + "grad_norm": 0.30647900700569153, + "learning_rate": 2.7470772679997604e-05, + "loss": 0.3173, + "num_input_tokens_seen": 11148960, + "step": 11065 + }, + { + "epoch": 5.219236209335219, + "grad_norm": 0.3331144452095032, + "learning_rate": 2.7450300552273783e-05, + "loss": 0.3226, + "num_input_tokens_seen": 11154816, + "step": 11070 + }, + { + "epoch": 5.221593587930221, + "grad_norm": 0.29186251759529114, + "learning_rate": 2.7429826765372917e-05, + "loss": 0.3034, + "num_input_tokens_seen": 11159072, + "step": 11075 + }, + { + "epoch": 5.223950966525224, + "grad_norm": 0.5365554690361023, + "learning_rate": 2.740935133315845e-05, + "loss": 0.3789, + "num_input_tokens_seen": 11162880, + "step": 11080 + }, + { + "epoch": 5.226308345120226, + "grad_norm": 0.2370453029870987, + "learning_rate": 2.7388874269494974e-05, + "loss": 0.3268, + "num_input_tokens_seen": 11167264, + "step": 11085 + }, + { + "epoch": 5.2286657237152285, + "grad_norm": 0.32554417848587036, + "learning_rate": 2.7368395588248153e-05, + "loss": 0.3423, + "num_input_tokens_seen": 11171616, + "step": 11090 + }, + { + "epoch": 5.231023102310231, + "grad_norm": 0.3715270161628723, + "learning_rate": 2.7347915303284755e-05, + "loss": 0.3629, + "num_input_tokens_seen": 11176992, + "step": 11095 + }, + { + "epoch": 5.233380480905233, + "grad_norm": 0.37141889333724976, + "learning_rate": 2.7327433428472642e-05, + "loss": 0.3529, + "num_input_tokens_seen": 11181248, + "step": 11100 + }, + { + "epoch": 5.235737859500236, + "grad_norm": 0.38833948969841003, + "learning_rate": 2.7306949977680758e-05, + "loss": 0.2845, + "num_input_tokens_seen": 11187232, + "step": 11105 + }, + { + "epoch": 5.238095238095238, + "grad_norm": 0.22927996516227722, + "learning_rate": 2.728646496477908e-05, + "loss": 0.3501, + "num_input_tokens_seen": 11193024, + "step": 11110 + }, + { + "epoch": 5.2404526166902405, + "grad_norm": 0.37331268191337585, + "learning_rate": 2.7265978403638693e-05, + "loss": 0.2898, + "num_input_tokens_seen": 11198432, + "step": 11115 + }, + { + "epoch": 5.242809995285243, + "grad_norm": 0.3116408586502075, + "learning_rate": 2.7245490308131682e-05, + "loss": 0.3547, + "num_input_tokens_seen": 11202976, + "step": 11120 + }, + { + "epoch": 5.245167373880245, + "grad_norm": 0.2474062740802765, + "learning_rate": 2.7225000692131215e-05, + "loss": 0.2965, + "num_input_tokens_seen": 11207488, + "step": 11125 + }, + { + "epoch": 5.247524752475248, + "grad_norm": 0.5787055492401123, + "learning_rate": 2.720450956951145e-05, + "loss": 0.338, + "num_input_tokens_seen": 11213344, + "step": 11130 + }, + { + "epoch": 5.24988213107025, + "grad_norm": 0.5903064608573914, + "learning_rate": 2.7184016954147588e-05, + "loss": 0.391, + "num_input_tokens_seen": 11217024, + "step": 11135 + }, + { + "epoch": 5.2522395096652525, + "grad_norm": 0.28217875957489014, + "learning_rate": 2.7163522859915846e-05, + "loss": 0.3005, + "num_input_tokens_seen": 11223008, + "step": 11140 + }, + { + "epoch": 5.254596888260255, + "grad_norm": 0.272263765335083, + "learning_rate": 2.7143027300693415e-05, + "loss": 0.3252, + "num_input_tokens_seen": 11227712, + "step": 11145 + }, + { + "epoch": 5.256954266855257, + "grad_norm": 0.36216798424720764, + "learning_rate": 2.7122530290358505e-05, + "loss": 0.2222, + "num_input_tokens_seen": 11233536, + "step": 11150 + }, + { + "epoch": 5.259311645450259, + "grad_norm": 0.580386221408844, + "learning_rate": 2.7102031842790292e-05, + "loss": 0.3388, + "num_input_tokens_seen": 11239680, + "step": 11155 + }, + { + "epoch": 5.261669024045261, + "grad_norm": 0.42333850264549255, + "learning_rate": 2.7081531971868944e-05, + "loss": 0.3465, + "num_input_tokens_seen": 11244544, + "step": 11160 + }, + { + "epoch": 5.264026402640264, + "grad_norm": 0.3972530961036682, + "learning_rate": 2.7061030691475576e-05, + "loss": 0.3258, + "num_input_tokens_seen": 11249120, + "step": 11165 + }, + { + "epoch": 5.266383781235266, + "grad_norm": 0.36654335260391235, + "learning_rate": 2.7040528015492256e-05, + "loss": 0.3489, + "num_input_tokens_seen": 11254976, + "step": 11170 + }, + { + "epoch": 5.2687411598302685, + "grad_norm": 0.4890325963497162, + "learning_rate": 2.7020023957802006e-05, + "loss": 0.3658, + "num_input_tokens_seen": 11258816, + "step": 11175 + }, + { + "epoch": 5.271098538425271, + "grad_norm": 0.5389552116394043, + "learning_rate": 2.6999518532288793e-05, + "loss": 0.3395, + "num_input_tokens_seen": 11263520, + "step": 11180 + }, + { + "epoch": 5.273455917020273, + "grad_norm": 0.24421627819538116, + "learning_rate": 2.6979011752837473e-05, + "loss": 0.3537, + "num_input_tokens_seen": 11268512, + "step": 11185 + }, + { + "epoch": 5.275813295615276, + "grad_norm": 0.250417023897171, + "learning_rate": 2.6958503633333864e-05, + "loss": 0.2701, + "num_input_tokens_seen": 11272960, + "step": 11190 + }, + { + "epoch": 5.278170674210278, + "grad_norm": 0.2982870042324066, + "learning_rate": 2.693799418766466e-05, + "loss": 0.3573, + "num_input_tokens_seen": 11276992, + "step": 11195 + }, + { + "epoch": 5.2805280528052805, + "grad_norm": 0.5290795564651489, + "learning_rate": 2.6917483429717478e-05, + "loss": 0.316, + "num_input_tokens_seen": 11281568, + "step": 11200 + }, + { + "epoch": 5.282885431400283, + "grad_norm": 0.2764025926589966, + "learning_rate": 2.689697137338079e-05, + "loss": 0.3259, + "num_input_tokens_seen": 11285952, + "step": 11205 + }, + { + "epoch": 5.285242809995285, + "grad_norm": 0.5834702253341675, + "learning_rate": 2.6876458032543983e-05, + "loss": 0.352, + "num_input_tokens_seen": 11291552, + "step": 11210 + }, + { + "epoch": 5.287600188590288, + "grad_norm": 0.1734006553888321, + "learning_rate": 2.6855943421097286e-05, + "loss": 0.2882, + "num_input_tokens_seen": 11296576, + "step": 11215 + }, + { + "epoch": 5.28995756718529, + "grad_norm": 0.31716111302375793, + "learning_rate": 2.6835427552931813e-05, + "loss": 0.2895, + "num_input_tokens_seen": 11302240, + "step": 11220 + }, + { + "epoch": 5.2923149457802925, + "grad_norm": 0.2597567141056061, + "learning_rate": 2.6814910441939505e-05, + "loss": 0.3466, + "num_input_tokens_seen": 11307232, + "step": 11225 + }, + { + "epoch": 5.294672324375295, + "grad_norm": 0.7971897721290588, + "learning_rate": 2.6794392102013162e-05, + "loss": 0.3184, + "num_input_tokens_seen": 11312160, + "step": 11230 + }, + { + "epoch": 5.297029702970297, + "grad_norm": 0.5016878247261047, + "learning_rate": 2.6773872547046408e-05, + "loss": 0.4097, + "num_input_tokens_seen": 11316960, + "step": 11235 + }, + { + "epoch": 5.2993870815653, + "grad_norm": 0.38998061418533325, + "learning_rate": 2.675335179093369e-05, + "loss": 0.2907, + "num_input_tokens_seen": 11323232, + "step": 11240 + }, + { + "epoch": 5.301744460160302, + "grad_norm": 0.36618179082870483, + "learning_rate": 2.6732829847570266e-05, + "loss": 0.3159, + "num_input_tokens_seen": 11327520, + "step": 11245 + }, + { + "epoch": 5.3041018387553045, + "grad_norm": 0.27201467752456665, + "learning_rate": 2.671230673085221e-05, + "loss": 0.3278, + "num_input_tokens_seen": 11333536, + "step": 11250 + }, + { + "epoch": 5.306459217350307, + "grad_norm": 0.3483031690120697, + "learning_rate": 2.6691782454676375e-05, + "loss": 0.2848, + "num_input_tokens_seen": 11339072, + "step": 11255 + }, + { + "epoch": 5.308816595945308, + "grad_norm": 0.6289777159690857, + "learning_rate": 2.6671257032940416e-05, + "loss": 0.3154, + "num_input_tokens_seen": 11343520, + "step": 11260 + }, + { + "epoch": 5.311173974540311, + "grad_norm": 0.29151734709739685, + "learning_rate": 2.665073047954274e-05, + "loss": 0.3096, + "num_input_tokens_seen": 11348000, + "step": 11265 + }, + { + "epoch": 5.313531353135313, + "grad_norm": 0.2865524888038635, + "learning_rate": 2.6630202808382543e-05, + "loss": 0.3343, + "num_input_tokens_seen": 11352928, + "step": 11270 + }, + { + "epoch": 5.315888731730316, + "grad_norm": 0.40681830048561096, + "learning_rate": 2.6609674033359765e-05, + "loss": 0.3129, + "num_input_tokens_seen": 11358528, + "step": 11275 + }, + { + "epoch": 5.318246110325318, + "grad_norm": 0.297739177942276, + "learning_rate": 2.6589144168375113e-05, + "loss": 0.308, + "num_input_tokens_seen": 11362496, + "step": 11280 + }, + { + "epoch": 5.32060348892032, + "grad_norm": 0.2526538074016571, + "learning_rate": 2.6568613227329993e-05, + "loss": 0.3702, + "num_input_tokens_seen": 11366304, + "step": 11285 + }, + { + "epoch": 5.322960867515323, + "grad_norm": 0.34759074449539185, + "learning_rate": 2.6548081224126577e-05, + "loss": 0.346, + "num_input_tokens_seen": 11370848, + "step": 11290 + }, + { + "epoch": 5.325318246110325, + "grad_norm": 0.3298919200897217, + "learning_rate": 2.6527548172667742e-05, + "loss": 0.3242, + "num_input_tokens_seen": 11375584, + "step": 11295 + }, + { + "epoch": 5.327675624705328, + "grad_norm": 0.3420906960964203, + "learning_rate": 2.6507014086857073e-05, + "loss": 0.348, + "num_input_tokens_seen": 11380768, + "step": 11300 + }, + { + "epoch": 5.33003300330033, + "grad_norm": 0.29797980189323425, + "learning_rate": 2.6486478980598857e-05, + "loss": 0.3087, + "num_input_tokens_seen": 11386720, + "step": 11305 + }, + { + "epoch": 5.332390381895332, + "grad_norm": 0.31029585003852844, + "learning_rate": 2.646594286779807e-05, + "loss": 0.3176, + "num_input_tokens_seen": 11392128, + "step": 11310 + }, + { + "epoch": 5.334747760490335, + "grad_norm": 0.5458968877792358, + "learning_rate": 2.6445405762360386e-05, + "loss": 0.352, + "num_input_tokens_seen": 11398144, + "step": 11315 + }, + { + "epoch": 5.337105139085337, + "grad_norm": 0.45121482014656067, + "learning_rate": 2.6424867678192116e-05, + "loss": 0.3494, + "num_input_tokens_seen": 11402656, + "step": 11320 + }, + { + "epoch": 5.33946251768034, + "grad_norm": 0.5353190302848816, + "learning_rate": 2.6404328629200275e-05, + "loss": 0.3372, + "num_input_tokens_seen": 11407904, + "step": 11325 + }, + { + "epoch": 5.341819896275342, + "grad_norm": 0.5265684723854065, + "learning_rate": 2.63837886292925e-05, + "loss": 0.3418, + "num_input_tokens_seen": 11413696, + "step": 11330 + }, + { + "epoch": 5.344177274870344, + "grad_norm": 0.4462781548500061, + "learning_rate": 2.636324769237709e-05, + "loss": 0.3448, + "num_input_tokens_seen": 11419200, + "step": 11335 + }, + { + "epoch": 5.346534653465347, + "grad_norm": 0.31923559308052063, + "learning_rate": 2.634270583236296e-05, + "loss": 0.3098, + "num_input_tokens_seen": 11423744, + "step": 11340 + }, + { + "epoch": 5.348892032060349, + "grad_norm": 0.3012583553791046, + "learning_rate": 2.6322163063159678e-05, + "loss": 0.3484, + "num_input_tokens_seen": 11428384, + "step": 11345 + }, + { + "epoch": 5.351249410655352, + "grad_norm": 0.2266664206981659, + "learning_rate": 2.6301619398677406e-05, + "loss": 0.2779, + "num_input_tokens_seen": 11432256, + "step": 11350 + }, + { + "epoch": 5.353606789250353, + "grad_norm": 0.3599557876586914, + "learning_rate": 2.6281074852826914e-05, + "loss": 0.3577, + "num_input_tokens_seen": 11437024, + "step": 11355 + }, + { + "epoch": 5.355964167845356, + "grad_norm": 0.3864191174507141, + "learning_rate": 2.6260529439519565e-05, + "loss": 0.3016, + "num_input_tokens_seen": 11443520, + "step": 11360 + }, + { + "epoch": 5.358321546440358, + "grad_norm": 0.3753843605518341, + "learning_rate": 2.6239983172667337e-05, + "loss": 0.3654, + "num_input_tokens_seen": 11448000, + "step": 11365 + }, + { + "epoch": 5.36067892503536, + "grad_norm": 0.3316129446029663, + "learning_rate": 2.6219436066182747e-05, + "loss": 0.3079, + "num_input_tokens_seen": 11451648, + "step": 11370 + }, + { + "epoch": 5.363036303630363, + "grad_norm": 0.3524262607097626, + "learning_rate": 2.619888813397891e-05, + "loss": 0.3499, + "num_input_tokens_seen": 11457600, + "step": 11375 + }, + { + "epoch": 5.365393682225365, + "grad_norm": 0.6155860424041748, + "learning_rate": 2.6178339389969486e-05, + "loss": 0.3714, + "num_input_tokens_seen": 11461920, + "step": 11380 + }, + { + "epoch": 5.367751060820368, + "grad_norm": 0.3131381571292877, + "learning_rate": 2.615778984806869e-05, + "loss": 0.3233, + "num_input_tokens_seen": 11466880, + "step": 11385 + }, + { + "epoch": 5.37010843941537, + "grad_norm": 0.3370121121406555, + "learning_rate": 2.6137239522191276e-05, + "loss": 0.2884, + "num_input_tokens_seen": 11472512, + "step": 11390 + }, + { + "epoch": 5.372465818010372, + "grad_norm": 0.44733697175979614, + "learning_rate": 2.611668842625253e-05, + "loss": 0.3145, + "num_input_tokens_seen": 11477920, + "step": 11395 + }, + { + "epoch": 5.374823196605375, + "grad_norm": 0.3518962860107422, + "learning_rate": 2.6096136574168243e-05, + "loss": 0.3007, + "num_input_tokens_seen": 11482656, + "step": 11400 + }, + { + "epoch": 5.377180575200377, + "grad_norm": 0.41312360763549805, + "learning_rate": 2.6075583979854756e-05, + "loss": 0.3326, + "num_input_tokens_seen": 11488704, + "step": 11405 + }, + { + "epoch": 5.37953795379538, + "grad_norm": 0.5981171727180481, + "learning_rate": 2.6055030657228875e-05, + "loss": 0.3717, + "num_input_tokens_seen": 11493504, + "step": 11410 + }, + { + "epoch": 5.381895332390382, + "grad_norm": 0.4099712669849396, + "learning_rate": 2.6034476620207916e-05, + "loss": 0.2991, + "num_input_tokens_seen": 11499296, + "step": 11415 + }, + { + "epoch": 5.384252710985384, + "grad_norm": 0.2911272346973419, + "learning_rate": 2.601392188270968e-05, + "loss": 0.3359, + "num_input_tokens_seen": 11503360, + "step": 11420 + }, + { + "epoch": 5.386610089580387, + "grad_norm": 0.3049758970737457, + "learning_rate": 2.5993366458652428e-05, + "loss": 0.3519, + "num_input_tokens_seen": 11507200, + "step": 11425 + }, + { + "epoch": 5.388967468175389, + "grad_norm": 0.31431013345718384, + "learning_rate": 2.5972810361954912e-05, + "loss": 0.2889, + "num_input_tokens_seen": 11511520, + "step": 11430 + }, + { + "epoch": 5.391324846770392, + "grad_norm": 0.4532061815261841, + "learning_rate": 2.595225360653631e-05, + "loss": 0.3557, + "num_input_tokens_seen": 11516096, + "step": 11435 + }, + { + "epoch": 5.393682225365394, + "grad_norm": 0.2965441048145294, + "learning_rate": 2.5931696206316274e-05, + "loss": 0.3022, + "num_input_tokens_seen": 11521632, + "step": 11440 + }, + { + "epoch": 5.396039603960396, + "grad_norm": 0.24779339134693146, + "learning_rate": 2.591113817521486e-05, + "loss": 0.2738, + "num_input_tokens_seen": 11527744, + "step": 11445 + }, + { + "epoch": 5.398396982555399, + "grad_norm": 0.46024656295776367, + "learning_rate": 2.5890579527152593e-05, + "loss": 0.3469, + "num_input_tokens_seen": 11531744, + "step": 11450 + }, + { + "epoch": 5.400754361150401, + "grad_norm": 0.5130016207695007, + "learning_rate": 2.587002027605037e-05, + "loss": 0.3189, + "num_input_tokens_seen": 11536192, + "step": 11455 + }, + { + "epoch": 5.403111739745403, + "grad_norm": 0.27571287751197815, + "learning_rate": 2.584946043582955e-05, + "loss": 0.3031, + "num_input_tokens_seen": 11541152, + "step": 11460 + }, + { + "epoch": 5.405469118340405, + "grad_norm": 0.3465900421142578, + "learning_rate": 2.5828900020411816e-05, + "loss": 0.3705, + "num_input_tokens_seen": 11546560, + "step": 11465 + }, + { + "epoch": 5.4078264969354075, + "grad_norm": 0.5186586976051331, + "learning_rate": 2.580833904371932e-05, + "loss": 0.3171, + "num_input_tokens_seen": 11552000, + "step": 11470 + }, + { + "epoch": 5.41018387553041, + "grad_norm": 0.30675697326660156, + "learning_rate": 2.5787777519674538e-05, + "loss": 0.2851, + "num_input_tokens_seen": 11557088, + "step": 11475 + }, + { + "epoch": 5.412541254125412, + "grad_norm": 0.5585362911224365, + "learning_rate": 2.576721546220035e-05, + "loss": 0.3358, + "num_input_tokens_seen": 11562336, + "step": 11480 + }, + { + "epoch": 5.414898632720415, + "grad_norm": 0.4858281910419464, + "learning_rate": 2.5746652885219975e-05, + "loss": 0.2291, + "num_input_tokens_seen": 11567296, + "step": 11485 + }, + { + "epoch": 5.417256011315417, + "grad_norm": 0.384240984916687, + "learning_rate": 2.5726089802657004e-05, + "loss": 0.3423, + "num_input_tokens_seen": 11572672, + "step": 11490 + }, + { + "epoch": 5.4196133899104195, + "grad_norm": 0.3674897253513336, + "learning_rate": 2.5705526228435344e-05, + "loss": 0.2915, + "num_input_tokens_seen": 11577312, + "step": 11495 + }, + { + "epoch": 5.421970768505422, + "grad_norm": 0.26215317845344543, + "learning_rate": 2.5684962176479265e-05, + "loss": 0.3015, + "num_input_tokens_seen": 11582208, + "step": 11500 + }, + { + "epoch": 5.424328147100424, + "grad_norm": 0.3938690423965454, + "learning_rate": 2.5664397660713335e-05, + "loss": 0.3373, + "num_input_tokens_seen": 11587424, + "step": 11505 + }, + { + "epoch": 5.426685525695427, + "grad_norm": 0.39548131823539734, + "learning_rate": 2.5643832695062457e-05, + "loss": 0.3037, + "num_input_tokens_seen": 11592000, + "step": 11510 + }, + { + "epoch": 5.429042904290429, + "grad_norm": 0.2477305382490158, + "learning_rate": 2.5623267293451826e-05, + "loss": 0.3057, + "num_input_tokens_seen": 11597088, + "step": 11515 + }, + { + "epoch": 5.4314002828854315, + "grad_norm": 0.21066983044147491, + "learning_rate": 2.5602701469806923e-05, + "loss": 0.4013, + "num_input_tokens_seen": 11601568, + "step": 11520 + }, + { + "epoch": 5.433757661480434, + "grad_norm": 0.240670308470726, + "learning_rate": 2.5582135238053544e-05, + "loss": 0.3564, + "num_input_tokens_seen": 11606752, + "step": 11525 + }, + { + "epoch": 5.436115040075436, + "grad_norm": 0.27594366669654846, + "learning_rate": 2.5561568612117726e-05, + "loss": 0.2691, + "num_input_tokens_seen": 11612288, + "step": 11530 + }, + { + "epoch": 5.438472418670439, + "grad_norm": 0.44889870285987854, + "learning_rate": 2.5541001605925806e-05, + "loss": 0.2667, + "num_input_tokens_seen": 11617728, + "step": 11535 + }, + { + "epoch": 5.440829797265441, + "grad_norm": 0.3888375759124756, + "learning_rate": 2.552043423340435e-05, + "loss": 0.3449, + "num_input_tokens_seen": 11623040, + "step": 11540 + }, + { + "epoch": 5.4431871758604435, + "grad_norm": 0.3965562582015991, + "learning_rate": 2.5499866508480197e-05, + "loss": 0.3853, + "num_input_tokens_seen": 11628416, + "step": 11545 + }, + { + "epoch": 5.445544554455446, + "grad_norm": 0.36678215861320496, + "learning_rate": 2.5479298445080407e-05, + "loss": 0.3235, + "num_input_tokens_seen": 11633312, + "step": 11550 + }, + { + "epoch": 5.4479019330504475, + "grad_norm": 0.43833523988723755, + "learning_rate": 2.5458730057132273e-05, + "loss": 0.268, + "num_input_tokens_seen": 11637920, + "step": 11555 + }, + { + "epoch": 5.45025931164545, + "grad_norm": 0.2828586995601654, + "learning_rate": 2.5438161358563317e-05, + "loss": 0.3007, + "num_input_tokens_seen": 11643392, + "step": 11560 + }, + { + "epoch": 5.452616690240452, + "grad_norm": 0.2889890968799591, + "learning_rate": 2.5417592363301257e-05, + "loss": 0.1956, + "num_input_tokens_seen": 11648064, + "step": 11565 + }, + { + "epoch": 5.454974068835455, + "grad_norm": 0.4267406165599823, + "learning_rate": 2.539702308527402e-05, + "loss": 0.2408, + "num_input_tokens_seen": 11654752, + "step": 11570 + }, + { + "epoch": 5.457331447430457, + "grad_norm": 0.26854604482650757, + "learning_rate": 2.537645353840974e-05, + "loss": 0.3845, + "num_input_tokens_seen": 11659680, + "step": 11575 + }, + { + "epoch": 5.4596888260254595, + "grad_norm": 0.33521875739097595, + "learning_rate": 2.5355883736636683e-05, + "loss": 0.2662, + "num_input_tokens_seen": 11664064, + "step": 11580 + }, + { + "epoch": 5.462046204620462, + "grad_norm": 0.3837713599205017, + "learning_rate": 2.533531369388335e-05, + "loss": 0.369, + "num_input_tokens_seen": 11669760, + "step": 11585 + }, + { + "epoch": 5.464403583215464, + "grad_norm": 0.2589920163154602, + "learning_rate": 2.531474342407836e-05, + "loss": 0.3003, + "num_input_tokens_seen": 11673952, + "step": 11590 + }, + { + "epoch": 5.466760961810467, + "grad_norm": 0.4136711061000824, + "learning_rate": 2.529417294115051e-05, + "loss": 0.3811, + "num_input_tokens_seen": 11679040, + "step": 11595 + }, + { + "epoch": 5.469118340405469, + "grad_norm": 0.5280289053916931, + "learning_rate": 2.5273602259028733e-05, + "loss": 0.3172, + "num_input_tokens_seen": 11683872, + "step": 11600 + }, + { + "epoch": 5.4714757190004715, + "grad_norm": 0.34108448028564453, + "learning_rate": 2.52530313916421e-05, + "loss": 0.2725, + "num_input_tokens_seen": 11689152, + "step": 11605 + }, + { + "epoch": 5.473833097595474, + "grad_norm": 0.48919203877449036, + "learning_rate": 2.5232460352919794e-05, + "loss": 0.361, + "num_input_tokens_seen": 11694208, + "step": 11610 + }, + { + "epoch": 5.476190476190476, + "grad_norm": 0.2841789126396179, + "learning_rate": 2.5211889156791136e-05, + "loss": 0.2766, + "num_input_tokens_seen": 11701280, + "step": 11615 + }, + { + "epoch": 5.478547854785479, + "grad_norm": 0.7365268468856812, + "learning_rate": 2.5191317817185533e-05, + "loss": 0.4265, + "num_input_tokens_seen": 11706208, + "step": 11620 + }, + { + "epoch": 5.480905233380481, + "grad_norm": 0.2601851224899292, + "learning_rate": 2.5170746348032508e-05, + "loss": 0.2941, + "num_input_tokens_seen": 11710400, + "step": 11625 + }, + { + "epoch": 5.4832626119754835, + "grad_norm": 0.4461249113082886, + "learning_rate": 2.5150174763261657e-05, + "loss": 0.3284, + "num_input_tokens_seen": 11716640, + "step": 11630 + }, + { + "epoch": 5.485619990570486, + "grad_norm": 0.3398306667804718, + "learning_rate": 2.512960307680266e-05, + "loss": 0.2896, + "num_input_tokens_seen": 11721824, + "step": 11635 + }, + { + "epoch": 5.487977369165488, + "grad_norm": 0.3298700451850891, + "learning_rate": 2.5109031302585266e-05, + "loss": 0.3134, + "num_input_tokens_seen": 11727904, + "step": 11640 + }, + { + "epoch": 5.490334747760491, + "grad_norm": 0.26808542013168335, + "learning_rate": 2.5088459454539275e-05, + "loss": 0.3602, + "num_input_tokens_seen": 11733184, + "step": 11645 + }, + { + "epoch": 5.492692126355493, + "grad_norm": 0.42769598960876465, + "learning_rate": 2.506788754659456e-05, + "loss": 0.3574, + "num_input_tokens_seen": 11737792, + "step": 11650 + }, + { + "epoch": 5.4950495049504955, + "grad_norm": 0.28675466775894165, + "learning_rate": 2.5047315592681002e-05, + "loss": 0.3133, + "num_input_tokens_seen": 11741728, + "step": 11655 + }, + { + "epoch": 5.497406883545497, + "grad_norm": 0.36924755573272705, + "learning_rate": 2.5026743606728555e-05, + "loss": 0.3069, + "num_input_tokens_seen": 11746336, + "step": 11660 + }, + { + "epoch": 5.499764262140499, + "grad_norm": 0.5594235062599182, + "learning_rate": 2.500617160266714e-05, + "loss": 0.3779, + "num_input_tokens_seen": 11752480, + "step": 11665 + }, + { + "epoch": 5.502121640735502, + "grad_norm": 0.3427983820438385, + "learning_rate": 2.4985599594426747e-05, + "loss": 0.3462, + "num_input_tokens_seen": 11757120, + "step": 11670 + }, + { + "epoch": 5.502593116454502, + "eval_loss": 0.33371543884277344, + "eval_runtime": 25.6329, + "eval_samples_per_second": 36.789, + "eval_steps_per_second": 9.207, + "num_input_tokens_seen": 11757792, + "step": 11671 + }, + { + "epoch": 5.504479019330504, + "grad_norm": 0.2866566777229309, + "learning_rate": 2.496502759593732e-05, + "loss": 0.356, + "num_input_tokens_seen": 11760896, + "step": 11675 + }, + { + "epoch": 5.506836397925507, + "grad_norm": 0.3762560188770294, + "learning_rate": 2.4944455621128835e-05, + "loss": 0.3207, + "num_input_tokens_seen": 11765888, + "step": 11680 + }, + { + "epoch": 5.509193776520509, + "grad_norm": 0.3597516417503357, + "learning_rate": 2.4923883683931224e-05, + "loss": 0.3309, + "num_input_tokens_seen": 11770432, + "step": 11685 + }, + { + "epoch": 5.511551155115511, + "grad_norm": 0.5109416842460632, + "learning_rate": 2.4903311798274413e-05, + "loss": 0.3184, + "num_input_tokens_seen": 11776736, + "step": 11690 + }, + { + "epoch": 5.513908533710514, + "grad_norm": 0.28222301602363586, + "learning_rate": 2.4882739978088282e-05, + "loss": 0.3307, + "num_input_tokens_seen": 11781792, + "step": 11695 + }, + { + "epoch": 5.516265912305516, + "grad_norm": 0.2960120439529419, + "learning_rate": 2.4862168237302673e-05, + "loss": 0.3759, + "num_input_tokens_seen": 11786912, + "step": 11700 + }, + { + "epoch": 5.518623290900519, + "grad_norm": 0.3333486318588257, + "learning_rate": 2.4841596589847363e-05, + "loss": 0.3492, + "num_input_tokens_seen": 11792960, + "step": 11705 + }, + { + "epoch": 5.520980669495521, + "grad_norm": 0.42078399658203125, + "learning_rate": 2.482102504965209e-05, + "loss": 0.348, + "num_input_tokens_seen": 11797408, + "step": 11710 + }, + { + "epoch": 5.523338048090523, + "grad_norm": 0.4005667269229889, + "learning_rate": 2.480045363064648e-05, + "loss": 0.3325, + "num_input_tokens_seen": 11802176, + "step": 11715 + }, + { + "epoch": 5.525695426685526, + "grad_norm": 0.5611519813537598, + "learning_rate": 2.4779882346760124e-05, + "loss": 0.3083, + "num_input_tokens_seen": 11806976, + "step": 11720 + }, + { + "epoch": 5.528052805280528, + "grad_norm": 0.35992252826690674, + "learning_rate": 2.475931121192248e-05, + "loss": 0.3455, + "num_input_tokens_seen": 11811200, + "step": 11725 + }, + { + "epoch": 5.530410183875531, + "grad_norm": 0.31105685234069824, + "learning_rate": 2.4738740240062934e-05, + "loss": 0.2815, + "num_input_tokens_seen": 11816480, + "step": 11730 + }, + { + "epoch": 5.532767562470533, + "grad_norm": 0.3061230182647705, + "learning_rate": 2.4718169445110744e-05, + "loss": 0.3472, + "num_input_tokens_seen": 11821696, + "step": 11735 + }, + { + "epoch": 5.535124941065535, + "grad_norm": 0.3530852496623993, + "learning_rate": 2.469759884099506e-05, + "loss": 0.2693, + "num_input_tokens_seen": 11826464, + "step": 11740 + }, + { + "epoch": 5.537482319660538, + "grad_norm": 0.318890780210495, + "learning_rate": 2.4677028441644887e-05, + "loss": 0.3536, + "num_input_tokens_seen": 11832512, + "step": 11745 + }, + { + "epoch": 5.539839698255539, + "grad_norm": 0.478266179561615, + "learning_rate": 2.465645826098912e-05, + "loss": 0.3455, + "num_input_tokens_seen": 11837056, + "step": 11750 + }, + { + "epoch": 5.542197076850542, + "grad_norm": 0.5592610836029053, + "learning_rate": 2.4635888312956464e-05, + "loss": 0.3058, + "num_input_tokens_seen": 11842240, + "step": 11755 + }, + { + "epoch": 5.544554455445544, + "grad_norm": 0.32082170248031616, + "learning_rate": 2.4615318611475514e-05, + "loss": 0.3574, + "num_input_tokens_seen": 11847744, + "step": 11760 + }, + { + "epoch": 5.5469118340405466, + "grad_norm": 0.5652072429656982, + "learning_rate": 2.4594749170474655e-05, + "loss": 0.396, + "num_input_tokens_seen": 11852384, + "step": 11765 + }, + { + "epoch": 5.549269212635549, + "grad_norm": 0.33434417843818665, + "learning_rate": 2.4574180003882136e-05, + "loss": 0.2825, + "num_input_tokens_seen": 11857504, + "step": 11770 + }, + { + "epoch": 5.551626591230551, + "grad_norm": 0.41174736618995667, + "learning_rate": 2.455361112562598e-05, + "loss": 0.2886, + "num_input_tokens_seen": 11862304, + "step": 11775 + }, + { + "epoch": 5.553983969825554, + "grad_norm": 0.44143956899642944, + "learning_rate": 2.4533042549634033e-05, + "loss": 0.2927, + "num_input_tokens_seen": 11866880, + "step": 11780 + }, + { + "epoch": 5.556341348420556, + "grad_norm": 0.2632052004337311, + "learning_rate": 2.451247428983395e-05, + "loss": 0.3033, + "num_input_tokens_seen": 11872064, + "step": 11785 + }, + { + "epoch": 5.558698727015559, + "grad_norm": 0.5550410747528076, + "learning_rate": 2.449190636015315e-05, + "loss": 0.3457, + "num_input_tokens_seen": 11876864, + "step": 11790 + }, + { + "epoch": 5.561056105610561, + "grad_norm": 0.3568395972251892, + "learning_rate": 2.4471338774518844e-05, + "loss": 0.3657, + "num_input_tokens_seen": 11883136, + "step": 11795 + }, + { + "epoch": 5.563413484205563, + "grad_norm": 0.6080742478370667, + "learning_rate": 2.4450771546857997e-05, + "loss": 0.4008, + "num_input_tokens_seen": 11887424, + "step": 11800 + }, + { + "epoch": 5.565770862800566, + "grad_norm": 0.3600097894668579, + "learning_rate": 2.4430204691097342e-05, + "loss": 0.3482, + "num_input_tokens_seen": 11891968, + "step": 11805 + }, + { + "epoch": 5.568128241395568, + "grad_norm": 0.5470433235168457, + "learning_rate": 2.4409638221163353e-05, + "loss": 0.4249, + "num_input_tokens_seen": 11897408, + "step": 11810 + }, + { + "epoch": 5.570485619990571, + "grad_norm": 0.32353830337524414, + "learning_rate": 2.4389072150982254e-05, + "loss": 0.36, + "num_input_tokens_seen": 11901696, + "step": 11815 + }, + { + "epoch": 5.572842998585573, + "grad_norm": 0.4721609950065613, + "learning_rate": 2.4368506494479978e-05, + "loss": 0.3315, + "num_input_tokens_seen": 11907104, + "step": 11820 + }, + { + "epoch": 5.575200377180575, + "grad_norm": 0.27241986989974976, + "learning_rate": 2.4347941265582207e-05, + "loss": 0.3458, + "num_input_tokens_seen": 11912704, + "step": 11825 + }, + { + "epoch": 5.577557755775578, + "grad_norm": 0.3228810429573059, + "learning_rate": 2.4327376478214298e-05, + "loss": 0.3476, + "num_input_tokens_seen": 11918656, + "step": 11830 + }, + { + "epoch": 5.57991513437058, + "grad_norm": 0.41233929991722107, + "learning_rate": 2.4306812146301342e-05, + "loss": 0.352, + "num_input_tokens_seen": 11923360, + "step": 11835 + }, + { + "epoch": 5.582272512965583, + "grad_norm": 0.3416653871536255, + "learning_rate": 2.42862482837681e-05, + "loss": 0.3389, + "num_input_tokens_seen": 11928064, + "step": 11840 + }, + { + "epoch": 5.584629891560585, + "grad_norm": 0.31767791509628296, + "learning_rate": 2.4265684904539024e-05, + "loss": 0.3514, + "num_input_tokens_seen": 11934912, + "step": 11845 + }, + { + "epoch": 5.586987270155587, + "grad_norm": 0.3425365388393402, + "learning_rate": 2.4245122022538235e-05, + "loss": 0.351, + "num_input_tokens_seen": 11939456, + "step": 11850 + }, + { + "epoch": 5.58934464875059, + "grad_norm": 0.48854389786720276, + "learning_rate": 2.422455965168953e-05, + "loss": 0.359, + "num_input_tokens_seen": 11943904, + "step": 11855 + }, + { + "epoch": 5.591702027345592, + "grad_norm": 0.40167760848999023, + "learning_rate": 2.4203997805916334e-05, + "loss": 0.3755, + "num_input_tokens_seen": 11951008, + "step": 11860 + }, + { + "epoch": 5.594059405940594, + "grad_norm": 0.45779210329055786, + "learning_rate": 2.4183436499141745e-05, + "loss": 0.3316, + "num_input_tokens_seen": 11956352, + "step": 11865 + }, + { + "epoch": 5.596416784535596, + "grad_norm": 0.3633318543434143, + "learning_rate": 2.4162875745288475e-05, + "loss": 0.3035, + "num_input_tokens_seen": 11961024, + "step": 11870 + }, + { + "epoch": 5.5987741631305985, + "grad_norm": 0.23889708518981934, + "learning_rate": 2.4142315558278875e-05, + "loss": 0.3122, + "num_input_tokens_seen": 11966496, + "step": 11875 + }, + { + "epoch": 5.601131541725601, + "grad_norm": 0.32949692010879517, + "learning_rate": 2.4121755952034896e-05, + "loss": 0.3382, + "num_input_tokens_seen": 11972608, + "step": 11880 + }, + { + "epoch": 5.603488920320603, + "grad_norm": 0.38422301411628723, + "learning_rate": 2.4101196940478128e-05, + "loss": 0.316, + "num_input_tokens_seen": 11977664, + "step": 11885 + }, + { + "epoch": 5.605846298915606, + "grad_norm": 0.4144671857357025, + "learning_rate": 2.408063853752972e-05, + "loss": 0.3652, + "num_input_tokens_seen": 11982400, + "step": 11890 + }, + { + "epoch": 5.608203677510608, + "grad_norm": 0.41635721921920776, + "learning_rate": 2.406008075711042e-05, + "loss": 0.2906, + "num_input_tokens_seen": 11987968, + "step": 11895 + }, + { + "epoch": 5.6105610561056105, + "grad_norm": 0.4060874879360199, + "learning_rate": 2.4039523613140573e-05, + "loss": 0.3411, + "num_input_tokens_seen": 11993120, + "step": 11900 + }, + { + "epoch": 5.612918434700613, + "grad_norm": 0.3581308424472809, + "learning_rate": 2.4018967119540074e-05, + "loss": 0.374, + "num_input_tokens_seen": 11998400, + "step": 11905 + }, + { + "epoch": 5.615275813295615, + "grad_norm": 0.5051378011703491, + "learning_rate": 2.3998411290228386e-05, + "loss": 0.3108, + "num_input_tokens_seen": 12002592, + "step": 11910 + }, + { + "epoch": 5.617633191890618, + "grad_norm": 0.2602517604827881, + "learning_rate": 2.397785613912452e-05, + "loss": 0.3714, + "num_input_tokens_seen": 12007744, + "step": 11915 + }, + { + "epoch": 5.61999057048562, + "grad_norm": 0.36030471324920654, + "learning_rate": 2.3957301680147027e-05, + "loss": 0.3239, + "num_input_tokens_seen": 12013024, + "step": 11920 + }, + { + "epoch": 5.6223479490806225, + "grad_norm": 0.4947222173213959, + "learning_rate": 2.3936747927213987e-05, + "loss": 0.3386, + "num_input_tokens_seen": 12018016, + "step": 11925 + }, + { + "epoch": 5.624705327675625, + "grad_norm": 0.4699721038341522, + "learning_rate": 2.3916194894243016e-05, + "loss": 0.3006, + "num_input_tokens_seen": 12023008, + "step": 11930 + }, + { + "epoch": 5.627062706270627, + "grad_norm": 0.3409734070301056, + "learning_rate": 2.3895642595151217e-05, + "loss": 0.3248, + "num_input_tokens_seen": 12027808, + "step": 11935 + }, + { + "epoch": 5.62942008486563, + "grad_norm": 0.31367430090904236, + "learning_rate": 2.3875091043855222e-05, + "loss": 0.3459, + "num_input_tokens_seen": 12032736, + "step": 11940 + }, + { + "epoch": 5.631777463460632, + "grad_norm": 0.3116514980792999, + "learning_rate": 2.3854540254271135e-05, + "loss": 0.354, + "num_input_tokens_seen": 12037824, + "step": 11945 + }, + { + "epoch": 5.634134842055634, + "grad_norm": 0.3129774332046509, + "learning_rate": 2.3833990240314562e-05, + "loss": 0.3194, + "num_input_tokens_seen": 12042368, + "step": 11950 + }, + { + "epoch": 5.636492220650636, + "grad_norm": 0.3645881116390228, + "learning_rate": 2.3813441015900574e-05, + "loss": 0.3142, + "num_input_tokens_seen": 12047584, + "step": 11955 + }, + { + "epoch": 5.6388495992456384, + "grad_norm": 0.3411192297935486, + "learning_rate": 2.3792892594943713e-05, + "loss": 0.3345, + "num_input_tokens_seen": 12051968, + "step": 11960 + }, + { + "epoch": 5.641206977840641, + "grad_norm": 0.5121519565582275, + "learning_rate": 2.3772344991357966e-05, + "loss": 0.3191, + "num_input_tokens_seen": 12057216, + "step": 11965 + }, + { + "epoch": 5.643564356435643, + "grad_norm": 0.4197995364665985, + "learning_rate": 2.375179821905678e-05, + "loss": 0.3177, + "num_input_tokens_seen": 12062368, + "step": 11970 + }, + { + "epoch": 5.645921735030646, + "grad_norm": 0.3034205734729767, + "learning_rate": 2.3731252291953026e-05, + "loss": 0.3215, + "num_input_tokens_seen": 12066368, + "step": 11975 + }, + { + "epoch": 5.648279113625648, + "grad_norm": 0.33683469891548157, + "learning_rate": 2.3710707223959024e-05, + "loss": 0.3093, + "num_input_tokens_seen": 12070816, + "step": 11980 + }, + { + "epoch": 5.6506364922206505, + "grad_norm": 0.3876732885837555, + "learning_rate": 2.369016302898648e-05, + "loss": 0.3276, + "num_input_tokens_seen": 12075744, + "step": 11985 + }, + { + "epoch": 5.652993870815653, + "grad_norm": 0.5738368630409241, + "learning_rate": 2.3669619720946546e-05, + "loss": 0.4027, + "num_input_tokens_seen": 12081184, + "step": 11990 + }, + { + "epoch": 5.655351249410655, + "grad_norm": 0.21809595823287964, + "learning_rate": 2.3649077313749746e-05, + "loss": 0.2983, + "num_input_tokens_seen": 12086144, + "step": 11995 + }, + { + "epoch": 5.657708628005658, + "grad_norm": 0.5490480661392212, + "learning_rate": 2.362853582130599e-05, + "loss": 0.3625, + "num_input_tokens_seen": 12090528, + "step": 12000 + }, + { + "epoch": 5.66006600660066, + "grad_norm": 0.3277533948421478, + "learning_rate": 2.36079952575246e-05, + "loss": 0.3093, + "num_input_tokens_seen": 12094688, + "step": 12005 + }, + { + "epoch": 5.6624233851956625, + "grad_norm": 0.6330156326293945, + "learning_rate": 2.3587455636314238e-05, + "loss": 0.3504, + "num_input_tokens_seen": 12100352, + "step": 12010 + }, + { + "epoch": 5.664780763790665, + "grad_norm": 0.32686465978622437, + "learning_rate": 2.356691697158295e-05, + "loss": 0.356, + "num_input_tokens_seen": 12104768, + "step": 12015 + }, + { + "epoch": 5.667138142385667, + "grad_norm": 0.33562085032463074, + "learning_rate": 2.3546379277238107e-05, + "loss": 0.3638, + "num_input_tokens_seen": 12109568, + "step": 12020 + }, + { + "epoch": 5.66949552098067, + "grad_norm": 0.2171396166086197, + "learning_rate": 2.352584256718646e-05, + "loss": 0.3482, + "num_input_tokens_seen": 12114080, + "step": 12025 + }, + { + "epoch": 5.671852899575672, + "grad_norm": 0.3633175492286682, + "learning_rate": 2.350530685533406e-05, + "loss": 0.3358, + "num_input_tokens_seen": 12118624, + "step": 12030 + }, + { + "epoch": 5.6742102781706745, + "grad_norm": 0.43609943985939026, + "learning_rate": 2.3484772155586308e-05, + "loss": 0.3298, + "num_input_tokens_seen": 12123232, + "step": 12035 + }, + { + "epoch": 5.676567656765677, + "grad_norm": 0.3486790955066681, + "learning_rate": 2.346423848184789e-05, + "loss": 0.3627, + "num_input_tokens_seen": 12128480, + "step": 12040 + }, + { + "epoch": 5.678925035360679, + "grad_norm": 0.30455586314201355, + "learning_rate": 2.3443705848022835e-05, + "loss": 0.3093, + "num_input_tokens_seen": 12133600, + "step": 12045 + }, + { + "epoch": 5.681282413955682, + "grad_norm": 0.5124199986457825, + "learning_rate": 2.342317426801443e-05, + "loss": 0.3094, + "num_input_tokens_seen": 12137920, + "step": 12050 + }, + { + "epoch": 5.683639792550684, + "grad_norm": 0.3081676959991455, + "learning_rate": 2.3402643755725282e-05, + "loss": 0.3363, + "num_input_tokens_seen": 12143008, + "step": 12055 + }, + { + "epoch": 5.6859971711456865, + "grad_norm": 0.51214998960495, + "learning_rate": 2.338211432505724e-05, + "loss": 0.3117, + "num_input_tokens_seen": 12147328, + "step": 12060 + }, + { + "epoch": 5.688354549740688, + "grad_norm": 0.29990994930267334, + "learning_rate": 2.336158598991146e-05, + "loss": 0.347, + "num_input_tokens_seen": 12152960, + "step": 12065 + }, + { + "epoch": 5.69071192833569, + "grad_norm": 0.3831104338169098, + "learning_rate": 2.334105876418832e-05, + "loss": 0.3015, + "num_input_tokens_seen": 12157952, + "step": 12070 + }, + { + "epoch": 5.693069306930693, + "grad_norm": 0.29577645659446716, + "learning_rate": 2.3320532661787474e-05, + "loss": 0.3715, + "num_input_tokens_seen": 12163136, + "step": 12075 + }, + { + "epoch": 5.695426685525695, + "grad_norm": 0.5599243640899658, + "learning_rate": 2.330000769660779e-05, + "loss": 0.3715, + "num_input_tokens_seen": 12168512, + "step": 12080 + }, + { + "epoch": 5.697784064120698, + "grad_norm": 0.35832175612449646, + "learning_rate": 2.32794838825474e-05, + "loss": 0.2943, + "num_input_tokens_seen": 12173984, + "step": 12085 + }, + { + "epoch": 5.7001414427157, + "grad_norm": 0.3407084047794342, + "learning_rate": 2.325896123350361e-05, + "loss": 0.3285, + "num_input_tokens_seen": 12178656, + "step": 12090 + }, + { + "epoch": 5.702498821310702, + "grad_norm": 0.2767365276813507, + "learning_rate": 2.323843976337299e-05, + "loss": 0.3324, + "num_input_tokens_seen": 12184224, + "step": 12095 + }, + { + "epoch": 5.704856199905705, + "grad_norm": 0.33607590198516846, + "learning_rate": 2.3217919486051268e-05, + "loss": 0.2917, + "num_input_tokens_seen": 12189600, + "step": 12100 + }, + { + "epoch": 5.707213578500707, + "grad_norm": 0.3717793822288513, + "learning_rate": 2.319740041543339e-05, + "loss": 0.3163, + "num_input_tokens_seen": 12194208, + "step": 12105 + }, + { + "epoch": 5.70957095709571, + "grad_norm": 0.5188944339752197, + "learning_rate": 2.317688256541348e-05, + "loss": 0.3793, + "num_input_tokens_seen": 12198368, + "step": 12110 + }, + { + "epoch": 5.711928335690712, + "grad_norm": 0.3294060230255127, + "learning_rate": 2.315636594988482e-05, + "loss": 0.3458, + "num_input_tokens_seen": 12202816, + "step": 12115 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.46117135882377625, + "learning_rate": 2.3135850582739867e-05, + "loss": 0.2966, + "num_input_tokens_seen": 12207456, + "step": 12120 + }, + { + "epoch": 5.716643092880717, + "grad_norm": 0.40766385197639465, + "learning_rate": 2.3115336477870246e-05, + "loss": 0.3436, + "num_input_tokens_seen": 12212224, + "step": 12125 + }, + { + "epoch": 5.719000471475719, + "grad_norm": 0.4865358769893646, + "learning_rate": 2.3094823649166702e-05, + "loss": 0.3024, + "num_input_tokens_seen": 12217056, + "step": 12130 + }, + { + "epoch": 5.721357850070722, + "grad_norm": 0.24036511778831482, + "learning_rate": 2.3074312110519138e-05, + "loss": 0.3024, + "num_input_tokens_seen": 12221696, + "step": 12135 + }, + { + "epoch": 5.723715228665724, + "grad_norm": 0.6343191862106323, + "learning_rate": 2.305380187581658e-05, + "loss": 0.3514, + "num_input_tokens_seen": 12227040, + "step": 12140 + }, + { + "epoch": 5.726072607260726, + "grad_norm": 0.45270249247550964, + "learning_rate": 2.303329295894715e-05, + "loss": 0.313, + "num_input_tokens_seen": 12231360, + "step": 12145 + }, + { + "epoch": 5.728429985855728, + "grad_norm": 0.5143155455589294, + "learning_rate": 2.3012785373798103e-05, + "loss": 0.3222, + "num_input_tokens_seen": 12236768, + "step": 12150 + }, + { + "epoch": 5.73078736445073, + "grad_norm": 0.553474485874176, + "learning_rate": 2.299227913425578e-05, + "loss": 0.3544, + "num_input_tokens_seen": 12241120, + "step": 12155 + }, + { + "epoch": 5.733144743045733, + "grad_norm": 0.5330823063850403, + "learning_rate": 2.2971774254205617e-05, + "loss": 0.3795, + "num_input_tokens_seen": 12245664, + "step": 12160 + }, + { + "epoch": 5.735502121640735, + "grad_norm": 0.3632469177246094, + "learning_rate": 2.2951270747532115e-05, + "loss": 0.3254, + "num_input_tokens_seen": 12250336, + "step": 12165 + }, + { + "epoch": 5.7378595002357375, + "grad_norm": 0.5327730774879456, + "learning_rate": 2.2930768628118868e-05, + "loss": 0.344, + "num_input_tokens_seen": 12255168, + "step": 12170 + }, + { + "epoch": 5.74021687883074, + "grad_norm": 0.5022169351577759, + "learning_rate": 2.2910267909848507e-05, + "loss": 0.3547, + "num_input_tokens_seen": 12260544, + "step": 12175 + }, + { + "epoch": 5.742574257425742, + "grad_norm": 0.30138033628463745, + "learning_rate": 2.2889768606602735e-05, + "loss": 0.3455, + "num_input_tokens_seen": 12264512, + "step": 12180 + }, + { + "epoch": 5.744931636020745, + "grad_norm": 0.3907308876514435, + "learning_rate": 2.286927073226228e-05, + "loss": 0.2893, + "num_input_tokens_seen": 12269856, + "step": 12185 + }, + { + "epoch": 5.747289014615747, + "grad_norm": 0.27040305733680725, + "learning_rate": 2.2848774300706918e-05, + "loss": 0.3203, + "num_input_tokens_seen": 12275744, + "step": 12190 + }, + { + "epoch": 5.7496463932107496, + "grad_norm": 0.3377397358417511, + "learning_rate": 2.282827932581543e-05, + "loss": 0.3255, + "num_input_tokens_seen": 12280672, + "step": 12195 + }, + { + "epoch": 5.752003771805752, + "grad_norm": 0.34194016456604004, + "learning_rate": 2.280778582146563e-05, + "loss": 0.3601, + "num_input_tokens_seen": 12286464, + "step": 12200 + }, + { + "epoch": 5.754361150400754, + "grad_norm": 0.24377213418483734, + "learning_rate": 2.2787293801534317e-05, + "loss": 0.3148, + "num_input_tokens_seen": 12291712, + "step": 12205 + }, + { + "epoch": 5.756718528995757, + "grad_norm": 0.39527228474617004, + "learning_rate": 2.2766803279897306e-05, + "loss": 0.331, + "num_input_tokens_seen": 12296288, + "step": 12210 + }, + { + "epoch": 5.759075907590759, + "grad_norm": 0.4261415898799896, + "learning_rate": 2.2746314270429374e-05, + "loss": 0.3949, + "num_input_tokens_seen": 12301856, + "step": 12215 + }, + { + "epoch": 5.761433286185762, + "grad_norm": 0.31450197100639343, + "learning_rate": 2.2725826787004308e-05, + "loss": 0.3683, + "num_input_tokens_seen": 12306528, + "step": 12220 + }, + { + "epoch": 5.763790664780764, + "grad_norm": 0.5174238085746765, + "learning_rate": 2.2705340843494806e-05, + "loss": 0.3068, + "num_input_tokens_seen": 12311584, + "step": 12225 + }, + { + "epoch": 5.766148043375766, + "grad_norm": 0.3151135742664337, + "learning_rate": 2.2684856453772585e-05, + "loss": 0.3163, + "num_input_tokens_seen": 12316000, + "step": 12230 + }, + { + "epoch": 5.768505421970769, + "grad_norm": 0.46604403853416443, + "learning_rate": 2.266437363170826e-05, + "loss": 0.3676, + "num_input_tokens_seen": 12321248, + "step": 12235 + }, + { + "epoch": 5.770862800565771, + "grad_norm": 0.3661462366580963, + "learning_rate": 2.264389239117143e-05, + "loss": 0.3408, + "num_input_tokens_seen": 12326272, + "step": 12240 + }, + { + "epoch": 5.773220179160774, + "grad_norm": 0.5007978081703186, + "learning_rate": 2.2623412746030574e-05, + "loss": 0.3351, + "num_input_tokens_seen": 12333472, + "step": 12245 + }, + { + "epoch": 5.775577557755776, + "grad_norm": 0.3874819874763489, + "learning_rate": 2.260293471015314e-05, + "loss": 0.3117, + "num_input_tokens_seen": 12338848, + "step": 12250 + }, + { + "epoch": 5.777934936350778, + "grad_norm": 0.4748454689979553, + "learning_rate": 2.2582458297405446e-05, + "loss": 0.2971, + "num_input_tokens_seen": 12344992, + "step": 12255 + }, + { + "epoch": 5.780292314945781, + "grad_norm": 0.38189125061035156, + "learning_rate": 2.2561983521652737e-05, + "loss": 0.3802, + "num_input_tokens_seen": 12350976, + "step": 12260 + }, + { + "epoch": 5.782649693540782, + "grad_norm": 0.3323333263397217, + "learning_rate": 2.2541510396759135e-05, + "loss": 0.3363, + "num_input_tokens_seen": 12355776, + "step": 12265 + }, + { + "epoch": 5.785007072135785, + "grad_norm": 0.42777714133262634, + "learning_rate": 2.2521038936587654e-05, + "loss": 0.2865, + "num_input_tokens_seen": 12360864, + "step": 12270 + }, + { + "epoch": 5.787364450730787, + "grad_norm": 0.33167335391044617, + "learning_rate": 2.2500569155000166e-05, + "loss": 0.3961, + "num_input_tokens_seen": 12365472, + "step": 12275 + }, + { + "epoch": 5.7897218293257895, + "grad_norm": 0.2875833213329315, + "learning_rate": 2.2480101065857433e-05, + "loss": 0.3369, + "num_input_tokens_seen": 12369280, + "step": 12280 + }, + { + "epoch": 5.792079207920792, + "grad_norm": 0.7743808627128601, + "learning_rate": 2.2459634683019034e-05, + "loss": 0.365, + "num_input_tokens_seen": 12373600, + "step": 12285 + }, + { + "epoch": 5.794436586515794, + "grad_norm": 0.5326367616653442, + "learning_rate": 2.2439170020343432e-05, + "loss": 0.3058, + "num_input_tokens_seen": 12377440, + "step": 12290 + }, + { + "epoch": 5.796793965110797, + "grad_norm": 0.5094301104545593, + "learning_rate": 2.2418707091687886e-05, + "loss": 0.3654, + "num_input_tokens_seen": 12382688, + "step": 12295 + }, + { + "epoch": 5.799151343705799, + "grad_norm": 0.2645971179008484, + "learning_rate": 2.2398245910908523e-05, + "loss": 0.3392, + "num_input_tokens_seen": 12387104, + "step": 12300 + }, + { + "epoch": 5.8015087223008015, + "grad_norm": 0.38200291991233826, + "learning_rate": 2.2377786491860248e-05, + "loss": 0.3429, + "num_input_tokens_seen": 12391968, + "step": 12305 + }, + { + "epoch": 5.803866100895804, + "grad_norm": 0.3971049189567566, + "learning_rate": 2.2357328848396796e-05, + "loss": 0.3153, + "num_input_tokens_seen": 12398496, + "step": 12310 + }, + { + "epoch": 5.806223479490806, + "grad_norm": 0.311816543340683, + "learning_rate": 2.2336872994370693e-05, + "loss": 0.3556, + "num_input_tokens_seen": 12404128, + "step": 12315 + }, + { + "epoch": 5.808580858085809, + "grad_norm": 0.22647158801555634, + "learning_rate": 2.231641894363325e-05, + "loss": 0.3126, + "num_input_tokens_seen": 12408960, + "step": 12320 + }, + { + "epoch": 5.810938236680811, + "grad_norm": 0.455695778131485, + "learning_rate": 2.2295966710034565e-05, + "loss": 0.3447, + "num_input_tokens_seen": 12413440, + "step": 12325 + }, + { + "epoch": 5.8132956152758135, + "grad_norm": 0.5307115316390991, + "learning_rate": 2.22755163074235e-05, + "loss": 0.2879, + "num_input_tokens_seen": 12417376, + "step": 12330 + }, + { + "epoch": 5.815652993870816, + "grad_norm": 0.36278852820396423, + "learning_rate": 2.2255067749647685e-05, + "loss": 0.3214, + "num_input_tokens_seen": 12422784, + "step": 12335 + }, + { + "epoch": 5.818010372465818, + "grad_norm": 0.585944652557373, + "learning_rate": 2.2234621050553474e-05, + "loss": 0.3345, + "num_input_tokens_seen": 12427936, + "step": 12340 + }, + { + "epoch": 5.820367751060821, + "grad_norm": 0.5467092394828796, + "learning_rate": 2.2214176223986e-05, + "loss": 0.3537, + "num_input_tokens_seen": 12432928, + "step": 12345 + }, + { + "epoch": 5.822725129655822, + "grad_norm": 0.3609158396720886, + "learning_rate": 2.21937332837891e-05, + "loss": 0.33, + "num_input_tokens_seen": 12438176, + "step": 12350 + }, + { + "epoch": 5.825082508250825, + "grad_norm": 0.3603835701942444, + "learning_rate": 2.2173292243805354e-05, + "loss": 0.3186, + "num_input_tokens_seen": 12443616, + "step": 12355 + }, + { + "epoch": 5.827439886845827, + "grad_norm": 0.23970353603363037, + "learning_rate": 2.2152853117876036e-05, + "loss": 0.3133, + "num_input_tokens_seen": 12448416, + "step": 12360 + }, + { + "epoch": 5.829797265440829, + "grad_norm": 0.5023058652877808, + "learning_rate": 2.2132415919841144e-05, + "loss": 0.2789, + "num_input_tokens_seen": 12453856, + "step": 12365 + }, + { + "epoch": 5.832154644035832, + "grad_norm": 0.557881772518158, + "learning_rate": 2.2111980663539346e-05, + "loss": 0.373, + "num_input_tokens_seen": 12458016, + "step": 12370 + }, + { + "epoch": 5.834512022630834, + "grad_norm": 0.39922547340393066, + "learning_rate": 2.2091547362808022e-05, + "loss": 0.3396, + "num_input_tokens_seen": 12463744, + "step": 12375 + }, + { + "epoch": 5.836869401225837, + "grad_norm": 0.36987411975860596, + "learning_rate": 2.2071116031483205e-05, + "loss": 0.3378, + "num_input_tokens_seen": 12468672, + "step": 12380 + }, + { + "epoch": 5.839226779820839, + "grad_norm": 0.3362692594528198, + "learning_rate": 2.2050686683399616e-05, + "loss": 0.3021, + "num_input_tokens_seen": 12472736, + "step": 12385 + }, + { + "epoch": 5.841584158415841, + "grad_norm": 0.5146910548210144, + "learning_rate": 2.203025933239061e-05, + "loss": 0.3893, + "num_input_tokens_seen": 12476768, + "step": 12390 + }, + { + "epoch": 5.843941537010844, + "grad_norm": 0.29009199142456055, + "learning_rate": 2.2009833992288202e-05, + "loss": 0.3619, + "num_input_tokens_seen": 12482048, + "step": 12395 + }, + { + "epoch": 5.846298915605846, + "grad_norm": 0.3670559525489807, + "learning_rate": 2.1989410676923053e-05, + "loss": 0.3311, + "num_input_tokens_seen": 12488096, + "step": 12400 + }, + { + "epoch": 5.848656294200849, + "grad_norm": 0.4077605605125427, + "learning_rate": 2.196898940012444e-05, + "loss": 0.3412, + "num_input_tokens_seen": 12493504, + "step": 12405 + }, + { + "epoch": 5.851013672795851, + "grad_norm": 0.36544060707092285, + "learning_rate": 2.194857017572026e-05, + "loss": 0.2935, + "num_input_tokens_seen": 12498208, + "step": 12410 + }, + { + "epoch": 5.8533710513908535, + "grad_norm": 0.3451593816280365, + "learning_rate": 2.1928153017537034e-05, + "loss": 0.3662, + "num_input_tokens_seen": 12502656, + "step": 12415 + }, + { + "epoch": 5.855728429985856, + "grad_norm": 0.5326082706451416, + "learning_rate": 2.1907737939399862e-05, + "loss": 0.3385, + "num_input_tokens_seen": 12506688, + "step": 12420 + }, + { + "epoch": 5.858085808580858, + "grad_norm": 0.2853084206581116, + "learning_rate": 2.1887324955132463e-05, + "loss": 0.3854, + "num_input_tokens_seen": 12511744, + "step": 12425 + }, + { + "epoch": 5.860443187175861, + "grad_norm": 0.3081589937210083, + "learning_rate": 2.186691407855711e-05, + "loss": 0.3742, + "num_input_tokens_seen": 12516032, + "step": 12430 + }, + { + "epoch": 5.862800565770863, + "grad_norm": 0.4113743305206299, + "learning_rate": 2.1846505323494665e-05, + "loss": 0.3603, + "num_input_tokens_seen": 12520736, + "step": 12435 + }, + { + "epoch": 5.8651579443658655, + "grad_norm": 0.34765172004699707, + "learning_rate": 2.182609870376455e-05, + "loss": 0.3322, + "num_input_tokens_seen": 12525920, + "step": 12440 + }, + { + "epoch": 5.867515322960868, + "grad_norm": 0.4173046946525574, + "learning_rate": 2.1805694233184752e-05, + "loss": 0.3442, + "num_input_tokens_seen": 12530432, + "step": 12445 + }, + { + "epoch": 5.86987270155587, + "grad_norm": 0.3722936511039734, + "learning_rate": 2.178529192557178e-05, + "loss": 0.3248, + "num_input_tokens_seen": 12535904, + "step": 12450 + }, + { + "epoch": 5.872230080150873, + "grad_norm": 0.5713955760002136, + "learning_rate": 2.176489179474068e-05, + "loss": 0.3382, + "num_input_tokens_seen": 12540800, + "step": 12455 + }, + { + "epoch": 5.874587458745875, + "grad_norm": 0.5757285356521606, + "learning_rate": 2.1744493854505064e-05, + "loss": 0.3382, + "num_input_tokens_seen": 12546304, + "step": 12460 + }, + { + "epoch": 5.876944837340877, + "grad_norm": 0.45982226729393005, + "learning_rate": 2.1724098118677006e-05, + "loss": 0.3421, + "num_input_tokens_seen": 12551680, + "step": 12465 + }, + { + "epoch": 5.879302215935879, + "grad_norm": 0.47616589069366455, + "learning_rate": 2.1703704601067127e-05, + "loss": 0.3417, + "num_input_tokens_seen": 12555936, + "step": 12470 + }, + { + "epoch": 5.881659594530881, + "grad_norm": 0.3638717532157898, + "learning_rate": 2.1683313315484514e-05, + "loss": 0.3356, + "num_input_tokens_seen": 12560512, + "step": 12475 + }, + { + "epoch": 5.884016973125884, + "grad_norm": 0.30547505617141724, + "learning_rate": 2.1662924275736788e-05, + "loss": 0.3539, + "num_input_tokens_seen": 12565056, + "step": 12480 + }, + { + "epoch": 5.886374351720886, + "grad_norm": 0.3720332384109497, + "learning_rate": 2.1642537495629995e-05, + "loss": 0.3409, + "num_input_tokens_seen": 12569344, + "step": 12485 + }, + { + "epoch": 5.888731730315889, + "grad_norm": 0.3132162094116211, + "learning_rate": 2.1622152988968693e-05, + "loss": 0.3047, + "num_input_tokens_seen": 12574112, + "step": 12490 + }, + { + "epoch": 5.891089108910891, + "grad_norm": 0.2571845054626465, + "learning_rate": 2.1601770769555872e-05, + "loss": 0.3219, + "num_input_tokens_seen": 12578656, + "step": 12495 + }, + { + "epoch": 5.893446487505893, + "grad_norm": 0.6796435713768005, + "learning_rate": 2.1581390851193006e-05, + "loss": 0.3618, + "num_input_tokens_seen": 12586784, + "step": 12500 + }, + { + "epoch": 5.895803866100896, + "grad_norm": 0.3417738080024719, + "learning_rate": 2.1561013247679967e-05, + "loss": 0.3486, + "num_input_tokens_seen": 12591200, + "step": 12505 + }, + { + "epoch": 5.898161244695898, + "grad_norm": 0.32889264822006226, + "learning_rate": 2.1540637972815107e-05, + "loss": 0.4064, + "num_input_tokens_seen": 12595808, + "step": 12510 + }, + { + "epoch": 5.900518623290901, + "grad_norm": 0.360571026802063, + "learning_rate": 2.1520265040395156e-05, + "loss": 0.3137, + "num_input_tokens_seen": 12600288, + "step": 12515 + }, + { + "epoch": 5.902876001885903, + "grad_norm": 0.544965386390686, + "learning_rate": 2.1499894464215296e-05, + "loss": 0.3201, + "num_input_tokens_seen": 12605120, + "step": 12520 + }, + { + "epoch": 5.905233380480905, + "grad_norm": 0.36804527044296265, + "learning_rate": 2.1479526258069087e-05, + "loss": 0.3739, + "num_input_tokens_seen": 12610624, + "step": 12525 + }, + { + "epoch": 5.907590759075908, + "grad_norm": 0.46080029010772705, + "learning_rate": 2.14591604357485e-05, + "loss": 0.3155, + "num_input_tokens_seen": 12615392, + "step": 12530 + }, + { + "epoch": 5.90994813767091, + "grad_norm": 0.7551490068435669, + "learning_rate": 2.143879701104388e-05, + "loss": 0.3491, + "num_input_tokens_seen": 12621216, + "step": 12535 + }, + { + "epoch": 5.912305516265913, + "grad_norm": 0.3770047426223755, + "learning_rate": 2.141843599774397e-05, + "loss": 0.3804, + "num_input_tokens_seen": 12625824, + "step": 12540 + }, + { + "epoch": 5.914662894860915, + "grad_norm": 0.23671777546405792, + "learning_rate": 2.1398077409635845e-05, + "loss": 0.3413, + "num_input_tokens_seen": 12630752, + "step": 12545 + }, + { + "epoch": 5.9170202734559165, + "grad_norm": 0.47929975390434265, + "learning_rate": 2.1377721260504967e-05, + "loss": 0.3461, + "num_input_tokens_seen": 12636928, + "step": 12550 + }, + { + "epoch": 5.919377652050919, + "grad_norm": 0.73491370677948, + "learning_rate": 2.135736756413514e-05, + "loss": 0.3219, + "num_input_tokens_seen": 12642880, + "step": 12555 + }, + { + "epoch": 5.921735030645921, + "grad_norm": 0.30852505564689636, + "learning_rate": 2.1337016334308505e-05, + "loss": 0.3231, + "num_input_tokens_seen": 12647264, + "step": 12560 + }, + { + "epoch": 5.924092409240924, + "grad_norm": 0.2702178359031677, + "learning_rate": 2.131666758480552e-05, + "loss": 0.3054, + "num_input_tokens_seen": 12651776, + "step": 12565 + }, + { + "epoch": 5.926449787835926, + "grad_norm": 0.5111743211746216, + "learning_rate": 2.1296321329404983e-05, + "loss": 0.3125, + "num_input_tokens_seen": 12655616, + "step": 12570 + }, + { + "epoch": 5.9288071664309285, + "grad_norm": 0.3007754385471344, + "learning_rate": 2.1275977581884003e-05, + "loss": 0.301, + "num_input_tokens_seen": 12660000, + "step": 12575 + }, + { + "epoch": 5.931164545025931, + "grad_norm": 0.4077266752719879, + "learning_rate": 2.1255636356017965e-05, + "loss": 0.347, + "num_input_tokens_seen": 12664352, + "step": 12580 + }, + { + "epoch": 5.933521923620933, + "grad_norm": 0.3348545432090759, + "learning_rate": 2.1235297665580585e-05, + "loss": 0.3974, + "num_input_tokens_seen": 12669728, + "step": 12585 + }, + { + "epoch": 5.935879302215936, + "grad_norm": 0.2699664235115051, + "learning_rate": 2.121496152434383e-05, + "loss": 0.2788, + "num_input_tokens_seen": 12674720, + "step": 12590 + }, + { + "epoch": 5.938236680810938, + "grad_norm": 0.34772536158561707, + "learning_rate": 2.1194627946077962e-05, + "loss": 0.302, + "num_input_tokens_seen": 12679328, + "step": 12595 + }, + { + "epoch": 5.9405940594059405, + "grad_norm": 0.3562777042388916, + "learning_rate": 2.117429694455149e-05, + "loss": 0.3005, + "num_input_tokens_seen": 12684000, + "step": 12600 + }, + { + "epoch": 5.942951438000943, + "grad_norm": 0.5614376664161682, + "learning_rate": 2.1153968533531193e-05, + "loss": 0.3898, + "num_input_tokens_seen": 12688896, + "step": 12605 + }, + { + "epoch": 5.945308816595945, + "grad_norm": 0.30550992488861084, + "learning_rate": 2.1133642726782084e-05, + "loss": 0.307, + "num_input_tokens_seen": 12693952, + "step": 12610 + }, + { + "epoch": 5.947666195190948, + "grad_norm": 0.3325306177139282, + "learning_rate": 2.1113319538067422e-05, + "loss": 0.2641, + "num_input_tokens_seen": 12699104, + "step": 12615 + }, + { + "epoch": 5.95002357378595, + "grad_norm": 0.4059074819087982, + "learning_rate": 2.1092998981148686e-05, + "loss": 0.3071, + "num_input_tokens_seen": 12704896, + "step": 12620 + }, + { + "epoch": 5.9523809523809526, + "grad_norm": 0.3718908727169037, + "learning_rate": 2.1072681069785588e-05, + "loss": 0.3547, + "num_input_tokens_seen": 12709216, + "step": 12625 + }, + { + "epoch": 5.954738330975955, + "grad_norm": 0.33802932500839233, + "learning_rate": 2.1052365817736022e-05, + "loss": 0.3214, + "num_input_tokens_seen": 12714016, + "step": 12630 + }, + { + "epoch": 5.957095709570957, + "grad_norm": 0.3319877088069916, + "learning_rate": 2.1032053238756107e-05, + "loss": 0.341, + "num_input_tokens_seen": 12718400, + "step": 12635 + }, + { + "epoch": 5.95945308816596, + "grad_norm": 0.3984719514846802, + "learning_rate": 2.1011743346600134e-05, + "loss": 0.3739, + "num_input_tokens_seen": 12723392, + "step": 12640 + }, + { + "epoch": 5.961810466760962, + "grad_norm": 0.4781612455844879, + "learning_rate": 2.099143615502059e-05, + "loss": 0.2782, + "num_input_tokens_seen": 12728544, + "step": 12645 + }, + { + "epoch": 5.964167845355965, + "grad_norm": 0.41013383865356445, + "learning_rate": 2.0971131677768115e-05, + "loss": 0.2871, + "num_input_tokens_seen": 12733504, + "step": 12650 + }, + { + "epoch": 5.966525223950967, + "grad_norm": 0.35391321778297424, + "learning_rate": 2.0950829928591533e-05, + "loss": 0.3503, + "num_input_tokens_seen": 12738240, + "step": 12655 + }, + { + "epoch": 5.968882602545969, + "grad_norm": 0.3817015290260315, + "learning_rate": 2.0930530921237798e-05, + "loss": 0.3996, + "num_input_tokens_seen": 12743936, + "step": 12660 + }, + { + "epoch": 5.971239981140971, + "grad_norm": 0.5309619903564453, + "learning_rate": 2.0910234669452027e-05, + "loss": 0.3523, + "num_input_tokens_seen": 12749056, + "step": 12665 + }, + { + "epoch": 5.973597359735973, + "grad_norm": 0.5497778058052063, + "learning_rate": 2.0889941186977468e-05, + "loss": 0.3341, + "num_input_tokens_seen": 12754080, + "step": 12670 + }, + { + "epoch": 5.975954738330976, + "grad_norm": 0.4708418548107147, + "learning_rate": 2.086965048755547e-05, + "loss": 0.3281, + "num_input_tokens_seen": 12760352, + "step": 12675 + }, + { + "epoch": 5.978312116925978, + "grad_norm": 0.33841317892074585, + "learning_rate": 2.084936258492553e-05, + "loss": 0.2869, + "num_input_tokens_seen": 12764224, + "step": 12680 + }, + { + "epoch": 5.9806694955209805, + "grad_norm": 0.34291696548461914, + "learning_rate": 2.0829077492825226e-05, + "loss": 0.3907, + "num_input_tokens_seen": 12768192, + "step": 12685 + }, + { + "epoch": 5.983026874115983, + "grad_norm": 0.6238914728164673, + "learning_rate": 2.080879522499026e-05, + "loss": 0.4202, + "num_input_tokens_seen": 12773216, + "step": 12690 + }, + { + "epoch": 5.985384252710985, + "grad_norm": 0.3029443919658661, + "learning_rate": 2.078851579515439e-05, + "loss": 0.2895, + "num_input_tokens_seen": 12778432, + "step": 12695 + }, + { + "epoch": 5.987741631305988, + "grad_norm": 0.36125606298446655, + "learning_rate": 2.076823921704948e-05, + "loss": 0.3626, + "num_input_tokens_seen": 12782976, + "step": 12700 + }, + { + "epoch": 5.99009900990099, + "grad_norm": 0.4918132424354553, + "learning_rate": 2.0747965504405443e-05, + "loss": 0.311, + "num_input_tokens_seen": 12789728, + "step": 12705 + }, + { + "epoch": 5.9924563884959925, + "grad_norm": 0.3117121756076813, + "learning_rate": 2.072769467095027e-05, + "loss": 0.3548, + "num_input_tokens_seen": 12797312, + "step": 12710 + }, + { + "epoch": 5.994813767090995, + "grad_norm": 0.5704934000968933, + "learning_rate": 2.0707426730409974e-05, + "loss": 0.3279, + "num_input_tokens_seen": 12801600, + "step": 12715 + }, + { + "epoch": 5.997171145685997, + "grad_norm": 0.5660470724105835, + "learning_rate": 2.0687161696508644e-05, + "loss": 0.3947, + "num_input_tokens_seen": 12808512, + "step": 12720 + }, + { + "epoch": 5.999528524281, + "grad_norm": 0.3415999710559845, + "learning_rate": 2.0666899582968374e-05, + "loss": 0.3111, + "num_input_tokens_seen": 12812960, + "step": 12725 + }, + { + "epoch": 6.001885902876002, + "grad_norm": 0.4754696786403656, + "learning_rate": 2.0646640403509304e-05, + "loss": 0.3193, + "num_input_tokens_seen": 12818112, + "step": 12730 + }, + { + "epoch": 6.002828854314003, + "eval_loss": 0.3320164382457733, + "eval_runtime": 25.5837, + "eval_samples_per_second": 36.859, + "eval_steps_per_second": 9.225, + "num_input_tokens_seen": 12820704, + "step": 12732 + }, + { + "epoch": 6.0042432814710045, + "grad_norm": 0.29189154505729675, + "learning_rate": 2.0626384171849557e-05, + "loss": 0.2947, + "num_input_tokens_seen": 12824096, + "step": 12735 + }, + { + "epoch": 6.006600660066007, + "grad_norm": 0.29616105556488037, + "learning_rate": 2.0606130901705286e-05, + "loss": 0.2897, + "num_input_tokens_seen": 12829024, + "step": 12740 + }, + { + "epoch": 6.008958038661009, + "grad_norm": 0.2789301574230194, + "learning_rate": 2.0585880606790626e-05, + "loss": 0.3277, + "num_input_tokens_seen": 12834464, + "step": 12745 + }, + { + "epoch": 6.011315417256012, + "grad_norm": 0.27811866998672485, + "learning_rate": 2.056563330081771e-05, + "loss": 0.3302, + "num_input_tokens_seen": 12838144, + "step": 12750 + }, + { + "epoch": 6.013672795851014, + "grad_norm": 0.37299105525016785, + "learning_rate": 2.054538899749662e-05, + "loss": 0.2889, + "num_input_tokens_seen": 12843296, + "step": 12755 + }, + { + "epoch": 6.016030174446016, + "grad_norm": 0.5682767033576965, + "learning_rate": 2.052514771053544e-05, + "loss": 0.4066, + "num_input_tokens_seen": 12848640, + "step": 12760 + }, + { + "epoch": 6.018387553041018, + "grad_norm": 0.31825852394104004, + "learning_rate": 2.0504909453640186e-05, + "loss": 0.3841, + "num_input_tokens_seen": 12853184, + "step": 12765 + }, + { + "epoch": 6.02074493163602, + "grad_norm": 0.34163928031921387, + "learning_rate": 2.0484674240514835e-05, + "loss": 0.3473, + "num_input_tokens_seen": 12857760, + "step": 12770 + }, + { + "epoch": 6.023102310231023, + "grad_norm": 0.39364853501319885, + "learning_rate": 2.0464442084861292e-05, + "loss": 0.2884, + "num_input_tokens_seen": 12862816, + "step": 12775 + }, + { + "epoch": 6.025459688826025, + "grad_norm": 0.5167708396911621, + "learning_rate": 2.0444213000379402e-05, + "loss": 0.3364, + "num_input_tokens_seen": 12866912, + "step": 12780 + }, + { + "epoch": 6.027817067421028, + "grad_norm": 0.33098897337913513, + "learning_rate": 2.0423987000766935e-05, + "loss": 0.3336, + "num_input_tokens_seen": 12872480, + "step": 12785 + }, + { + "epoch": 6.03017444601603, + "grad_norm": 0.7472811341285706, + "learning_rate": 2.0403764099719543e-05, + "loss": 0.3239, + "num_input_tokens_seen": 12878240, + "step": 12790 + }, + { + "epoch": 6.032531824611032, + "grad_norm": 0.2824952304363251, + "learning_rate": 2.038354431093082e-05, + "loss": 0.3244, + "num_input_tokens_seen": 12885088, + "step": 12795 + }, + { + "epoch": 6.034889203206035, + "grad_norm": 0.3876200318336487, + "learning_rate": 2.036332764809222e-05, + "loss": 0.3813, + "num_input_tokens_seen": 12890080, + "step": 12800 + }, + { + "epoch": 6.037246581801037, + "grad_norm": 0.32940128445625305, + "learning_rate": 2.0343114124893097e-05, + "loss": 0.2867, + "num_input_tokens_seen": 12895616, + "step": 12805 + }, + { + "epoch": 6.03960396039604, + "grad_norm": 0.5433613657951355, + "learning_rate": 2.032290375502068e-05, + "loss": 0.3735, + "num_input_tokens_seen": 12901152, + "step": 12810 + }, + { + "epoch": 6.041961338991042, + "grad_norm": 0.32499706745147705, + "learning_rate": 2.030269655216006e-05, + "loss": 0.2873, + "num_input_tokens_seen": 12906720, + "step": 12815 + }, + { + "epoch": 6.044318717586044, + "grad_norm": 0.30888277292251587, + "learning_rate": 2.0282492529994172e-05, + "loss": 0.3193, + "num_input_tokens_seen": 12911168, + "step": 12820 + }, + { + "epoch": 6.046676096181047, + "grad_norm": 0.3062818944454193, + "learning_rate": 2.0262291702203818e-05, + "loss": 0.3068, + "num_input_tokens_seen": 12915168, + "step": 12825 + }, + { + "epoch": 6.049033474776049, + "grad_norm": 0.26907849311828613, + "learning_rate": 2.024209408246761e-05, + "loss": 0.3462, + "num_input_tokens_seen": 12920032, + "step": 12830 + }, + { + "epoch": 6.051390853371052, + "grad_norm": 0.28430411219596863, + "learning_rate": 2.0221899684462026e-05, + "loss": 0.3218, + "num_input_tokens_seen": 12924736, + "step": 12835 + }, + { + "epoch": 6.053748231966054, + "grad_norm": 0.32016822695732117, + "learning_rate": 2.0201708521861324e-05, + "loss": 0.3284, + "num_input_tokens_seen": 12930624, + "step": 12840 + }, + { + "epoch": 6.0561056105610565, + "grad_norm": 0.27949121594429016, + "learning_rate": 2.0181520608337593e-05, + "loss": 0.3107, + "num_input_tokens_seen": 12935104, + "step": 12845 + }, + { + "epoch": 6.058462989156059, + "grad_norm": 0.31472575664520264, + "learning_rate": 2.0161335957560713e-05, + "loss": 0.3153, + "num_input_tokens_seen": 12939840, + "step": 12850 + }, + { + "epoch": 6.060820367751061, + "grad_norm": 0.46242085099220276, + "learning_rate": 2.0141154583198366e-05, + "loss": 0.3601, + "num_input_tokens_seen": 12945280, + "step": 12855 + }, + { + "epoch": 6.063177746346063, + "grad_norm": 0.6045969724655151, + "learning_rate": 2.0120976498915993e-05, + "loss": 0.3832, + "num_input_tokens_seen": 12950208, + "step": 12860 + }, + { + "epoch": 6.065535124941065, + "grad_norm": 0.32732734084129333, + "learning_rate": 2.010080171837684e-05, + "loss": 0.2958, + "num_input_tokens_seen": 12954976, + "step": 12865 + }, + { + "epoch": 6.067892503536068, + "grad_norm": 0.5211916565895081, + "learning_rate": 2.0080630255241884e-05, + "loss": 0.2935, + "num_input_tokens_seen": 12960032, + "step": 12870 + }, + { + "epoch": 6.07024988213107, + "grad_norm": 0.30892589688301086, + "learning_rate": 2.0060462123169875e-05, + "loss": 0.3339, + "num_input_tokens_seen": 12965280, + "step": 12875 + }, + { + "epoch": 6.072607260726072, + "grad_norm": 0.47800540924072266, + "learning_rate": 2.0040297335817295e-05, + "loss": 0.307, + "num_input_tokens_seen": 12971552, + "step": 12880 + }, + { + "epoch": 6.074964639321075, + "grad_norm": 0.3555357754230499, + "learning_rate": 2.0020135906838383e-05, + "loss": 0.3531, + "num_input_tokens_seen": 12976736, + "step": 12885 + }, + { + "epoch": 6.077322017916077, + "grad_norm": 0.5377951860427856, + "learning_rate": 1.9999977849885072e-05, + "loss": 0.312, + "num_input_tokens_seen": 12981408, + "step": 12890 + }, + { + "epoch": 6.07967939651108, + "grad_norm": 0.2793082892894745, + "learning_rate": 1.9979823178607042e-05, + "loss": 0.2897, + "num_input_tokens_seen": 12985728, + "step": 12895 + }, + { + "epoch": 6.082036775106082, + "grad_norm": 0.410905659198761, + "learning_rate": 1.9959671906651658e-05, + "loss": 0.2793, + "num_input_tokens_seen": 12991136, + "step": 12900 + }, + { + "epoch": 6.084394153701084, + "grad_norm": 0.5663029551506042, + "learning_rate": 1.993952404766399e-05, + "loss": 0.3542, + "num_input_tokens_seen": 12995424, + "step": 12905 + }, + { + "epoch": 6.086751532296087, + "grad_norm": 0.5363618731498718, + "learning_rate": 1.991937961528681e-05, + "loss": 0.3188, + "num_input_tokens_seen": 13000512, + "step": 12910 + }, + { + "epoch": 6.089108910891089, + "grad_norm": 0.5313544273376465, + "learning_rate": 1.989923862316055e-05, + "loss": 0.3377, + "num_input_tokens_seen": 13005216, + "step": 12915 + }, + { + "epoch": 6.091466289486092, + "grad_norm": 0.29342886805534363, + "learning_rate": 1.9879101084923324e-05, + "loss": 0.3735, + "num_input_tokens_seen": 13011552, + "step": 12920 + }, + { + "epoch": 6.093823668081094, + "grad_norm": 0.384771466255188, + "learning_rate": 1.98589670142109e-05, + "loss": 0.315, + "num_input_tokens_seen": 13016960, + "step": 12925 + }, + { + "epoch": 6.096181046676096, + "grad_norm": 0.5391272306442261, + "learning_rate": 1.9838836424656714e-05, + "loss": 0.3538, + "num_input_tokens_seen": 13022176, + "step": 12930 + }, + { + "epoch": 6.098538425271099, + "grad_norm": 0.29723864793777466, + "learning_rate": 1.9818709329891823e-05, + "loss": 0.3094, + "num_input_tokens_seen": 13027104, + "step": 12935 + }, + { + "epoch": 6.100895803866101, + "grad_norm": 0.4912359416484833, + "learning_rate": 1.9798585743544938e-05, + "loss": 0.319, + "num_input_tokens_seen": 13031904, + "step": 12940 + }, + { + "epoch": 6.103253182461104, + "grad_norm": 0.4549797773361206, + "learning_rate": 1.9778465679242373e-05, + "loss": 0.3044, + "num_input_tokens_seen": 13037632, + "step": 12945 + }, + { + "epoch": 6.105610561056106, + "grad_norm": 0.277619868516922, + "learning_rate": 1.9758349150608076e-05, + "loss": 0.3144, + "num_input_tokens_seen": 13042720, + "step": 12950 + }, + { + "epoch": 6.107967939651108, + "grad_norm": 0.32661324739456177, + "learning_rate": 1.9738236171263593e-05, + "loss": 0.335, + "num_input_tokens_seen": 13047136, + "step": 12955 + }, + { + "epoch": 6.11032531824611, + "grad_norm": 0.3829572796821594, + "learning_rate": 1.9718126754828066e-05, + "loss": 0.362, + "num_input_tokens_seen": 13052032, + "step": 12960 + }, + { + "epoch": 6.112682696841112, + "grad_norm": 0.545445442199707, + "learning_rate": 1.9698020914918218e-05, + "loss": 0.4009, + "num_input_tokens_seen": 13058848, + "step": 12965 + }, + { + "epoch": 6.115040075436115, + "grad_norm": 0.25724533200263977, + "learning_rate": 1.967791866514837e-05, + "loss": 0.3389, + "num_input_tokens_seen": 13062848, + "step": 12970 + }, + { + "epoch": 6.117397454031117, + "grad_norm": 0.2650902569293976, + "learning_rate": 1.965782001913039e-05, + "loss": 0.3366, + "num_input_tokens_seen": 13068352, + "step": 12975 + }, + { + "epoch": 6.1197548326261195, + "grad_norm": 0.4914599359035492, + "learning_rate": 1.9637724990473727e-05, + "loss": 0.3549, + "num_input_tokens_seen": 13073536, + "step": 12980 + }, + { + "epoch": 6.122112211221122, + "grad_norm": 0.3424645662307739, + "learning_rate": 1.9617633592785353e-05, + "loss": 0.3615, + "num_input_tokens_seen": 13080352, + "step": 12985 + }, + { + "epoch": 6.124469589816124, + "grad_norm": 0.2930172383785248, + "learning_rate": 1.9597545839669815e-05, + "loss": 0.3149, + "num_input_tokens_seen": 13085984, + "step": 12990 + }, + { + "epoch": 6.126826968411127, + "grad_norm": 0.46039915084838867, + "learning_rate": 1.957746174472916e-05, + "loss": 0.3427, + "num_input_tokens_seen": 13090560, + "step": 12995 + }, + { + "epoch": 6.129184347006129, + "grad_norm": 0.35142987966537476, + "learning_rate": 1.955738132156298e-05, + "loss": 0.3312, + "num_input_tokens_seen": 13096192, + "step": 13000 + }, + { + "epoch": 6.1315417256011315, + "grad_norm": 0.27614566683769226, + "learning_rate": 1.953730458376837e-05, + "loss": 0.3425, + "num_input_tokens_seen": 13101280, + "step": 13005 + }, + { + "epoch": 6.133899104196134, + "grad_norm": 0.40953943133354187, + "learning_rate": 1.9517231544939942e-05, + "loss": 0.3373, + "num_input_tokens_seen": 13108000, + "step": 13010 + }, + { + "epoch": 6.136256482791136, + "grad_norm": 0.32601162791252136, + "learning_rate": 1.949716221866978e-05, + "loss": 0.3476, + "num_input_tokens_seen": 13113888, + "step": 13015 + }, + { + "epoch": 6.138613861386139, + "grad_norm": 0.4907996356487274, + "learning_rate": 1.9477096618547476e-05, + "loss": 0.2986, + "num_input_tokens_seen": 13119552, + "step": 13020 + }, + { + "epoch": 6.140971239981141, + "grad_norm": 0.27457085251808167, + "learning_rate": 1.9457034758160085e-05, + "loss": 0.2978, + "num_input_tokens_seen": 13125280, + "step": 13025 + }, + { + "epoch": 6.1433286185761435, + "grad_norm": 0.39518141746520996, + "learning_rate": 1.9436976651092144e-05, + "loss": 0.3458, + "num_input_tokens_seen": 13129856, + "step": 13030 + }, + { + "epoch": 6.145685997171146, + "grad_norm": 0.3629002869129181, + "learning_rate": 1.9416922310925636e-05, + "loss": 0.3739, + "num_input_tokens_seen": 13134944, + "step": 13035 + }, + { + "epoch": 6.148043375766148, + "grad_norm": 0.5150750875473022, + "learning_rate": 1.9396871751239994e-05, + "loss": 0.3354, + "num_input_tokens_seen": 13139264, + "step": 13040 + }, + { + "epoch": 6.150400754361151, + "grad_norm": 0.358599990606308, + "learning_rate": 1.9376824985612104e-05, + "loss": 0.3122, + "num_input_tokens_seen": 13144896, + "step": 13045 + }, + { + "epoch": 6.152758132956153, + "grad_norm": 0.7207773923873901, + "learning_rate": 1.9356782027616264e-05, + "loss": 0.3656, + "num_input_tokens_seen": 13150112, + "step": 13050 + }, + { + "epoch": 6.1551155115511555, + "grad_norm": 0.41004425287246704, + "learning_rate": 1.933674289082421e-05, + "loss": 0.3816, + "num_input_tokens_seen": 13155008, + "step": 13055 + }, + { + "epoch": 6.157472890146157, + "grad_norm": 0.3767876923084259, + "learning_rate": 1.931670758880508e-05, + "loss": 0.372, + "num_input_tokens_seen": 13159584, + "step": 13060 + }, + { + "epoch": 6.1598302687411595, + "grad_norm": 0.3530625104904175, + "learning_rate": 1.9296676135125423e-05, + "loss": 0.3042, + "num_input_tokens_seen": 13165536, + "step": 13065 + }, + { + "epoch": 6.162187647336162, + "grad_norm": 0.3349190056324005, + "learning_rate": 1.9276648543349175e-05, + "loss": 0.3118, + "num_input_tokens_seen": 13170336, + "step": 13070 + }, + { + "epoch": 6.164545025931164, + "grad_norm": 0.2640615701675415, + "learning_rate": 1.9256624827037664e-05, + "loss": 0.3417, + "num_input_tokens_seen": 13175808, + "step": 13075 + }, + { + "epoch": 6.166902404526167, + "grad_norm": 0.3315025269985199, + "learning_rate": 1.923660499974958e-05, + "loss": 0.355, + "num_input_tokens_seen": 13180000, + "step": 13080 + }, + { + "epoch": 6.169259783121169, + "grad_norm": 0.48177972435951233, + "learning_rate": 1.921658907504101e-05, + "loss": 0.365, + "num_input_tokens_seen": 13185664, + "step": 13085 + }, + { + "epoch": 6.1716171617161715, + "grad_norm": 0.43777450919151306, + "learning_rate": 1.9196577066465353e-05, + "loss": 0.3442, + "num_input_tokens_seen": 13189600, + "step": 13090 + }, + { + "epoch": 6.173974540311174, + "grad_norm": 0.3852463960647583, + "learning_rate": 1.91765689875734e-05, + "loss": 0.3531, + "num_input_tokens_seen": 13193824, + "step": 13095 + }, + { + "epoch": 6.176331918906176, + "grad_norm": 0.2911893427371979, + "learning_rate": 1.9156564851913257e-05, + "loss": 0.3163, + "num_input_tokens_seen": 13198432, + "step": 13100 + }, + { + "epoch": 6.178689297501179, + "grad_norm": 0.4032494127750397, + "learning_rate": 1.9136564673030364e-05, + "loss": 0.3624, + "num_input_tokens_seen": 13202240, + "step": 13105 + }, + { + "epoch": 6.181046676096181, + "grad_norm": 0.7815051674842834, + "learning_rate": 1.9116568464467486e-05, + "loss": 0.3448, + "num_input_tokens_seen": 13208480, + "step": 13110 + }, + { + "epoch": 6.1834040546911835, + "grad_norm": 0.3235092759132385, + "learning_rate": 1.9096576239764697e-05, + "loss": 0.355, + "num_input_tokens_seen": 13213344, + "step": 13115 + }, + { + "epoch": 6.185761433286186, + "grad_norm": 0.364224910736084, + "learning_rate": 1.9076588012459377e-05, + "loss": 0.3273, + "num_input_tokens_seen": 13219200, + "step": 13120 + }, + { + "epoch": 6.188118811881188, + "grad_norm": 0.5729198455810547, + "learning_rate": 1.9056603796086183e-05, + "loss": 0.3333, + "num_input_tokens_seen": 13224832, + "step": 13125 + }, + { + "epoch": 6.190476190476191, + "grad_norm": 0.5475242137908936, + "learning_rate": 1.9036623604177074e-05, + "loss": 0.3375, + "num_input_tokens_seen": 13230272, + "step": 13130 + }, + { + "epoch": 6.192833569071193, + "grad_norm": 0.3165191411972046, + "learning_rate": 1.9016647450261284e-05, + "loss": 0.3065, + "num_input_tokens_seen": 13235616, + "step": 13135 + }, + { + "epoch": 6.1951909476661955, + "grad_norm": 0.28386619687080383, + "learning_rate": 1.8996675347865293e-05, + "loss": 0.3071, + "num_input_tokens_seen": 13239840, + "step": 13140 + }, + { + "epoch": 6.197548326261198, + "grad_norm": 0.2406371831893921, + "learning_rate": 1.8976707310512873e-05, + "loss": 0.3322, + "num_input_tokens_seen": 13244480, + "step": 13145 + }, + { + "epoch": 6.1999057048562, + "grad_norm": 0.4724380671977997, + "learning_rate": 1.8956743351725002e-05, + "loss": 0.3389, + "num_input_tokens_seen": 13248800, + "step": 13150 + }, + { + "epoch": 6.202263083451202, + "grad_norm": 0.439576655626297, + "learning_rate": 1.8936783485019934e-05, + "loss": 0.2879, + "num_input_tokens_seen": 13253056, + "step": 13155 + }, + { + "epoch": 6.204620462046204, + "grad_norm": 0.2964024543762207, + "learning_rate": 1.891682772391312e-05, + "loss": 0.2607, + "num_input_tokens_seen": 13258016, + "step": 13160 + }, + { + "epoch": 6.206977840641207, + "grad_norm": 0.3812639117240906, + "learning_rate": 1.8896876081917257e-05, + "loss": 0.3384, + "num_input_tokens_seen": 13263328, + "step": 13165 + }, + { + "epoch": 6.209335219236209, + "grad_norm": 0.5757248997688293, + "learning_rate": 1.887692857254223e-05, + "loss": 0.3766, + "num_input_tokens_seen": 13267040, + "step": 13170 + }, + { + "epoch": 6.211692597831211, + "grad_norm": 0.3170773983001709, + "learning_rate": 1.8856985209295148e-05, + "loss": 0.3534, + "num_input_tokens_seen": 13271648, + "step": 13175 + }, + { + "epoch": 6.214049976426214, + "grad_norm": 0.272751122713089, + "learning_rate": 1.883704600568029e-05, + "loss": 0.3064, + "num_input_tokens_seen": 13276384, + "step": 13180 + }, + { + "epoch": 6.216407355021216, + "grad_norm": 0.3813071846961975, + "learning_rate": 1.8817110975199136e-05, + "loss": 0.2761, + "num_input_tokens_seen": 13281120, + "step": 13185 + }, + { + "epoch": 6.218764733616219, + "grad_norm": 0.2788962423801422, + "learning_rate": 1.879718013135034e-05, + "loss": 0.3689, + "num_input_tokens_seen": 13285504, + "step": 13190 + }, + { + "epoch": 6.221122112211221, + "grad_norm": 0.47089117765426636, + "learning_rate": 1.8777253487629707e-05, + "loss": 0.3421, + "num_input_tokens_seen": 13290048, + "step": 13195 + }, + { + "epoch": 6.223479490806223, + "grad_norm": 0.7774358987808228, + "learning_rate": 1.8757331057530212e-05, + "loss": 0.3927, + "num_input_tokens_seen": 13294784, + "step": 13200 + }, + { + "epoch": 6.225836869401226, + "grad_norm": 0.4735918343067169, + "learning_rate": 1.8737412854541963e-05, + "loss": 0.2834, + "num_input_tokens_seen": 13300064, + "step": 13205 + }, + { + "epoch": 6.228194247996228, + "grad_norm": 0.46205976605415344, + "learning_rate": 1.8717498892152223e-05, + "loss": 0.3051, + "num_input_tokens_seen": 13304000, + "step": 13210 + }, + { + "epoch": 6.230551626591231, + "grad_norm": 0.5945736765861511, + "learning_rate": 1.8697589183845366e-05, + "loss": 0.3458, + "num_input_tokens_seen": 13309696, + "step": 13215 + }, + { + "epoch": 6.232909005186233, + "grad_norm": 0.321315199136734, + "learning_rate": 1.8677683743102902e-05, + "loss": 0.3475, + "num_input_tokens_seen": 13313952, + "step": 13220 + }, + { + "epoch": 6.235266383781235, + "grad_norm": 0.2566947638988495, + "learning_rate": 1.8657782583403436e-05, + "loss": 0.3133, + "num_input_tokens_seen": 13318592, + "step": 13225 + }, + { + "epoch": 6.237623762376238, + "grad_norm": 0.3625091314315796, + "learning_rate": 1.8637885718222687e-05, + "loss": 0.3642, + "num_input_tokens_seen": 13322720, + "step": 13230 + }, + { + "epoch": 6.23998114097124, + "grad_norm": 0.5510318279266357, + "learning_rate": 1.8617993161033463e-05, + "loss": 0.3312, + "num_input_tokens_seen": 13329344, + "step": 13235 + }, + { + "epoch": 6.242338519566243, + "grad_norm": 0.2549631893634796, + "learning_rate": 1.8598104925305644e-05, + "loss": 0.3425, + "num_input_tokens_seen": 13335136, + "step": 13240 + }, + { + "epoch": 6.244695898161245, + "grad_norm": 0.2790239751338959, + "learning_rate": 1.8578221024506183e-05, + "loss": 0.3354, + "num_input_tokens_seen": 13340736, + "step": 13245 + }, + { + "epoch": 6.247053276756247, + "grad_norm": 0.3195487856864929, + "learning_rate": 1.8558341472099126e-05, + "loss": 0.3199, + "num_input_tokens_seen": 13347072, + "step": 13250 + }, + { + "epoch": 6.24941065535125, + "grad_norm": 0.35651060938835144, + "learning_rate": 1.853846628154554e-05, + "loss": 0.3351, + "num_input_tokens_seen": 13352480, + "step": 13255 + }, + { + "epoch": 6.251768033946251, + "grad_norm": 0.2884892225265503, + "learning_rate": 1.851859546630357e-05, + "loss": 0.3491, + "num_input_tokens_seen": 13356768, + "step": 13260 + }, + { + "epoch": 6.254125412541254, + "grad_norm": 0.27733007073402405, + "learning_rate": 1.8498729039828362e-05, + "loss": 0.3281, + "num_input_tokens_seen": 13360896, + "step": 13265 + }, + { + "epoch": 6.256482791136256, + "grad_norm": 0.384342759847641, + "learning_rate": 1.847886701557213e-05, + "loss": 0.3084, + "num_input_tokens_seen": 13366272, + "step": 13270 + }, + { + "epoch": 6.258840169731259, + "grad_norm": 0.4951072633266449, + "learning_rate": 1.8459009406984073e-05, + "loss": 0.3926, + "num_input_tokens_seen": 13370464, + "step": 13275 + }, + { + "epoch": 6.261197548326261, + "grad_norm": 0.29317963123321533, + "learning_rate": 1.8439156227510425e-05, + "loss": 0.3232, + "num_input_tokens_seen": 13375424, + "step": 13280 + }, + { + "epoch": 6.263554926921263, + "grad_norm": 0.4013907313346863, + "learning_rate": 1.8419307490594396e-05, + "loss": 0.3411, + "num_input_tokens_seen": 13380736, + "step": 13285 + }, + { + "epoch": 6.265912305516266, + "grad_norm": 0.6036160588264465, + "learning_rate": 1.8399463209676214e-05, + "loss": 0.3292, + "num_input_tokens_seen": 13385088, + "step": 13290 + }, + { + "epoch": 6.268269684111268, + "grad_norm": 0.4336033761501312, + "learning_rate": 1.8379623398193074e-05, + "loss": 0.2976, + "num_input_tokens_seen": 13390208, + "step": 13295 + }, + { + "epoch": 6.270627062706271, + "grad_norm": 0.3910869061946869, + "learning_rate": 1.8359788069579153e-05, + "loss": 0.327, + "num_input_tokens_seen": 13395488, + "step": 13300 + }, + { + "epoch": 6.272984441301273, + "grad_norm": 0.37339287996292114, + "learning_rate": 1.8339957237265577e-05, + "loss": 0.3097, + "num_input_tokens_seen": 13400608, + "step": 13305 + }, + { + "epoch": 6.275341819896275, + "grad_norm": 0.3237535059452057, + "learning_rate": 1.832013091468045e-05, + "loss": 0.3084, + "num_input_tokens_seen": 13406784, + "step": 13310 + }, + { + "epoch": 6.277699198491278, + "grad_norm": 0.6424565315246582, + "learning_rate": 1.8300309115248797e-05, + "loss": 0.3114, + "num_input_tokens_seen": 13412032, + "step": 13315 + }, + { + "epoch": 6.28005657708628, + "grad_norm": 0.5660730004310608, + "learning_rate": 1.828049185239261e-05, + "loss": 0.3489, + "num_input_tokens_seen": 13420768, + "step": 13320 + }, + { + "epoch": 6.282413955681283, + "grad_norm": 0.33262452483177185, + "learning_rate": 1.8260679139530784e-05, + "loss": 0.3552, + "num_input_tokens_seen": 13424768, + "step": 13325 + }, + { + "epoch": 6.284771334276285, + "grad_norm": 0.24796904623508453, + "learning_rate": 1.8240870990079144e-05, + "loss": 0.3337, + "num_input_tokens_seen": 13428800, + "step": 13330 + }, + { + "epoch": 6.287128712871287, + "grad_norm": 0.26633334159851074, + "learning_rate": 1.8221067417450417e-05, + "loss": 0.3138, + "num_input_tokens_seen": 13433344, + "step": 13335 + }, + { + "epoch": 6.28948609146629, + "grad_norm": 0.3294439911842346, + "learning_rate": 1.820126843505425e-05, + "loss": 0.2894, + "num_input_tokens_seen": 13438272, + "step": 13340 + }, + { + "epoch": 6.291843470061292, + "grad_norm": 0.28906482458114624, + "learning_rate": 1.8181474056297158e-05, + "loss": 0.3317, + "num_input_tokens_seen": 13443104, + "step": 13345 + }, + { + "epoch": 6.294200848656295, + "grad_norm": 0.31528303027153015, + "learning_rate": 1.8161684294582547e-05, + "loss": 0.3574, + "num_input_tokens_seen": 13449152, + "step": 13350 + }, + { + "epoch": 6.296558227251296, + "grad_norm": 0.3104722201824188, + "learning_rate": 1.814189916331071e-05, + "loss": 0.3875, + "num_input_tokens_seen": 13454112, + "step": 13355 + }, + { + "epoch": 6.2989156058462985, + "grad_norm": 0.32551416754722595, + "learning_rate": 1.812211867587878e-05, + "loss": 0.3458, + "num_input_tokens_seen": 13459424, + "step": 13360 + }, + { + "epoch": 6.301272984441301, + "grad_norm": 0.46951770782470703, + "learning_rate": 1.8102342845680774e-05, + "loss": 0.3506, + "num_input_tokens_seen": 13464160, + "step": 13365 + }, + { + "epoch": 6.303630363036303, + "grad_norm": 0.273989200592041, + "learning_rate": 1.8082571686107525e-05, + "loss": 0.2984, + "num_input_tokens_seen": 13468704, + "step": 13370 + }, + { + "epoch": 6.305987741631306, + "grad_norm": 0.5083168148994446, + "learning_rate": 1.8062805210546732e-05, + "loss": 0.2847, + "num_input_tokens_seen": 13473184, + "step": 13375 + }, + { + "epoch": 6.308345120226308, + "grad_norm": 0.2872740924358368, + "learning_rate": 1.80430434323829e-05, + "loss": 0.321, + "num_input_tokens_seen": 13477824, + "step": 13380 + }, + { + "epoch": 6.3107024988213105, + "grad_norm": 0.7259162664413452, + "learning_rate": 1.8023286364997372e-05, + "loss": 0.354, + "num_input_tokens_seen": 13484288, + "step": 13385 + }, + { + "epoch": 6.313059877416313, + "grad_norm": 0.3390604555606842, + "learning_rate": 1.800353402176828e-05, + "loss": 0.3372, + "num_input_tokens_seen": 13489312, + "step": 13390 + }, + { + "epoch": 6.315417256011315, + "grad_norm": 0.3550299406051636, + "learning_rate": 1.798378641607058e-05, + "loss": 0.3364, + "num_input_tokens_seen": 13495040, + "step": 13395 + }, + { + "epoch": 6.317774634606318, + "grad_norm": 0.3501130938529968, + "learning_rate": 1.7964043561276e-05, + "loss": 0.3495, + "num_input_tokens_seen": 13499616, + "step": 13400 + }, + { + "epoch": 6.32013201320132, + "grad_norm": 0.5580697655677795, + "learning_rate": 1.794430547075307e-05, + "loss": 0.3272, + "num_input_tokens_seen": 13503616, + "step": 13405 + }, + { + "epoch": 6.3224893917963225, + "grad_norm": 0.37587592005729675, + "learning_rate": 1.792457215786707e-05, + "loss": 0.3331, + "num_input_tokens_seen": 13509440, + "step": 13410 + }, + { + "epoch": 6.324846770391325, + "grad_norm": 0.3164370357990265, + "learning_rate": 1.790484363598007e-05, + "loss": 0.3236, + "num_input_tokens_seen": 13514752, + "step": 13415 + }, + { + "epoch": 6.327204148986327, + "grad_norm": 0.3141029179096222, + "learning_rate": 1.7885119918450884e-05, + "loss": 0.3014, + "num_input_tokens_seen": 13519360, + "step": 13420 + }, + { + "epoch": 6.32956152758133, + "grad_norm": 0.37334930896759033, + "learning_rate": 1.786540101863507e-05, + "loss": 0.2995, + "num_input_tokens_seen": 13525120, + "step": 13425 + }, + { + "epoch": 6.331918906176332, + "grad_norm": 0.32376664876937866, + "learning_rate": 1.784568694988492e-05, + "loss": 0.3076, + "num_input_tokens_seen": 13530208, + "step": 13430 + }, + { + "epoch": 6.3342762847713345, + "grad_norm": 0.3221428096294403, + "learning_rate": 1.7825977725549475e-05, + "loss": 0.3514, + "num_input_tokens_seen": 13535328, + "step": 13435 + }, + { + "epoch": 6.336633663366337, + "grad_norm": 0.42701056599617004, + "learning_rate": 1.7806273358974473e-05, + "loss": 0.3202, + "num_input_tokens_seen": 13539936, + "step": 13440 + }, + { + "epoch": 6.338991041961339, + "grad_norm": 0.24929232895374298, + "learning_rate": 1.778657386350238e-05, + "loss": 0.2946, + "num_input_tokens_seen": 13544448, + "step": 13445 + }, + { + "epoch": 6.341348420556342, + "grad_norm": 0.3522411584854126, + "learning_rate": 1.776687925247235e-05, + "loss": 0.3109, + "num_input_tokens_seen": 13550336, + "step": 13450 + }, + { + "epoch": 6.343705799151344, + "grad_norm": 0.6094906330108643, + "learning_rate": 1.7747189539220237e-05, + "loss": 0.337, + "num_input_tokens_seen": 13556480, + "step": 13455 + }, + { + "epoch": 6.346063177746346, + "grad_norm": 0.5231804251670837, + "learning_rate": 1.7727504737078588e-05, + "loss": 0.3729, + "num_input_tokens_seen": 13562752, + "step": 13460 + }, + { + "epoch": 6.348420556341348, + "grad_norm": 0.3074835240840912, + "learning_rate": 1.770782485937659e-05, + "loss": 0.3368, + "num_input_tokens_seen": 13566944, + "step": 13465 + }, + { + "epoch": 6.3507779349363505, + "grad_norm": 0.4006820619106293, + "learning_rate": 1.7688149919440137e-05, + "loss": 0.2721, + "num_input_tokens_seen": 13571232, + "step": 13470 + }, + { + "epoch": 6.353135313531353, + "grad_norm": 0.37173178791999817, + "learning_rate": 1.7668479930591753e-05, + "loss": 0.3895, + "num_input_tokens_seen": 13576544, + "step": 13475 + }, + { + "epoch": 6.355492692126355, + "grad_norm": 0.3430730998516083, + "learning_rate": 1.7648814906150624e-05, + "loss": 0.3281, + "num_input_tokens_seen": 13580800, + "step": 13480 + }, + { + "epoch": 6.357850070721358, + "grad_norm": 0.4615550637245178, + "learning_rate": 1.7629154859432562e-05, + "loss": 0.2861, + "num_input_tokens_seen": 13587040, + "step": 13485 + }, + { + "epoch": 6.36020744931636, + "grad_norm": 0.35394468903541565, + "learning_rate": 1.7609499803750023e-05, + "loss": 0.3519, + "num_input_tokens_seen": 13591424, + "step": 13490 + }, + { + "epoch": 6.3625648279113625, + "grad_norm": 0.33039411902427673, + "learning_rate": 1.758984975241207e-05, + "loss": 0.3014, + "num_input_tokens_seen": 13596992, + "step": 13495 + }, + { + "epoch": 6.364922206506365, + "grad_norm": 0.3107241988182068, + "learning_rate": 1.757020471872438e-05, + "loss": 0.3299, + "num_input_tokens_seen": 13602144, + "step": 13500 + }, + { + "epoch": 6.367279585101367, + "grad_norm": 0.5458829998970032, + "learning_rate": 1.755056471598924e-05, + "loss": 0.377, + "num_input_tokens_seen": 13606528, + "step": 13505 + }, + { + "epoch": 6.36963696369637, + "grad_norm": 0.2853847146034241, + "learning_rate": 1.7530929757505527e-05, + "loss": 0.3244, + "num_input_tokens_seen": 13610624, + "step": 13510 + }, + { + "epoch": 6.371994342291372, + "grad_norm": 0.3802318871021271, + "learning_rate": 1.7511299856568695e-05, + "loss": 0.2948, + "num_input_tokens_seen": 13616256, + "step": 13515 + }, + { + "epoch": 6.3743517208863745, + "grad_norm": 0.25421902537345886, + "learning_rate": 1.749167502647078e-05, + "loss": 0.2955, + "num_input_tokens_seen": 13620832, + "step": 13520 + }, + { + "epoch": 6.376709099481377, + "grad_norm": 0.28840506076812744, + "learning_rate": 1.747205528050039e-05, + "loss": 0.2767, + "num_input_tokens_seen": 13625664, + "step": 13525 + }, + { + "epoch": 6.379066478076379, + "grad_norm": 0.4849260747432709, + "learning_rate": 1.7452440631942678e-05, + "loss": 0.3197, + "num_input_tokens_seen": 13630592, + "step": 13530 + }, + { + "epoch": 6.381423856671382, + "grad_norm": 0.3044903874397278, + "learning_rate": 1.7432831094079355e-05, + "loss": 0.333, + "num_input_tokens_seen": 13634336, + "step": 13535 + }, + { + "epoch": 6.383781235266384, + "grad_norm": 0.25960907340049744, + "learning_rate": 1.7413226680188662e-05, + "loss": 0.2768, + "num_input_tokens_seen": 13639552, + "step": 13540 + }, + { + "epoch": 6.3861386138613865, + "grad_norm": 0.43976113200187683, + "learning_rate": 1.7393627403545378e-05, + "loss": 0.401, + "num_input_tokens_seen": 13645536, + "step": 13545 + }, + { + "epoch": 6.388495992456389, + "grad_norm": 0.2209375500679016, + "learning_rate": 1.737403327742081e-05, + "loss": 0.3114, + "num_input_tokens_seen": 13650144, + "step": 13550 + }, + { + "epoch": 6.39085337105139, + "grad_norm": 0.24621650576591492, + "learning_rate": 1.7354444315082753e-05, + "loss": 0.3276, + "num_input_tokens_seen": 13655392, + "step": 13555 + }, + { + "epoch": 6.393210749646393, + "grad_norm": 0.3111286163330078, + "learning_rate": 1.7334860529795537e-05, + "loss": 0.3467, + "num_input_tokens_seen": 13660864, + "step": 13560 + }, + { + "epoch": 6.395568128241395, + "grad_norm": 0.2540666162967682, + "learning_rate": 1.7315281934819962e-05, + "loss": 0.3334, + "num_input_tokens_seen": 13665888, + "step": 13565 + }, + { + "epoch": 6.397925506836398, + "grad_norm": 0.32520076632499695, + "learning_rate": 1.7295708543413326e-05, + "loss": 0.297, + "num_input_tokens_seen": 13670112, + "step": 13570 + }, + { + "epoch": 6.4002828854314, + "grad_norm": 0.27442434430122375, + "learning_rate": 1.72761403688294e-05, + "loss": 0.3311, + "num_input_tokens_seen": 13674464, + "step": 13575 + }, + { + "epoch": 6.402640264026402, + "grad_norm": 0.2508433163166046, + "learning_rate": 1.725657742431841e-05, + "loss": 0.3262, + "num_input_tokens_seen": 13679648, + "step": 13580 + }, + { + "epoch": 6.404997642621405, + "grad_norm": 0.44652315974235535, + "learning_rate": 1.7237019723127074e-05, + "loss": 0.2797, + "num_input_tokens_seen": 13684320, + "step": 13585 + }, + { + "epoch": 6.407355021216407, + "grad_norm": 0.2340131551027298, + "learning_rate": 1.721746727849852e-05, + "loss": 0.3182, + "num_input_tokens_seen": 13688576, + "step": 13590 + }, + { + "epoch": 6.40971239981141, + "grad_norm": 0.44439932703971863, + "learning_rate": 1.7197920103672345e-05, + "loss": 0.3523, + "num_input_tokens_seen": 13693760, + "step": 13595 + }, + { + "epoch": 6.412069778406412, + "grad_norm": 0.33920523524284363, + "learning_rate": 1.7178378211884565e-05, + "loss": 0.2686, + "num_input_tokens_seen": 13698112, + "step": 13600 + }, + { + "epoch": 6.414427157001414, + "grad_norm": 0.2694844603538513, + "learning_rate": 1.7158841616367622e-05, + "loss": 0.3096, + "num_input_tokens_seen": 13702528, + "step": 13605 + }, + { + "epoch": 6.416784535596417, + "grad_norm": 0.33870646357536316, + "learning_rate": 1.7139310330350365e-05, + "loss": 0.2577, + "num_input_tokens_seen": 13707520, + "step": 13610 + }, + { + "epoch": 6.419141914191419, + "grad_norm": 0.44731488823890686, + "learning_rate": 1.7119784367058066e-05, + "loss": 0.3198, + "num_input_tokens_seen": 13712672, + "step": 13615 + }, + { + "epoch": 6.421499292786422, + "grad_norm": 0.40697216987609863, + "learning_rate": 1.710026373971237e-05, + "loss": 0.3309, + "num_input_tokens_seen": 13717440, + "step": 13620 + }, + { + "epoch": 6.423856671381424, + "grad_norm": 0.3475494980812073, + "learning_rate": 1.7080748461531328e-05, + "loss": 0.3878, + "num_input_tokens_seen": 13722272, + "step": 13625 + }, + { + "epoch": 6.426214049976426, + "grad_norm": 0.3777853846549988, + "learning_rate": 1.706123854572935e-05, + "loss": 0.3556, + "num_input_tokens_seen": 13727040, + "step": 13630 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.28425711393356323, + "learning_rate": 1.704173400551724e-05, + "loss": 0.2891, + "num_input_tokens_seen": 13731872, + "step": 13635 + }, + { + "epoch": 6.430928807166431, + "grad_norm": 0.3980942964553833, + "learning_rate": 1.7022234854102133e-05, + "loss": 0.3441, + "num_input_tokens_seen": 13737216, + "step": 13640 + }, + { + "epoch": 6.433286185761434, + "grad_norm": 0.33328431844711304, + "learning_rate": 1.7002741104687537e-05, + "loss": 0.2806, + "num_input_tokens_seen": 13741888, + "step": 13645 + }, + { + "epoch": 6.435643564356436, + "grad_norm": 0.3649362325668335, + "learning_rate": 1.6983252770473287e-05, + "loss": 0.2545, + "num_input_tokens_seen": 13746944, + "step": 13650 + }, + { + "epoch": 6.438000942951438, + "grad_norm": 0.3229846656322479, + "learning_rate": 1.696376986465557e-05, + "loss": 0.3085, + "num_input_tokens_seen": 13752256, + "step": 13655 + }, + { + "epoch": 6.44035832154644, + "grad_norm": 0.3982031047344208, + "learning_rate": 1.6944292400426882e-05, + "loss": 0.3006, + "num_input_tokens_seen": 13757888, + "step": 13660 + }, + { + "epoch": 6.442715700141442, + "grad_norm": 0.19651490449905396, + "learning_rate": 1.6924820390976035e-05, + "loss": 0.3023, + "num_input_tokens_seen": 13762880, + "step": 13665 + }, + { + "epoch": 6.445073078736445, + "grad_norm": 0.30844682455062866, + "learning_rate": 1.6905353849488147e-05, + "loss": 0.2667, + "num_input_tokens_seen": 13768448, + "step": 13670 + }, + { + "epoch": 6.447430457331447, + "grad_norm": 0.5911639928817749, + "learning_rate": 1.6885892789144654e-05, + "loss": 0.3701, + "num_input_tokens_seen": 13772448, + "step": 13675 + }, + { + "epoch": 6.4497878359264496, + "grad_norm": 0.1861637681722641, + "learning_rate": 1.6866437223123243e-05, + "loss": 0.2474, + "num_input_tokens_seen": 13777632, + "step": 13680 + }, + { + "epoch": 6.452145214521452, + "grad_norm": 0.7001076936721802, + "learning_rate": 1.6846987164597927e-05, + "loss": 0.4521, + "num_input_tokens_seen": 13783296, + "step": 13685 + }, + { + "epoch": 6.454502593116454, + "grad_norm": 0.380281537771225, + "learning_rate": 1.6827542626738942e-05, + "loss": 0.2804, + "num_input_tokens_seen": 13788256, + "step": 13690 + }, + { + "epoch": 6.456859971711457, + "grad_norm": 0.33440959453582764, + "learning_rate": 1.6808103622712803e-05, + "loss": 0.3011, + "num_input_tokens_seen": 13794112, + "step": 13695 + }, + { + "epoch": 6.459217350306459, + "grad_norm": 0.8378137946128845, + "learning_rate": 1.6788670165682303e-05, + "loss": 0.4188, + "num_input_tokens_seen": 13798880, + "step": 13700 + }, + { + "epoch": 6.461574728901462, + "grad_norm": 0.5561883449554443, + "learning_rate": 1.6769242268806445e-05, + "loss": 0.3478, + "num_input_tokens_seen": 13802656, + "step": 13705 + }, + { + "epoch": 6.463932107496464, + "grad_norm": 0.3111601769924164, + "learning_rate": 1.674981994524049e-05, + "loss": 0.3159, + "num_input_tokens_seen": 13807328, + "step": 13710 + }, + { + "epoch": 6.466289486091466, + "grad_norm": 0.4705531597137451, + "learning_rate": 1.6730403208135907e-05, + "loss": 0.3306, + "num_input_tokens_seen": 13811872, + "step": 13715 + }, + { + "epoch": 6.468646864686469, + "grad_norm": 0.39321357011795044, + "learning_rate": 1.67109920706404e-05, + "loss": 0.3161, + "num_input_tokens_seen": 13817760, + "step": 13720 + }, + { + "epoch": 6.471004243281471, + "grad_norm": 0.35270699858665466, + "learning_rate": 1.6691586545897864e-05, + "loss": 0.3219, + "num_input_tokens_seen": 13823648, + "step": 13725 + }, + { + "epoch": 6.473361621876474, + "grad_norm": 0.3185389041900635, + "learning_rate": 1.667218664704841e-05, + "loss": 0.3368, + "num_input_tokens_seen": 13829696, + "step": 13730 + }, + { + "epoch": 6.475719000471476, + "grad_norm": 0.2980872392654419, + "learning_rate": 1.665279238722832e-05, + "loss": 0.3651, + "num_input_tokens_seen": 13835200, + "step": 13735 + }, + { + "epoch": 6.478076379066478, + "grad_norm": 0.28284111618995667, + "learning_rate": 1.6633403779570084e-05, + "loss": 0.3143, + "num_input_tokens_seen": 13839424, + "step": 13740 + }, + { + "epoch": 6.480433757661481, + "grad_norm": 0.35160455107688904, + "learning_rate": 1.6614020837202336e-05, + "loss": 0.3384, + "num_input_tokens_seen": 13843840, + "step": 13745 + }, + { + "epoch": 6.482791136256483, + "grad_norm": 0.34095999598503113, + "learning_rate": 1.6594643573249896e-05, + "loss": 0.3086, + "num_input_tokens_seen": 13848608, + "step": 13750 + }, + { + "epoch": 6.485148514851485, + "grad_norm": 0.2918975055217743, + "learning_rate": 1.6575272000833725e-05, + "loss": 0.3419, + "num_input_tokens_seen": 13853856, + "step": 13755 + }, + { + "epoch": 6.487505893446487, + "grad_norm": 0.3273079991340637, + "learning_rate": 1.6555906133070944e-05, + "loss": 0.2833, + "num_input_tokens_seen": 13858784, + "step": 13760 + }, + { + "epoch": 6.4898632720414895, + "grad_norm": 0.2670630216598511, + "learning_rate": 1.653654598307479e-05, + "loss": 0.2739, + "num_input_tokens_seen": 13863808, + "step": 13765 + }, + { + "epoch": 6.492220650636492, + "grad_norm": 0.44671204686164856, + "learning_rate": 1.651719156395466e-05, + "loss": 0.3589, + "num_input_tokens_seen": 13869312, + "step": 13770 + }, + { + "epoch": 6.494578029231494, + "grad_norm": 0.3236679136753082, + "learning_rate": 1.649784288881603e-05, + "loss": 0.3595, + "num_input_tokens_seen": 13875136, + "step": 13775 + }, + { + "epoch": 6.496935407826497, + "grad_norm": 0.5616161823272705, + "learning_rate": 1.6478499970760526e-05, + "loss": 0.3667, + "num_input_tokens_seen": 13879680, + "step": 13780 + }, + { + "epoch": 6.499292786421499, + "grad_norm": 0.3165639340877533, + "learning_rate": 1.6459162822885843e-05, + "loss": 0.3288, + "num_input_tokens_seen": 13884576, + "step": 13785 + }, + { + "epoch": 6.5016501650165015, + "grad_norm": 0.4279976487159729, + "learning_rate": 1.6439831458285797e-05, + "loss": 0.3118, + "num_input_tokens_seen": 13889216, + "step": 13790 + }, + { + "epoch": 6.503064592173503, + "eval_loss": 0.3314424753189087, + "eval_runtime": 25.5739, + "eval_samples_per_second": 36.873, + "eval_steps_per_second": 9.228, + "num_input_tokens_seen": 13892416, + "step": 13793 + }, + { + "epoch": 6.504007543611504, + "grad_norm": 0.39409810304641724, + "learning_rate": 1.642050589005027e-05, + "loss": 0.3113, + "num_input_tokens_seen": 13894432, + "step": 13795 + }, + { + "epoch": 6.506364922206506, + "grad_norm": 0.4059755206108093, + "learning_rate": 1.6401186131265217e-05, + "loss": 0.2835, + "num_input_tokens_seen": 13899936, + "step": 13800 + }, + { + "epoch": 6.508722300801509, + "grad_norm": 0.7752254605293274, + "learning_rate": 1.6381872195012677e-05, + "loss": 0.3412, + "num_input_tokens_seen": 13904960, + "step": 13805 + }, + { + "epoch": 6.511079679396511, + "grad_norm": 0.5065967440605164, + "learning_rate": 1.6362564094370722e-05, + "loss": 0.3386, + "num_input_tokens_seen": 13909728, + "step": 13810 + }, + { + "epoch": 6.5134370579915135, + "grad_norm": 0.3598061800003052, + "learning_rate": 1.6343261842413497e-05, + "loss": 0.3216, + "num_input_tokens_seen": 13914080, + "step": 13815 + }, + { + "epoch": 6.515794436586516, + "grad_norm": 0.3424120247364044, + "learning_rate": 1.632396545221117e-05, + "loss": 0.3057, + "num_input_tokens_seen": 13918272, + "step": 13820 + }, + { + "epoch": 6.518151815181518, + "grad_norm": 0.26625433564186096, + "learning_rate": 1.630467493682995e-05, + "loss": 0.3227, + "num_input_tokens_seen": 13923360, + "step": 13825 + }, + { + "epoch": 6.520509193776521, + "grad_norm": 0.35718241333961487, + "learning_rate": 1.628539030933206e-05, + "loss": 0.3083, + "num_input_tokens_seen": 13928160, + "step": 13830 + }, + { + "epoch": 6.522866572371523, + "grad_norm": 0.29637813568115234, + "learning_rate": 1.626611158277574e-05, + "loss": 0.3291, + "num_input_tokens_seen": 13933472, + "step": 13835 + }, + { + "epoch": 6.5252239509665255, + "grad_norm": 0.28173598647117615, + "learning_rate": 1.6246838770215233e-05, + "loss": 0.267, + "num_input_tokens_seen": 13938272, + "step": 13840 + }, + { + "epoch": 6.527581329561528, + "grad_norm": 0.3675142824649811, + "learning_rate": 1.622757188470078e-05, + "loss": 0.3285, + "num_input_tokens_seen": 13943584, + "step": 13845 + }, + { + "epoch": 6.52993870815653, + "grad_norm": 0.29500046372413635, + "learning_rate": 1.6208310939278606e-05, + "loss": 0.275, + "num_input_tokens_seen": 13948992, + "step": 13850 + }, + { + "epoch": 6.532296086751533, + "grad_norm": 0.31973564624786377, + "learning_rate": 1.6189055946990917e-05, + "loss": 0.3305, + "num_input_tokens_seen": 13953600, + "step": 13855 + }, + { + "epoch": 6.534653465346535, + "grad_norm": 0.35056570172309875, + "learning_rate": 1.616980692087588e-05, + "loss": 0.3862, + "num_input_tokens_seen": 13958400, + "step": 13860 + }, + { + "epoch": 6.537010843941537, + "grad_norm": 0.2441500425338745, + "learning_rate": 1.6150563873967632e-05, + "loss": 0.4036, + "num_input_tokens_seen": 13964608, + "step": 13865 + }, + { + "epoch": 6.539368222536539, + "grad_norm": 0.33192917704582214, + "learning_rate": 1.613132681929625e-05, + "loss": 0.3288, + "num_input_tokens_seen": 13968896, + "step": 13870 + }, + { + "epoch": 6.5417256011315414, + "grad_norm": 0.24509502947330475, + "learning_rate": 1.6112095769887776e-05, + "loss": 0.3117, + "num_input_tokens_seen": 13974272, + "step": 13875 + }, + { + "epoch": 6.544082979726544, + "grad_norm": 0.47126299142837524, + "learning_rate": 1.609287073876415e-05, + "loss": 0.3036, + "num_input_tokens_seen": 13979424, + "step": 13880 + }, + { + "epoch": 6.546440358321546, + "grad_norm": 0.4012770652770996, + "learning_rate": 1.6073651738943274e-05, + "loss": 0.3464, + "num_input_tokens_seen": 13983584, + "step": 13885 + }, + { + "epoch": 6.548797736916549, + "grad_norm": 0.36582309007644653, + "learning_rate": 1.6054438783438934e-05, + "loss": 0.2904, + "num_input_tokens_seen": 13988288, + "step": 13890 + }, + { + "epoch": 6.551155115511551, + "grad_norm": 0.3335961103439331, + "learning_rate": 1.6035231885260855e-05, + "loss": 0.2762, + "num_input_tokens_seen": 13992800, + "step": 13895 + }, + { + "epoch": 6.5535124941065535, + "grad_norm": 0.3092866539955139, + "learning_rate": 1.6016031057414623e-05, + "loss": 0.3171, + "num_input_tokens_seen": 13999936, + "step": 13900 + }, + { + "epoch": 6.555869872701556, + "grad_norm": 0.3454689383506775, + "learning_rate": 1.5996836312901754e-05, + "loss": 0.3328, + "num_input_tokens_seen": 14003968, + "step": 13905 + }, + { + "epoch": 6.558227251296558, + "grad_norm": 0.4176735281944275, + "learning_rate": 1.5977647664719618e-05, + "loss": 0.3857, + "num_input_tokens_seen": 14009120, + "step": 13910 + }, + { + "epoch": 6.560584629891561, + "grad_norm": 0.35964900255203247, + "learning_rate": 1.5958465125861456e-05, + "loss": 0.3058, + "num_input_tokens_seen": 14014080, + "step": 13915 + }, + { + "epoch": 6.562942008486563, + "grad_norm": 0.38936707377433777, + "learning_rate": 1.5939288709316387e-05, + "loss": 0.3192, + "num_input_tokens_seen": 14020128, + "step": 13920 + }, + { + "epoch": 6.5652993870815655, + "grad_norm": 0.36134427785873413, + "learning_rate": 1.5920118428069373e-05, + "loss": 0.3589, + "num_input_tokens_seen": 14024800, + "step": 13925 + }, + { + "epoch": 6.567656765676568, + "grad_norm": 0.738584041595459, + "learning_rate": 1.5900954295101234e-05, + "loss": 0.3187, + "num_input_tokens_seen": 14030496, + "step": 13930 + }, + { + "epoch": 6.57001414427157, + "grad_norm": 0.25616827607154846, + "learning_rate": 1.588179632338861e-05, + "loss": 0.3032, + "num_input_tokens_seen": 14035872, + "step": 13935 + }, + { + "epoch": 6.572371522866573, + "grad_norm": 0.23947547376155853, + "learning_rate": 1.586264452590398e-05, + "loss": 0.2885, + "num_input_tokens_seen": 14040416, + "step": 13940 + }, + { + "epoch": 6.574728901461575, + "grad_norm": 0.333304762840271, + "learning_rate": 1.5843498915615643e-05, + "loss": 0.295, + "num_input_tokens_seen": 14045760, + "step": 13945 + }, + { + "epoch": 6.5770862800565775, + "grad_norm": 0.4001898467540741, + "learning_rate": 1.5824359505487706e-05, + "loss": 0.3251, + "num_input_tokens_seen": 14051392, + "step": 13950 + }, + { + "epoch": 6.579443658651579, + "grad_norm": 0.2604105472564697, + "learning_rate": 1.580522630848007e-05, + "loss": 0.328, + "num_input_tokens_seen": 14055872, + "step": 13955 + }, + { + "epoch": 6.581801037246581, + "grad_norm": 0.3749571144580841, + "learning_rate": 1.5786099337548445e-05, + "loss": 0.3002, + "num_input_tokens_seen": 14060608, + "step": 13960 + }, + { + "epoch": 6.584158415841584, + "grad_norm": 0.3318006694316864, + "learning_rate": 1.576697860564431e-05, + "loss": 0.3641, + "num_input_tokens_seen": 14065600, + "step": 13965 + }, + { + "epoch": 6.586515794436586, + "grad_norm": 0.3471059799194336, + "learning_rate": 1.574786412571493e-05, + "loss": 0.3645, + "num_input_tokens_seen": 14071008, + "step": 13970 + }, + { + "epoch": 6.588873173031589, + "grad_norm": 0.32826003432273865, + "learning_rate": 1.5728755910703324e-05, + "loss": 0.3649, + "num_input_tokens_seen": 14075648, + "step": 13975 + }, + { + "epoch": 6.591230551626591, + "grad_norm": 0.5491892695426941, + "learning_rate": 1.570965397354829e-05, + "loss": 0.346, + "num_input_tokens_seen": 14080960, + "step": 13980 + }, + { + "epoch": 6.593587930221593, + "grad_norm": 0.24236169457435608, + "learning_rate": 1.5690558327184345e-05, + "loss": 0.3633, + "num_input_tokens_seen": 14085632, + "step": 13985 + }, + { + "epoch": 6.595945308816596, + "grad_norm": 0.3086824119091034, + "learning_rate": 1.5671468984541783e-05, + "loss": 0.332, + "num_input_tokens_seen": 14090400, + "step": 13990 + }, + { + "epoch": 6.598302687411598, + "grad_norm": 0.2802920937538147, + "learning_rate": 1.5652385958546595e-05, + "loss": 0.3198, + "num_input_tokens_seen": 14095072, + "step": 13995 + }, + { + "epoch": 6.600660066006601, + "grad_norm": 0.3053348958492279, + "learning_rate": 1.5633309262120515e-05, + "loss": 0.3413, + "num_input_tokens_seen": 14099616, + "step": 14000 + }, + { + "epoch": 6.603017444601603, + "grad_norm": 0.4641198217868805, + "learning_rate": 1.5614238908180985e-05, + "loss": 0.324, + "num_input_tokens_seen": 14104352, + "step": 14005 + }, + { + "epoch": 6.605374823196605, + "grad_norm": 0.3070061504840851, + "learning_rate": 1.5595174909641152e-05, + "loss": 0.3601, + "num_input_tokens_seen": 14109504, + "step": 14010 + }, + { + "epoch": 6.607732201791608, + "grad_norm": 0.4753396213054657, + "learning_rate": 1.557611727940986e-05, + "loss": 0.296, + "num_input_tokens_seen": 14114592, + "step": 14015 + }, + { + "epoch": 6.61008958038661, + "grad_norm": 0.3276534378528595, + "learning_rate": 1.5557066030391648e-05, + "loss": 0.3594, + "num_input_tokens_seen": 14118592, + "step": 14020 + }, + { + "epoch": 6.612446958981613, + "grad_norm": 0.5133419036865234, + "learning_rate": 1.553802117548672e-05, + "loss": 0.265, + "num_input_tokens_seen": 14122048, + "step": 14025 + }, + { + "epoch": 6.614804337576615, + "grad_norm": 0.3810881972312927, + "learning_rate": 1.5518982727590957e-05, + "loss": 0.3509, + "num_input_tokens_seen": 14128704, + "step": 14030 + }, + { + "epoch": 6.617161716171617, + "grad_norm": 0.44389140605926514, + "learning_rate": 1.549995069959591e-05, + "loss": 0.3591, + "num_input_tokens_seen": 14134432, + "step": 14035 + }, + { + "epoch": 6.61951909476662, + "grad_norm": 0.360857218503952, + "learning_rate": 1.5480925104388762e-05, + "loss": 0.3041, + "num_input_tokens_seen": 14141984, + "step": 14040 + }, + { + "epoch": 6.621876473361622, + "grad_norm": 0.7343931794166565, + "learning_rate": 1.546190595485237e-05, + "loss": 0.357, + "num_input_tokens_seen": 14147456, + "step": 14045 + }, + { + "epoch": 6.624233851956625, + "grad_norm": 0.3963930606842041, + "learning_rate": 1.5442893263865195e-05, + "loss": 0.3189, + "num_input_tokens_seen": 14152640, + "step": 14050 + }, + { + "epoch": 6.626591230551627, + "grad_norm": 0.3526712656021118, + "learning_rate": 1.542388704430135e-05, + "loss": 0.3758, + "num_input_tokens_seen": 14156288, + "step": 14055 + }, + { + "epoch": 6.628948609146629, + "grad_norm": 0.40391889214515686, + "learning_rate": 1.540488730903055e-05, + "loss": 0.3328, + "num_input_tokens_seen": 14162048, + "step": 14060 + }, + { + "epoch": 6.631305987741631, + "grad_norm": 0.5453811287879944, + "learning_rate": 1.5385894070918126e-05, + "loss": 0.3137, + "num_input_tokens_seen": 14166432, + "step": 14065 + }, + { + "epoch": 6.633663366336633, + "grad_norm": 0.4397848844528198, + "learning_rate": 1.536690734282501e-05, + "loss": 0.3564, + "num_input_tokens_seen": 14171040, + "step": 14070 + }, + { + "epoch": 6.636020744931636, + "grad_norm": 0.36837512254714966, + "learning_rate": 1.5347927137607722e-05, + "loss": 0.3083, + "num_input_tokens_seen": 14176224, + "step": 14075 + }, + { + "epoch": 6.638378123526638, + "grad_norm": 0.5474161505699158, + "learning_rate": 1.532895346811837e-05, + "loss": 0.3441, + "num_input_tokens_seen": 14181280, + "step": 14080 + }, + { + "epoch": 6.6407355021216405, + "grad_norm": 0.27918821573257446, + "learning_rate": 1.530998634720463e-05, + "loss": 0.3204, + "num_input_tokens_seen": 14185376, + "step": 14085 + }, + { + "epoch": 6.643092880716643, + "grad_norm": 0.3585335910320282, + "learning_rate": 1.529102578770975e-05, + "loss": 0.3308, + "num_input_tokens_seen": 14189600, + "step": 14090 + }, + { + "epoch": 6.645450259311645, + "grad_norm": 0.4113994240760803, + "learning_rate": 1.5272071802472536e-05, + "loss": 0.3011, + "num_input_tokens_seen": 14194656, + "step": 14095 + }, + { + "epoch": 6.647807637906648, + "grad_norm": 0.5103563666343689, + "learning_rate": 1.5253124404327329e-05, + "loss": 0.3543, + "num_input_tokens_seen": 14200128, + "step": 14100 + }, + { + "epoch": 6.65016501650165, + "grad_norm": 0.31684255599975586, + "learning_rate": 1.523418360610403e-05, + "loss": 0.3056, + "num_input_tokens_seen": 14205024, + "step": 14105 + }, + { + "epoch": 6.6525223950966526, + "grad_norm": 0.34376466274261475, + "learning_rate": 1.5215249420628056e-05, + "loss": 0.3244, + "num_input_tokens_seen": 14209088, + "step": 14110 + }, + { + "epoch": 6.654879773691655, + "grad_norm": 0.4356708526611328, + "learning_rate": 1.5196321860720352e-05, + "loss": 0.3362, + "num_input_tokens_seen": 14213600, + "step": 14115 + }, + { + "epoch": 6.657237152286657, + "grad_norm": 0.571159839630127, + "learning_rate": 1.5177400939197372e-05, + "loss": 0.3254, + "num_input_tokens_seen": 14218944, + "step": 14120 + }, + { + "epoch": 6.65959453088166, + "grad_norm": 0.3390493392944336, + "learning_rate": 1.5158486668871085e-05, + "loss": 0.2977, + "num_input_tokens_seen": 14223552, + "step": 14125 + }, + { + "epoch": 6.661951909476662, + "grad_norm": 0.49131378531455994, + "learning_rate": 1.5139579062548945e-05, + "loss": 0.2948, + "num_input_tokens_seen": 14229152, + "step": 14130 + }, + { + "epoch": 6.664309288071665, + "grad_norm": 0.30878782272338867, + "learning_rate": 1.5120678133033914e-05, + "loss": 0.3219, + "num_input_tokens_seen": 14234592, + "step": 14135 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.8284927010536194, + "learning_rate": 1.5101783893124386e-05, + "loss": 0.3578, + "num_input_tokens_seen": 14240416, + "step": 14140 + }, + { + "epoch": 6.669024045261669, + "grad_norm": 0.43800264596939087, + "learning_rate": 1.5082896355614279e-05, + "loss": 0.34, + "num_input_tokens_seen": 14244608, + "step": 14145 + }, + { + "epoch": 6.671381423856672, + "grad_norm": 0.3742195963859558, + "learning_rate": 1.506401553329294e-05, + "loss": 0.3078, + "num_input_tokens_seen": 14249888, + "step": 14150 + }, + { + "epoch": 6.673738802451673, + "grad_norm": 0.21710583567619324, + "learning_rate": 1.5045141438945187e-05, + "loss": 0.2353, + "num_input_tokens_seen": 14254976, + "step": 14155 + }, + { + "epoch": 6.676096181046676, + "grad_norm": 0.5349944829940796, + "learning_rate": 1.5026274085351266e-05, + "loss": 0.3119, + "num_input_tokens_seen": 14259264, + "step": 14160 + }, + { + "epoch": 6.678453559641678, + "grad_norm": 0.32801851630210876, + "learning_rate": 1.5007413485286875e-05, + "loss": 0.3281, + "num_input_tokens_seen": 14263872, + "step": 14165 + }, + { + "epoch": 6.6808109382366805, + "grad_norm": 0.4223589301109314, + "learning_rate": 1.4988559651523127e-05, + "loss": 0.3417, + "num_input_tokens_seen": 14269728, + "step": 14170 + }, + { + "epoch": 6.683168316831683, + "grad_norm": 0.379890114068985, + "learning_rate": 1.4969712596826558e-05, + "loss": 0.334, + "num_input_tokens_seen": 14274336, + "step": 14175 + }, + { + "epoch": 6.685525695426685, + "grad_norm": 0.3431597948074341, + "learning_rate": 1.4950872333959109e-05, + "loss": 0.3544, + "num_input_tokens_seen": 14279136, + "step": 14180 + }, + { + "epoch": 6.687883074021688, + "grad_norm": 0.5114656686782837, + "learning_rate": 1.4932038875678137e-05, + "loss": 0.2766, + "num_input_tokens_seen": 14283680, + "step": 14185 + }, + { + "epoch": 6.69024045261669, + "grad_norm": 0.43446752429008484, + "learning_rate": 1.4913212234736368e-05, + "loss": 0.3205, + "num_input_tokens_seen": 14289216, + "step": 14190 + }, + { + "epoch": 6.6925978312116925, + "grad_norm": 0.24920561909675598, + "learning_rate": 1.4894392423881932e-05, + "loss": 0.325, + "num_input_tokens_seen": 14293984, + "step": 14195 + }, + { + "epoch": 6.694955209806695, + "grad_norm": 0.6583231687545776, + "learning_rate": 1.4875579455858319e-05, + "loss": 0.3716, + "num_input_tokens_seen": 14298624, + "step": 14200 + }, + { + "epoch": 6.697312588401697, + "grad_norm": 0.22178348898887634, + "learning_rate": 1.4856773343404402e-05, + "loss": 0.2803, + "num_input_tokens_seen": 14304032, + "step": 14205 + }, + { + "epoch": 6.6996699669967, + "grad_norm": 0.3312886357307434, + "learning_rate": 1.483797409925439e-05, + "loss": 0.2934, + "num_input_tokens_seen": 14308064, + "step": 14210 + }, + { + "epoch": 6.702027345591702, + "grad_norm": 0.35571426153182983, + "learning_rate": 1.4819181736137866e-05, + "loss": 0.2829, + "num_input_tokens_seen": 14312064, + "step": 14215 + }, + { + "epoch": 6.7043847241867045, + "grad_norm": 0.45087501406669617, + "learning_rate": 1.4800396266779737e-05, + "loss": 0.3574, + "num_input_tokens_seen": 14316928, + "step": 14220 + }, + { + "epoch": 6.706742102781707, + "grad_norm": 0.2528354823589325, + "learning_rate": 1.4781617703900241e-05, + "loss": 0.3208, + "num_input_tokens_seen": 14322336, + "step": 14225 + }, + { + "epoch": 6.709099481376709, + "grad_norm": 0.4057137072086334, + "learning_rate": 1.4762846060214958e-05, + "loss": 0.3387, + "num_input_tokens_seen": 14327968, + "step": 14230 + }, + { + "epoch": 6.711456859971712, + "grad_norm": 0.5071261525154114, + "learning_rate": 1.4744081348434755e-05, + "loss": 0.4107, + "num_input_tokens_seen": 14332992, + "step": 14235 + }, + { + "epoch": 6.713814238566714, + "grad_norm": 0.3637199103832245, + "learning_rate": 1.4725323581265832e-05, + "loss": 0.3075, + "num_input_tokens_seen": 14337792, + "step": 14240 + }, + { + "epoch": 6.7161716171617165, + "grad_norm": 0.3062158226966858, + "learning_rate": 1.4706572771409677e-05, + "loss": 0.3396, + "num_input_tokens_seen": 14342720, + "step": 14245 + }, + { + "epoch": 6.718528995756719, + "grad_norm": 0.34442877769470215, + "learning_rate": 1.4687828931563048e-05, + "loss": 0.3509, + "num_input_tokens_seen": 14348000, + "step": 14250 + }, + { + "epoch": 6.720886374351721, + "grad_norm": 0.4582545757293701, + "learning_rate": 1.4669092074418004e-05, + "loss": 0.305, + "num_input_tokens_seen": 14355072, + "step": 14255 + }, + { + "epoch": 6.723243752946724, + "grad_norm": 0.29563120007514954, + "learning_rate": 1.4650362212661878e-05, + "loss": 0.3216, + "num_input_tokens_seen": 14360512, + "step": 14260 + }, + { + "epoch": 6.725601131541725, + "grad_norm": 0.5634151697158813, + "learning_rate": 1.4631639358977256e-05, + "loss": 0.3395, + "num_input_tokens_seen": 14364864, + "step": 14265 + }, + { + "epoch": 6.727958510136728, + "grad_norm": 0.3743756115436554, + "learning_rate": 1.4612923526041989e-05, + "loss": 0.3358, + "num_input_tokens_seen": 14369312, + "step": 14270 + }, + { + "epoch": 6.73031588873173, + "grad_norm": 0.23260489106178284, + "learning_rate": 1.459421472652916e-05, + "loss": 0.3466, + "num_input_tokens_seen": 14373920, + "step": 14275 + }, + { + "epoch": 6.732673267326732, + "grad_norm": 0.4544202983379364, + "learning_rate": 1.4575512973107094e-05, + "loss": 0.2698, + "num_input_tokens_seen": 14379072, + "step": 14280 + }, + { + "epoch": 6.735030645921735, + "grad_norm": 0.35763007402420044, + "learning_rate": 1.4556818278439366e-05, + "loss": 0.3697, + "num_input_tokens_seen": 14383520, + "step": 14285 + }, + { + "epoch": 6.737388024516737, + "grad_norm": 0.44352293014526367, + "learning_rate": 1.4538130655184729e-05, + "loss": 0.272, + "num_input_tokens_seen": 14388160, + "step": 14290 + }, + { + "epoch": 6.73974540311174, + "grad_norm": 0.568163275718689, + "learning_rate": 1.4519450115997183e-05, + "loss": 0.44, + "num_input_tokens_seen": 14392960, + "step": 14295 + }, + { + "epoch": 6.742102781706742, + "grad_norm": 0.3713829815387726, + "learning_rate": 1.4500776673525918e-05, + "loss": 0.3338, + "num_input_tokens_seen": 14400096, + "step": 14300 + }, + { + "epoch": 6.7444601603017444, + "grad_norm": 0.3022964596748352, + "learning_rate": 1.4482110340415334e-05, + "loss": 0.3578, + "num_input_tokens_seen": 14404096, + "step": 14305 + }, + { + "epoch": 6.746817538896747, + "grad_norm": 0.4769511818885803, + "learning_rate": 1.4463451129304984e-05, + "loss": 0.3231, + "num_input_tokens_seen": 14409184, + "step": 14310 + }, + { + "epoch": 6.749174917491749, + "grad_norm": 0.31801578402519226, + "learning_rate": 1.4444799052829621e-05, + "loss": 0.3177, + "num_input_tokens_seen": 14414112, + "step": 14315 + }, + { + "epoch": 6.751532296086752, + "grad_norm": 0.3036918044090271, + "learning_rate": 1.4426154123619173e-05, + "loss": 0.3359, + "num_input_tokens_seen": 14418976, + "step": 14320 + }, + { + "epoch": 6.753889674681754, + "grad_norm": 0.514356255531311, + "learning_rate": 1.4407516354298723e-05, + "loss": 0.333, + "num_input_tokens_seen": 14424992, + "step": 14325 + }, + { + "epoch": 6.7562470532767565, + "grad_norm": 0.5052562952041626, + "learning_rate": 1.4388885757488483e-05, + "loss": 0.3425, + "num_input_tokens_seen": 14431680, + "step": 14330 + }, + { + "epoch": 6.758604431871759, + "grad_norm": 0.3889715373516083, + "learning_rate": 1.4370262345803836e-05, + "loss": 0.3369, + "num_input_tokens_seen": 14435776, + "step": 14335 + }, + { + "epoch": 6.760961810466761, + "grad_norm": 0.3029053211212158, + "learning_rate": 1.4351646131855295e-05, + "loss": 0.3352, + "num_input_tokens_seen": 14440704, + "step": 14340 + }, + { + "epoch": 6.763319189061764, + "grad_norm": 0.3084380626678467, + "learning_rate": 1.4333037128248506e-05, + "loss": 0.331, + "num_input_tokens_seen": 14445792, + "step": 14345 + }, + { + "epoch": 6.765676567656766, + "grad_norm": 0.5339139699935913, + "learning_rate": 1.4314435347584195e-05, + "loss": 0.3593, + "num_input_tokens_seen": 14450144, + "step": 14350 + }, + { + "epoch": 6.768033946251768, + "grad_norm": 0.3757864832878113, + "learning_rate": 1.4295840802458244e-05, + "loss": 0.3283, + "num_input_tokens_seen": 14454592, + "step": 14355 + }, + { + "epoch": 6.77039132484677, + "grad_norm": 0.2993345260620117, + "learning_rate": 1.4277253505461613e-05, + "loss": 0.3137, + "num_input_tokens_seen": 14459360, + "step": 14360 + }, + { + "epoch": 6.772748703441772, + "grad_norm": 0.5479684472084045, + "learning_rate": 1.4258673469180344e-05, + "loss": 0.3532, + "num_input_tokens_seen": 14463840, + "step": 14365 + }, + { + "epoch": 6.775106082036775, + "grad_norm": 0.31135237216949463, + "learning_rate": 1.42401007061956e-05, + "loss": 0.3449, + "num_input_tokens_seen": 14469280, + "step": 14370 + }, + { + "epoch": 6.777463460631777, + "grad_norm": 0.37833327054977417, + "learning_rate": 1.4221535229083566e-05, + "loss": 0.3142, + "num_input_tokens_seen": 14473408, + "step": 14375 + }, + { + "epoch": 6.77982083922678, + "grad_norm": 0.32108187675476074, + "learning_rate": 1.4202977050415533e-05, + "loss": 0.2898, + "num_input_tokens_seen": 14478144, + "step": 14380 + }, + { + "epoch": 6.782178217821782, + "grad_norm": 0.30036792159080505, + "learning_rate": 1.418442618275784e-05, + "loss": 0.3578, + "num_input_tokens_seen": 14483008, + "step": 14385 + }, + { + "epoch": 6.784535596416784, + "grad_norm": 0.40315476059913635, + "learning_rate": 1.4165882638671885e-05, + "loss": 0.3079, + "num_input_tokens_seen": 14488032, + "step": 14390 + }, + { + "epoch": 6.786892975011787, + "grad_norm": 0.3606131672859192, + "learning_rate": 1.4147346430714075e-05, + "loss": 0.3875, + "num_input_tokens_seen": 14493856, + "step": 14395 + }, + { + "epoch": 6.789250353606789, + "grad_norm": 0.527923047542572, + "learning_rate": 1.4128817571435881e-05, + "loss": 0.3252, + "num_input_tokens_seen": 14498528, + "step": 14400 + }, + { + "epoch": 6.791607732201792, + "grad_norm": 0.35511839389801025, + "learning_rate": 1.4110296073383789e-05, + "loss": 0.3263, + "num_input_tokens_seen": 14503680, + "step": 14405 + }, + { + "epoch": 6.793965110796794, + "grad_norm": 0.5241124033927917, + "learning_rate": 1.4091781949099304e-05, + "loss": 0.3555, + "num_input_tokens_seen": 14509440, + "step": 14410 + }, + { + "epoch": 6.796322489391796, + "grad_norm": 0.38754698634147644, + "learning_rate": 1.407327521111892e-05, + "loss": 0.338, + "num_input_tokens_seen": 14514112, + "step": 14415 + }, + { + "epoch": 6.798679867986799, + "grad_norm": 0.44930174946784973, + "learning_rate": 1.4054775871974145e-05, + "loss": 0.3157, + "num_input_tokens_seen": 14519904, + "step": 14420 + }, + { + "epoch": 6.801037246581801, + "grad_norm": 0.29862967133522034, + "learning_rate": 1.403628394419148e-05, + "loss": 0.3291, + "num_input_tokens_seen": 14524576, + "step": 14425 + }, + { + "epoch": 6.803394625176804, + "grad_norm": 0.2546379268169403, + "learning_rate": 1.4017799440292412e-05, + "loss": 0.3127, + "num_input_tokens_seen": 14530528, + "step": 14430 + }, + { + "epoch": 6.805752003771806, + "grad_norm": 0.33612996339797974, + "learning_rate": 1.399932237279337e-05, + "loss": 0.3802, + "num_input_tokens_seen": 14535552, + "step": 14435 + }, + { + "epoch": 6.808109382366808, + "grad_norm": 0.5417976379394531, + "learning_rate": 1.3980852754205775e-05, + "loss": 0.3616, + "num_input_tokens_seen": 14539392, + "step": 14440 + }, + { + "epoch": 6.810466760961811, + "grad_norm": 0.3553178012371063, + "learning_rate": 1.3962390597036007e-05, + "loss": 0.2974, + "num_input_tokens_seen": 14545536, + "step": 14445 + }, + { + "epoch": 6.812824139556813, + "grad_norm": 0.3596811890602112, + "learning_rate": 1.3943935913785384e-05, + "loss": 0.3448, + "num_input_tokens_seen": 14549504, + "step": 14450 + }, + { + "epoch": 6.815181518151816, + "grad_norm": 0.3197573125362396, + "learning_rate": 1.392548871695015e-05, + "loss": 0.3384, + "num_input_tokens_seen": 14555136, + "step": 14455 + }, + { + "epoch": 6.817538896746818, + "grad_norm": 0.39731213450431824, + "learning_rate": 1.39070490190215e-05, + "loss": 0.3292, + "num_input_tokens_seen": 14559968, + "step": 14460 + }, + { + "epoch": 6.8198962753418195, + "grad_norm": 0.5743786692619324, + "learning_rate": 1.3888616832485546e-05, + "loss": 0.3477, + "num_input_tokens_seen": 14564320, + "step": 14465 + }, + { + "epoch": 6.822253653936822, + "grad_norm": 0.6999056339263916, + "learning_rate": 1.3870192169823326e-05, + "loss": 0.3805, + "num_input_tokens_seen": 14569376, + "step": 14470 + }, + { + "epoch": 6.824611032531824, + "grad_norm": 0.3123110234737396, + "learning_rate": 1.3851775043510753e-05, + "loss": 0.3009, + "num_input_tokens_seen": 14575328, + "step": 14475 + }, + { + "epoch": 6.826968411126827, + "grad_norm": 0.3741854429244995, + "learning_rate": 1.383336546601865e-05, + "loss": 0.2952, + "num_input_tokens_seen": 14580960, + "step": 14480 + }, + { + "epoch": 6.829325789721829, + "grad_norm": 0.33495351672172546, + "learning_rate": 1.3814963449812735e-05, + "loss": 0.2992, + "num_input_tokens_seen": 14587776, + "step": 14485 + }, + { + "epoch": 6.8316831683168315, + "grad_norm": 0.29375240206718445, + "learning_rate": 1.3796569007353608e-05, + "loss": 0.3469, + "num_input_tokens_seen": 14592992, + "step": 14490 + }, + { + "epoch": 6.834040546911834, + "grad_norm": 0.3692268133163452, + "learning_rate": 1.3778182151096741e-05, + "loss": 0.3588, + "num_input_tokens_seen": 14597536, + "step": 14495 + }, + { + "epoch": 6.836397925506836, + "grad_norm": 0.4370937943458557, + "learning_rate": 1.3759802893492452e-05, + "loss": 0.312, + "num_input_tokens_seen": 14602400, + "step": 14500 + }, + { + "epoch": 6.838755304101839, + "grad_norm": 0.5413361191749573, + "learning_rate": 1.374143124698593e-05, + "loss": 0.3558, + "num_input_tokens_seen": 14606112, + "step": 14505 + }, + { + "epoch": 6.841112682696841, + "grad_norm": 0.3263212740421295, + "learning_rate": 1.3723067224017205e-05, + "loss": 0.3301, + "num_input_tokens_seen": 14610400, + "step": 14510 + }, + { + "epoch": 6.8434700612918435, + "grad_norm": 0.3415868282318115, + "learning_rate": 1.3704710837021159e-05, + "loss": 0.3232, + "num_input_tokens_seen": 14615712, + "step": 14515 + }, + { + "epoch": 6.845827439886846, + "grad_norm": 0.7163748145103455, + "learning_rate": 1.3686362098427469e-05, + "loss": 0.4038, + "num_input_tokens_seen": 14620768, + "step": 14520 + }, + { + "epoch": 6.848184818481848, + "grad_norm": 0.32108721137046814, + "learning_rate": 1.366802102066066e-05, + "loss": 0.3317, + "num_input_tokens_seen": 14625408, + "step": 14525 + }, + { + "epoch": 6.850542197076851, + "grad_norm": 0.29913076758384705, + "learning_rate": 1.3649687616140067e-05, + "loss": 0.328, + "num_input_tokens_seen": 14630528, + "step": 14530 + }, + { + "epoch": 6.852899575671853, + "grad_norm": 0.5783029794692993, + "learning_rate": 1.3631361897279837e-05, + "loss": 0.3537, + "num_input_tokens_seen": 14635968, + "step": 14535 + }, + { + "epoch": 6.8552569542668556, + "grad_norm": 0.5079655647277832, + "learning_rate": 1.361304387648888e-05, + "loss": 0.34, + "num_input_tokens_seen": 14642272, + "step": 14540 + }, + { + "epoch": 6.857614332861858, + "grad_norm": 0.38587620854377747, + "learning_rate": 1.3594733566170926e-05, + "loss": 0.3299, + "num_input_tokens_seen": 14646976, + "step": 14545 + }, + { + "epoch": 6.85997171145686, + "grad_norm": 0.31006836891174316, + "learning_rate": 1.3576430978724471e-05, + "loss": 0.3219, + "num_input_tokens_seen": 14651488, + "step": 14550 + }, + { + "epoch": 6.862329090051862, + "grad_norm": 0.41964709758758545, + "learning_rate": 1.3558136126542797e-05, + "loss": 0.3601, + "num_input_tokens_seen": 14656352, + "step": 14555 + }, + { + "epoch": 6.864686468646864, + "grad_norm": 0.3732777535915375, + "learning_rate": 1.3539849022013911e-05, + "loss": 0.3167, + "num_input_tokens_seen": 14661536, + "step": 14560 + }, + { + "epoch": 6.867043847241867, + "grad_norm": 0.4187946021556854, + "learning_rate": 1.3521569677520612e-05, + "loss": 0.335, + "num_input_tokens_seen": 14666592, + "step": 14565 + }, + { + "epoch": 6.869401225836869, + "grad_norm": 0.5787064433097839, + "learning_rate": 1.350329810544043e-05, + "loss": 0.2907, + "num_input_tokens_seen": 14671456, + "step": 14570 + }, + { + "epoch": 6.8717586044318715, + "grad_norm": 0.3574872612953186, + "learning_rate": 1.3485034318145634e-05, + "loss": 0.3637, + "num_input_tokens_seen": 14677568, + "step": 14575 + }, + { + "epoch": 6.874115983026874, + "grad_norm": 0.2922697961330414, + "learning_rate": 1.346677832800321e-05, + "loss": 0.3049, + "num_input_tokens_seen": 14682400, + "step": 14580 + }, + { + "epoch": 6.876473361621876, + "grad_norm": 0.245726078748703, + "learning_rate": 1.3448530147374889e-05, + "loss": 0.3003, + "num_input_tokens_seen": 14688672, + "step": 14585 + }, + { + "epoch": 6.878830740216879, + "grad_norm": 0.5594576001167297, + "learning_rate": 1.3430289788617079e-05, + "loss": 0.3635, + "num_input_tokens_seen": 14693312, + "step": 14590 + }, + { + "epoch": 6.881188118811881, + "grad_norm": 0.3469880223274231, + "learning_rate": 1.3412057264080925e-05, + "loss": 0.3543, + "num_input_tokens_seen": 14698624, + "step": 14595 + }, + { + "epoch": 6.8835454974068835, + "grad_norm": 0.3962896764278412, + "learning_rate": 1.3393832586112259e-05, + "loss": 0.3515, + "num_input_tokens_seen": 14703584, + "step": 14600 + }, + { + "epoch": 6.885902876001886, + "grad_norm": 0.5199814438819885, + "learning_rate": 1.337561576705158e-05, + "loss": 0.3779, + "num_input_tokens_seen": 14707424, + "step": 14605 + }, + { + "epoch": 6.888260254596888, + "grad_norm": 0.5853482484817505, + "learning_rate": 1.3357406819234083e-05, + "loss": 0.3526, + "num_input_tokens_seen": 14712224, + "step": 14610 + }, + { + "epoch": 6.890617633191891, + "grad_norm": 0.3108634948730469, + "learning_rate": 1.3339205754989637e-05, + "loss": 0.2975, + "num_input_tokens_seen": 14717184, + "step": 14615 + }, + { + "epoch": 6.892975011786893, + "grad_norm": 0.2326122373342514, + "learning_rate": 1.3321012586642772e-05, + "loss": 0.3287, + "num_input_tokens_seen": 14721824, + "step": 14620 + }, + { + "epoch": 6.8953323903818955, + "grad_norm": 0.5490350723266602, + "learning_rate": 1.330282732651265e-05, + "loss": 0.3465, + "num_input_tokens_seen": 14726464, + "step": 14625 + }, + { + "epoch": 6.897689768976898, + "grad_norm": 0.33849433064460754, + "learning_rate": 1.3284649986913097e-05, + "loss": 0.3816, + "num_input_tokens_seen": 14731712, + "step": 14630 + }, + { + "epoch": 6.9000471475719, + "grad_norm": 0.3914676308631897, + "learning_rate": 1.3266480580152579e-05, + "loss": 0.3457, + "num_input_tokens_seen": 14736416, + "step": 14635 + }, + { + "epoch": 6.902404526166903, + "grad_norm": 0.3073917627334595, + "learning_rate": 1.3248319118534189e-05, + "loss": 0.3682, + "num_input_tokens_seen": 14741888, + "step": 14640 + }, + { + "epoch": 6.904761904761905, + "grad_norm": 0.5288712382316589, + "learning_rate": 1.3230165614355616e-05, + "loss": 0.3519, + "num_input_tokens_seen": 14747488, + "step": 14645 + }, + { + "epoch": 6.9071192833569075, + "grad_norm": 0.5284141898155212, + "learning_rate": 1.3212020079909191e-05, + "loss": 0.3517, + "num_input_tokens_seen": 14751680, + "step": 14650 + }, + { + "epoch": 6.90947666195191, + "grad_norm": 0.45652130246162415, + "learning_rate": 1.3193882527481835e-05, + "loss": 0.3079, + "num_input_tokens_seen": 14756320, + "step": 14655 + }, + { + "epoch": 6.911834040546912, + "grad_norm": 0.30714643001556396, + "learning_rate": 1.3175752969355076e-05, + "loss": 0.2878, + "num_input_tokens_seen": 14760544, + "step": 14660 + }, + { + "epoch": 6.914191419141914, + "grad_norm": 0.38597655296325684, + "learning_rate": 1.3157631417805006e-05, + "loss": 0.3499, + "num_input_tokens_seen": 14765856, + "step": 14665 + }, + { + "epoch": 6.916548797736916, + "grad_norm": 0.2317967414855957, + "learning_rate": 1.3139517885102306e-05, + "loss": 0.322, + "num_input_tokens_seen": 14770048, + "step": 14670 + }, + { + "epoch": 6.918906176331919, + "grad_norm": 0.4772786796092987, + "learning_rate": 1.3121412383512238e-05, + "loss": 0.3225, + "num_input_tokens_seen": 14774848, + "step": 14675 + }, + { + "epoch": 6.921263554926921, + "grad_norm": 0.26291194558143616, + "learning_rate": 1.3103314925294626e-05, + "loss": 0.3279, + "num_input_tokens_seen": 14779584, + "step": 14680 + }, + { + "epoch": 6.923620933521923, + "grad_norm": 0.7903621792793274, + "learning_rate": 1.308522552270382e-05, + "loss": 0.347, + "num_input_tokens_seen": 14783616, + "step": 14685 + }, + { + "epoch": 6.925978312116926, + "grad_norm": 0.268532395362854, + "learning_rate": 1.3067144187988745e-05, + "loss": 0.3144, + "num_input_tokens_seen": 14788928, + "step": 14690 + }, + { + "epoch": 6.928335690711928, + "grad_norm": 0.3641071617603302, + "learning_rate": 1.3049070933392857e-05, + "loss": 0.3355, + "num_input_tokens_seen": 14793792, + "step": 14695 + }, + { + "epoch": 6.930693069306931, + "grad_norm": 0.3105728030204773, + "learning_rate": 1.303100577115412e-05, + "loss": 0.2756, + "num_input_tokens_seen": 14799200, + "step": 14700 + }, + { + "epoch": 6.933050447901933, + "grad_norm": 0.29314345121383667, + "learning_rate": 1.3012948713505052e-05, + "loss": 0.3519, + "num_input_tokens_seen": 14804896, + "step": 14705 + }, + { + "epoch": 6.935407826496935, + "grad_norm": 0.2759866416454315, + "learning_rate": 1.2994899772672654e-05, + "loss": 0.3172, + "num_input_tokens_seen": 14808864, + "step": 14710 + }, + { + "epoch": 6.937765205091938, + "grad_norm": 0.33195534348487854, + "learning_rate": 1.297685896087844e-05, + "loss": 0.3129, + "num_input_tokens_seen": 14814688, + "step": 14715 + }, + { + "epoch": 6.94012258368694, + "grad_norm": 0.35591059923171997, + "learning_rate": 1.295882629033843e-05, + "loss": 0.3323, + "num_input_tokens_seen": 14819008, + "step": 14720 + }, + { + "epoch": 6.942479962281943, + "grad_norm": 0.4099147319793701, + "learning_rate": 1.2940801773263131e-05, + "loss": 0.3585, + "num_input_tokens_seen": 14824736, + "step": 14725 + }, + { + "epoch": 6.944837340876945, + "grad_norm": 0.2895975708961487, + "learning_rate": 1.2922785421857504e-05, + "loss": 0.3087, + "num_input_tokens_seen": 14829376, + "step": 14730 + }, + { + "epoch": 6.947194719471947, + "grad_norm": 0.3441219627857208, + "learning_rate": 1.2904777248320998e-05, + "loss": 0.3352, + "num_input_tokens_seen": 14834080, + "step": 14735 + }, + { + "epoch": 6.94955209806695, + "grad_norm": 0.3325943350791931, + "learning_rate": 1.2886777264847538e-05, + "loss": 0.323, + "num_input_tokens_seen": 14839680, + "step": 14740 + }, + { + "epoch": 6.951909476661952, + "grad_norm": 0.3643795847892761, + "learning_rate": 1.2868785483625494e-05, + "loss": 0.3062, + "num_input_tokens_seen": 14844224, + "step": 14745 + }, + { + "epoch": 6.954266855256955, + "grad_norm": 0.46132734417915344, + "learning_rate": 1.2850801916837658e-05, + "loss": 0.2791, + "num_input_tokens_seen": 14849216, + "step": 14750 + }, + { + "epoch": 6.956624233851956, + "grad_norm": 0.4637611210346222, + "learning_rate": 1.2832826576661285e-05, + "loss": 0.266, + "num_input_tokens_seen": 14854688, + "step": 14755 + }, + { + "epoch": 6.958981612446959, + "grad_norm": 0.5718017816543579, + "learning_rate": 1.2814859475268062e-05, + "loss": 0.3924, + "num_input_tokens_seen": 14859200, + "step": 14760 + }, + { + "epoch": 6.961338991041961, + "grad_norm": 0.3370095193386078, + "learning_rate": 1.2796900624824094e-05, + "loss": 0.3815, + "num_input_tokens_seen": 14865152, + "step": 14765 + }, + { + "epoch": 6.963696369636963, + "grad_norm": 0.43529778718948364, + "learning_rate": 1.2778950037489879e-05, + "loss": 0.3024, + "num_input_tokens_seen": 14870432, + "step": 14770 + }, + { + "epoch": 6.966053748231966, + "grad_norm": 0.5760348439216614, + "learning_rate": 1.2761007725420337e-05, + "loss": 0.3524, + "num_input_tokens_seen": 14875808, + "step": 14775 + }, + { + "epoch": 6.968411126826968, + "grad_norm": 0.6364084482192993, + "learning_rate": 1.274307370076479e-05, + "loss": 0.3535, + "num_input_tokens_seen": 14881088, + "step": 14780 + }, + { + "epoch": 6.970768505421971, + "grad_norm": 0.3995671272277832, + "learning_rate": 1.2725147975666948e-05, + "loss": 0.3596, + "num_input_tokens_seen": 14886528, + "step": 14785 + }, + { + "epoch": 6.973125884016973, + "grad_norm": 0.33659613132476807, + "learning_rate": 1.2707230562264876e-05, + "loss": 0.3548, + "num_input_tokens_seen": 14891008, + "step": 14790 + }, + { + "epoch": 6.975483262611975, + "grad_norm": 0.32549023628234863, + "learning_rate": 1.2689321472691035e-05, + "loss": 0.3396, + "num_input_tokens_seen": 14895712, + "step": 14795 + }, + { + "epoch": 6.977840641206978, + "grad_norm": 0.5040299296379089, + "learning_rate": 1.267142071907225e-05, + "loss": 0.3388, + "num_input_tokens_seen": 14901728, + "step": 14800 + }, + { + "epoch": 6.98019801980198, + "grad_norm": 0.4688813090324402, + "learning_rate": 1.2653528313529698e-05, + "loss": 0.3389, + "num_input_tokens_seen": 14908160, + "step": 14805 + }, + { + "epoch": 6.982555398396983, + "grad_norm": 0.4810577929019928, + "learning_rate": 1.2635644268178892e-05, + "loss": 0.3078, + "num_input_tokens_seen": 14914016, + "step": 14810 + }, + { + "epoch": 6.984912776991985, + "grad_norm": 0.3088354468345642, + "learning_rate": 1.261776859512969e-05, + "loss": 0.3275, + "num_input_tokens_seen": 14918400, + "step": 14815 + }, + { + "epoch": 6.987270155586987, + "grad_norm": 0.3649957478046417, + "learning_rate": 1.2599901306486284e-05, + "loss": 0.3385, + "num_input_tokens_seen": 14923168, + "step": 14820 + }, + { + "epoch": 6.98962753418199, + "grad_norm": 0.3446301519870758, + "learning_rate": 1.2582042414347189e-05, + "loss": 0.3475, + "num_input_tokens_seen": 14928064, + "step": 14825 + }, + { + "epoch": 6.991984912776992, + "grad_norm": 0.41667571663856506, + "learning_rate": 1.2564191930805247e-05, + "loss": 0.3206, + "num_input_tokens_seen": 14933728, + "step": 14830 + }, + { + "epoch": 6.994342291371995, + "grad_norm": 0.27974656224250793, + "learning_rate": 1.2546349867947574e-05, + "loss": 0.3035, + "num_input_tokens_seen": 14938944, + "step": 14835 + }, + { + "epoch": 6.996699669966997, + "grad_norm": 0.3494514524936676, + "learning_rate": 1.2528516237855609e-05, + "loss": 0.2777, + "num_input_tokens_seen": 14944736, + "step": 14840 + }, + { + "epoch": 6.999057048561999, + "grad_norm": 0.4232180416584015, + "learning_rate": 1.2510691052605072e-05, + "loss": 0.2346, + "num_input_tokens_seen": 14948800, + "step": 14845 + }, + { + "epoch": 7.001414427157002, + "grad_norm": 0.33981943130493164, + "learning_rate": 1.2492874324265982e-05, + "loss": 0.2944, + "num_input_tokens_seen": 14953344, + "step": 14850 + }, + { + "epoch": 7.003300330033003, + "eval_loss": 0.33054131269454956, + "eval_runtime": 25.6069, + "eval_samples_per_second": 36.826, + "eval_steps_per_second": 9.216, + "num_input_tokens_seen": 14957120, + "step": 14854 + }, + { + "epoch": 7.003771805752004, + "grad_norm": 0.30214184522628784, + "learning_rate": 1.2475066064902596e-05, + "loss": 0.3295, + "num_input_tokens_seen": 14958496, + "step": 14855 + }, + { + "epoch": 7.006129184347006, + "grad_norm": 0.16016559302806854, + "learning_rate": 1.245726628657347e-05, + "loss": 0.3679, + "num_input_tokens_seen": 14963616, + "step": 14860 + }, + { + "epoch": 7.008486562942008, + "grad_norm": 0.238665372133255, + "learning_rate": 1.2439475001331397e-05, + "loss": 0.2719, + "num_input_tokens_seen": 14968320, + "step": 14865 + }, + { + "epoch": 7.0108439415370105, + "grad_norm": 0.5855079889297485, + "learning_rate": 1.2421692221223442e-05, + "loss": 0.3292, + "num_input_tokens_seen": 14974016, + "step": 14870 + }, + { + "epoch": 7.013201320132013, + "grad_norm": 0.17007124423980713, + "learning_rate": 1.2403917958290872e-05, + "loss": 0.2657, + "num_input_tokens_seen": 14980416, + "step": 14875 + }, + { + "epoch": 7.015558698727015, + "grad_norm": 0.35480257868766785, + "learning_rate": 1.2386152224569224e-05, + "loss": 0.3234, + "num_input_tokens_seen": 14985536, + "step": 14880 + }, + { + "epoch": 7.017916077322018, + "grad_norm": 0.267171174287796, + "learning_rate": 1.2368395032088249e-05, + "loss": 0.309, + "num_input_tokens_seen": 14991648, + "step": 14885 + }, + { + "epoch": 7.02027345591702, + "grad_norm": 0.3632192611694336, + "learning_rate": 1.2350646392871912e-05, + "loss": 0.3578, + "num_input_tokens_seen": 14996640, + "step": 14890 + }, + { + "epoch": 7.0226308345120225, + "grad_norm": 0.4218800961971283, + "learning_rate": 1.233290631893837e-05, + "loss": 0.2999, + "num_input_tokens_seen": 15001120, + "step": 14895 + }, + { + "epoch": 7.024988213107025, + "grad_norm": 0.5258214473724365, + "learning_rate": 1.2315174822300008e-05, + "loss": 0.3493, + "num_input_tokens_seen": 15006496, + "step": 14900 + }, + { + "epoch": 7.027345591702027, + "grad_norm": 0.2810705006122589, + "learning_rate": 1.2297451914963385e-05, + "loss": 0.3138, + "num_input_tokens_seen": 15011520, + "step": 14905 + }, + { + "epoch": 7.02970297029703, + "grad_norm": 0.538074254989624, + "learning_rate": 1.2279737608929265e-05, + "loss": 0.41, + "num_input_tokens_seen": 15017152, + "step": 14910 + }, + { + "epoch": 7.032060348892032, + "grad_norm": 0.27023717761039734, + "learning_rate": 1.226203191619255e-05, + "loss": 0.3255, + "num_input_tokens_seen": 15021824, + "step": 14915 + }, + { + "epoch": 7.0344177274870345, + "grad_norm": 0.30404576659202576, + "learning_rate": 1.2244334848742357e-05, + "loss": 0.3535, + "num_input_tokens_seen": 15027968, + "step": 14920 + }, + { + "epoch": 7.036775106082037, + "grad_norm": 0.5245261192321777, + "learning_rate": 1.2226646418561907e-05, + "loss": 0.337, + "num_input_tokens_seen": 15032640, + "step": 14925 + }, + { + "epoch": 7.039132484677039, + "grad_norm": 0.3103489875793457, + "learning_rate": 1.2208966637628625e-05, + "loss": 0.3042, + "num_input_tokens_seen": 15037216, + "step": 14930 + }, + { + "epoch": 7.041489863272042, + "grad_norm": 0.5180718898773193, + "learning_rate": 1.2191295517914058e-05, + "loss": 0.3266, + "num_input_tokens_seen": 15041088, + "step": 14935 + }, + { + "epoch": 7.043847241867044, + "grad_norm": 0.33669427037239075, + "learning_rate": 1.2173633071383874e-05, + "loss": 0.3666, + "num_input_tokens_seen": 15045632, + "step": 14940 + }, + { + "epoch": 7.0462046204620465, + "grad_norm": 0.27321919798851013, + "learning_rate": 1.215597930999789e-05, + "loss": 0.3081, + "num_input_tokens_seen": 15049536, + "step": 14945 + }, + { + "epoch": 7.048561999057049, + "grad_norm": 0.37471091747283936, + "learning_rate": 1.2138334245710028e-05, + "loss": 0.3019, + "num_input_tokens_seen": 15053888, + "step": 14950 + }, + { + "epoch": 7.050919377652051, + "grad_norm": 0.25851067900657654, + "learning_rate": 1.2120697890468347e-05, + "loss": 0.3132, + "num_input_tokens_seen": 15059040, + "step": 14955 + }, + { + "epoch": 7.053276756247053, + "grad_norm": 0.341213196516037, + "learning_rate": 1.210307025621496e-05, + "loss": 0.3156, + "num_input_tokens_seen": 15063360, + "step": 14960 + }, + { + "epoch": 7.055634134842055, + "grad_norm": 0.31213754415512085, + "learning_rate": 1.2085451354886118e-05, + "loss": 0.3176, + "num_input_tokens_seen": 15070080, + "step": 14965 + }, + { + "epoch": 7.057991513437058, + "grad_norm": 0.32049667835235596, + "learning_rate": 1.2067841198412142e-05, + "loss": 0.3128, + "num_input_tokens_seen": 15075328, + "step": 14970 + }, + { + "epoch": 7.06034889203206, + "grad_norm": 0.1638793647289276, + "learning_rate": 1.2050239798717441e-05, + "loss": 0.3565, + "num_input_tokens_seen": 15080192, + "step": 14975 + }, + { + "epoch": 7.0627062706270625, + "grad_norm": 0.3380284309387207, + "learning_rate": 1.2032647167720473e-05, + "loss": 0.3362, + "num_input_tokens_seen": 15085856, + "step": 14980 + }, + { + "epoch": 7.065063649222065, + "grad_norm": 0.2886240482330322, + "learning_rate": 1.2015063317333777e-05, + "loss": 0.3246, + "num_input_tokens_seen": 15090432, + "step": 14985 + }, + { + "epoch": 7.067421027817067, + "grad_norm": 0.35855159163475037, + "learning_rate": 1.1997488259463946e-05, + "loss": 0.3906, + "num_input_tokens_seen": 15095296, + "step": 14990 + }, + { + "epoch": 7.06977840641207, + "grad_norm": 0.36274445056915283, + "learning_rate": 1.197992200601162e-05, + "loss": 0.3622, + "num_input_tokens_seen": 15100640, + "step": 14995 + }, + { + "epoch": 7.072135785007072, + "grad_norm": 0.5056880116462708, + "learning_rate": 1.1962364568871457e-05, + "loss": 0.3281, + "num_input_tokens_seen": 15104544, + "step": 15000 + }, + { + "epoch": 7.0744931636020745, + "grad_norm": 0.32302168011665344, + "learning_rate": 1.1944815959932169e-05, + "loss": 0.3154, + "num_input_tokens_seen": 15109312, + "step": 15005 + }, + { + "epoch": 7.076850542197077, + "grad_norm": 0.524681806564331, + "learning_rate": 1.1927276191076478e-05, + "loss": 0.3272, + "num_input_tokens_seen": 15114368, + "step": 15010 + }, + { + "epoch": 7.079207920792079, + "grad_norm": 0.4289802312850952, + "learning_rate": 1.1909745274181138e-05, + "loss": 0.2817, + "num_input_tokens_seen": 15120128, + "step": 15015 + }, + { + "epoch": 7.081565299387082, + "grad_norm": 0.26467448472976685, + "learning_rate": 1.1892223221116874e-05, + "loss": 0.3034, + "num_input_tokens_seen": 15126208, + "step": 15020 + }, + { + "epoch": 7.083922677982084, + "grad_norm": 0.5092832446098328, + "learning_rate": 1.1874710043748441e-05, + "loss": 0.3127, + "num_input_tokens_seen": 15131552, + "step": 15025 + }, + { + "epoch": 7.0862800565770865, + "grad_norm": 0.2819293737411499, + "learning_rate": 1.185720575393457e-05, + "loss": 0.3304, + "num_input_tokens_seen": 15137312, + "step": 15030 + }, + { + "epoch": 7.088637435172089, + "grad_norm": 0.5177387595176697, + "learning_rate": 1.183971036352799e-05, + "loss": 0.3325, + "num_input_tokens_seen": 15141536, + "step": 15035 + }, + { + "epoch": 7.090994813767091, + "grad_norm": 0.2732391655445099, + "learning_rate": 1.1822223884375375e-05, + "loss": 0.3497, + "num_input_tokens_seen": 15147232, + "step": 15040 + }, + { + "epoch": 7.093352192362094, + "grad_norm": 0.46593019366264343, + "learning_rate": 1.1804746328317377e-05, + "loss": 0.3519, + "num_input_tokens_seen": 15151680, + "step": 15045 + }, + { + "epoch": 7.095709570957096, + "grad_norm": 0.19685585796833038, + "learning_rate": 1.1787277707188616e-05, + "loss": 0.3518, + "num_input_tokens_seen": 15156032, + "step": 15050 + }, + { + "epoch": 7.0980669495520985, + "grad_norm": 0.3770833909511566, + "learning_rate": 1.1769818032817656e-05, + "loss": 0.326, + "num_input_tokens_seen": 15160544, + "step": 15055 + }, + { + "epoch": 7.1004243281471, + "grad_norm": 0.3013339638710022, + "learning_rate": 1.175236731702701e-05, + "loss": 0.3633, + "num_input_tokens_seen": 15166432, + "step": 15060 + }, + { + "epoch": 7.102781706742102, + "grad_norm": 0.30307504534721375, + "learning_rate": 1.1734925571633104e-05, + "loss": 0.3588, + "num_input_tokens_seen": 15171008, + "step": 15065 + }, + { + "epoch": 7.105139085337105, + "grad_norm": 0.4340272843837738, + "learning_rate": 1.1717492808446304e-05, + "loss": 0.3231, + "num_input_tokens_seen": 15176672, + "step": 15070 + }, + { + "epoch": 7.107496463932107, + "grad_norm": 0.4789365828037262, + "learning_rate": 1.17000690392709e-05, + "loss": 0.347, + "num_input_tokens_seen": 15180928, + "step": 15075 + }, + { + "epoch": 7.10985384252711, + "grad_norm": 0.347778856754303, + "learning_rate": 1.1682654275905089e-05, + "loss": 0.3403, + "num_input_tokens_seen": 15185824, + "step": 15080 + }, + { + "epoch": 7.112211221122112, + "grad_norm": 0.539143443107605, + "learning_rate": 1.1665248530140948e-05, + "loss": 0.3389, + "num_input_tokens_seen": 15190080, + "step": 15085 + }, + { + "epoch": 7.114568599717114, + "grad_norm": 0.5489917397499084, + "learning_rate": 1.164785181376448e-05, + "loss": 0.336, + "num_input_tokens_seen": 15194880, + "step": 15090 + }, + { + "epoch": 7.116925978312117, + "grad_norm": 0.378996878862381, + "learning_rate": 1.163046413855555e-05, + "loss": 0.323, + "num_input_tokens_seen": 15200800, + "step": 15095 + }, + { + "epoch": 7.119283356907119, + "grad_norm": 0.27630501985549927, + "learning_rate": 1.1613085516287928e-05, + "loss": 0.33, + "num_input_tokens_seen": 15205120, + "step": 15100 + }, + { + "epoch": 7.121640735502122, + "grad_norm": 0.2899111211299896, + "learning_rate": 1.1595715958729211e-05, + "loss": 0.3153, + "num_input_tokens_seen": 15210304, + "step": 15105 + }, + { + "epoch": 7.123998114097124, + "grad_norm": 0.48534873127937317, + "learning_rate": 1.1578355477640896e-05, + "loss": 0.3212, + "num_input_tokens_seen": 15214944, + "step": 15110 + }, + { + "epoch": 7.126355492692126, + "grad_norm": 0.5121934413909912, + "learning_rate": 1.156100408477832e-05, + "loss": 0.3508, + "num_input_tokens_seen": 15219552, + "step": 15115 + }, + { + "epoch": 7.128712871287129, + "grad_norm": 0.5455906987190247, + "learning_rate": 1.1543661791890682e-05, + "loss": 0.3291, + "num_input_tokens_seen": 15224032, + "step": 15120 + }, + { + "epoch": 7.131070249882131, + "grad_norm": 0.32319310307502747, + "learning_rate": 1.1526328610720977e-05, + "loss": 0.3204, + "num_input_tokens_seen": 15229888, + "step": 15125 + }, + { + "epoch": 7.133427628477134, + "grad_norm": 0.3680107593536377, + "learning_rate": 1.1509004553006072e-05, + "loss": 0.2946, + "num_input_tokens_seen": 15235392, + "step": 15130 + }, + { + "epoch": 7.135785007072136, + "grad_norm": 0.367899626493454, + "learning_rate": 1.149168963047664e-05, + "loss": 0.3275, + "num_input_tokens_seen": 15240928, + "step": 15135 + }, + { + "epoch": 7.138142385667138, + "grad_norm": 0.41260474920272827, + "learning_rate": 1.1474383854857182e-05, + "loss": 0.2909, + "num_input_tokens_seen": 15246624, + "step": 15140 + }, + { + "epoch": 7.140499764262141, + "grad_norm": 0.25025177001953125, + "learning_rate": 1.145708723786598e-05, + "loss": 0.3157, + "num_input_tokens_seen": 15251168, + "step": 15145 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.4798571467399597, + "learning_rate": 1.1439799791215122e-05, + "loss": 0.3802, + "num_input_tokens_seen": 15255680, + "step": 15150 + }, + { + "epoch": 7.145214521452146, + "grad_norm": 0.3766087293624878, + "learning_rate": 1.1422521526610499e-05, + "loss": 0.3502, + "num_input_tokens_seen": 15262656, + "step": 15155 + }, + { + "epoch": 7.147571900047147, + "grad_norm": 0.574913740158081, + "learning_rate": 1.140525245575179e-05, + "loss": 0.3678, + "num_input_tokens_seen": 15267328, + "step": 15160 + }, + { + "epoch": 7.1499292786421496, + "grad_norm": 0.3872351050376892, + "learning_rate": 1.138799259033242e-05, + "loss": 0.3249, + "num_input_tokens_seen": 15272992, + "step": 15165 + }, + { + "epoch": 7.152286657237152, + "grad_norm": 0.509607195854187, + "learning_rate": 1.1370741942039598e-05, + "loss": 0.3014, + "num_input_tokens_seen": 15280224, + "step": 15170 + }, + { + "epoch": 7.154644035832154, + "grad_norm": 0.36483219265937805, + "learning_rate": 1.1353500522554297e-05, + "loss": 0.3529, + "num_input_tokens_seen": 15284256, + "step": 15175 + }, + { + "epoch": 7.157001414427157, + "grad_norm": 0.35334232449531555, + "learning_rate": 1.1336268343551243e-05, + "loss": 0.3323, + "num_input_tokens_seen": 15288928, + "step": 15180 + }, + { + "epoch": 7.159358793022159, + "grad_norm": 0.432066410779953, + "learning_rate": 1.1319045416698876e-05, + "loss": 0.3424, + "num_input_tokens_seen": 15294528, + "step": 15185 + }, + { + "epoch": 7.161716171617162, + "grad_norm": 0.3207476735115051, + "learning_rate": 1.1301831753659405e-05, + "loss": 0.3522, + "num_input_tokens_seen": 15299776, + "step": 15190 + }, + { + "epoch": 7.164073550212164, + "grad_norm": 0.704848051071167, + "learning_rate": 1.1284627366088752e-05, + "loss": 0.3757, + "num_input_tokens_seen": 15304608, + "step": 15195 + }, + { + "epoch": 7.166430928807166, + "grad_norm": 0.30638253688812256, + "learning_rate": 1.1267432265636568e-05, + "loss": 0.3587, + "num_input_tokens_seen": 15308864, + "step": 15200 + }, + { + "epoch": 7.168788307402169, + "grad_norm": 0.5395008325576782, + "learning_rate": 1.1250246463946187e-05, + "loss": 0.3048, + "num_input_tokens_seen": 15315488, + "step": 15205 + }, + { + "epoch": 7.171145685997171, + "grad_norm": 0.4517122805118561, + "learning_rate": 1.1233069972654679e-05, + "loss": 0.291, + "num_input_tokens_seen": 15320704, + "step": 15210 + }, + { + "epoch": 7.173503064592174, + "grad_norm": 0.47409409284591675, + "learning_rate": 1.121590280339279e-05, + "loss": 0.3271, + "num_input_tokens_seen": 15325504, + "step": 15215 + }, + { + "epoch": 7.175860443187176, + "grad_norm": 0.49518683552742004, + "learning_rate": 1.1198744967784977e-05, + "loss": 0.3736, + "num_input_tokens_seen": 15329536, + "step": 15220 + }, + { + "epoch": 7.178217821782178, + "grad_norm": 0.47951197624206543, + "learning_rate": 1.1181596477449339e-05, + "loss": 0.3285, + "num_input_tokens_seen": 15334336, + "step": 15225 + }, + { + "epoch": 7.180575200377181, + "grad_norm": 0.47157853841781616, + "learning_rate": 1.1164457343997675e-05, + "loss": 0.3629, + "num_input_tokens_seen": 15338976, + "step": 15230 + }, + { + "epoch": 7.182932578972183, + "grad_norm": 0.37681353092193604, + "learning_rate": 1.114732757903545e-05, + "loss": 0.327, + "num_input_tokens_seen": 15344544, + "step": 15235 + }, + { + "epoch": 7.185289957567186, + "grad_norm": 0.3750675618648529, + "learning_rate": 1.1130207194161776e-05, + "loss": 0.3128, + "num_input_tokens_seen": 15350016, + "step": 15240 + }, + { + "epoch": 7.187647336162188, + "grad_norm": 0.3023500144481659, + "learning_rate": 1.1113096200969403e-05, + "loss": 0.3047, + "num_input_tokens_seen": 15354656, + "step": 15245 + }, + { + "epoch": 7.19000471475719, + "grad_norm": 0.5141637921333313, + "learning_rate": 1.1095994611044739e-05, + "loss": 0.3172, + "num_input_tokens_seen": 15360096, + "step": 15250 + }, + { + "epoch": 7.192362093352193, + "grad_norm": 0.27295610308647156, + "learning_rate": 1.107890243596782e-05, + "loss": 0.3773, + "num_input_tokens_seen": 15365344, + "step": 15255 + }, + { + "epoch": 7.194719471947194, + "grad_norm": 0.3028872311115265, + "learning_rate": 1.1061819687312313e-05, + "loss": 0.311, + "num_input_tokens_seen": 15370208, + "step": 15260 + }, + { + "epoch": 7.197076850542197, + "grad_norm": 0.22643029689788818, + "learning_rate": 1.104474637664549e-05, + "loss": 0.3168, + "num_input_tokens_seen": 15375488, + "step": 15265 + }, + { + "epoch": 7.199434229137199, + "grad_norm": 0.34034964442253113, + "learning_rate": 1.1027682515528223e-05, + "loss": 0.3633, + "num_input_tokens_seen": 15379808, + "step": 15270 + }, + { + "epoch": 7.2017916077322015, + "grad_norm": 0.44720232486724854, + "learning_rate": 1.1010628115515012e-05, + "loss": 0.345, + "num_input_tokens_seen": 15384544, + "step": 15275 + }, + { + "epoch": 7.204148986327204, + "grad_norm": 0.5060552954673767, + "learning_rate": 1.0993583188153933e-05, + "loss": 0.3157, + "num_input_tokens_seen": 15390304, + "step": 15280 + }, + { + "epoch": 7.206506364922206, + "grad_norm": 0.49779555201530457, + "learning_rate": 1.0976547744986667e-05, + "loss": 0.3286, + "num_input_tokens_seen": 15394848, + "step": 15285 + }, + { + "epoch": 7.208863743517209, + "grad_norm": 0.4615604281425476, + "learning_rate": 1.0959521797548439e-05, + "loss": 0.2597, + "num_input_tokens_seen": 15399392, + "step": 15290 + }, + { + "epoch": 7.211221122112211, + "grad_norm": 0.2604096531867981, + "learning_rate": 1.0942505357368072e-05, + "loss": 0.3337, + "num_input_tokens_seen": 15403680, + "step": 15295 + }, + { + "epoch": 7.2135785007072135, + "grad_norm": 0.41632333397865295, + "learning_rate": 1.0925498435967943e-05, + "loss": 0.3245, + "num_input_tokens_seen": 15408096, + "step": 15300 + }, + { + "epoch": 7.215935879302216, + "grad_norm": 0.4677616059780121, + "learning_rate": 1.0908501044863995e-05, + "loss": 0.3066, + "num_input_tokens_seen": 15412544, + "step": 15305 + }, + { + "epoch": 7.218293257897218, + "grad_norm": 0.3141442537307739, + "learning_rate": 1.0891513195565684e-05, + "loss": 0.3423, + "num_input_tokens_seen": 15418016, + "step": 15310 + }, + { + "epoch": 7.220650636492221, + "grad_norm": 0.34022316336631775, + "learning_rate": 1.087453489957604e-05, + "loss": 0.2847, + "num_input_tokens_seen": 15422880, + "step": 15315 + }, + { + "epoch": 7.223008015087223, + "grad_norm": 0.2646884024143219, + "learning_rate": 1.0857566168391608e-05, + "loss": 0.3024, + "num_input_tokens_seen": 15427264, + "step": 15320 + }, + { + "epoch": 7.2253653936822255, + "grad_norm": 0.29565659165382385, + "learning_rate": 1.0840607013502474e-05, + "loss": 0.3111, + "num_input_tokens_seen": 15432000, + "step": 15325 + }, + { + "epoch": 7.227722772277228, + "grad_norm": 0.2628195285797119, + "learning_rate": 1.0823657446392203e-05, + "loss": 0.3422, + "num_input_tokens_seen": 15436896, + "step": 15330 + }, + { + "epoch": 7.23008015087223, + "grad_norm": 0.427651584148407, + "learning_rate": 1.08067174785379e-05, + "loss": 0.3203, + "num_input_tokens_seen": 15441440, + "step": 15335 + }, + { + "epoch": 7.232437529467233, + "grad_norm": 0.4141589105129242, + "learning_rate": 1.0789787121410164e-05, + "loss": 0.3563, + "num_input_tokens_seen": 15446720, + "step": 15340 + }, + { + "epoch": 7.234794908062235, + "grad_norm": 0.34051549434661865, + "learning_rate": 1.0772866386473085e-05, + "loss": 0.2953, + "num_input_tokens_seen": 15451200, + "step": 15345 + }, + { + "epoch": 7.2371522866572375, + "grad_norm": 0.25300511717796326, + "learning_rate": 1.0755955285184222e-05, + "loss": 0.3006, + "num_input_tokens_seen": 15456992, + "step": 15350 + }, + { + "epoch": 7.23950966525224, + "grad_norm": 0.45980408787727356, + "learning_rate": 1.073905382899463e-05, + "loss": 0.2622, + "num_input_tokens_seen": 15461056, + "step": 15355 + }, + { + "epoch": 7.2418670438472414, + "grad_norm": 0.3823534846305847, + "learning_rate": 1.072216202934883e-05, + "loss": 0.2865, + "num_input_tokens_seen": 15466208, + "step": 15360 + }, + { + "epoch": 7.244224422442244, + "grad_norm": 0.2229447215795517, + "learning_rate": 1.0705279897684806e-05, + "loss": 0.2959, + "num_input_tokens_seen": 15470400, + "step": 15365 + }, + { + "epoch": 7.246581801037246, + "grad_norm": 0.35666608810424805, + "learning_rate": 1.0688407445433985e-05, + "loss": 0.2963, + "num_input_tokens_seen": 15475168, + "step": 15370 + }, + { + "epoch": 7.248939179632249, + "grad_norm": 0.45304569602012634, + "learning_rate": 1.0671544684021234e-05, + "loss": 0.3709, + "num_input_tokens_seen": 15479872, + "step": 15375 + }, + { + "epoch": 7.251296558227251, + "grad_norm": 0.29738759994506836, + "learning_rate": 1.0654691624864879e-05, + "loss": 0.4144, + "num_input_tokens_seen": 15485088, + "step": 15380 + }, + { + "epoch": 7.2536539368222535, + "grad_norm": 0.3698456585407257, + "learning_rate": 1.063784827937667e-05, + "loss": 0.3807, + "num_input_tokens_seen": 15490080, + "step": 15385 + }, + { + "epoch": 7.256011315417256, + "grad_norm": 0.4260103106498718, + "learning_rate": 1.0621014658961779e-05, + "loss": 0.3729, + "num_input_tokens_seen": 15495360, + "step": 15390 + }, + { + "epoch": 7.258368694012258, + "grad_norm": 0.31740233302116394, + "learning_rate": 1.0604190775018774e-05, + "loss": 0.2925, + "num_input_tokens_seen": 15499840, + "step": 15395 + }, + { + "epoch": 7.260726072607261, + "grad_norm": 0.3008975386619568, + "learning_rate": 1.0587376638939655e-05, + "loss": 0.3309, + "num_input_tokens_seen": 15504352, + "step": 15400 + }, + { + "epoch": 7.263083451202263, + "grad_norm": 0.29233190417289734, + "learning_rate": 1.0570572262109816e-05, + "loss": 0.2957, + "num_input_tokens_seen": 15509504, + "step": 15405 + }, + { + "epoch": 7.2654408297972655, + "grad_norm": 0.4194605350494385, + "learning_rate": 1.0553777655908046e-05, + "loss": 0.3307, + "num_input_tokens_seen": 15515296, + "step": 15410 + }, + { + "epoch": 7.267798208392268, + "grad_norm": 0.24644890427589417, + "learning_rate": 1.0536992831706492e-05, + "loss": 0.3281, + "num_input_tokens_seen": 15520384, + "step": 15415 + }, + { + "epoch": 7.27015558698727, + "grad_norm": 0.38662999868392944, + "learning_rate": 1.0520217800870705e-05, + "loss": 0.3085, + "num_input_tokens_seen": 15524960, + "step": 15420 + }, + { + "epoch": 7.272512965582273, + "grad_norm": 0.44328734278678894, + "learning_rate": 1.0503452574759598e-05, + "loss": 0.3928, + "num_input_tokens_seen": 15530720, + "step": 15425 + }, + { + "epoch": 7.274870344177275, + "grad_norm": 0.26144659519195557, + "learning_rate": 1.0486697164725451e-05, + "loss": 0.342, + "num_input_tokens_seen": 15534976, + "step": 15430 + }, + { + "epoch": 7.2772277227722775, + "grad_norm": 0.5030160546302795, + "learning_rate": 1.0469951582113873e-05, + "loss": 0.2948, + "num_input_tokens_seen": 15541376, + "step": 15435 + }, + { + "epoch": 7.27958510136728, + "grad_norm": 0.3080084025859833, + "learning_rate": 1.0453215838263842e-05, + "loss": 0.3211, + "num_input_tokens_seen": 15546720, + "step": 15440 + }, + { + "epoch": 7.281942479962282, + "grad_norm": 0.26246583461761475, + "learning_rate": 1.0436489944507668e-05, + "loss": 0.3331, + "num_input_tokens_seen": 15552320, + "step": 15445 + }, + { + "epoch": 7.284299858557285, + "grad_norm": 0.3930654525756836, + "learning_rate": 1.0419773912170996e-05, + "loss": 0.3115, + "num_input_tokens_seen": 15556128, + "step": 15450 + }, + { + "epoch": 7.286657237152287, + "grad_norm": 0.3174440860748291, + "learning_rate": 1.0403067752572773e-05, + "loss": 0.357, + "num_input_tokens_seen": 15560096, + "step": 15455 + }, + { + "epoch": 7.2890146157472895, + "grad_norm": 0.2814996540546417, + "learning_rate": 1.0386371477025283e-05, + "loss": 0.2893, + "num_input_tokens_seen": 15565824, + "step": 15460 + }, + { + "epoch": 7.291371994342291, + "grad_norm": 0.317548006772995, + "learning_rate": 1.0369685096834107e-05, + "loss": 0.328, + "num_input_tokens_seen": 15570944, + "step": 15465 + }, + { + "epoch": 7.293729372937293, + "grad_norm": 0.3217787742614746, + "learning_rate": 1.0353008623298138e-05, + "loss": 0.3224, + "num_input_tokens_seen": 15575968, + "step": 15470 + }, + { + "epoch": 7.296086751532296, + "grad_norm": 0.33116310834884644, + "learning_rate": 1.0336342067709536e-05, + "loss": 0.3142, + "num_input_tokens_seen": 15580832, + "step": 15475 + }, + { + "epoch": 7.298444130127298, + "grad_norm": 0.3363493084907532, + "learning_rate": 1.0319685441353766e-05, + "loss": 0.3409, + "num_input_tokens_seen": 15585152, + "step": 15480 + }, + { + "epoch": 7.300801508722301, + "grad_norm": 0.4849323630332947, + "learning_rate": 1.0303038755509572e-05, + "loss": 0.3377, + "num_input_tokens_seen": 15589952, + "step": 15485 + }, + { + "epoch": 7.303158887317303, + "grad_norm": 0.3657204508781433, + "learning_rate": 1.0286402021448943e-05, + "loss": 0.3246, + "num_input_tokens_seen": 15595968, + "step": 15490 + }, + { + "epoch": 7.305516265912305, + "grad_norm": 0.4564196467399597, + "learning_rate": 1.026977525043716e-05, + "loss": 0.3258, + "num_input_tokens_seen": 15601152, + "step": 15495 + }, + { + "epoch": 7.307873644507308, + "grad_norm": 0.3154785931110382, + "learning_rate": 1.0253158453732731e-05, + "loss": 0.2865, + "num_input_tokens_seen": 15605440, + "step": 15500 + }, + { + "epoch": 7.31023102310231, + "grad_norm": 0.39479321241378784, + "learning_rate": 1.0236551642587423e-05, + "loss": 0.3543, + "num_input_tokens_seen": 15610496, + "step": 15505 + }, + { + "epoch": 7.312588401697313, + "grad_norm": 0.3479604721069336, + "learning_rate": 1.0219954828246242e-05, + "loss": 0.312, + "num_input_tokens_seen": 15616160, + "step": 15510 + }, + { + "epoch": 7.314945780292315, + "grad_norm": 0.47019827365875244, + "learning_rate": 1.0203368021947436e-05, + "loss": 0.3162, + "num_input_tokens_seen": 15620960, + "step": 15515 + }, + { + "epoch": 7.317303158887317, + "grad_norm": 0.28838101029396057, + "learning_rate": 1.0186791234922437e-05, + "loss": 0.3043, + "num_input_tokens_seen": 15625600, + "step": 15520 + }, + { + "epoch": 7.31966053748232, + "grad_norm": 0.3189135491847992, + "learning_rate": 1.017022447839594e-05, + "loss": 0.3455, + "num_input_tokens_seen": 15630304, + "step": 15525 + }, + { + "epoch": 7.322017916077322, + "grad_norm": 0.314639151096344, + "learning_rate": 1.0153667763585816e-05, + "loss": 0.4388, + "num_input_tokens_seen": 15635232, + "step": 15530 + }, + { + "epoch": 7.324375294672325, + "grad_norm": 0.3356930613517761, + "learning_rate": 1.0137121101703162e-05, + "loss": 0.3341, + "num_input_tokens_seen": 15640576, + "step": 15535 + }, + { + "epoch": 7.326732673267327, + "grad_norm": 0.34736281633377075, + "learning_rate": 1.0120584503952232e-05, + "loss": 0.3323, + "num_input_tokens_seen": 15644640, + "step": 15540 + }, + { + "epoch": 7.329090051862329, + "grad_norm": 0.37126171588897705, + "learning_rate": 1.0104057981530501e-05, + "loss": 0.3168, + "num_input_tokens_seen": 15651296, + "step": 15545 + }, + { + "epoch": 7.331447430457332, + "grad_norm": 0.34549468755722046, + "learning_rate": 1.00875415456286e-05, + "loss": 0.3381, + "num_input_tokens_seen": 15656160, + "step": 15550 + }, + { + "epoch": 7.333804809052333, + "grad_norm": 0.32234421372413635, + "learning_rate": 1.0071035207430352e-05, + "loss": 0.3158, + "num_input_tokens_seen": 15661216, + "step": 15555 + }, + { + "epoch": 7.336162187647336, + "grad_norm": 0.3591618239879608, + "learning_rate": 1.0054538978112704e-05, + "loss": 0.3092, + "num_input_tokens_seen": 15667360, + "step": 15560 + }, + { + "epoch": 7.338519566242338, + "grad_norm": 0.2806774377822876, + "learning_rate": 1.0038052868845796e-05, + "loss": 0.3026, + "num_input_tokens_seen": 15671840, + "step": 15565 + }, + { + "epoch": 7.3408769448373405, + "grad_norm": 0.34529879689216614, + "learning_rate": 1.0021576890792897e-05, + "loss": 0.2991, + "num_input_tokens_seen": 15677312, + "step": 15570 + }, + { + "epoch": 7.343234323432343, + "grad_norm": 0.3528675436973572, + "learning_rate": 1.0005111055110434e-05, + "loss": 0.2993, + "num_input_tokens_seen": 15681408, + "step": 15575 + }, + { + "epoch": 7.345591702027345, + "grad_norm": 0.3795848488807678, + "learning_rate": 9.988655372947927e-06, + "loss": 0.3916, + "num_input_tokens_seen": 15685984, + "step": 15580 + }, + { + "epoch": 7.347949080622348, + "grad_norm": 0.32670822739601135, + "learning_rate": 9.97220985544806e-06, + "loss": 0.3053, + "num_input_tokens_seen": 15690400, + "step": 15585 + }, + { + "epoch": 7.35030645921735, + "grad_norm": 0.6589813828468323, + "learning_rate": 9.955774513746622e-06, + "loss": 0.3212, + "num_input_tokens_seen": 15695936, + "step": 15590 + }, + { + "epoch": 7.3526638378123526, + "grad_norm": 0.35491931438446045, + "learning_rate": 9.939349358972511e-06, + "loss": 0.2988, + "num_input_tokens_seen": 15701184, + "step": 15595 + }, + { + "epoch": 7.355021216407355, + "grad_norm": 0.9039114713668823, + "learning_rate": 9.922934402247724e-06, + "loss": 0.396, + "num_input_tokens_seen": 15705760, + "step": 15600 + }, + { + "epoch": 7.357378595002357, + "grad_norm": 0.31927984952926636, + "learning_rate": 9.90652965468734e-06, + "loss": 0.302, + "num_input_tokens_seen": 15710592, + "step": 15605 + }, + { + "epoch": 7.35973597359736, + "grad_norm": 0.5705206990242004, + "learning_rate": 9.890135127399555e-06, + "loss": 0.3431, + "num_input_tokens_seen": 15714912, + "step": 15610 + }, + { + "epoch": 7.362093352192362, + "grad_norm": 0.3302662670612335, + "learning_rate": 9.873750831485624e-06, + "loss": 0.3749, + "num_input_tokens_seen": 15720032, + "step": 15615 + }, + { + "epoch": 7.364450730787365, + "grad_norm": 0.3059212863445282, + "learning_rate": 9.85737677803989e-06, + "loss": 0.3419, + "num_input_tokens_seen": 15725184, + "step": 15620 + }, + { + "epoch": 7.366808109382367, + "grad_norm": 0.2747180461883545, + "learning_rate": 9.841012978149728e-06, + "loss": 0.2967, + "num_input_tokens_seen": 15730208, + "step": 15625 + }, + { + "epoch": 7.369165487977369, + "grad_norm": 0.3420795500278473, + "learning_rate": 9.824659442895604e-06, + "loss": 0.3573, + "num_input_tokens_seen": 15734944, + "step": 15630 + }, + { + "epoch": 7.371522866572372, + "grad_norm": 0.4310578405857086, + "learning_rate": 9.808316183351018e-06, + "loss": 0.3317, + "num_input_tokens_seen": 15739648, + "step": 15635 + }, + { + "epoch": 7.373880245167374, + "grad_norm": 0.752369225025177, + "learning_rate": 9.791983210582529e-06, + "loss": 0.4054, + "num_input_tokens_seen": 15744544, + "step": 15640 + }, + { + "epoch": 7.376237623762377, + "grad_norm": 0.29975229501724243, + "learning_rate": 9.775660535649695e-06, + "loss": 0.3152, + "num_input_tokens_seen": 15749344, + "step": 15645 + }, + { + "epoch": 7.378595002357379, + "grad_norm": 0.38850530982017517, + "learning_rate": 9.759348169605128e-06, + "loss": 0.3854, + "num_input_tokens_seen": 15755488, + "step": 15650 + }, + { + "epoch": 7.380952380952381, + "grad_norm": 0.2874525189399719, + "learning_rate": 9.743046123494462e-06, + "loss": 0.346, + "num_input_tokens_seen": 15760224, + "step": 15655 + }, + { + "epoch": 7.383309759547384, + "grad_norm": 0.3042932152748108, + "learning_rate": 9.72675440835634e-06, + "loss": 0.3535, + "num_input_tokens_seen": 15764512, + "step": 15660 + }, + { + "epoch": 7.385667138142385, + "grad_norm": 0.39394518733024597, + "learning_rate": 9.710473035222382e-06, + "loss": 0.3156, + "num_input_tokens_seen": 15769760, + "step": 15665 + }, + { + "epoch": 7.388024516737388, + "grad_norm": 0.24960654973983765, + "learning_rate": 9.694202015117243e-06, + "loss": 0.2986, + "num_input_tokens_seen": 15774656, + "step": 15670 + }, + { + "epoch": 7.39038189533239, + "grad_norm": 0.2929053008556366, + "learning_rate": 9.67794135905855e-06, + "loss": 0.341, + "num_input_tokens_seen": 15779008, + "step": 15675 + }, + { + "epoch": 7.3927392739273925, + "grad_norm": 0.5192704200744629, + "learning_rate": 9.661691078056925e-06, + "loss": 0.2999, + "num_input_tokens_seen": 15784032, + "step": 15680 + }, + { + "epoch": 7.395096652522395, + "grad_norm": 0.34116241335868835, + "learning_rate": 9.645451183115933e-06, + "loss": 0.3355, + "num_input_tokens_seen": 15789952, + "step": 15685 + }, + { + "epoch": 7.397454031117397, + "grad_norm": 0.5269842743873596, + "learning_rate": 9.62922168523214e-06, + "loss": 0.3681, + "num_input_tokens_seen": 15794048, + "step": 15690 + }, + { + "epoch": 7.3998114097124, + "grad_norm": 0.36820298433303833, + "learning_rate": 9.613002595395054e-06, + "loss": 0.2837, + "num_input_tokens_seen": 15799456, + "step": 15695 + }, + { + "epoch": 7.402168788307402, + "grad_norm": 0.3920098841190338, + "learning_rate": 9.596793924587155e-06, + "loss": 0.3118, + "num_input_tokens_seen": 15804576, + "step": 15700 + }, + { + "epoch": 7.4045261669024045, + "grad_norm": 0.28555789589881897, + "learning_rate": 9.580595683783836e-06, + "loss": 0.3008, + "num_input_tokens_seen": 15809952, + "step": 15705 + }, + { + "epoch": 7.406883545497407, + "grad_norm": 0.3346964120864868, + "learning_rate": 9.564407883953459e-06, + "loss": 0.3191, + "num_input_tokens_seen": 15814528, + "step": 15710 + }, + { + "epoch": 7.409240924092409, + "grad_norm": 0.4547036588191986, + "learning_rate": 9.548230536057284e-06, + "loss": 0.3728, + "num_input_tokens_seen": 15819488, + "step": 15715 + }, + { + "epoch": 7.411598302687412, + "grad_norm": 0.24236631393432617, + "learning_rate": 9.532063651049525e-06, + "loss": 0.324, + "num_input_tokens_seen": 15824160, + "step": 15720 + }, + { + "epoch": 7.413955681282414, + "grad_norm": 0.31395334005355835, + "learning_rate": 9.515907239877304e-06, + "loss": 0.3299, + "num_input_tokens_seen": 15828384, + "step": 15725 + }, + { + "epoch": 7.4163130598774165, + "grad_norm": 0.38305118680000305, + "learning_rate": 9.499761313480626e-06, + "loss": 0.382, + "num_input_tokens_seen": 15833920, + "step": 15730 + }, + { + "epoch": 7.418670438472419, + "grad_norm": 0.2957506477832794, + "learning_rate": 9.483625882792429e-06, + "loss": 0.3737, + "num_input_tokens_seen": 15838496, + "step": 15735 + }, + { + "epoch": 7.421027817067421, + "grad_norm": 0.7048094272613525, + "learning_rate": 9.467500958738526e-06, + "loss": 0.3124, + "num_input_tokens_seen": 15844096, + "step": 15740 + }, + { + "epoch": 7.423385195662424, + "grad_norm": 0.2885609269142151, + "learning_rate": 9.451386552237628e-06, + "loss": 0.373, + "num_input_tokens_seen": 15848704, + "step": 15745 + }, + { + "epoch": 7.425742574257426, + "grad_norm": 0.26103463768959045, + "learning_rate": 9.435282674201304e-06, + "loss": 0.3042, + "num_input_tokens_seen": 15853792, + "step": 15750 + }, + { + "epoch": 7.428099952852428, + "grad_norm": 0.45944857597351074, + "learning_rate": 9.419189335534012e-06, + "loss": 0.2851, + "num_input_tokens_seen": 15858848, + "step": 15755 + }, + { + "epoch": 7.43045733144743, + "grad_norm": 0.3705163896083832, + "learning_rate": 9.403106547133066e-06, + "loss": 0.3045, + "num_input_tokens_seen": 15864032, + "step": 15760 + }, + { + "epoch": 7.432814710042432, + "grad_norm": 0.44863200187683105, + "learning_rate": 9.387034319888647e-06, + "loss": 0.3276, + "num_input_tokens_seen": 15868832, + "step": 15765 + }, + { + "epoch": 7.435172088637435, + "grad_norm": 0.4739460349082947, + "learning_rate": 9.370972664683758e-06, + "loss": 0.2832, + "num_input_tokens_seen": 15873344, + "step": 15770 + }, + { + "epoch": 7.437529467232437, + "grad_norm": 0.2491324096918106, + "learning_rate": 9.354921592394269e-06, + "loss": 0.3555, + "num_input_tokens_seen": 15878368, + "step": 15775 + }, + { + "epoch": 7.43988684582744, + "grad_norm": 0.25602900981903076, + "learning_rate": 9.338881113888878e-06, + "loss": 0.2638, + "num_input_tokens_seen": 15884064, + "step": 15780 + }, + { + "epoch": 7.442244224422442, + "grad_norm": 0.29451489448547363, + "learning_rate": 9.322851240029113e-06, + "loss": 0.2683, + "num_input_tokens_seen": 15891200, + "step": 15785 + }, + { + "epoch": 7.4446016030174444, + "grad_norm": 0.37098219990730286, + "learning_rate": 9.306831981669299e-06, + "loss": 0.3161, + "num_input_tokens_seen": 15895584, + "step": 15790 + }, + { + "epoch": 7.446958981612447, + "grad_norm": 0.28426671028137207, + "learning_rate": 9.290823349656597e-06, + "loss": 0.2556, + "num_input_tokens_seen": 15900544, + "step": 15795 + }, + { + "epoch": 7.449316360207449, + "grad_norm": 0.9557251930236816, + "learning_rate": 9.274825354830965e-06, + "loss": 0.4071, + "num_input_tokens_seen": 15904640, + "step": 15800 + }, + { + "epoch": 7.451673738802452, + "grad_norm": 0.5906579494476318, + "learning_rate": 9.25883800802517e-06, + "loss": 0.3491, + "num_input_tokens_seen": 15910656, + "step": 15805 + }, + { + "epoch": 7.454031117397454, + "grad_norm": 0.2360227257013321, + "learning_rate": 9.242861320064739e-06, + "loss": 0.3391, + "num_input_tokens_seen": 15914912, + "step": 15810 + }, + { + "epoch": 7.4563884959924565, + "grad_norm": 0.4108346700668335, + "learning_rate": 9.226895301768001e-06, + "loss": 0.3403, + "num_input_tokens_seen": 15920256, + "step": 15815 + }, + { + "epoch": 7.458745874587459, + "grad_norm": 0.37793099880218506, + "learning_rate": 9.210939963946078e-06, + "loss": 0.3614, + "num_input_tokens_seen": 15924928, + "step": 15820 + }, + { + "epoch": 7.461103253182461, + "grad_norm": 0.3774031102657318, + "learning_rate": 9.194995317402816e-06, + "loss": 0.3844, + "num_input_tokens_seen": 15929792, + "step": 15825 + }, + { + "epoch": 7.463460631777464, + "grad_norm": 0.4257754683494568, + "learning_rate": 9.179061372934866e-06, + "loss": 0.294, + "num_input_tokens_seen": 15934976, + "step": 15830 + }, + { + "epoch": 7.465818010372466, + "grad_norm": 0.5410647988319397, + "learning_rate": 9.163138141331595e-06, + "loss": 0.2806, + "num_input_tokens_seen": 15939936, + "step": 15835 + }, + { + "epoch": 7.4681753889674685, + "grad_norm": 0.26654258370399475, + "learning_rate": 9.147225633375145e-06, + "loss": 0.2826, + "num_input_tokens_seen": 15943840, + "step": 15840 + }, + { + "epoch": 7.470532767562471, + "grad_norm": 0.31155863404273987, + "learning_rate": 9.13132385984038e-06, + "loss": 0.3362, + "num_input_tokens_seen": 15948800, + "step": 15845 + }, + { + "epoch": 7.472890146157473, + "grad_norm": 0.6090996861457825, + "learning_rate": 9.115432831494914e-06, + "loss": 0.2906, + "num_input_tokens_seen": 15952800, + "step": 15850 + }, + { + "epoch": 7.475247524752476, + "grad_norm": 0.6132972240447998, + "learning_rate": 9.09955255909905e-06, + "loss": 0.3653, + "num_input_tokens_seen": 15957472, + "step": 15855 + }, + { + "epoch": 7.477604903347478, + "grad_norm": 0.28980007767677307, + "learning_rate": 9.083683053405842e-06, + "loss": 0.3469, + "num_input_tokens_seen": 15963424, + "step": 15860 + }, + { + "epoch": 7.47996228194248, + "grad_norm": 0.276289165019989, + "learning_rate": 9.06782432516104e-06, + "loss": 0.3412, + "num_input_tokens_seen": 15968288, + "step": 15865 + }, + { + "epoch": 7.482319660537482, + "grad_norm": 0.3064233660697937, + "learning_rate": 9.051976385103103e-06, + "loss": 0.2894, + "num_input_tokens_seen": 15972416, + "step": 15870 + }, + { + "epoch": 7.484677039132484, + "grad_norm": 0.45487526059150696, + "learning_rate": 9.036139243963166e-06, + "loss": 0.3499, + "num_input_tokens_seen": 15977600, + "step": 15875 + }, + { + "epoch": 7.487034417727487, + "grad_norm": 0.44169023633003235, + "learning_rate": 9.02031291246507e-06, + "loss": 0.2976, + "num_input_tokens_seen": 15983744, + "step": 15880 + }, + { + "epoch": 7.489391796322489, + "grad_norm": 0.3548901081085205, + "learning_rate": 9.004497401325335e-06, + "loss": 0.325, + "num_input_tokens_seen": 15988800, + "step": 15885 + }, + { + "epoch": 7.491749174917492, + "grad_norm": 0.3656917214393616, + "learning_rate": 8.988692721253156e-06, + "loss": 0.3219, + "num_input_tokens_seen": 15994080, + "step": 15890 + }, + { + "epoch": 7.494106553512494, + "grad_norm": 0.2837616801261902, + "learning_rate": 8.972898882950373e-06, + "loss": 0.3167, + "num_input_tokens_seen": 15999328, + "step": 15895 + }, + { + "epoch": 7.496463932107496, + "grad_norm": 0.471523255109787, + "learning_rate": 8.957115897111509e-06, + "loss": 0.363, + "num_input_tokens_seen": 16005408, + "step": 15900 + }, + { + "epoch": 7.498821310702499, + "grad_norm": 0.3507046699523926, + "learning_rate": 8.941343774423729e-06, + "loss": 0.3511, + "num_input_tokens_seen": 16010592, + "step": 15905 + }, + { + "epoch": 7.501178689297501, + "grad_norm": 0.2967362701892853, + "learning_rate": 8.92558252556685e-06, + "loss": 0.3153, + "num_input_tokens_seen": 16015296, + "step": 15910 + }, + { + "epoch": 7.503536067892504, + "grad_norm": 0.36172568798065186, + "learning_rate": 8.909832161213306e-06, + "loss": 0.3672, + "num_input_tokens_seen": 16020704, + "step": 15915 + }, + { + "epoch": 7.503536067892504, + "eval_loss": 0.3301604390144348, + "eval_runtime": 25.6322, + "eval_samples_per_second": 36.79, + "eval_steps_per_second": 9.207, + "num_input_tokens_seen": 16020704, + "step": 15915 + }, + { + "epoch": 7.505893446487506, + "grad_norm": 0.28973332047462463, + "learning_rate": 8.894092692028178e-06, + "loss": 0.2806, + "num_input_tokens_seen": 16025728, + "step": 15920 + }, + { + "epoch": 7.508250825082508, + "grad_norm": 0.28122764825820923, + "learning_rate": 8.878364128669168e-06, + "loss": 0.3583, + "num_input_tokens_seen": 16031072, + "step": 15925 + }, + { + "epoch": 7.510608203677511, + "grad_norm": 0.35574033856391907, + "learning_rate": 8.862646481786594e-06, + "loss": 0.3171, + "num_input_tokens_seen": 16035040, + "step": 15930 + }, + { + "epoch": 7.512965582272513, + "grad_norm": 0.29882553219795227, + "learning_rate": 8.846939762023376e-06, + "loss": 0.3012, + "num_input_tokens_seen": 16041760, + "step": 15935 + }, + { + "epoch": 7.515322960867516, + "grad_norm": 0.4607921242713928, + "learning_rate": 8.831243980015025e-06, + "loss": 0.3413, + "num_input_tokens_seen": 16046560, + "step": 15940 + }, + { + "epoch": 7.517680339462518, + "grad_norm": 0.32841092348098755, + "learning_rate": 8.815559146389669e-06, + "loss": 0.332, + "num_input_tokens_seen": 16052704, + "step": 15945 + }, + { + "epoch": 7.52003771805752, + "grad_norm": 0.3114159107208252, + "learning_rate": 8.79988527176801e-06, + "loss": 0.3289, + "num_input_tokens_seen": 16057312, + "step": 15950 + }, + { + "epoch": 7.522395096652522, + "grad_norm": 0.34407588839530945, + "learning_rate": 8.78422236676334e-06, + "loss": 0.3087, + "num_input_tokens_seen": 16062048, + "step": 15955 + }, + { + "epoch": 7.524752475247524, + "grad_norm": 0.3134285509586334, + "learning_rate": 8.768570441981496e-06, + "loss": 0.3044, + "num_input_tokens_seen": 16066656, + "step": 15960 + }, + { + "epoch": 7.527109853842527, + "grad_norm": 0.4197703003883362, + "learning_rate": 8.75292950802091e-06, + "loss": 0.3006, + "num_input_tokens_seen": 16071264, + "step": 15965 + }, + { + "epoch": 7.529467232437529, + "grad_norm": 0.3225448429584503, + "learning_rate": 8.737299575472557e-06, + "loss": 0.3332, + "num_input_tokens_seen": 16077632, + "step": 15970 + }, + { + "epoch": 7.5318246110325315, + "grad_norm": 0.28977879881858826, + "learning_rate": 8.721680654919973e-06, + "loss": 0.3051, + "num_input_tokens_seen": 16082208, + "step": 15975 + }, + { + "epoch": 7.534181989627534, + "grad_norm": 0.35903656482696533, + "learning_rate": 8.706072756939212e-06, + "loss": 0.3179, + "num_input_tokens_seen": 16086464, + "step": 15980 + }, + { + "epoch": 7.536539368222536, + "grad_norm": 0.31578972935676575, + "learning_rate": 8.690475892098899e-06, + "loss": 0.3179, + "num_input_tokens_seen": 16092000, + "step": 15985 + }, + { + "epoch": 7.538896746817539, + "grad_norm": 0.45910340547561646, + "learning_rate": 8.67489007096016e-06, + "loss": 0.2896, + "num_input_tokens_seen": 16096352, + "step": 15990 + }, + { + "epoch": 7.541254125412541, + "grad_norm": 0.3206930458545685, + "learning_rate": 8.659315304076667e-06, + "loss": 0.322, + "num_input_tokens_seen": 16101664, + "step": 15995 + }, + { + "epoch": 7.5436115040075435, + "grad_norm": 0.32225412130355835, + "learning_rate": 8.643751601994577e-06, + "loss": 0.3286, + "num_input_tokens_seen": 16107040, + "step": 16000 + }, + { + "epoch": 7.545968882602546, + "grad_norm": 0.5523732900619507, + "learning_rate": 8.628198975252583e-06, + "loss": 0.3584, + "num_input_tokens_seen": 16111744, + "step": 16005 + }, + { + "epoch": 7.548326261197548, + "grad_norm": 0.4915218651294708, + "learning_rate": 8.612657434381862e-06, + "loss": 0.3365, + "num_input_tokens_seen": 16116512, + "step": 16010 + }, + { + "epoch": 7.550683639792551, + "grad_norm": 0.34282588958740234, + "learning_rate": 8.597126989906101e-06, + "loss": 0.3026, + "num_input_tokens_seen": 16121792, + "step": 16015 + }, + { + "epoch": 7.553041018387553, + "grad_norm": 0.5971064567565918, + "learning_rate": 8.581607652341445e-06, + "loss": 0.2954, + "num_input_tokens_seen": 16126816, + "step": 16020 + }, + { + "epoch": 7.5553983969825556, + "grad_norm": 0.35911184549331665, + "learning_rate": 8.566099432196542e-06, + "loss": 0.3745, + "num_input_tokens_seen": 16131744, + "step": 16025 + }, + { + "epoch": 7.557755775577558, + "grad_norm": 0.4531371593475342, + "learning_rate": 8.550602339972506e-06, + "loss": 0.3153, + "num_input_tokens_seen": 16136544, + "step": 16030 + }, + { + "epoch": 7.56011315417256, + "grad_norm": 0.37394288182258606, + "learning_rate": 8.53511638616292e-06, + "loss": 0.3336, + "num_input_tokens_seen": 16141888, + "step": 16035 + }, + { + "epoch": 7.562470532767563, + "grad_norm": 0.3946356773376465, + "learning_rate": 8.51964158125381e-06, + "loss": 0.3277, + "num_input_tokens_seen": 16146688, + "step": 16040 + }, + { + "epoch": 7.564827911362565, + "grad_norm": 0.3109710216522217, + "learning_rate": 8.504177935723673e-06, + "loss": 0.3999, + "num_input_tokens_seen": 16151424, + "step": 16045 + }, + { + "epoch": 7.567185289957568, + "grad_norm": 0.34260255098342896, + "learning_rate": 8.488725460043423e-06, + "loss": 0.3568, + "num_input_tokens_seen": 16156192, + "step": 16050 + }, + { + "epoch": 7.56954266855257, + "grad_norm": 0.2656083405017853, + "learning_rate": 8.47328416467644e-06, + "loss": 0.3209, + "num_input_tokens_seen": 16161120, + "step": 16055 + }, + { + "epoch": 7.571900047147572, + "grad_norm": 0.35109448432922363, + "learning_rate": 8.45785406007852e-06, + "loss": 0.3506, + "num_input_tokens_seen": 16166080, + "step": 16060 + }, + { + "epoch": 7.574257425742574, + "grad_norm": 0.42112618684768677, + "learning_rate": 8.442435156697873e-06, + "loss": 0.3365, + "num_input_tokens_seen": 16170912, + "step": 16065 + }, + { + "epoch": 7.576614804337576, + "grad_norm": 0.3349582850933075, + "learning_rate": 8.427027464975138e-06, + "loss": 0.3198, + "num_input_tokens_seen": 16176160, + "step": 16070 + }, + { + "epoch": 7.578972182932579, + "grad_norm": 0.4808773100376129, + "learning_rate": 8.411630995343356e-06, + "loss": 0.3255, + "num_input_tokens_seen": 16181728, + "step": 16075 + }, + { + "epoch": 7.581329561527581, + "grad_norm": 0.3616534173488617, + "learning_rate": 8.396245758227978e-06, + "loss": 0.3378, + "num_input_tokens_seen": 16186752, + "step": 16080 + }, + { + "epoch": 7.5836869401225835, + "grad_norm": 0.30823948979377747, + "learning_rate": 8.380871764046824e-06, + "loss": 0.2664, + "num_input_tokens_seen": 16191296, + "step": 16085 + }, + { + "epoch": 7.586044318717586, + "grad_norm": 0.30761563777923584, + "learning_rate": 8.365509023210134e-06, + "loss": 0.2744, + "num_input_tokens_seen": 16195680, + "step": 16090 + }, + { + "epoch": 7.588401697312588, + "grad_norm": 0.30414852499961853, + "learning_rate": 8.350157546120502e-06, + "loss": 0.3589, + "num_input_tokens_seen": 16201664, + "step": 16095 + }, + { + "epoch": 7.590759075907591, + "grad_norm": 0.4299394488334656, + "learning_rate": 8.334817343172919e-06, + "loss": 0.3913, + "num_input_tokens_seen": 16206496, + "step": 16100 + }, + { + "epoch": 7.593116454502593, + "grad_norm": 0.5420286655426025, + "learning_rate": 8.31948842475471e-06, + "loss": 0.2987, + "num_input_tokens_seen": 16212288, + "step": 16105 + }, + { + "epoch": 7.5954738330975955, + "grad_norm": 0.4594322741031647, + "learning_rate": 8.304170801245584e-06, + "loss": 0.3328, + "num_input_tokens_seen": 16217952, + "step": 16110 + }, + { + "epoch": 7.597831211692598, + "grad_norm": 0.27784463763237, + "learning_rate": 8.2888644830176e-06, + "loss": 0.3082, + "num_input_tokens_seen": 16222560, + "step": 16115 + }, + { + "epoch": 7.6001885902876, + "grad_norm": 0.3061700463294983, + "learning_rate": 8.273569480435162e-06, + "loss": 0.3862, + "num_input_tokens_seen": 16227232, + "step": 16120 + }, + { + "epoch": 7.602545968882603, + "grad_norm": 0.5580292344093323, + "learning_rate": 8.258285803854988e-06, + "loss": 0.3626, + "num_input_tokens_seen": 16232128, + "step": 16125 + }, + { + "epoch": 7.604903347477605, + "grad_norm": 0.33184677362442017, + "learning_rate": 8.243013463626153e-06, + "loss": 0.3471, + "num_input_tokens_seen": 16237120, + "step": 16130 + }, + { + "epoch": 7.6072607260726075, + "grad_norm": 0.7429593205451965, + "learning_rate": 8.227752470090055e-06, + "loss": 0.3491, + "num_input_tokens_seen": 16242080, + "step": 16135 + }, + { + "epoch": 7.60961810466761, + "grad_norm": 0.3751881718635559, + "learning_rate": 8.212502833580404e-06, + "loss": 0.323, + "num_input_tokens_seen": 16246368, + "step": 16140 + }, + { + "epoch": 7.611975483262612, + "grad_norm": 0.2851102650165558, + "learning_rate": 8.197264564423205e-06, + "loss": 0.307, + "num_input_tokens_seen": 16251584, + "step": 16145 + }, + { + "epoch": 7.614332861857615, + "grad_norm": 0.3517346978187561, + "learning_rate": 8.182037672936779e-06, + "loss": 0.34, + "num_input_tokens_seen": 16255552, + "step": 16150 + }, + { + "epoch": 7.616690240452616, + "grad_norm": 0.305789589881897, + "learning_rate": 8.166822169431754e-06, + "loss": 0.388, + "num_input_tokens_seen": 16260960, + "step": 16155 + }, + { + "epoch": 7.619047619047619, + "grad_norm": 0.3967553973197937, + "learning_rate": 8.151618064211041e-06, + "loss": 0.3487, + "num_input_tokens_seen": 16265728, + "step": 16160 + }, + { + "epoch": 7.621404997642621, + "grad_norm": 0.5279147624969482, + "learning_rate": 8.136425367569802e-06, + "loss": 0.3208, + "num_input_tokens_seen": 16269312, + "step": 16165 + }, + { + "epoch": 7.623762376237623, + "grad_norm": 0.309893935918808, + "learning_rate": 8.12124408979551e-06, + "loss": 0.3386, + "num_input_tokens_seen": 16273152, + "step": 16170 + }, + { + "epoch": 7.626119754832626, + "grad_norm": 0.33756783604621887, + "learning_rate": 8.106074241167899e-06, + "loss": 0.3405, + "num_input_tokens_seen": 16278112, + "step": 16175 + }, + { + "epoch": 7.628477133427628, + "grad_norm": 0.4630145728588104, + "learning_rate": 8.090915831958964e-06, + "loss": 0.3543, + "num_input_tokens_seen": 16282688, + "step": 16180 + }, + { + "epoch": 7.630834512022631, + "grad_norm": 0.7873532772064209, + "learning_rate": 8.075768872432935e-06, + "loss": 0.3431, + "num_input_tokens_seen": 16287136, + "step": 16185 + }, + { + "epoch": 7.633191890617633, + "grad_norm": 0.38178199529647827, + "learning_rate": 8.060633372846313e-06, + "loss": 0.2897, + "num_input_tokens_seen": 16292480, + "step": 16190 + }, + { + "epoch": 7.635549269212635, + "grad_norm": 0.2583377957344055, + "learning_rate": 8.045509343447829e-06, + "loss": 0.3098, + "num_input_tokens_seen": 16297728, + "step": 16195 + }, + { + "epoch": 7.637906647807638, + "grad_norm": 0.5093919038772583, + "learning_rate": 8.030396794478457e-06, + "loss": 0.3261, + "num_input_tokens_seen": 16301728, + "step": 16200 + }, + { + "epoch": 7.64026402640264, + "grad_norm": 0.312570720911026, + "learning_rate": 8.015295736171372e-06, + "loss": 0.3434, + "num_input_tokens_seen": 16307616, + "step": 16205 + }, + { + "epoch": 7.642621404997643, + "grad_norm": 0.3407736122608185, + "learning_rate": 8.00020617875199e-06, + "loss": 0.3068, + "num_input_tokens_seen": 16313056, + "step": 16210 + }, + { + "epoch": 7.644978783592645, + "grad_norm": 0.36394715309143066, + "learning_rate": 7.985128132437939e-06, + "loss": 0.3015, + "num_input_tokens_seen": 16317920, + "step": 16215 + }, + { + "epoch": 7.6473361621876474, + "grad_norm": 0.31012001633644104, + "learning_rate": 7.97006160743905e-06, + "loss": 0.3581, + "num_input_tokens_seen": 16322592, + "step": 16220 + }, + { + "epoch": 7.64969354078265, + "grad_norm": 0.34744587540626526, + "learning_rate": 7.955006613957338e-06, + "loss": 0.2778, + "num_input_tokens_seen": 16327648, + "step": 16225 + }, + { + "epoch": 7.652050919377652, + "grad_norm": 0.30076706409454346, + "learning_rate": 7.939963162187028e-06, + "loss": 0.3249, + "num_input_tokens_seen": 16332768, + "step": 16230 + }, + { + "epoch": 7.654408297972655, + "grad_norm": 0.28493964672088623, + "learning_rate": 7.924931262314526e-06, + "loss": 0.3945, + "num_input_tokens_seen": 16336576, + "step": 16235 + }, + { + "epoch": 7.656765676567657, + "grad_norm": 0.5569764971733093, + "learning_rate": 7.909910924518418e-06, + "loss": 0.3363, + "num_input_tokens_seen": 16341472, + "step": 16240 + }, + { + "epoch": 7.6591230551626595, + "grad_norm": 0.3569442927837372, + "learning_rate": 7.894902158969441e-06, + "loss": 0.302, + "num_input_tokens_seen": 16347264, + "step": 16245 + }, + { + "epoch": 7.661480433757662, + "grad_norm": 0.4172150194644928, + "learning_rate": 7.87990497583052e-06, + "loss": 0.2689, + "num_input_tokens_seen": 16351712, + "step": 16250 + }, + { + "epoch": 7.663837812352664, + "grad_norm": 0.3538292348384857, + "learning_rate": 7.864919385256727e-06, + "loss": 0.3121, + "num_input_tokens_seen": 16356512, + "step": 16255 + }, + { + "epoch": 7.666195190947667, + "grad_norm": 0.5528496503829956, + "learning_rate": 7.849945397395298e-06, + "loss": 0.3327, + "num_input_tokens_seen": 16362560, + "step": 16260 + }, + { + "epoch": 7.668552569542668, + "grad_norm": 0.5291309952735901, + "learning_rate": 7.834983022385579e-06, + "loss": 0.3169, + "num_input_tokens_seen": 16367520, + "step": 16265 + }, + { + "epoch": 7.670909948137671, + "grad_norm": 0.3337632417678833, + "learning_rate": 7.82003227035909e-06, + "loss": 0.331, + "num_input_tokens_seen": 16372800, + "step": 16270 + }, + { + "epoch": 7.673267326732673, + "grad_norm": 0.3663453161716461, + "learning_rate": 7.805093151439452e-06, + "loss": 0.3094, + "num_input_tokens_seen": 16378272, + "step": 16275 + }, + { + "epoch": 7.675624705327675, + "grad_norm": 0.40827274322509766, + "learning_rate": 7.790165675742427e-06, + "loss": 0.3395, + "num_input_tokens_seen": 16382944, + "step": 16280 + }, + { + "epoch": 7.677982083922678, + "grad_norm": 0.4750279188156128, + "learning_rate": 7.7752498533759e-06, + "loss": 0.2786, + "num_input_tokens_seen": 16387936, + "step": 16285 + }, + { + "epoch": 7.68033946251768, + "grad_norm": 0.3462674021720886, + "learning_rate": 7.760345694439825e-06, + "loss": 0.2863, + "num_input_tokens_seen": 16392896, + "step": 16290 + }, + { + "epoch": 7.682696841112683, + "grad_norm": 0.3808835446834564, + "learning_rate": 7.745453209026304e-06, + "loss": 0.3168, + "num_input_tokens_seen": 16398688, + "step": 16295 + }, + { + "epoch": 7.685054219707685, + "grad_norm": 0.3851817846298218, + "learning_rate": 7.730572407219516e-06, + "loss": 0.2892, + "num_input_tokens_seen": 16404032, + "step": 16300 + }, + { + "epoch": 7.687411598302687, + "grad_norm": 0.34424105286598206, + "learning_rate": 7.71570329909573e-06, + "loss": 0.3807, + "num_input_tokens_seen": 16408992, + "step": 16305 + }, + { + "epoch": 7.68976897689769, + "grad_norm": 0.407962441444397, + "learning_rate": 7.700845894723288e-06, + "loss": 0.334, + "num_input_tokens_seen": 16414688, + "step": 16310 + }, + { + "epoch": 7.692126355492692, + "grad_norm": 0.44072893261909485, + "learning_rate": 7.686000204162617e-06, + "loss": 0.2977, + "num_input_tokens_seen": 16419456, + "step": 16315 + }, + { + "epoch": 7.694483734087695, + "grad_norm": 0.5156105756759644, + "learning_rate": 7.67116623746621e-06, + "loss": 0.3111, + "num_input_tokens_seen": 16424160, + "step": 16320 + }, + { + "epoch": 7.696841112682697, + "grad_norm": 0.3827711343765259, + "learning_rate": 7.65634400467863e-06, + "loss": 0.3359, + "num_input_tokens_seen": 16429792, + "step": 16325 + }, + { + "epoch": 7.699198491277699, + "grad_norm": 0.29877614974975586, + "learning_rate": 7.641533515836475e-06, + "loss": 0.3771, + "num_input_tokens_seen": 16434976, + "step": 16330 + }, + { + "epoch": 7.701555869872702, + "grad_norm": 0.5289814472198486, + "learning_rate": 7.626734780968403e-06, + "loss": 0.3768, + "num_input_tokens_seen": 16440640, + "step": 16335 + }, + { + "epoch": 7.703913248467704, + "grad_norm": 0.4123634696006775, + "learning_rate": 7.611947810095116e-06, + "loss": 0.3575, + "num_input_tokens_seen": 16446144, + "step": 16340 + }, + { + "epoch": 7.706270627062707, + "grad_norm": 0.25889185070991516, + "learning_rate": 7.597172613229353e-06, + "loss": 0.3873, + "num_input_tokens_seen": 16450720, + "step": 16345 + }, + { + "epoch": 7.708628005657709, + "grad_norm": 0.29139596223831177, + "learning_rate": 7.582409200375854e-06, + "loss": 0.3736, + "num_input_tokens_seen": 16455904, + "step": 16350 + }, + { + "epoch": 7.7109853842527105, + "grad_norm": 0.32879725098609924, + "learning_rate": 7.567657581531412e-06, + "loss": 0.3179, + "num_input_tokens_seen": 16460832, + "step": 16355 + }, + { + "epoch": 7.713342762847713, + "grad_norm": 0.48755550384521484, + "learning_rate": 7.55291776668482e-06, + "loss": 0.3087, + "num_input_tokens_seen": 16465920, + "step": 16360 + }, + { + "epoch": 7.715700141442715, + "grad_norm": 0.4676958918571472, + "learning_rate": 7.538189765816883e-06, + "loss": 0.3168, + "num_input_tokens_seen": 16470432, + "step": 16365 + }, + { + "epoch": 7.718057520037718, + "grad_norm": 0.4097278416156769, + "learning_rate": 7.52347358890039e-06, + "loss": 0.3765, + "num_input_tokens_seen": 16474464, + "step": 16370 + }, + { + "epoch": 7.72041489863272, + "grad_norm": 0.4928228259086609, + "learning_rate": 7.508769245900146e-06, + "loss": 0.3665, + "num_input_tokens_seen": 16478848, + "step": 16375 + }, + { + "epoch": 7.7227722772277225, + "grad_norm": 0.4920629560947418, + "learning_rate": 7.4940767467729295e-06, + "loss": 0.3012, + "num_input_tokens_seen": 16483840, + "step": 16380 + }, + { + "epoch": 7.725129655822725, + "grad_norm": 0.6146871447563171, + "learning_rate": 7.4793961014675114e-06, + "loss": 0.3187, + "num_input_tokens_seen": 16489248, + "step": 16385 + }, + { + "epoch": 7.727487034417727, + "grad_norm": 0.3260592818260193, + "learning_rate": 7.464727319924619e-06, + "loss": 0.3261, + "num_input_tokens_seen": 16494912, + "step": 16390 + }, + { + "epoch": 7.72984441301273, + "grad_norm": 0.4595372676849365, + "learning_rate": 7.4500704120769486e-06, + "loss": 0.3058, + "num_input_tokens_seen": 16499520, + "step": 16395 + }, + { + "epoch": 7.732201791607732, + "grad_norm": 0.4930005371570587, + "learning_rate": 7.435425387849168e-06, + "loss": 0.3344, + "num_input_tokens_seen": 16503552, + "step": 16400 + }, + { + "epoch": 7.7345591702027345, + "grad_norm": 0.436563640832901, + "learning_rate": 7.420792257157893e-06, + "loss": 0.3267, + "num_input_tokens_seen": 16508320, + "step": 16405 + }, + { + "epoch": 7.736916548797737, + "grad_norm": 0.29794713854789734, + "learning_rate": 7.406171029911696e-06, + "loss": 0.2505, + "num_input_tokens_seen": 16513632, + "step": 16410 + }, + { + "epoch": 7.739273927392739, + "grad_norm": 0.346312016248703, + "learning_rate": 7.391561716011058e-06, + "loss": 0.3536, + "num_input_tokens_seen": 16519200, + "step": 16415 + }, + { + "epoch": 7.741631305987742, + "grad_norm": 0.47854116559028625, + "learning_rate": 7.376964325348426e-06, + "loss": 0.3149, + "num_input_tokens_seen": 16524352, + "step": 16420 + }, + { + "epoch": 7.743988684582744, + "grad_norm": 0.2663077116012573, + "learning_rate": 7.362378867808159e-06, + "loss": 0.3144, + "num_input_tokens_seen": 16529344, + "step": 16425 + }, + { + "epoch": 7.7463460631777465, + "grad_norm": 0.29397305846214294, + "learning_rate": 7.347805353266546e-06, + "loss": 0.3065, + "num_input_tokens_seen": 16535392, + "step": 16430 + }, + { + "epoch": 7.748703441772749, + "grad_norm": 0.29456695914268494, + "learning_rate": 7.333243791591765e-06, + "loss": 0.3158, + "num_input_tokens_seen": 16540192, + "step": 16435 + }, + { + "epoch": 7.751060820367751, + "grad_norm": 0.8351885676383972, + "learning_rate": 7.318694192643924e-06, + "loss": 0.3347, + "num_input_tokens_seen": 16546208, + "step": 16440 + }, + { + "epoch": 7.753418198962754, + "grad_norm": 0.3854067325592041, + "learning_rate": 7.304156566275022e-06, + "loss": 0.3489, + "num_input_tokens_seen": 16550848, + "step": 16445 + }, + { + "epoch": 7.755775577557756, + "grad_norm": 0.28967955708503723, + "learning_rate": 7.289630922328964e-06, + "loss": 0.3092, + "num_input_tokens_seen": 16555104, + "step": 16450 + }, + { + "epoch": 7.7581329561527586, + "grad_norm": 0.502993643283844, + "learning_rate": 7.275117270641507e-06, + "loss": 0.3175, + "num_input_tokens_seen": 16559488, + "step": 16455 + }, + { + "epoch": 7.760490334747761, + "grad_norm": 0.3048171401023865, + "learning_rate": 7.260615621040323e-06, + "loss": 0.313, + "num_input_tokens_seen": 16564256, + "step": 16460 + }, + { + "epoch": 7.7628477133427625, + "grad_norm": 0.34329304099082947, + "learning_rate": 7.246125983344942e-06, + "loss": 0.307, + "num_input_tokens_seen": 16568832, + "step": 16465 + }, + { + "epoch": 7.765205091937765, + "grad_norm": 0.3846833109855652, + "learning_rate": 7.23164836736677e-06, + "loss": 0.3338, + "num_input_tokens_seen": 16573888, + "step": 16470 + }, + { + "epoch": 7.767562470532767, + "grad_norm": 0.5502975583076477, + "learning_rate": 7.217182782909046e-06, + "loss": 0.3314, + "num_input_tokens_seen": 16578336, + "step": 16475 + }, + { + "epoch": 7.76991984912777, + "grad_norm": 0.6057628393173218, + "learning_rate": 7.202729239766892e-06, + "loss": 0.4107, + "num_input_tokens_seen": 16582400, + "step": 16480 + }, + { + "epoch": 7.772277227722772, + "grad_norm": 0.33964696526527405, + "learning_rate": 7.188287747727268e-06, + "loss": 0.2919, + "num_input_tokens_seen": 16587712, + "step": 16485 + }, + { + "epoch": 7.7746346063177745, + "grad_norm": 0.35622936487197876, + "learning_rate": 7.173858316568974e-06, + "loss": 0.3456, + "num_input_tokens_seen": 16592576, + "step": 16490 + }, + { + "epoch": 7.776991984912777, + "grad_norm": 0.26782462000846863, + "learning_rate": 7.159440956062627e-06, + "loss": 0.3396, + "num_input_tokens_seen": 16597344, + "step": 16495 + }, + { + "epoch": 7.779349363507779, + "grad_norm": 0.3533923029899597, + "learning_rate": 7.145035675970699e-06, + "loss": 0.3655, + "num_input_tokens_seen": 16601856, + "step": 16500 + }, + { + "epoch": 7.781706742102782, + "grad_norm": 0.346199631690979, + "learning_rate": 7.130642486047451e-06, + "loss": 0.3474, + "num_input_tokens_seen": 16606304, + "step": 16505 + }, + { + "epoch": 7.784064120697784, + "grad_norm": 0.426718145608902, + "learning_rate": 7.116261396038984e-06, + "loss": 0.3676, + "num_input_tokens_seen": 16611264, + "step": 16510 + }, + { + "epoch": 7.7864214992927865, + "grad_norm": 0.3232894539833069, + "learning_rate": 7.101892415683198e-06, + "loss": 0.2811, + "num_input_tokens_seen": 16616256, + "step": 16515 + }, + { + "epoch": 7.788778877887789, + "grad_norm": 0.520029604434967, + "learning_rate": 7.087535554709779e-06, + "loss": 0.3285, + "num_input_tokens_seen": 16621920, + "step": 16520 + }, + { + "epoch": 7.791136256482791, + "grad_norm": 0.3386048376560211, + "learning_rate": 7.073190822840223e-06, + "loss": 0.2891, + "num_input_tokens_seen": 16628448, + "step": 16525 + }, + { + "epoch": 7.793493635077794, + "grad_norm": 0.28611209988594055, + "learning_rate": 7.058858229787807e-06, + "loss": 0.4038, + "num_input_tokens_seen": 16634624, + "step": 16530 + }, + { + "epoch": 7.795851013672796, + "grad_norm": 0.40424591302871704, + "learning_rate": 7.044537785257602e-06, + "loss": 0.3695, + "num_input_tokens_seen": 16639712, + "step": 16535 + }, + { + "epoch": 7.7982083922677985, + "grad_norm": 0.3908936679363251, + "learning_rate": 7.030229498946417e-06, + "loss": 0.3533, + "num_input_tokens_seen": 16644800, + "step": 16540 + }, + { + "epoch": 7.800565770862801, + "grad_norm": 0.2916141748428345, + "learning_rate": 7.015933380542866e-06, + "loss": 0.355, + "num_input_tokens_seen": 16650240, + "step": 16545 + }, + { + "epoch": 7.802923149457803, + "grad_norm": 0.3572535216808319, + "learning_rate": 7.0016494397273035e-06, + "loss": 0.3399, + "num_input_tokens_seen": 16655552, + "step": 16550 + }, + { + "epoch": 7.805280528052805, + "grad_norm": 0.3558562994003296, + "learning_rate": 6.987377686171856e-06, + "loss": 0.292, + "num_input_tokens_seen": 16660096, + "step": 16555 + }, + { + "epoch": 7.807637906647807, + "grad_norm": 0.3009711802005768, + "learning_rate": 6.973118129540368e-06, + "loss": 0.3137, + "num_input_tokens_seen": 16665056, + "step": 16560 + }, + { + "epoch": 7.80999528524281, + "grad_norm": 0.30839428305625916, + "learning_rate": 6.958870779488447e-06, + "loss": 0.3446, + "num_input_tokens_seen": 16671072, + "step": 16565 + }, + { + "epoch": 7.812352663837812, + "grad_norm": 0.2922322154045105, + "learning_rate": 6.944635645663436e-06, + "loss": 0.3727, + "num_input_tokens_seen": 16676672, + "step": 16570 + }, + { + "epoch": 7.814710042432814, + "grad_norm": 0.33731353282928467, + "learning_rate": 6.930412737704406e-06, + "loss": 0.3113, + "num_input_tokens_seen": 16680608, + "step": 16575 + }, + { + "epoch": 7.817067421027817, + "grad_norm": 0.5650679469108582, + "learning_rate": 6.916202065242125e-06, + "loss": 0.324, + "num_input_tokens_seen": 16685664, + "step": 16580 + }, + { + "epoch": 7.819424799622819, + "grad_norm": 0.37445855140686035, + "learning_rate": 6.90200363789911e-06, + "loss": 0.3309, + "num_input_tokens_seen": 16690560, + "step": 16585 + }, + { + "epoch": 7.821782178217822, + "grad_norm": 0.3219990134239197, + "learning_rate": 6.887817465289565e-06, + "loss": 0.2956, + "num_input_tokens_seen": 16695648, + "step": 16590 + }, + { + "epoch": 7.824139556812824, + "grad_norm": 0.2802848219871521, + "learning_rate": 6.87364355701941e-06, + "loss": 0.3373, + "num_input_tokens_seen": 16700672, + "step": 16595 + }, + { + "epoch": 7.826496935407826, + "grad_norm": 0.2341926097869873, + "learning_rate": 6.859481922686239e-06, + "loss": 0.3271, + "num_input_tokens_seen": 16705792, + "step": 16600 + }, + { + "epoch": 7.828854314002829, + "grad_norm": 0.3459753394126892, + "learning_rate": 6.845332571879357e-06, + "loss": 0.3006, + "num_input_tokens_seen": 16711424, + "step": 16605 + }, + { + "epoch": 7.831211692597831, + "grad_norm": 0.45782071352005005, + "learning_rate": 6.83119551417975e-06, + "loss": 0.3522, + "num_input_tokens_seen": 16715968, + "step": 16610 + }, + { + "epoch": 7.833569071192834, + "grad_norm": 0.27867648005485535, + "learning_rate": 6.81707075916006e-06, + "loss": 0.3085, + "num_input_tokens_seen": 16720160, + "step": 16615 + }, + { + "epoch": 7.835926449787836, + "grad_norm": 0.3588520586490631, + "learning_rate": 6.802958316384622e-06, + "loss": 0.3671, + "num_input_tokens_seen": 16726496, + "step": 16620 + }, + { + "epoch": 7.838283828382838, + "grad_norm": 0.28360578417778015, + "learning_rate": 6.788858195409409e-06, + "loss": 0.272, + "num_input_tokens_seen": 16732352, + "step": 16625 + }, + { + "epoch": 7.840641206977841, + "grad_norm": 0.38078832626342773, + "learning_rate": 6.774770405782074e-06, + "loss": 0.3294, + "num_input_tokens_seen": 16738080, + "step": 16630 + }, + { + "epoch": 7.842998585572843, + "grad_norm": 0.3136480748653412, + "learning_rate": 6.7606949570419135e-06, + "loss": 0.3204, + "num_input_tokens_seen": 16743328, + "step": 16635 + }, + { + "epoch": 7.845355964167846, + "grad_norm": 0.5591611862182617, + "learning_rate": 6.746631858719868e-06, + "loss": 0.2987, + "num_input_tokens_seen": 16748992, + "step": 16640 + }, + { + "epoch": 7.847713342762848, + "grad_norm": 0.49708905816078186, + "learning_rate": 6.7325811203385e-06, + "loss": 0.3113, + "num_input_tokens_seen": 16754112, + "step": 16645 + }, + { + "epoch": 7.8500707213578504, + "grad_norm": 0.3774976432323456, + "learning_rate": 6.718542751412021e-06, + "loss": 0.382, + "num_input_tokens_seen": 16759616, + "step": 16650 + }, + { + "epoch": 7.852428099952853, + "grad_norm": 0.33892837166786194, + "learning_rate": 6.704516761446261e-06, + "loss": 0.3693, + "num_input_tokens_seen": 16764064, + "step": 16655 + }, + { + "epoch": 7.854785478547855, + "grad_norm": 0.3838329017162323, + "learning_rate": 6.690503159938674e-06, + "loss": 0.2909, + "num_input_tokens_seen": 16769760, + "step": 16660 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.30981171131134033, + "learning_rate": 6.676501956378306e-06, + "loss": 0.3853, + "num_input_tokens_seen": 16773984, + "step": 16665 + }, + { + "epoch": 7.859500235737859, + "grad_norm": 0.37483856081962585, + "learning_rate": 6.6625131602458255e-06, + "loss": 0.322, + "num_input_tokens_seen": 16780256, + "step": 16670 + }, + { + "epoch": 7.861857614332862, + "grad_norm": 0.5063418745994568, + "learning_rate": 6.648536781013495e-06, + "loss": 0.352, + "num_input_tokens_seen": 16784736, + "step": 16675 + }, + { + "epoch": 7.864214992927864, + "grad_norm": 0.27478671073913574, + "learning_rate": 6.63457282814518e-06, + "loss": 0.3221, + "num_input_tokens_seen": 16789504, + "step": 16680 + }, + { + "epoch": 7.866572371522866, + "grad_norm": 0.354343444108963, + "learning_rate": 6.620621311096303e-06, + "loss": 0.3298, + "num_input_tokens_seen": 16794208, + "step": 16685 + }, + { + "epoch": 7.868929750117869, + "grad_norm": 0.5083283185958862, + "learning_rate": 6.606682239313891e-06, + "loss": 0.2755, + "num_input_tokens_seen": 16799680, + "step": 16690 + }, + { + "epoch": 7.871287128712871, + "grad_norm": 0.5552966594696045, + "learning_rate": 6.592755622236535e-06, + "loss": 0.353, + "num_input_tokens_seen": 16804320, + "step": 16695 + }, + { + "epoch": 7.873644507307874, + "grad_norm": 0.32040563225746155, + "learning_rate": 6.578841469294403e-06, + "loss": 0.2981, + "num_input_tokens_seen": 16810240, + "step": 16700 + }, + { + "epoch": 7.876001885902876, + "grad_norm": 0.4411863386631012, + "learning_rate": 6.5649397899092e-06, + "loss": 0.279, + "num_input_tokens_seen": 16814880, + "step": 16705 + }, + { + "epoch": 7.878359264497878, + "grad_norm": 0.20792262256145477, + "learning_rate": 6.551050593494204e-06, + "loss": 0.3075, + "num_input_tokens_seen": 16819328, + "step": 16710 + }, + { + "epoch": 7.880716643092881, + "grad_norm": 0.6028140783309937, + "learning_rate": 6.537173889454237e-06, + "loss": 0.3411, + "num_input_tokens_seen": 16824064, + "step": 16715 + }, + { + "epoch": 7.883074021687883, + "grad_norm": 0.3714563548564911, + "learning_rate": 6.523309687185666e-06, + "loss": 0.3356, + "num_input_tokens_seen": 16828832, + "step": 16720 + }, + { + "epoch": 7.885431400282886, + "grad_norm": 0.2896938621997833, + "learning_rate": 6.509457996076382e-06, + "loss": 0.336, + "num_input_tokens_seen": 16833216, + "step": 16725 + }, + { + "epoch": 7.887788778877888, + "grad_norm": 0.5720784664154053, + "learning_rate": 6.495618825505797e-06, + "loss": 0.3679, + "num_input_tokens_seen": 16837856, + "step": 16730 + }, + { + "epoch": 7.89014615747289, + "grad_norm": 0.3093571364879608, + "learning_rate": 6.481792184844868e-06, + "loss": 0.311, + "num_input_tokens_seen": 16843872, + "step": 16735 + }, + { + "epoch": 7.892503536067893, + "grad_norm": 0.43999409675598145, + "learning_rate": 6.467978083456055e-06, + "loss": 0.3186, + "num_input_tokens_seen": 16849728, + "step": 16740 + }, + { + "epoch": 7.894860914662895, + "grad_norm": 0.3365636467933655, + "learning_rate": 6.454176530693337e-06, + "loss": 0.3416, + "num_input_tokens_seen": 16854592, + "step": 16745 + }, + { + "epoch": 7.897218293257898, + "grad_norm": 0.25665444135665894, + "learning_rate": 6.4403875359021735e-06, + "loss": 0.3029, + "num_input_tokens_seen": 16859808, + "step": 16750 + }, + { + "epoch": 7.899575671852899, + "grad_norm": 0.5127463936805725, + "learning_rate": 6.426611108419542e-06, + "loss": 0.3859, + "num_input_tokens_seen": 16864192, + "step": 16755 + }, + { + "epoch": 7.9019330504479015, + "grad_norm": 0.5148900747299194, + "learning_rate": 6.412847257573903e-06, + "loss": 0.3527, + "num_input_tokens_seen": 16869056, + "step": 16760 + }, + { + "epoch": 7.904290429042904, + "grad_norm": 0.3279362916946411, + "learning_rate": 6.399095992685208e-06, + "loss": 0.3096, + "num_input_tokens_seen": 16872672, + "step": 16765 + }, + { + "epoch": 7.906647807637906, + "grad_norm": 0.3201131224632263, + "learning_rate": 6.385357323064864e-06, + "loss": 0.3023, + "num_input_tokens_seen": 16877888, + "step": 16770 + }, + { + "epoch": 7.909005186232909, + "grad_norm": 0.3019546568393707, + "learning_rate": 6.371631258015772e-06, + "loss": 0.3372, + "num_input_tokens_seen": 16881984, + "step": 16775 + }, + { + "epoch": 7.911362564827911, + "grad_norm": 0.33926650881767273, + "learning_rate": 6.357917806832295e-06, + "loss": 0.3383, + "num_input_tokens_seen": 16886624, + "step": 16780 + }, + { + "epoch": 7.9137199434229135, + "grad_norm": 0.4246223270893097, + "learning_rate": 6.344216978800252e-06, + "loss": 0.3754, + "num_input_tokens_seen": 16892096, + "step": 16785 + }, + { + "epoch": 7.916077322017916, + "grad_norm": 0.31119653582572937, + "learning_rate": 6.3305287831968984e-06, + "loss": 0.3044, + "num_input_tokens_seen": 16896960, + "step": 16790 + }, + { + "epoch": 7.918434700612918, + "grad_norm": 0.4088454842567444, + "learning_rate": 6.316853229290962e-06, + "loss": 0.2483, + "num_input_tokens_seen": 16902272, + "step": 16795 + }, + { + "epoch": 7.920792079207921, + "grad_norm": 0.4186982810497284, + "learning_rate": 6.303190326342598e-06, + "loss": 0.3657, + "num_input_tokens_seen": 16907136, + "step": 16800 + }, + { + "epoch": 7.923149457802923, + "grad_norm": 0.4284222424030304, + "learning_rate": 6.289540083603399e-06, + "loss": 0.297, + "num_input_tokens_seen": 16911680, + "step": 16805 + }, + { + "epoch": 7.9255068363979255, + "grad_norm": 0.3311315178871155, + "learning_rate": 6.27590251031637e-06, + "loss": 0.3193, + "num_input_tokens_seen": 16916640, + "step": 16810 + }, + { + "epoch": 7.927864214992928, + "grad_norm": 0.3316720724105835, + "learning_rate": 6.262277615715955e-06, + "loss": 0.3114, + "num_input_tokens_seen": 16920672, + "step": 16815 + }, + { + "epoch": 7.93022159358793, + "grad_norm": 0.3180360794067383, + "learning_rate": 6.24866540902801e-06, + "loss": 0.3056, + "num_input_tokens_seen": 16925472, + "step": 16820 + }, + { + "epoch": 7.932578972182933, + "grad_norm": 0.47445639967918396, + "learning_rate": 6.2350658994698e-06, + "loss": 0.2996, + "num_input_tokens_seen": 16930240, + "step": 16825 + }, + { + "epoch": 7.934936350777935, + "grad_norm": 0.28366395831108093, + "learning_rate": 6.221479096249977e-06, + "loss": 0.2286, + "num_input_tokens_seen": 16935200, + "step": 16830 + }, + { + "epoch": 7.9372937293729375, + "grad_norm": 0.2979488670825958, + "learning_rate": 6.207905008568616e-06, + "loss": 0.3443, + "num_input_tokens_seen": 16941248, + "step": 16835 + }, + { + "epoch": 7.93965110796794, + "grad_norm": 0.27335068583488464, + "learning_rate": 6.194343645617148e-06, + "loss": 0.3202, + "num_input_tokens_seen": 16946976, + "step": 16840 + }, + { + "epoch": 7.942008486562942, + "grad_norm": 0.37503162026405334, + "learning_rate": 6.180795016578419e-06, + "loss": 0.3074, + "num_input_tokens_seen": 16951360, + "step": 16845 + }, + { + "epoch": 7.944365865157945, + "grad_norm": 0.4259468913078308, + "learning_rate": 6.167259130626646e-06, + "loss": 0.3314, + "num_input_tokens_seen": 16956512, + "step": 16850 + }, + { + "epoch": 7.946723243752947, + "grad_norm": 0.34768420457839966, + "learning_rate": 6.153735996927392e-06, + "loss": 0.3144, + "num_input_tokens_seen": 16961536, + "step": 16855 + }, + { + "epoch": 7.9490806223479495, + "grad_norm": 0.3751024603843689, + "learning_rate": 6.140225624637619e-06, + "loss": 0.3345, + "num_input_tokens_seen": 16967360, + "step": 16860 + }, + { + "epoch": 7.951438000942951, + "grad_norm": 0.38992732763290405, + "learning_rate": 6.1267280229056285e-06, + "loss": 0.3029, + "num_input_tokens_seen": 16972704, + "step": 16865 + }, + { + "epoch": 7.9537953795379535, + "grad_norm": 0.35736000537872314, + "learning_rate": 6.113243200871088e-06, + "loss": 0.3799, + "num_input_tokens_seen": 16977760, + "step": 16870 + }, + { + "epoch": 7.956152758132956, + "grad_norm": 0.25977909564971924, + "learning_rate": 6.099771167664986e-06, + "loss": 0.3253, + "num_input_tokens_seen": 16983552, + "step": 16875 + }, + { + "epoch": 7.958510136727958, + "grad_norm": 0.2537011504173279, + "learning_rate": 6.08631193240968e-06, + "loss": 0.3801, + "num_input_tokens_seen": 16988480, + "step": 16880 + }, + { + "epoch": 7.960867515322961, + "grad_norm": 0.2843030095100403, + "learning_rate": 6.072865504218845e-06, + "loss": 0.3196, + "num_input_tokens_seen": 16992928, + "step": 16885 + }, + { + "epoch": 7.963224893917963, + "grad_norm": 0.31189095973968506, + "learning_rate": 6.059431892197498e-06, + "loss": 0.3135, + "num_input_tokens_seen": 16997472, + "step": 16890 + }, + { + "epoch": 7.9655822725129655, + "grad_norm": 0.2764180302619934, + "learning_rate": 6.046011105441954e-06, + "loss": 0.2985, + "num_input_tokens_seen": 17002528, + "step": 16895 + }, + { + "epoch": 7.967939651107968, + "grad_norm": 0.3339079022407532, + "learning_rate": 6.032603153039862e-06, + "loss": 0.3511, + "num_input_tokens_seen": 17007648, + "step": 16900 + }, + { + "epoch": 7.97029702970297, + "grad_norm": 0.39781200885772705, + "learning_rate": 6.019208044070182e-06, + "loss": 0.3563, + "num_input_tokens_seen": 17013440, + "step": 16905 + }, + { + "epoch": 7.972654408297973, + "grad_norm": 0.2884520888328552, + "learning_rate": 6.005825787603175e-06, + "loss": 0.3394, + "num_input_tokens_seen": 17018048, + "step": 16910 + }, + { + "epoch": 7.975011786892975, + "grad_norm": 0.32464227080345154, + "learning_rate": 5.992456392700383e-06, + "loss": 0.2989, + "num_input_tokens_seen": 17024960, + "step": 16915 + }, + { + "epoch": 7.9773691654879775, + "grad_norm": 0.29590174555778503, + "learning_rate": 5.979099868414656e-06, + "loss": 0.2707, + "num_input_tokens_seen": 17031232, + "step": 16920 + }, + { + "epoch": 7.97972654408298, + "grad_norm": 0.34551674127578735, + "learning_rate": 5.965756223790128e-06, + "loss": 0.3516, + "num_input_tokens_seen": 17036512, + "step": 16925 + }, + { + "epoch": 7.982083922677982, + "grad_norm": 0.41611701250076294, + "learning_rate": 5.952425467862216e-06, + "loss": 0.3107, + "num_input_tokens_seen": 17042112, + "step": 16930 + }, + { + "epoch": 7.984441301272985, + "grad_norm": 0.2859736382961273, + "learning_rate": 5.939107609657585e-06, + "loss": 0.312, + "num_input_tokens_seen": 17048000, + "step": 16935 + }, + { + "epoch": 7.986798679867987, + "grad_norm": 0.34667500853538513, + "learning_rate": 5.9258026581941935e-06, + "loss": 0.251, + "num_input_tokens_seen": 17053280, + "step": 16940 + }, + { + "epoch": 7.9891560584629895, + "grad_norm": 0.31503963470458984, + "learning_rate": 5.912510622481249e-06, + "loss": 0.2882, + "num_input_tokens_seen": 17058784, + "step": 16945 + }, + { + "epoch": 7.991513437057992, + "grad_norm": 0.265587717294693, + "learning_rate": 5.899231511519221e-06, + "loss": 0.3439, + "num_input_tokens_seen": 17063136, + "step": 16950 + }, + { + "epoch": 7.993870815652993, + "grad_norm": 0.5838868021965027, + "learning_rate": 5.885965334299817e-06, + "loss": 0.3369, + "num_input_tokens_seen": 17069600, + "step": 16955 + }, + { + "epoch": 7.996228194247996, + "grad_norm": 0.5692241787910461, + "learning_rate": 5.872712099805983e-06, + "loss": 0.3044, + "num_input_tokens_seen": 17074400, + "step": 16960 + }, + { + "epoch": 7.998585572842998, + "grad_norm": 0.30314069986343384, + "learning_rate": 5.859471817011916e-06, + "loss": 0.3222, + "num_input_tokens_seen": 17079296, + "step": 16965 + }, + { + "epoch": 8.000942951438, + "grad_norm": 0.35860875248908997, + "learning_rate": 5.8462444948830326e-06, + "loss": 0.316, + "num_input_tokens_seen": 17085120, + "step": 16970 + }, + { + "epoch": 8.003300330033003, + "grad_norm": 0.456588476896286, + "learning_rate": 5.83303014237599e-06, + "loss": 0.2215, + "num_input_tokens_seen": 17089824, + "step": 16975 + }, + { + "epoch": 8.003771805752004, + "eval_loss": 0.32866472005844116, + "eval_runtime": 25.6303, + "eval_samples_per_second": 36.792, + "eval_steps_per_second": 9.208, + "num_input_tokens_seen": 17090912, + "step": 16976 + }, + { + "epoch": 8.005657708628005, + "grad_norm": 0.35685965418815613, + "learning_rate": 5.819828768438629e-06, + "loss": 0.2926, + "num_input_tokens_seen": 17094624, + "step": 16980 + }, + { + "epoch": 8.008015087223008, + "grad_norm": 0.40293458104133606, + "learning_rate": 5.806640382010034e-06, + "loss": 0.3142, + "num_input_tokens_seen": 17099520, + "step": 16985 + }, + { + "epoch": 8.01037246581801, + "grad_norm": 0.5850895047187805, + "learning_rate": 5.793464992020486e-06, + "loss": 0.3436, + "num_input_tokens_seen": 17104416, + "step": 16990 + }, + { + "epoch": 8.012729844413013, + "grad_norm": 0.5351223945617676, + "learning_rate": 5.780302607391467e-06, + "loss": 0.3524, + "num_input_tokens_seen": 17109440, + "step": 16995 + }, + { + "epoch": 8.015087223008015, + "grad_norm": 0.341953843832016, + "learning_rate": 5.767153237035636e-06, + "loss": 0.2703, + "num_input_tokens_seen": 17115104, + "step": 17000 + }, + { + "epoch": 8.017444601603017, + "grad_norm": 0.37736088037490845, + "learning_rate": 5.754016889856861e-06, + "loss": 0.3083, + "num_input_tokens_seen": 17120000, + "step": 17005 + }, + { + "epoch": 8.01980198019802, + "grad_norm": 0.45053866505622864, + "learning_rate": 5.740893574750186e-06, + "loss": 0.3351, + "num_input_tokens_seen": 17124864, + "step": 17010 + }, + { + "epoch": 8.022159358793022, + "grad_norm": 0.30681905150413513, + "learning_rate": 5.72778330060183e-06, + "loss": 0.3133, + "num_input_tokens_seen": 17129760, + "step": 17015 + }, + { + "epoch": 8.024516737388025, + "grad_norm": 0.38211026787757874, + "learning_rate": 5.7146860762891715e-06, + "loss": 0.3745, + "num_input_tokens_seen": 17134752, + "step": 17020 + }, + { + "epoch": 8.026874115983027, + "grad_norm": 0.3611219823360443, + "learning_rate": 5.701601910680765e-06, + "loss": 0.3293, + "num_input_tokens_seen": 17139872, + "step": 17025 + }, + { + "epoch": 8.02923149457803, + "grad_norm": 0.2907056510448456, + "learning_rate": 5.688530812636319e-06, + "loss": 0.4094, + "num_input_tokens_seen": 17145344, + "step": 17030 + }, + { + "epoch": 8.031588873173032, + "grad_norm": 0.5149962902069092, + "learning_rate": 5.6754727910067e-06, + "loss": 0.3624, + "num_input_tokens_seen": 17150336, + "step": 17035 + }, + { + "epoch": 8.033946251768034, + "grad_norm": 0.4181129038333893, + "learning_rate": 5.662427854633898e-06, + "loss": 0.2865, + "num_input_tokens_seen": 17155328, + "step": 17040 + }, + { + "epoch": 8.036303630363037, + "grad_norm": 0.7695322632789612, + "learning_rate": 5.649396012351069e-06, + "loss": 0.3779, + "num_input_tokens_seen": 17160128, + "step": 17045 + }, + { + "epoch": 8.038661008958039, + "grad_norm": 0.5267140865325928, + "learning_rate": 5.6363772729824825e-06, + "loss": 0.3272, + "num_input_tokens_seen": 17165728, + "step": 17050 + }, + { + "epoch": 8.041018387553041, + "grad_norm": 0.24467211961746216, + "learning_rate": 5.623371645343559e-06, + "loss": 0.306, + "num_input_tokens_seen": 17169952, + "step": 17055 + }, + { + "epoch": 8.043375766148044, + "grad_norm": 0.40063780546188354, + "learning_rate": 5.610379138240817e-06, + "loss": 0.3025, + "num_input_tokens_seen": 17175008, + "step": 17060 + }, + { + "epoch": 8.045733144743046, + "grad_norm": 0.2926301956176758, + "learning_rate": 5.597399760471891e-06, + "loss": 0.3141, + "num_input_tokens_seen": 17179552, + "step": 17065 + }, + { + "epoch": 8.048090523338049, + "grad_norm": 0.4842292070388794, + "learning_rate": 5.584433520825541e-06, + "loss": 0.3103, + "num_input_tokens_seen": 17184608, + "step": 17070 + }, + { + "epoch": 8.050447901933051, + "grad_norm": 0.3646676540374756, + "learning_rate": 5.5714804280816265e-06, + "loss": 0.3308, + "num_input_tokens_seen": 17190080, + "step": 17075 + }, + { + "epoch": 8.052805280528053, + "grad_norm": 0.34187421202659607, + "learning_rate": 5.558540491011105e-06, + "loss": 0.3232, + "num_input_tokens_seen": 17195136, + "step": 17080 + }, + { + "epoch": 8.055162659123056, + "grad_norm": 0.2955092489719391, + "learning_rate": 5.545613718376008e-06, + "loss": 0.3583, + "num_input_tokens_seen": 17201344, + "step": 17085 + }, + { + "epoch": 8.057520037718058, + "grad_norm": 0.3205440044403076, + "learning_rate": 5.532700118929479e-06, + "loss": 0.2708, + "num_input_tokens_seen": 17206016, + "step": 17090 + }, + { + "epoch": 8.05987741631306, + "grad_norm": 0.3461010456085205, + "learning_rate": 5.519799701415729e-06, + "loss": 0.3197, + "num_input_tokens_seen": 17211040, + "step": 17095 + }, + { + "epoch": 8.062234794908063, + "grad_norm": 0.403219997882843, + "learning_rate": 5.506912474570047e-06, + "loss": 0.3235, + "num_input_tokens_seen": 17216288, + "step": 17100 + }, + { + "epoch": 8.064592173503065, + "grad_norm": 0.5189611315727234, + "learning_rate": 5.494038447118779e-06, + "loss": 0.3514, + "num_input_tokens_seen": 17222880, + "step": 17105 + }, + { + "epoch": 8.066949552098066, + "grad_norm": 0.3311515152454376, + "learning_rate": 5.4811776277793436e-06, + "loss": 0.3608, + "num_input_tokens_seen": 17227392, + "step": 17110 + }, + { + "epoch": 8.069306930693068, + "grad_norm": 0.48935869336128235, + "learning_rate": 5.4683300252602185e-06, + "loss": 0.3047, + "num_input_tokens_seen": 17232512, + "step": 17115 + }, + { + "epoch": 8.07166430928807, + "grad_norm": 0.34990066289901733, + "learning_rate": 5.455495648260934e-06, + "loss": 0.2776, + "num_input_tokens_seen": 17237568, + "step": 17120 + }, + { + "epoch": 8.074021687883073, + "grad_norm": 0.29456937313079834, + "learning_rate": 5.442674505472037e-06, + "loss": 0.3316, + "num_input_tokens_seen": 17243808, + "step": 17125 + }, + { + "epoch": 8.076379066478076, + "grad_norm": 0.565832257270813, + "learning_rate": 5.429866605575152e-06, + "loss": 0.3217, + "num_input_tokens_seen": 17248256, + "step": 17130 + }, + { + "epoch": 8.078736445073078, + "grad_norm": 0.2748289406299591, + "learning_rate": 5.417071957242909e-06, + "loss": 0.2762, + "num_input_tokens_seen": 17252832, + "step": 17135 + }, + { + "epoch": 8.08109382366808, + "grad_norm": 0.444976806640625, + "learning_rate": 5.404290569138986e-06, + "loss": 0.33, + "num_input_tokens_seen": 17258560, + "step": 17140 + }, + { + "epoch": 8.083451202263083, + "grad_norm": 0.5306108593940735, + "learning_rate": 5.391522449918057e-06, + "loss": 0.3518, + "num_input_tokens_seen": 17262272, + "step": 17145 + }, + { + "epoch": 8.085808580858085, + "grad_norm": 0.619455099105835, + "learning_rate": 5.378767608225832e-06, + "loss": 0.3161, + "num_input_tokens_seen": 17267360, + "step": 17150 + }, + { + "epoch": 8.088165959453088, + "grad_norm": 0.387382835149765, + "learning_rate": 5.3660260526990216e-06, + "loss": 0.3387, + "num_input_tokens_seen": 17271360, + "step": 17155 + }, + { + "epoch": 8.09052333804809, + "grad_norm": 0.3484764099121094, + "learning_rate": 5.3532977919653476e-06, + "loss": 0.3286, + "num_input_tokens_seen": 17276640, + "step": 17160 + }, + { + "epoch": 8.092880716643092, + "grad_norm": 0.5247676968574524, + "learning_rate": 5.3405828346435125e-06, + "loss": 0.2984, + "num_input_tokens_seen": 17281216, + "step": 17165 + }, + { + "epoch": 8.095238095238095, + "grad_norm": 0.36675119400024414, + "learning_rate": 5.327881189343226e-06, + "loss": 0.3306, + "num_input_tokens_seen": 17286624, + "step": 17170 + }, + { + "epoch": 8.097595473833097, + "grad_norm": 0.3671422004699707, + "learning_rate": 5.315192864665186e-06, + "loss": 0.3387, + "num_input_tokens_seen": 17291744, + "step": 17175 + }, + { + "epoch": 8.0999528524281, + "grad_norm": 0.36710649728775024, + "learning_rate": 5.302517869201059e-06, + "loss": 0.3417, + "num_input_tokens_seen": 17297280, + "step": 17180 + }, + { + "epoch": 8.102310231023102, + "grad_norm": 0.5689657330513, + "learning_rate": 5.289856211533484e-06, + "loss": 0.291, + "num_input_tokens_seen": 17301760, + "step": 17185 + }, + { + "epoch": 8.104667609618105, + "grad_norm": 0.34640374779701233, + "learning_rate": 5.277207900236081e-06, + "loss": 0.3529, + "num_input_tokens_seen": 17306976, + "step": 17190 + }, + { + "epoch": 8.107024988213107, + "grad_norm": 0.3247329592704773, + "learning_rate": 5.264572943873425e-06, + "loss": 0.3169, + "num_input_tokens_seen": 17311680, + "step": 17195 + }, + { + "epoch": 8.10938236680811, + "grad_norm": 0.23989243805408478, + "learning_rate": 5.251951351001061e-06, + "loss": 0.3522, + "num_input_tokens_seen": 17316704, + "step": 17200 + }, + { + "epoch": 8.111739745403112, + "grad_norm": 0.3072831630706787, + "learning_rate": 5.2393431301654555e-06, + "loss": 0.3598, + "num_input_tokens_seen": 17321408, + "step": 17205 + }, + { + "epoch": 8.114097123998114, + "grad_norm": 0.4762112498283386, + "learning_rate": 5.226748289904049e-06, + "loss": 0.3008, + "num_input_tokens_seen": 17326464, + "step": 17210 + }, + { + "epoch": 8.116454502593117, + "grad_norm": 0.5502005219459534, + "learning_rate": 5.214166838745213e-06, + "loss": 0.3204, + "num_input_tokens_seen": 17332736, + "step": 17215 + }, + { + "epoch": 8.118811881188119, + "grad_norm": 0.6176158785820007, + "learning_rate": 5.201598785208256e-06, + "loss": 0.3765, + "num_input_tokens_seen": 17337216, + "step": 17220 + }, + { + "epoch": 8.121169259783121, + "grad_norm": 0.5152252912521362, + "learning_rate": 5.189044137803395e-06, + "loss": 0.3868, + "num_input_tokens_seen": 17341664, + "step": 17225 + }, + { + "epoch": 8.123526638378124, + "grad_norm": 0.4171876609325409, + "learning_rate": 5.1765029050317936e-06, + "loss": 0.3055, + "num_input_tokens_seen": 17346048, + "step": 17230 + }, + { + "epoch": 8.125884016973126, + "grad_norm": 0.43970686197280884, + "learning_rate": 5.163975095385523e-06, + "loss": 0.3482, + "num_input_tokens_seen": 17350624, + "step": 17235 + }, + { + "epoch": 8.128241395568129, + "grad_norm": 0.319980651140213, + "learning_rate": 5.151460717347567e-06, + "loss": 0.3026, + "num_input_tokens_seen": 17357088, + "step": 17240 + }, + { + "epoch": 8.130598774163131, + "grad_norm": 0.49196985363960266, + "learning_rate": 5.138959779391805e-06, + "loss": 0.3075, + "num_input_tokens_seen": 17361696, + "step": 17245 + }, + { + "epoch": 8.132956152758133, + "grad_norm": 0.31697699427604675, + "learning_rate": 5.126472289983023e-06, + "loss": 0.3133, + "num_input_tokens_seen": 17368320, + "step": 17250 + }, + { + "epoch": 8.135313531353136, + "grad_norm": 0.4642685651779175, + "learning_rate": 5.113998257576904e-06, + "loss": 0.3191, + "num_input_tokens_seen": 17374624, + "step": 17255 + }, + { + "epoch": 8.137670909948138, + "grad_norm": 0.45102715492248535, + "learning_rate": 5.101537690620023e-06, + "loss": 0.2761, + "num_input_tokens_seen": 17379360, + "step": 17260 + }, + { + "epoch": 8.14002828854314, + "grad_norm": 0.35363441705703735, + "learning_rate": 5.089090597549814e-06, + "loss": 0.355, + "num_input_tokens_seen": 17384128, + "step": 17265 + }, + { + "epoch": 8.142385667138143, + "grad_norm": 0.29856258630752563, + "learning_rate": 5.076656986794611e-06, + "loss": 0.3156, + "num_input_tokens_seen": 17388576, + "step": 17270 + }, + { + "epoch": 8.144743045733145, + "grad_norm": 0.4311378300189972, + "learning_rate": 5.064236866773608e-06, + "loss": 0.2861, + "num_input_tokens_seen": 17394432, + "step": 17275 + }, + { + "epoch": 8.147100424328148, + "grad_norm": 0.34359022974967957, + "learning_rate": 5.051830245896874e-06, + "loss": 0.2571, + "num_input_tokens_seen": 17401024, + "step": 17280 + }, + { + "epoch": 8.14945780292315, + "grad_norm": 0.4514734745025635, + "learning_rate": 5.0394371325653225e-06, + "loss": 0.3398, + "num_input_tokens_seen": 17406112, + "step": 17285 + }, + { + "epoch": 8.151815181518153, + "grad_norm": 0.4511820375919342, + "learning_rate": 5.027057535170723e-06, + "loss": 0.3628, + "num_input_tokens_seen": 17411616, + "step": 17290 + }, + { + "epoch": 8.154172560113155, + "grad_norm": 0.20096348226070404, + "learning_rate": 5.014691462095702e-06, + "loss": 0.3842, + "num_input_tokens_seen": 17416832, + "step": 17295 + }, + { + "epoch": 8.156529938708157, + "grad_norm": 0.46278148889541626, + "learning_rate": 5.002338921713726e-06, + "loss": 0.295, + "num_input_tokens_seen": 17422080, + "step": 17300 + }, + { + "epoch": 8.15888731730316, + "grad_norm": 0.30018144845962524, + "learning_rate": 4.989999922389102e-06, + "loss": 0.3271, + "num_input_tokens_seen": 17426912, + "step": 17305 + }, + { + "epoch": 8.16124469589816, + "grad_norm": 0.5845975875854492, + "learning_rate": 4.977674472476951e-06, + "loss": 0.3104, + "num_input_tokens_seen": 17433152, + "step": 17310 + }, + { + "epoch": 8.163602074493163, + "grad_norm": 0.387533038854599, + "learning_rate": 4.96536258032323e-06, + "loss": 0.3481, + "num_input_tokens_seen": 17438208, + "step": 17315 + }, + { + "epoch": 8.165959453088165, + "grad_norm": 0.7529004812240601, + "learning_rate": 4.953064254264722e-06, + "loss": 0.3685, + "num_input_tokens_seen": 17443328, + "step": 17320 + }, + { + "epoch": 8.168316831683168, + "grad_norm": 0.5977901816368103, + "learning_rate": 4.940779502629025e-06, + "loss": 0.361, + "num_input_tokens_seen": 17447840, + "step": 17325 + }, + { + "epoch": 8.17067421027817, + "grad_norm": 0.3405086100101471, + "learning_rate": 4.9285083337345215e-06, + "loss": 0.2984, + "num_input_tokens_seen": 17452544, + "step": 17330 + }, + { + "epoch": 8.173031588873172, + "grad_norm": 0.3672920763492584, + "learning_rate": 4.91625075589042e-06, + "loss": 0.316, + "num_input_tokens_seen": 17457824, + "step": 17335 + }, + { + "epoch": 8.175388967468175, + "grad_norm": 0.4817875325679779, + "learning_rate": 4.904006777396722e-06, + "loss": 0.2811, + "num_input_tokens_seen": 17462496, + "step": 17340 + }, + { + "epoch": 8.177746346063177, + "grad_norm": 0.3391585648059845, + "learning_rate": 4.891776406544221e-06, + "loss": 0.3267, + "num_input_tokens_seen": 17467200, + "step": 17345 + }, + { + "epoch": 8.18010372465818, + "grad_norm": 0.3331543803215027, + "learning_rate": 4.879559651614482e-06, + "loss": 0.3199, + "num_input_tokens_seen": 17471296, + "step": 17350 + }, + { + "epoch": 8.182461103253182, + "grad_norm": 0.2851455509662628, + "learning_rate": 4.867356520879871e-06, + "loss": 0.2986, + "num_input_tokens_seen": 17475904, + "step": 17355 + }, + { + "epoch": 8.184818481848184, + "grad_norm": 0.25383642315864563, + "learning_rate": 4.855167022603513e-06, + "loss": 0.3177, + "num_input_tokens_seen": 17480192, + "step": 17360 + }, + { + "epoch": 8.187175860443187, + "grad_norm": 0.48006707429885864, + "learning_rate": 4.842991165039318e-06, + "loss": 0.3273, + "num_input_tokens_seen": 17484288, + "step": 17365 + }, + { + "epoch": 8.18953323903819, + "grad_norm": 0.32892775535583496, + "learning_rate": 4.830828956431935e-06, + "loss": 0.3727, + "num_input_tokens_seen": 17488608, + "step": 17370 + }, + { + "epoch": 8.191890617633192, + "grad_norm": 0.4686095714569092, + "learning_rate": 4.8186804050167925e-06, + "loss": 0.3775, + "num_input_tokens_seen": 17493120, + "step": 17375 + }, + { + "epoch": 8.194247996228194, + "grad_norm": 0.5819273591041565, + "learning_rate": 4.806545519020067e-06, + "loss": 0.3206, + "num_input_tokens_seen": 17497376, + "step": 17380 + }, + { + "epoch": 8.196605374823196, + "grad_norm": 0.49685534834861755, + "learning_rate": 4.794424306658679e-06, + "loss": 0.3172, + "num_input_tokens_seen": 17503008, + "step": 17385 + }, + { + "epoch": 8.198962753418199, + "grad_norm": 0.4082929193973541, + "learning_rate": 4.782316776140283e-06, + "loss": 0.3035, + "num_input_tokens_seen": 17508992, + "step": 17390 + }, + { + "epoch": 8.201320132013201, + "grad_norm": 0.35519707202911377, + "learning_rate": 4.770222935663276e-06, + "loss": 0.3023, + "num_input_tokens_seen": 17514880, + "step": 17395 + }, + { + "epoch": 8.203677510608204, + "grad_norm": 0.43890392780303955, + "learning_rate": 4.7581427934168e-06, + "loss": 0.261, + "num_input_tokens_seen": 17519936, + "step": 17400 + }, + { + "epoch": 8.206034889203206, + "grad_norm": 0.34283167123794556, + "learning_rate": 4.746076357580687e-06, + "loss": 0.3024, + "num_input_tokens_seen": 17525408, + "step": 17405 + }, + { + "epoch": 8.208392267798208, + "grad_norm": 0.29993999004364014, + "learning_rate": 4.734023636325524e-06, + "loss": 0.3198, + "num_input_tokens_seen": 17531744, + "step": 17410 + }, + { + "epoch": 8.21074964639321, + "grad_norm": 0.4499262571334839, + "learning_rate": 4.721984637812579e-06, + "loss": 0.2765, + "num_input_tokens_seen": 17535936, + "step": 17415 + }, + { + "epoch": 8.213107024988213, + "grad_norm": 0.380076140165329, + "learning_rate": 4.709959370193853e-06, + "loss": 0.2963, + "num_input_tokens_seen": 17540032, + "step": 17420 + }, + { + "epoch": 8.215464403583216, + "grad_norm": 0.5051827430725098, + "learning_rate": 4.697947841612038e-06, + "loss": 0.3514, + "num_input_tokens_seen": 17544672, + "step": 17425 + }, + { + "epoch": 8.217821782178218, + "grad_norm": 0.4257768392562866, + "learning_rate": 4.685950060200531e-06, + "loss": 0.3504, + "num_input_tokens_seen": 17549920, + "step": 17430 + }, + { + "epoch": 8.22017916077322, + "grad_norm": 0.2882785201072693, + "learning_rate": 4.673966034083405e-06, + "loss": 0.3481, + "num_input_tokens_seen": 17554560, + "step": 17435 + }, + { + "epoch": 8.222536539368223, + "grad_norm": 0.2663715183734894, + "learning_rate": 4.6619957713754325e-06, + "loss": 0.2857, + "num_input_tokens_seen": 17559072, + "step": 17440 + }, + { + "epoch": 8.224893917963225, + "grad_norm": 0.541191816329956, + "learning_rate": 4.650039280182061e-06, + "loss": 0.3387, + "num_input_tokens_seen": 17563456, + "step": 17445 + }, + { + "epoch": 8.227251296558228, + "grad_norm": 0.4110221862792969, + "learning_rate": 4.638096568599423e-06, + "loss": 0.375, + "num_input_tokens_seen": 17567456, + "step": 17450 + }, + { + "epoch": 8.22960867515323, + "grad_norm": 0.31259697675704956, + "learning_rate": 4.626167644714297e-06, + "loss": 0.344, + "num_input_tokens_seen": 17572864, + "step": 17455 + }, + { + "epoch": 8.231966053748232, + "grad_norm": 0.2557911276817322, + "learning_rate": 4.614252516604145e-06, + "loss": 0.2793, + "num_input_tokens_seen": 17578528, + "step": 17460 + }, + { + "epoch": 8.234323432343235, + "grad_norm": 0.5552685260772705, + "learning_rate": 4.6023511923370845e-06, + "loss": 0.3813, + "num_input_tokens_seen": 17583744, + "step": 17465 + }, + { + "epoch": 8.236680810938237, + "grad_norm": 0.32720503211021423, + "learning_rate": 4.590463679971888e-06, + "loss": 0.2821, + "num_input_tokens_seen": 17588416, + "step": 17470 + }, + { + "epoch": 8.23903818953324, + "grad_norm": 0.3813861906528473, + "learning_rate": 4.578589987557957e-06, + "loss": 0.3681, + "num_input_tokens_seen": 17594336, + "step": 17475 + }, + { + "epoch": 8.241395568128242, + "grad_norm": 0.29881447553634644, + "learning_rate": 4.566730123135363e-06, + "loss": 0.3269, + "num_input_tokens_seen": 17598880, + "step": 17480 + }, + { + "epoch": 8.243752946723244, + "grad_norm": 0.26414138078689575, + "learning_rate": 4.554884094734793e-06, + "loss": 0.2972, + "num_input_tokens_seen": 17603488, + "step": 17485 + }, + { + "epoch": 8.246110325318247, + "grad_norm": 0.2968473434448242, + "learning_rate": 4.54305191037758e-06, + "loss": 0.3298, + "num_input_tokens_seen": 17609536, + "step": 17490 + }, + { + "epoch": 8.24846770391325, + "grad_norm": 0.5618106722831726, + "learning_rate": 4.531233578075667e-06, + "loss": 0.3819, + "num_input_tokens_seen": 17613952, + "step": 17495 + }, + { + "epoch": 8.250825082508252, + "grad_norm": 0.5707865953445435, + "learning_rate": 4.519429105831627e-06, + "loss": 0.3357, + "num_input_tokens_seen": 17618144, + "step": 17500 + }, + { + "epoch": 8.253182461103254, + "grad_norm": 0.7197121977806091, + "learning_rate": 4.507638501638647e-06, + "loss": 0.3494, + "num_input_tokens_seen": 17622880, + "step": 17505 + }, + { + "epoch": 8.255539839698255, + "grad_norm": 0.30899104475975037, + "learning_rate": 4.495861773480533e-06, + "loss": 0.3387, + "num_input_tokens_seen": 17627200, + "step": 17510 + }, + { + "epoch": 8.257897218293257, + "grad_norm": 0.355372816324234, + "learning_rate": 4.484098929331676e-06, + "loss": 0.4116, + "num_input_tokens_seen": 17636096, + "step": 17515 + }, + { + "epoch": 8.26025459688826, + "grad_norm": 0.31043779850006104, + "learning_rate": 4.472349977157069e-06, + "loss": 0.3265, + "num_input_tokens_seen": 17641792, + "step": 17520 + }, + { + "epoch": 8.262611975483262, + "grad_norm": 0.30822914838790894, + "learning_rate": 4.4606149249123126e-06, + "loss": 0.3069, + "num_input_tokens_seen": 17645920, + "step": 17525 + }, + { + "epoch": 8.264969354078264, + "grad_norm": 0.5292227268218994, + "learning_rate": 4.448893780543581e-06, + "loss": 0.2686, + "num_input_tokens_seen": 17651968, + "step": 17530 + }, + { + "epoch": 8.267326732673267, + "grad_norm": 0.5241310596466064, + "learning_rate": 4.4371865519876485e-06, + "loss": 0.2727, + "num_input_tokens_seen": 17657408, + "step": 17535 + }, + { + "epoch": 8.269684111268269, + "grad_norm": 0.46396520733833313, + "learning_rate": 4.425493247171841e-06, + "loss": 0.3154, + "num_input_tokens_seen": 17662752, + "step": 17540 + }, + { + "epoch": 8.272041489863271, + "grad_norm": 0.34460872411727905, + "learning_rate": 4.413813874014078e-06, + "loss": 0.3675, + "num_input_tokens_seen": 17667648, + "step": 17545 + }, + { + "epoch": 8.274398868458274, + "grad_norm": 0.2579828202724457, + "learning_rate": 4.402148440422832e-06, + "loss": 0.3716, + "num_input_tokens_seen": 17672096, + "step": 17550 + }, + { + "epoch": 8.276756247053276, + "grad_norm": 0.30365535616874695, + "learning_rate": 4.390496954297155e-06, + "loss": 0.3117, + "num_input_tokens_seen": 17678400, + "step": 17555 + }, + { + "epoch": 8.279113625648279, + "grad_norm": 0.7958579063415527, + "learning_rate": 4.378859423526621e-06, + "loss": 0.3628, + "num_input_tokens_seen": 17682752, + "step": 17560 + }, + { + "epoch": 8.281471004243281, + "grad_norm": 0.6551151275634766, + "learning_rate": 4.3672358559913915e-06, + "loss": 0.3372, + "num_input_tokens_seen": 17686720, + "step": 17565 + }, + { + "epoch": 8.283828382838283, + "grad_norm": 0.3937184810638428, + "learning_rate": 4.355626259562149e-06, + "loss": 0.3753, + "num_input_tokens_seen": 17691904, + "step": 17570 + }, + { + "epoch": 8.286185761433286, + "grad_norm": 0.3193525969982147, + "learning_rate": 4.344030642100133e-06, + "loss": 0.3, + "num_input_tokens_seen": 17697088, + "step": 17575 + }, + { + "epoch": 8.288543140028288, + "grad_norm": 0.4396529197692871, + "learning_rate": 4.332449011457093e-06, + "loss": 0.2904, + "num_input_tokens_seen": 17704480, + "step": 17580 + }, + { + "epoch": 8.29090051862329, + "grad_norm": 0.48475727438926697, + "learning_rate": 4.320881375475331e-06, + "loss": 0.3452, + "num_input_tokens_seen": 17709984, + "step": 17585 + }, + { + "epoch": 8.293257897218293, + "grad_norm": 0.487934947013855, + "learning_rate": 4.309327741987662e-06, + "loss": 0.3299, + "num_input_tokens_seen": 17715328, + "step": 17590 + }, + { + "epoch": 8.295615275813295, + "grad_norm": 0.26538750529289246, + "learning_rate": 4.297788118817431e-06, + "loss": 0.2874, + "num_input_tokens_seen": 17719776, + "step": 17595 + }, + { + "epoch": 8.297972654408298, + "grad_norm": 0.5085274577140808, + "learning_rate": 4.286262513778471e-06, + "loss": 0.3221, + "num_input_tokens_seen": 17725376, + "step": 17600 + }, + { + "epoch": 8.3003300330033, + "grad_norm": 0.6316096782684326, + "learning_rate": 4.274750934675145e-06, + "loss": 0.3056, + "num_input_tokens_seen": 17729600, + "step": 17605 + }, + { + "epoch": 8.302687411598303, + "grad_norm": 0.37157875299453735, + "learning_rate": 4.263253389302316e-06, + "loss": 0.3519, + "num_input_tokens_seen": 17733984, + "step": 17610 + }, + { + "epoch": 8.305044790193305, + "grad_norm": 0.5567635893821716, + "learning_rate": 4.2517698854453416e-06, + "loss": 0.3795, + "num_input_tokens_seen": 17738912, + "step": 17615 + }, + { + "epoch": 8.307402168788308, + "grad_norm": 0.23795512318611145, + "learning_rate": 4.240300430880062e-06, + "loss": 0.3106, + "num_input_tokens_seen": 17743136, + "step": 17620 + }, + { + "epoch": 8.30975954738331, + "grad_norm": 0.3639107942581177, + "learning_rate": 4.228845033372822e-06, + "loss": 0.3182, + "num_input_tokens_seen": 17748608, + "step": 17625 + }, + { + "epoch": 8.312116925978312, + "grad_norm": 0.36885231733322144, + "learning_rate": 4.217403700680428e-06, + "loss": 0.3816, + "num_input_tokens_seen": 17752576, + "step": 17630 + }, + { + "epoch": 8.314474304573315, + "grad_norm": 0.5088356137275696, + "learning_rate": 4.20597644055018e-06, + "loss": 0.329, + "num_input_tokens_seen": 17756864, + "step": 17635 + }, + { + "epoch": 8.316831683168317, + "grad_norm": 0.2803612947463989, + "learning_rate": 4.194563260719847e-06, + "loss": 0.3304, + "num_input_tokens_seen": 17761952, + "step": 17640 + }, + { + "epoch": 8.31918906176332, + "grad_norm": 0.333982914686203, + "learning_rate": 4.183164168917645e-06, + "loss": 0.3166, + "num_input_tokens_seen": 17765920, + "step": 17645 + }, + { + "epoch": 8.321546440358322, + "grad_norm": 0.5022452473640442, + "learning_rate": 4.171779172862273e-06, + "loss": 0.3292, + "num_input_tokens_seen": 17771744, + "step": 17650 + }, + { + "epoch": 8.323903818953324, + "grad_norm": 0.6004623770713806, + "learning_rate": 4.160408280262875e-06, + "loss": 0.2977, + "num_input_tokens_seen": 17776032, + "step": 17655 + }, + { + "epoch": 8.326261197548327, + "grad_norm": 0.2984929084777832, + "learning_rate": 4.149051498819056e-06, + "loss": 0.256, + "num_input_tokens_seen": 17781696, + "step": 17660 + }, + { + "epoch": 8.32861857614333, + "grad_norm": 0.2378278225660324, + "learning_rate": 4.137708836220841e-06, + "loss": 0.3103, + "num_input_tokens_seen": 17786688, + "step": 17665 + }, + { + "epoch": 8.330975954738332, + "grad_norm": 0.32961347699165344, + "learning_rate": 4.126380300148719e-06, + "loss": 0.3381, + "num_input_tokens_seen": 17790912, + "step": 17670 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.5583527088165283, + "learning_rate": 4.115065898273604e-06, + "loss": 0.3602, + "num_input_tokens_seen": 17796832, + "step": 17675 + }, + { + "epoch": 8.335690711928336, + "grad_norm": 0.4373195171356201, + "learning_rate": 4.1037656382568454e-06, + "loss": 0.307, + "num_input_tokens_seen": 17801280, + "step": 17680 + }, + { + "epoch": 8.338048090523339, + "grad_norm": 0.36339226365089417, + "learning_rate": 4.092479527750201e-06, + "loss": 0.3496, + "num_input_tokens_seen": 17805824, + "step": 17685 + }, + { + "epoch": 8.340405469118341, + "grad_norm": 0.272278368473053, + "learning_rate": 4.081207574395862e-06, + "loss": 0.2917, + "num_input_tokens_seen": 17811840, + "step": 17690 + }, + { + "epoch": 8.342762847713344, + "grad_norm": 0.2774339020252228, + "learning_rate": 4.0699497858264325e-06, + "loss": 0.3322, + "num_input_tokens_seen": 17816768, + "step": 17695 + }, + { + "epoch": 8.345120226308346, + "grad_norm": 0.48023244738578796, + "learning_rate": 4.058706169664927e-06, + "loss": 0.2661, + "num_input_tokens_seen": 17821824, + "step": 17700 + }, + { + "epoch": 8.347477604903348, + "grad_norm": 0.3014877736568451, + "learning_rate": 4.047476733524744e-06, + "loss": 0.2939, + "num_input_tokens_seen": 17826400, + "step": 17705 + }, + { + "epoch": 8.34983498349835, + "grad_norm": 0.5274595618247986, + "learning_rate": 4.036261485009702e-06, + "loss": 0.3198, + "num_input_tokens_seen": 17831264, + "step": 17710 + }, + { + "epoch": 8.352192362093351, + "grad_norm": 0.20052196085453033, + "learning_rate": 4.025060431714006e-06, + "loss": 0.3669, + "num_input_tokens_seen": 17835840, + "step": 17715 + }, + { + "epoch": 8.354549740688354, + "grad_norm": 0.31207555532455444, + "learning_rate": 4.0138735812222526e-06, + "loss": 0.3348, + "num_input_tokens_seen": 17839776, + "step": 17720 + }, + { + "epoch": 8.356907119283356, + "grad_norm": 0.3317849934101105, + "learning_rate": 4.002700941109408e-06, + "loss": 0.3331, + "num_input_tokens_seen": 17844416, + "step": 17725 + }, + { + "epoch": 8.359264497878359, + "grad_norm": 0.3193918764591217, + "learning_rate": 3.9915425189408275e-06, + "loss": 0.2822, + "num_input_tokens_seen": 17849024, + "step": 17730 + }, + { + "epoch": 8.361621876473361, + "grad_norm": 0.6048680543899536, + "learning_rate": 3.980398322272244e-06, + "loss": 0.3616, + "num_input_tokens_seen": 17853440, + "step": 17735 + }, + { + "epoch": 8.363979255068363, + "grad_norm": 0.3293170928955078, + "learning_rate": 3.969268358649739e-06, + "loss": 0.3261, + "num_input_tokens_seen": 17858432, + "step": 17740 + }, + { + "epoch": 8.366336633663366, + "grad_norm": 0.3066788613796234, + "learning_rate": 3.958152635609777e-06, + "loss": 0.3228, + "num_input_tokens_seen": 17864352, + "step": 17745 + }, + { + "epoch": 8.368694012258368, + "grad_norm": 0.28869396448135376, + "learning_rate": 3.9470511606791635e-06, + "loss": 0.346, + "num_input_tokens_seen": 17869024, + "step": 17750 + }, + { + "epoch": 8.37105139085337, + "grad_norm": 0.46381205320358276, + "learning_rate": 3.935963941375065e-06, + "loss": 0.3081, + "num_input_tokens_seen": 17874944, + "step": 17755 + }, + { + "epoch": 8.373408769448373, + "grad_norm": 0.31024080514907837, + "learning_rate": 3.9248909852049906e-06, + "loss": 0.3646, + "num_input_tokens_seen": 17879264, + "step": 17760 + }, + { + "epoch": 8.375766148043375, + "grad_norm": 0.36558425426483154, + "learning_rate": 3.913832299666806e-06, + "loss": 0.3434, + "num_input_tokens_seen": 17884960, + "step": 17765 + }, + { + "epoch": 8.378123526638378, + "grad_norm": 0.2895266115665436, + "learning_rate": 3.902787892248685e-06, + "loss": 0.3012, + "num_input_tokens_seen": 17889856, + "step": 17770 + }, + { + "epoch": 8.38048090523338, + "grad_norm": 0.2678624093532562, + "learning_rate": 3.891757770429161e-06, + "loss": 0.3434, + "num_input_tokens_seen": 17894240, + "step": 17775 + }, + { + "epoch": 8.382838283828383, + "grad_norm": 0.32715484499931335, + "learning_rate": 3.880741941677077e-06, + "loss": 0.3453, + "num_input_tokens_seen": 17899552, + "step": 17780 + }, + { + "epoch": 8.385195662423385, + "grad_norm": 0.6486188769340515, + "learning_rate": 3.869740413451614e-06, + "loss": 0.3415, + "num_input_tokens_seen": 17904512, + "step": 17785 + }, + { + "epoch": 8.387553041018387, + "grad_norm": 0.37031644582748413, + "learning_rate": 3.8587531932022445e-06, + "loss": 0.3383, + "num_input_tokens_seen": 17910016, + "step": 17790 + }, + { + "epoch": 8.38991041961339, + "grad_norm": 0.5641663670539856, + "learning_rate": 3.847780288368777e-06, + "loss": 0.3182, + "num_input_tokens_seen": 17914560, + "step": 17795 + }, + { + "epoch": 8.392267798208392, + "grad_norm": 0.5776022672653198, + "learning_rate": 3.836821706381313e-06, + "loss": 0.3659, + "num_input_tokens_seen": 17919456, + "step": 17800 + }, + { + "epoch": 8.394625176803395, + "grad_norm": 0.319052517414093, + "learning_rate": 3.825877454660268e-06, + "loss": 0.2959, + "num_input_tokens_seen": 17924352, + "step": 17805 + }, + { + "epoch": 8.396982555398397, + "grad_norm": 0.37939462065696716, + "learning_rate": 3.814947540616334e-06, + "loss": 0.3548, + "num_input_tokens_seen": 17929024, + "step": 17810 + }, + { + "epoch": 8.3993399339934, + "grad_norm": 0.2472912073135376, + "learning_rate": 3.8040319716505124e-06, + "loss": 0.2782, + "num_input_tokens_seen": 17933568, + "step": 17815 + }, + { + "epoch": 8.401697312588402, + "grad_norm": 0.27817007899284363, + "learning_rate": 3.7931307551540812e-06, + "loss": 0.3529, + "num_input_tokens_seen": 17938336, + "step": 17820 + }, + { + "epoch": 8.404054691183404, + "grad_norm": 0.5006629824638367, + "learning_rate": 3.782243898508614e-06, + "loss": 0.3223, + "num_input_tokens_seen": 17943808, + "step": 17825 + }, + { + "epoch": 8.406412069778407, + "grad_norm": 0.3821069002151489, + "learning_rate": 3.7713714090859386e-06, + "loss": 0.3773, + "num_input_tokens_seen": 17949536, + "step": 17830 + }, + { + "epoch": 8.408769448373409, + "grad_norm": 0.37042179703712463, + "learning_rate": 3.7605132942481662e-06, + "loss": 0.2766, + "num_input_tokens_seen": 17954560, + "step": 17835 + }, + { + "epoch": 8.411126826968411, + "grad_norm": 0.3937414586544037, + "learning_rate": 3.7496695613476823e-06, + "loss": 0.358, + "num_input_tokens_seen": 17960288, + "step": 17840 + }, + { + "epoch": 8.413484205563414, + "grad_norm": 0.46268153190612793, + "learning_rate": 3.738840217727127e-06, + "loss": 0.3178, + "num_input_tokens_seen": 17965056, + "step": 17845 + }, + { + "epoch": 8.415841584158416, + "grad_norm": 0.3559599220752716, + "learning_rate": 3.7280252707193903e-06, + "loss": 0.3147, + "num_input_tokens_seen": 17969728, + "step": 17850 + }, + { + "epoch": 8.418198962753419, + "grad_norm": 0.5835328102111816, + "learning_rate": 3.717224727647614e-06, + "loss": 0.3183, + "num_input_tokens_seen": 17975072, + "step": 17855 + }, + { + "epoch": 8.420556341348421, + "grad_norm": 0.31248167157173157, + "learning_rate": 3.7064385958252012e-06, + "loss": 0.3563, + "num_input_tokens_seen": 17980480, + "step": 17860 + }, + { + "epoch": 8.422913719943423, + "grad_norm": 0.35029903054237366, + "learning_rate": 3.6956668825557815e-06, + "loss": 0.3502, + "num_input_tokens_seen": 17985152, + "step": 17865 + }, + { + "epoch": 8.425271098538426, + "grad_norm": 0.3344484269618988, + "learning_rate": 3.6849095951332373e-06, + "loss": 0.3691, + "num_input_tokens_seen": 17991264, + "step": 17870 + }, + { + "epoch": 8.427628477133428, + "grad_norm": 0.4853408634662628, + "learning_rate": 3.6741667408416614e-06, + "loss": 0.3446, + "num_input_tokens_seen": 17995712, + "step": 17875 + }, + { + "epoch": 8.42998585572843, + "grad_norm": 0.5938094258308411, + "learning_rate": 3.6634383269553874e-06, + "loss": 0.3448, + "num_input_tokens_seen": 18001088, + "step": 17880 + }, + { + "epoch": 8.432343234323433, + "grad_norm": 0.8084452748298645, + "learning_rate": 3.652724360738971e-06, + "loss": 0.3582, + "num_input_tokens_seen": 18006368, + "step": 17885 + }, + { + "epoch": 8.434700612918435, + "grad_norm": 0.33699288964271545, + "learning_rate": 3.642024849447187e-06, + "loss": 0.3415, + "num_input_tokens_seen": 18012128, + "step": 17890 + }, + { + "epoch": 8.437057991513438, + "grad_norm": 0.3762529492378235, + "learning_rate": 3.6313398003250064e-06, + "loss": 0.356, + "num_input_tokens_seen": 18016384, + "step": 17895 + }, + { + "epoch": 8.43941537010844, + "grad_norm": 0.395506352186203, + "learning_rate": 3.6206692206076255e-06, + "loss": 0.3147, + "num_input_tokens_seen": 18021696, + "step": 17900 + }, + { + "epoch": 8.441772748703443, + "grad_norm": 0.33500561118125916, + "learning_rate": 3.6100131175204333e-06, + "loss": 0.2863, + "num_input_tokens_seen": 18026592, + "step": 17905 + }, + { + "epoch": 8.444130127298443, + "grad_norm": 0.312317430973053, + "learning_rate": 3.599371498279025e-06, + "loss": 0.2958, + "num_input_tokens_seen": 18032160, + "step": 17910 + }, + { + "epoch": 8.446487505893446, + "grad_norm": 0.29054850339889526, + "learning_rate": 3.58874437008917e-06, + "loss": 0.33, + "num_input_tokens_seen": 18036128, + "step": 17915 + }, + { + "epoch": 8.448844884488448, + "grad_norm": 0.38829362392425537, + "learning_rate": 3.578131740146845e-06, + "loss": 0.3314, + "num_input_tokens_seen": 18041152, + "step": 17920 + }, + { + "epoch": 8.45120226308345, + "grad_norm": 0.4446813464164734, + "learning_rate": 3.5675336156382023e-06, + "loss": 0.2743, + "num_input_tokens_seen": 18046144, + "step": 17925 + }, + { + "epoch": 8.453559641678453, + "grad_norm": 0.5054104328155518, + "learning_rate": 3.5569500037395733e-06, + "loss": 0.3214, + "num_input_tokens_seen": 18051200, + "step": 17930 + }, + { + "epoch": 8.455917020273455, + "grad_norm": 0.29914721846580505, + "learning_rate": 3.5463809116174556e-06, + "loss": 0.2839, + "num_input_tokens_seen": 18057344, + "step": 17935 + }, + { + "epoch": 8.458274398868458, + "grad_norm": 0.36763861775398254, + "learning_rate": 3.5358263464285226e-06, + "loss": 0.3656, + "num_input_tokens_seen": 18061216, + "step": 17940 + }, + { + "epoch": 8.46063177746346, + "grad_norm": 0.28668591380119324, + "learning_rate": 3.5252863153196062e-06, + "loss": 0.2723, + "num_input_tokens_seen": 18066432, + "step": 17945 + }, + { + "epoch": 8.462989156058462, + "grad_norm": 0.4951246678829193, + "learning_rate": 3.5147608254277126e-06, + "loss": 0.2949, + "num_input_tokens_seen": 18072512, + "step": 17950 + }, + { + "epoch": 8.465346534653465, + "grad_norm": 0.3184884488582611, + "learning_rate": 3.5042498838799678e-06, + "loss": 0.369, + "num_input_tokens_seen": 18077120, + "step": 17955 + }, + { + "epoch": 8.467703913248467, + "grad_norm": 0.29577913880348206, + "learning_rate": 3.4937534977936865e-06, + "loss": 0.3328, + "num_input_tokens_seen": 18082720, + "step": 17960 + }, + { + "epoch": 8.47006129184347, + "grad_norm": 0.5661014914512634, + "learning_rate": 3.483271674276292e-06, + "loss": 0.3064, + "num_input_tokens_seen": 18086848, + "step": 17965 + }, + { + "epoch": 8.472418670438472, + "grad_norm": 0.3360588252544403, + "learning_rate": 3.4728044204253685e-06, + "loss": 0.3553, + "num_input_tokens_seen": 18094144, + "step": 17970 + }, + { + "epoch": 8.474776049033474, + "grad_norm": 0.7587132453918457, + "learning_rate": 3.4623517433286374e-06, + "loss": 0.373, + "num_input_tokens_seen": 18099104, + "step": 17975 + }, + { + "epoch": 8.477133427628477, + "grad_norm": 0.5474273562431335, + "learning_rate": 3.4519136500639287e-06, + "loss": 0.3591, + "num_input_tokens_seen": 18103200, + "step": 17980 + }, + { + "epoch": 8.47949080622348, + "grad_norm": 0.449092298746109, + "learning_rate": 3.4414901476992167e-06, + "loss": 0.3177, + "num_input_tokens_seen": 18108384, + "step": 17985 + }, + { + "epoch": 8.481848184818482, + "grad_norm": 0.5629483461380005, + "learning_rate": 3.431081243292589e-06, + "loss": 0.3528, + "num_input_tokens_seen": 18113184, + "step": 17990 + }, + { + "epoch": 8.484205563413484, + "grad_norm": 0.34909963607788086, + "learning_rate": 3.420686943892254e-06, + "loss": 0.3478, + "num_input_tokens_seen": 18116992, + "step": 17995 + }, + { + "epoch": 8.486562942008486, + "grad_norm": 0.41712939739227295, + "learning_rate": 3.41030725653651e-06, + "loss": 0.3638, + "num_input_tokens_seen": 18122176, + "step": 18000 + }, + { + "epoch": 8.488920320603489, + "grad_norm": 0.5458019971847534, + "learning_rate": 3.399942188253788e-06, + "loss": 0.3663, + "num_input_tokens_seen": 18127264, + "step": 18005 + }, + { + "epoch": 8.491277699198491, + "grad_norm": 0.5391527414321899, + "learning_rate": 3.389591746062601e-06, + "loss": 0.3316, + "num_input_tokens_seen": 18132288, + "step": 18010 + }, + { + "epoch": 8.493635077793494, + "grad_norm": 0.34740474820137024, + "learning_rate": 3.379255936971576e-06, + "loss": 0.3015, + "num_input_tokens_seen": 18137440, + "step": 18015 + }, + { + "epoch": 8.495992456388496, + "grad_norm": 0.3015376925468445, + "learning_rate": 3.368934767979409e-06, + "loss": 0.3197, + "num_input_tokens_seen": 18142400, + "step": 18020 + }, + { + "epoch": 8.498349834983498, + "grad_norm": 0.476527601480484, + "learning_rate": 3.358628246074899e-06, + "loss": 0.3336, + "num_input_tokens_seen": 18146720, + "step": 18025 + }, + { + "epoch": 8.500707213578501, + "grad_norm": 0.41937607526779175, + "learning_rate": 3.3483363782369213e-06, + "loss": 0.2941, + "num_input_tokens_seen": 18151040, + "step": 18030 + }, + { + "epoch": 8.503064592173503, + "grad_norm": 0.3200032114982605, + "learning_rate": 3.3380591714344378e-06, + "loss": 0.3548, + "num_input_tokens_seen": 18155456, + "step": 18035 + }, + { + "epoch": 8.504007543611504, + "eval_loss": 0.3313177824020386, + "eval_runtime": 25.687, + "eval_samples_per_second": 36.711, + "eval_steps_per_second": 9.188, + "num_input_tokens_seen": 18157184, + "step": 18037 + }, + { + "epoch": 8.505421970768506, + "grad_norm": 0.3448517620563507, + "learning_rate": 3.3277966326264594e-06, + "loss": 0.263, + "num_input_tokens_seen": 18160640, + "step": 18040 + }, + { + "epoch": 8.507779349363508, + "grad_norm": 0.5609997510910034, + "learning_rate": 3.317548768762088e-06, + "loss": 0.3702, + "num_input_tokens_seen": 18166784, + "step": 18045 + }, + { + "epoch": 8.51013672795851, + "grad_norm": 0.3219829201698303, + "learning_rate": 3.307315586780482e-06, + "loss": 0.3631, + "num_input_tokens_seen": 18171648, + "step": 18050 + }, + { + "epoch": 8.512494106553513, + "grad_norm": 0.3629448711872101, + "learning_rate": 3.2970970936108625e-06, + "loss": 0.3068, + "num_input_tokens_seen": 18176384, + "step": 18055 + }, + { + "epoch": 8.514851485148515, + "grad_norm": 0.4977435767650604, + "learning_rate": 3.2868932961724856e-06, + "loss": 0.3202, + "num_input_tokens_seen": 18182272, + "step": 18060 + }, + { + "epoch": 8.517208863743518, + "grad_norm": 0.5378589034080505, + "learning_rate": 3.276704201374675e-06, + "loss": 0.3398, + "num_input_tokens_seen": 18186848, + "step": 18065 + }, + { + "epoch": 8.51956624233852, + "grad_norm": 0.2789531946182251, + "learning_rate": 3.266529816116795e-06, + "loss": 0.3141, + "num_input_tokens_seen": 18191104, + "step": 18070 + }, + { + "epoch": 8.521923620933523, + "grad_norm": 0.3454248309135437, + "learning_rate": 3.2563701472882513e-06, + "loss": 0.3162, + "num_input_tokens_seen": 18195936, + "step": 18075 + }, + { + "epoch": 8.524280999528525, + "grad_norm": 0.29752177000045776, + "learning_rate": 3.2462252017684797e-06, + "loss": 0.3474, + "num_input_tokens_seen": 18200960, + "step": 18080 + }, + { + "epoch": 8.526638378123527, + "grad_norm": 0.39959657192230225, + "learning_rate": 3.2360949864269407e-06, + "loss": 0.3022, + "num_input_tokens_seen": 18206880, + "step": 18085 + }, + { + "epoch": 8.52899575671853, + "grad_norm": 0.29399338364601135, + "learning_rate": 3.225979508123131e-06, + "loss": 0.3239, + "num_input_tokens_seen": 18211072, + "step": 18090 + }, + { + "epoch": 8.531353135313532, + "grad_norm": 0.2667314112186432, + "learning_rate": 3.215878773706568e-06, + "loss": 0.3259, + "num_input_tokens_seen": 18215392, + "step": 18095 + }, + { + "epoch": 8.533710513908535, + "grad_norm": 0.3537059426307678, + "learning_rate": 3.205792790016787e-06, + "loss": 0.3302, + "num_input_tokens_seen": 18221248, + "step": 18100 + }, + { + "epoch": 8.536067892503535, + "grad_norm": 0.36997097730636597, + "learning_rate": 3.195721563883322e-06, + "loss": 0.3266, + "num_input_tokens_seen": 18226816, + "step": 18105 + }, + { + "epoch": 8.53842527109854, + "grad_norm": 0.513827919960022, + "learning_rate": 3.185665102125726e-06, + "loss": 0.3306, + "num_input_tokens_seen": 18231488, + "step": 18110 + }, + { + "epoch": 8.54078264969354, + "grad_norm": 0.29873329401016235, + "learning_rate": 3.175623411553552e-06, + "loss": 0.2997, + "num_input_tokens_seen": 18236032, + "step": 18115 + }, + { + "epoch": 8.543140028288542, + "grad_norm": 0.6028712391853333, + "learning_rate": 3.1655964989663558e-06, + "loss": 0.3177, + "num_input_tokens_seen": 18240640, + "step": 18120 + }, + { + "epoch": 8.545497406883545, + "grad_norm": 0.3603895604610443, + "learning_rate": 3.155584371153672e-06, + "loss": 0.2903, + "num_input_tokens_seen": 18246848, + "step": 18125 + }, + { + "epoch": 8.547854785478547, + "grad_norm": 0.3362111449241638, + "learning_rate": 3.1455870348950366e-06, + "loss": 0.375, + "num_input_tokens_seen": 18252768, + "step": 18130 + }, + { + "epoch": 8.55021216407355, + "grad_norm": 0.5020606517791748, + "learning_rate": 3.1356044969599677e-06, + "loss": 0.3399, + "num_input_tokens_seen": 18257312, + "step": 18135 + }, + { + "epoch": 8.552569542668552, + "grad_norm": 0.4138076901435852, + "learning_rate": 3.1256367641079697e-06, + "loss": 0.3148, + "num_input_tokens_seen": 18263456, + "step": 18140 + }, + { + "epoch": 8.554926921263554, + "grad_norm": 0.37944087386131287, + "learning_rate": 3.115683843088499e-06, + "loss": 0.368, + "num_input_tokens_seen": 18268320, + "step": 18145 + }, + { + "epoch": 8.557284299858557, + "grad_norm": 0.4110528826713562, + "learning_rate": 3.105745740641003e-06, + "loss": 0.3682, + "num_input_tokens_seen": 18272256, + "step": 18150 + }, + { + "epoch": 8.55964167845356, + "grad_norm": 0.5018133521080017, + "learning_rate": 3.0958224634948933e-06, + "loss": 0.3428, + "num_input_tokens_seen": 18276512, + "step": 18155 + }, + { + "epoch": 8.561999057048562, + "grad_norm": 0.5439450740814209, + "learning_rate": 3.08591401836954e-06, + "loss": 0.2855, + "num_input_tokens_seen": 18282688, + "step": 18160 + }, + { + "epoch": 8.564356435643564, + "grad_norm": 0.4956980347633362, + "learning_rate": 3.076020411974262e-06, + "loss": 0.3221, + "num_input_tokens_seen": 18287584, + "step": 18165 + }, + { + "epoch": 8.566713814238566, + "grad_norm": 0.3639909625053406, + "learning_rate": 3.066141651008339e-06, + "loss": 0.3702, + "num_input_tokens_seen": 18292768, + "step": 18170 + }, + { + "epoch": 8.569071192833569, + "grad_norm": 0.39492085576057434, + "learning_rate": 3.0562777421609983e-06, + "loss": 0.3182, + "num_input_tokens_seen": 18297920, + "step": 18175 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 0.3102317452430725, + "learning_rate": 3.0464286921114167e-06, + "loss": 0.3457, + "num_input_tokens_seen": 18302304, + "step": 18180 + }, + { + "epoch": 8.573785950023574, + "grad_norm": 0.3565179705619812, + "learning_rate": 3.036594507528692e-06, + "loss": 0.3073, + "num_input_tokens_seen": 18307264, + "step": 18185 + }, + { + "epoch": 8.576143328618576, + "grad_norm": 0.35033199191093445, + "learning_rate": 3.0267751950718626e-06, + "loss": 0.3579, + "num_input_tokens_seen": 18312640, + "step": 18190 + }, + { + "epoch": 8.578500707213578, + "grad_norm": 0.28389957547187805, + "learning_rate": 3.016970761389906e-06, + "loss": 0.3311, + "num_input_tokens_seen": 18316928, + "step": 18195 + }, + { + "epoch": 8.58085808580858, + "grad_norm": 0.33735960721969604, + "learning_rate": 3.007181213121721e-06, + "loss": 0.3191, + "num_input_tokens_seen": 18323072, + "step": 18200 + }, + { + "epoch": 8.583215464403583, + "grad_norm": 0.317720890045166, + "learning_rate": 2.997406556896118e-06, + "loss": 0.3519, + "num_input_tokens_seen": 18327296, + "step": 18205 + }, + { + "epoch": 8.585572842998586, + "grad_norm": 0.34912803769111633, + "learning_rate": 2.9876467993318307e-06, + "loss": 0.3548, + "num_input_tokens_seen": 18331168, + "step": 18210 + }, + { + "epoch": 8.587930221593588, + "grad_norm": 0.5791960954666138, + "learning_rate": 2.977901947037509e-06, + "loss": 0.3525, + "num_input_tokens_seen": 18337248, + "step": 18215 + }, + { + "epoch": 8.59028760018859, + "grad_norm": 0.30809125304222107, + "learning_rate": 2.9681720066117053e-06, + "loss": 0.3249, + "num_input_tokens_seen": 18342240, + "step": 18220 + }, + { + "epoch": 8.592644978783593, + "grad_norm": 0.30662471055984497, + "learning_rate": 2.9584569846428658e-06, + "loss": 0.3568, + "num_input_tokens_seen": 18347040, + "step": 18225 + }, + { + "epoch": 8.595002357378595, + "grad_norm": 0.37053635716438293, + "learning_rate": 2.9487568877093474e-06, + "loss": 0.3206, + "num_input_tokens_seen": 18352320, + "step": 18230 + }, + { + "epoch": 8.597359735973598, + "grad_norm": 0.3869383931159973, + "learning_rate": 2.9390717223793997e-06, + "loss": 0.3915, + "num_input_tokens_seen": 18357280, + "step": 18235 + }, + { + "epoch": 8.5997171145686, + "grad_norm": 0.412178099155426, + "learning_rate": 2.9294014952111593e-06, + "loss": 0.3288, + "num_input_tokens_seen": 18361920, + "step": 18240 + }, + { + "epoch": 8.602074493163602, + "grad_norm": 0.6000946164131165, + "learning_rate": 2.9197462127526414e-06, + "loss": 0.3494, + "num_input_tokens_seen": 18366720, + "step": 18245 + }, + { + "epoch": 8.604431871758605, + "grad_norm": 0.44648146629333496, + "learning_rate": 2.910105881541747e-06, + "loss": 0.3368, + "num_input_tokens_seen": 18371776, + "step": 18250 + }, + { + "epoch": 8.606789250353607, + "grad_norm": 0.27151215076446533, + "learning_rate": 2.90048050810626e-06, + "loss": 0.3134, + "num_input_tokens_seen": 18376544, + "step": 18255 + }, + { + "epoch": 8.60914662894861, + "grad_norm": 0.33336547017097473, + "learning_rate": 2.8908700989638272e-06, + "loss": 0.3689, + "num_input_tokens_seen": 18380704, + "step": 18260 + }, + { + "epoch": 8.611504007543612, + "grad_norm": 0.35519737005233765, + "learning_rate": 2.8812746606219604e-06, + "loss": 0.3282, + "num_input_tokens_seen": 18384800, + "step": 18265 + }, + { + "epoch": 8.613861386138614, + "grad_norm": 0.3248938322067261, + "learning_rate": 2.87169419957804e-06, + "loss": 0.3339, + "num_input_tokens_seen": 18390848, + "step": 18270 + }, + { + "epoch": 8.616218764733617, + "grad_norm": 0.4250810742378235, + "learning_rate": 2.862128722319307e-06, + "loss": 0.2645, + "num_input_tokens_seen": 18396064, + "step": 18275 + }, + { + "epoch": 8.61857614332862, + "grad_norm": 0.3466561436653137, + "learning_rate": 2.852578235322853e-06, + "loss": 0.3389, + "num_input_tokens_seen": 18401696, + "step": 18280 + }, + { + "epoch": 8.620933521923622, + "grad_norm": 0.3402210474014282, + "learning_rate": 2.8430427450556103e-06, + "loss": 0.3338, + "num_input_tokens_seen": 18406752, + "step": 18285 + }, + { + "epoch": 8.623290900518624, + "grad_norm": 0.28136253356933594, + "learning_rate": 2.833522257974372e-06, + "loss": 0.3121, + "num_input_tokens_seen": 18411552, + "step": 18290 + }, + { + "epoch": 8.625648279113626, + "grad_norm": 0.29100286960601807, + "learning_rate": 2.824016780525765e-06, + "loss": 0.3108, + "num_input_tokens_seen": 18416256, + "step": 18295 + }, + { + "epoch": 8.628005657708629, + "grad_norm": 0.3558026850223541, + "learning_rate": 2.8145263191462544e-06, + "loss": 0.3544, + "num_input_tokens_seen": 18421440, + "step": 18300 + }, + { + "epoch": 8.630363036303631, + "grad_norm": 0.2933037579059601, + "learning_rate": 2.805050880262136e-06, + "loss": 0.3085, + "num_input_tokens_seen": 18426912, + "step": 18305 + }, + { + "epoch": 8.632720414898632, + "grad_norm": 0.31768426299095154, + "learning_rate": 2.795590470289522e-06, + "loss": 0.3472, + "num_input_tokens_seen": 18430624, + "step": 18310 + }, + { + "epoch": 8.635077793493634, + "grad_norm": 0.3946220874786377, + "learning_rate": 2.786145095634371e-06, + "loss": 0.3501, + "num_input_tokens_seen": 18436576, + "step": 18315 + }, + { + "epoch": 8.637435172088637, + "grad_norm": 0.2995656430721283, + "learning_rate": 2.7767147626924422e-06, + "loss": 0.3829, + "num_input_tokens_seen": 18441280, + "step": 18320 + }, + { + "epoch": 8.639792550683639, + "grad_norm": 0.39776772260665894, + "learning_rate": 2.767299477849325e-06, + "loss": 0.3048, + "num_input_tokens_seen": 18446464, + "step": 18325 + }, + { + "epoch": 8.642149929278641, + "grad_norm": 0.3829653263092041, + "learning_rate": 2.7578992474804026e-06, + "loss": 0.3482, + "num_input_tokens_seen": 18451840, + "step": 18330 + }, + { + "epoch": 8.644507307873644, + "grad_norm": 0.5275986194610596, + "learning_rate": 2.748514077950873e-06, + "loss": 0.3395, + "num_input_tokens_seen": 18456416, + "step": 18335 + }, + { + "epoch": 8.646864686468646, + "grad_norm": 0.45431792736053467, + "learning_rate": 2.7391439756157376e-06, + "loss": 0.319, + "num_input_tokens_seen": 18461312, + "step": 18340 + }, + { + "epoch": 8.649222065063649, + "grad_norm": 0.2876453697681427, + "learning_rate": 2.7297889468198e-06, + "loss": 0.3288, + "num_input_tokens_seen": 18465664, + "step": 18345 + }, + { + "epoch": 8.651579443658651, + "grad_norm": 0.3847603499889374, + "learning_rate": 2.7204489978976343e-06, + "loss": 0.3085, + "num_input_tokens_seen": 18471168, + "step": 18350 + }, + { + "epoch": 8.653936822253653, + "grad_norm": 0.32775068283081055, + "learning_rate": 2.711124135173629e-06, + "loss": 0.2807, + "num_input_tokens_seen": 18475552, + "step": 18355 + }, + { + "epoch": 8.656294200848656, + "grad_norm": 0.32978421449661255, + "learning_rate": 2.7018143649619467e-06, + "loss": 0.296, + "num_input_tokens_seen": 18481120, + "step": 18360 + }, + { + "epoch": 8.658651579443658, + "grad_norm": 0.27050837874412537, + "learning_rate": 2.6925196935665365e-06, + "loss": 0.2929, + "num_input_tokens_seen": 18485632, + "step": 18365 + }, + { + "epoch": 8.66100895803866, + "grad_norm": 0.5574162006378174, + "learning_rate": 2.6832401272811086e-06, + "loss": 0.3357, + "num_input_tokens_seen": 18490304, + "step": 18370 + }, + { + "epoch": 8.663366336633663, + "grad_norm": 0.3127480745315552, + "learning_rate": 2.673975672389156e-06, + "loss": 0.261, + "num_input_tokens_seen": 18496416, + "step": 18375 + }, + { + "epoch": 8.665723715228665, + "grad_norm": 0.3886825442314148, + "learning_rate": 2.664726335163942e-06, + "loss": 0.331, + "num_input_tokens_seen": 18503072, + "step": 18380 + }, + { + "epoch": 8.668081093823668, + "grad_norm": 0.4835066795349121, + "learning_rate": 2.655492121868494e-06, + "loss": 0.324, + "num_input_tokens_seen": 18508352, + "step": 18385 + }, + { + "epoch": 8.67043847241867, + "grad_norm": 0.365805983543396, + "learning_rate": 2.646273038755584e-06, + "loss": 0.3246, + "num_input_tokens_seen": 18512448, + "step": 18390 + }, + { + "epoch": 8.672795851013673, + "grad_norm": 0.4830385148525238, + "learning_rate": 2.637069092067751e-06, + "loss": 0.3161, + "num_input_tokens_seen": 18517856, + "step": 18395 + }, + { + "epoch": 8.675153229608675, + "grad_norm": 0.402506023645401, + "learning_rate": 2.627880288037285e-06, + "loss": 0.3501, + "num_input_tokens_seen": 18522560, + "step": 18400 + }, + { + "epoch": 8.677510608203677, + "grad_norm": 0.4017671048641205, + "learning_rate": 2.6187066328862226e-06, + "loss": 0.2979, + "num_input_tokens_seen": 18526784, + "step": 18405 + }, + { + "epoch": 8.67986798679868, + "grad_norm": 0.5744242072105408, + "learning_rate": 2.6095481328263394e-06, + "loss": 0.3541, + "num_input_tokens_seen": 18531104, + "step": 18410 + }, + { + "epoch": 8.682225365393682, + "grad_norm": 0.6821003556251526, + "learning_rate": 2.6004047940591385e-06, + "loss": 0.3481, + "num_input_tokens_seen": 18535520, + "step": 18415 + }, + { + "epoch": 8.684582743988685, + "grad_norm": 0.3557223677635193, + "learning_rate": 2.591276622775876e-06, + "loss": 0.2885, + "num_input_tokens_seen": 18541056, + "step": 18420 + }, + { + "epoch": 8.686940122583687, + "grad_norm": 0.3139263987541199, + "learning_rate": 2.582163625157527e-06, + "loss": 0.363, + "num_input_tokens_seen": 18546336, + "step": 18425 + }, + { + "epoch": 8.68929750117869, + "grad_norm": 0.3368074297904968, + "learning_rate": 2.5730658073748025e-06, + "loss": 0.358, + "num_input_tokens_seen": 18552544, + "step": 18430 + }, + { + "epoch": 8.691654879773692, + "grad_norm": 0.2925248146057129, + "learning_rate": 2.5639831755881166e-06, + "loss": 0.2844, + "num_input_tokens_seen": 18558016, + "step": 18435 + }, + { + "epoch": 8.694012258368694, + "grad_norm": 0.3587174117565155, + "learning_rate": 2.5549157359476104e-06, + "loss": 0.3283, + "num_input_tokens_seen": 18563040, + "step": 18440 + }, + { + "epoch": 8.696369636963697, + "grad_norm": 0.3904399871826172, + "learning_rate": 2.5458634945931424e-06, + "loss": 0.3489, + "num_input_tokens_seen": 18567584, + "step": 18445 + }, + { + "epoch": 8.698727015558699, + "grad_norm": 0.3062868118286133, + "learning_rate": 2.536826457654279e-06, + "loss": 0.2823, + "num_input_tokens_seen": 18573888, + "step": 18450 + }, + { + "epoch": 8.701084394153701, + "grad_norm": 0.3753114342689514, + "learning_rate": 2.52780463125028e-06, + "loss": 0.2859, + "num_input_tokens_seen": 18577952, + "step": 18455 + }, + { + "epoch": 8.703441772748704, + "grad_norm": 0.27439960837364197, + "learning_rate": 2.5187980214901126e-06, + "loss": 0.3184, + "num_input_tokens_seen": 18582048, + "step": 18460 + }, + { + "epoch": 8.705799151343706, + "grad_norm": 0.2366357445716858, + "learning_rate": 2.5098066344724473e-06, + "loss": 0.3196, + "num_input_tokens_seen": 18586912, + "step": 18465 + }, + { + "epoch": 8.708156529938709, + "grad_norm": 0.3346196413040161, + "learning_rate": 2.5008304762856427e-06, + "loss": 0.3517, + "num_input_tokens_seen": 18591904, + "step": 18470 + }, + { + "epoch": 8.710513908533711, + "grad_norm": 0.5587714910507202, + "learning_rate": 2.4918695530077307e-06, + "loss": 0.3327, + "num_input_tokens_seen": 18597056, + "step": 18475 + }, + { + "epoch": 8.712871287128714, + "grad_norm": 0.2572328746318817, + "learning_rate": 2.4829238707064496e-06, + "loss": 0.3278, + "num_input_tokens_seen": 18602240, + "step": 18480 + }, + { + "epoch": 8.715228665723716, + "grad_norm": 0.2736721932888031, + "learning_rate": 2.4739934354392035e-06, + "loss": 0.3095, + "num_input_tokens_seen": 18608800, + "step": 18485 + }, + { + "epoch": 8.717586044318718, + "grad_norm": 0.3152971863746643, + "learning_rate": 2.465078253253084e-06, + "loss": 0.2659, + "num_input_tokens_seen": 18613184, + "step": 18490 + }, + { + "epoch": 8.71994342291372, + "grad_norm": 0.3762631416320801, + "learning_rate": 2.4561783301848356e-06, + "loss": 0.3662, + "num_input_tokens_seen": 18617728, + "step": 18495 + }, + { + "epoch": 8.722300801508723, + "grad_norm": 0.4095257520675659, + "learning_rate": 2.4472936722608863e-06, + "loss": 0.3138, + "num_input_tokens_seen": 18622624, + "step": 18500 + }, + { + "epoch": 8.724658180103724, + "grad_norm": 0.6251070499420166, + "learning_rate": 2.438424285497326e-06, + "loss": 0.331, + "num_input_tokens_seen": 18627712, + "step": 18505 + }, + { + "epoch": 8.727015558698728, + "grad_norm": 0.4294745922088623, + "learning_rate": 2.429570175899901e-06, + "loss": 0.3008, + "num_input_tokens_seen": 18632928, + "step": 18510 + }, + { + "epoch": 8.729372937293729, + "grad_norm": 0.4235766530036926, + "learning_rate": 2.4207313494640035e-06, + "loss": 0.2677, + "num_input_tokens_seen": 18639264, + "step": 18515 + }, + { + "epoch": 8.731730315888731, + "grad_norm": 0.2820510268211365, + "learning_rate": 2.4119078121746935e-06, + "loss": 0.2838, + "num_input_tokens_seen": 18643776, + "step": 18520 + }, + { + "epoch": 8.734087694483733, + "grad_norm": 0.278257817029953, + "learning_rate": 2.4030995700066757e-06, + "loss": 0.3261, + "num_input_tokens_seen": 18648192, + "step": 18525 + }, + { + "epoch": 8.736445073078736, + "grad_norm": 0.32270893454551697, + "learning_rate": 2.394306628924281e-06, + "loss": 0.3204, + "num_input_tokens_seen": 18653120, + "step": 18530 + }, + { + "epoch": 8.738802451673738, + "grad_norm": 0.4641607999801636, + "learning_rate": 2.3855289948815027e-06, + "loss": 0.3095, + "num_input_tokens_seen": 18658208, + "step": 18535 + }, + { + "epoch": 8.74115983026874, + "grad_norm": 0.28847452998161316, + "learning_rate": 2.3767666738219506e-06, + "loss": 0.3216, + "num_input_tokens_seen": 18662944, + "step": 18540 + }, + { + "epoch": 8.743517208863743, + "grad_norm": 0.2656390070915222, + "learning_rate": 2.3680196716788756e-06, + "loss": 0.356, + "num_input_tokens_seen": 18668608, + "step": 18545 + }, + { + "epoch": 8.745874587458745, + "grad_norm": 0.44779065251350403, + "learning_rate": 2.3592879943751533e-06, + "loss": 0.2974, + "num_input_tokens_seen": 18673792, + "step": 18550 + }, + { + "epoch": 8.748231966053748, + "grad_norm": 0.3152596652507782, + "learning_rate": 2.3505716478232882e-06, + "loss": 0.3093, + "num_input_tokens_seen": 18678848, + "step": 18555 + }, + { + "epoch": 8.75058934464875, + "grad_norm": 0.32018932700157166, + "learning_rate": 2.341870637925386e-06, + "loss": 0.3294, + "num_input_tokens_seen": 18682944, + "step": 18560 + }, + { + "epoch": 8.752946723243753, + "grad_norm": 0.4182852804660797, + "learning_rate": 2.333184970573188e-06, + "loss": 0.3709, + "num_input_tokens_seen": 18688032, + "step": 18565 + }, + { + "epoch": 8.755304101838755, + "grad_norm": 0.349848210811615, + "learning_rate": 2.3245146516480355e-06, + "loss": 0.3282, + "num_input_tokens_seen": 18693760, + "step": 18570 + }, + { + "epoch": 8.757661480433757, + "grad_norm": 0.25174206495285034, + "learning_rate": 2.3158596870208824e-06, + "loss": 0.3838, + "num_input_tokens_seen": 18698464, + "step": 18575 + }, + { + "epoch": 8.76001885902876, + "grad_norm": 0.3062109351158142, + "learning_rate": 2.3072200825522777e-06, + "loss": 0.315, + "num_input_tokens_seen": 18704096, + "step": 18580 + }, + { + "epoch": 8.762376237623762, + "grad_norm": 0.3156066834926605, + "learning_rate": 2.298595844092377e-06, + "loss": 0.3159, + "num_input_tokens_seen": 18709408, + "step": 18585 + }, + { + "epoch": 8.764733616218765, + "grad_norm": 0.31350401043891907, + "learning_rate": 2.2899869774809264e-06, + "loss": 0.3484, + "num_input_tokens_seen": 18713920, + "step": 18590 + }, + { + "epoch": 8.767090994813767, + "grad_norm": 0.4825873374938965, + "learning_rate": 2.281393488547273e-06, + "loss": 0.2856, + "num_input_tokens_seen": 18717952, + "step": 18595 + }, + { + "epoch": 8.76944837340877, + "grad_norm": 0.25548267364501953, + "learning_rate": 2.2728153831103354e-06, + "loss": 0.305, + "num_input_tokens_seen": 18722208, + "step": 18600 + }, + { + "epoch": 8.771805752003772, + "grad_norm": 0.3235393464565277, + "learning_rate": 2.264252666978625e-06, + "loss": 0.3072, + "num_input_tokens_seen": 18726848, + "step": 18605 + }, + { + "epoch": 8.774163130598774, + "grad_norm": 0.3377801477909088, + "learning_rate": 2.255705345950232e-06, + "loss": 0.3587, + "num_input_tokens_seen": 18731808, + "step": 18610 + }, + { + "epoch": 8.776520509193777, + "grad_norm": 0.38971763849258423, + "learning_rate": 2.2471734258128292e-06, + "loss": 0.3408, + "num_input_tokens_seen": 18736128, + "step": 18615 + }, + { + "epoch": 8.778877887788779, + "grad_norm": 0.4036656618118286, + "learning_rate": 2.2386569123436424e-06, + "loss": 0.3209, + "num_input_tokens_seen": 18741024, + "step": 18620 + }, + { + "epoch": 8.781235266383781, + "grad_norm": 0.5691838264465332, + "learning_rate": 2.2301558113094785e-06, + "loss": 0.371, + "num_input_tokens_seen": 18747968, + "step": 18625 + }, + { + "epoch": 8.783592644978784, + "grad_norm": 0.2902474105358124, + "learning_rate": 2.2216701284667045e-06, + "loss": 0.3416, + "num_input_tokens_seen": 18752672, + "step": 18630 + }, + { + "epoch": 8.785950023573786, + "grad_norm": 0.29370206594467163, + "learning_rate": 2.213199869561258e-06, + "loss": 0.2801, + "num_input_tokens_seen": 18756928, + "step": 18635 + }, + { + "epoch": 8.788307402168789, + "grad_norm": 0.3616164028644562, + "learning_rate": 2.204745040328615e-06, + "loss": 0.3158, + "num_input_tokens_seen": 18760864, + "step": 18640 + }, + { + "epoch": 8.790664780763791, + "grad_norm": 0.33567407727241516, + "learning_rate": 2.196305646493807e-06, + "loss": 0.3274, + "num_input_tokens_seen": 18765664, + "step": 18645 + }, + { + "epoch": 8.793022159358793, + "grad_norm": 0.3489585816860199, + "learning_rate": 2.1878816937714236e-06, + "loss": 0.3143, + "num_input_tokens_seen": 18771456, + "step": 18650 + }, + { + "epoch": 8.795379537953796, + "grad_norm": 0.3396584689617157, + "learning_rate": 2.179473187865591e-06, + "loss": 0.3272, + "num_input_tokens_seen": 18776000, + "step": 18655 + }, + { + "epoch": 8.797736916548798, + "grad_norm": 0.33727601170539856, + "learning_rate": 2.1710801344699855e-06, + "loss": 0.2921, + "num_input_tokens_seen": 18780704, + "step": 18660 + }, + { + "epoch": 8.8000942951438, + "grad_norm": 0.4601404666900635, + "learning_rate": 2.1627025392678022e-06, + "loss": 0.2693, + "num_input_tokens_seen": 18786176, + "step": 18665 + }, + { + "epoch": 8.802451673738803, + "grad_norm": 0.3597533702850342, + "learning_rate": 2.1543404079317845e-06, + "loss": 0.315, + "num_input_tokens_seen": 18790400, + "step": 18670 + }, + { + "epoch": 8.804809052333805, + "grad_norm": 0.4348369538784027, + "learning_rate": 2.1459937461241973e-06, + "loss": 0.317, + "num_input_tokens_seen": 18795968, + "step": 18675 + }, + { + "epoch": 8.807166430928808, + "grad_norm": 0.3332652151584625, + "learning_rate": 2.137662559496842e-06, + "loss": 0.3729, + "num_input_tokens_seen": 18800960, + "step": 18680 + }, + { + "epoch": 8.80952380952381, + "grad_norm": 0.4174778461456299, + "learning_rate": 2.129346853691017e-06, + "loss": 0.2974, + "num_input_tokens_seen": 18805696, + "step": 18685 + }, + { + "epoch": 8.811881188118813, + "grad_norm": 0.3153471350669861, + "learning_rate": 2.1210466343375618e-06, + "loss": 0.2916, + "num_input_tokens_seen": 18810784, + "step": 18690 + }, + { + "epoch": 8.814238566713815, + "grad_norm": 0.5625483989715576, + "learning_rate": 2.1127619070568206e-06, + "loss": 0.3319, + "num_input_tokens_seen": 18816288, + "step": 18695 + }, + { + "epoch": 8.816595945308817, + "grad_norm": 0.3597591817378998, + "learning_rate": 2.1044926774586526e-06, + "loss": 0.3312, + "num_input_tokens_seen": 18820928, + "step": 18700 + }, + { + "epoch": 8.81895332390382, + "grad_norm": 0.41111668944358826, + "learning_rate": 2.096238951142407e-06, + "loss": 0.337, + "num_input_tokens_seen": 18824352, + "step": 18705 + }, + { + "epoch": 8.82131070249882, + "grad_norm": 0.34605905413627625, + "learning_rate": 2.0880007336969516e-06, + "loss": 0.3232, + "num_input_tokens_seen": 18828992, + "step": 18710 + }, + { + "epoch": 8.823668081093825, + "grad_norm": 0.5479110479354858, + "learning_rate": 2.0797780307006492e-06, + "loss": 0.305, + "num_input_tokens_seen": 18834656, + "step": 18715 + }, + { + "epoch": 8.826025459688825, + "grad_norm": 0.3719964623451233, + "learning_rate": 2.0715708477213604e-06, + "loss": 0.3078, + "num_input_tokens_seen": 18840096, + "step": 18720 + }, + { + "epoch": 8.828382838283828, + "grad_norm": 0.5847484469413757, + "learning_rate": 2.0633791903164224e-06, + "loss": 0.3776, + "num_input_tokens_seen": 18845472, + "step": 18725 + }, + { + "epoch": 8.83074021687883, + "grad_norm": 0.29480013251304626, + "learning_rate": 2.055203064032671e-06, + "loss": 0.3178, + "num_input_tokens_seen": 18849792, + "step": 18730 + }, + { + "epoch": 8.833097595473832, + "grad_norm": 0.24800333380699158, + "learning_rate": 2.0470424744064244e-06, + "loss": 0.3005, + "num_input_tokens_seen": 18854624, + "step": 18735 + }, + { + "epoch": 8.835454974068835, + "grad_norm": 0.4134012460708618, + "learning_rate": 2.0388974269634875e-06, + "loss": 0.3685, + "num_input_tokens_seen": 18859328, + "step": 18740 + }, + { + "epoch": 8.837812352663837, + "grad_norm": 0.2662082016468048, + "learning_rate": 2.0307679272191184e-06, + "loss": 0.2978, + "num_input_tokens_seen": 18864640, + "step": 18745 + }, + { + "epoch": 8.84016973125884, + "grad_norm": 0.5437020063400269, + "learning_rate": 2.022653980678077e-06, + "loss": 0.2849, + "num_input_tokens_seen": 18869376, + "step": 18750 + }, + { + "epoch": 8.842527109853842, + "grad_norm": 0.353022962808609, + "learning_rate": 2.0145555928345604e-06, + "loss": 0.263, + "num_input_tokens_seen": 18873920, + "step": 18755 + }, + { + "epoch": 8.844884488448844, + "grad_norm": 0.514758825302124, + "learning_rate": 2.0064727691722573e-06, + "loss": 0.3425, + "num_input_tokens_seen": 18879488, + "step": 18760 + }, + { + "epoch": 8.847241867043847, + "grad_norm": 0.33137786388397217, + "learning_rate": 1.9984055151643087e-06, + "loss": 0.3584, + "num_input_tokens_seen": 18884768, + "step": 18765 + }, + { + "epoch": 8.84959924563885, + "grad_norm": 0.307799756526947, + "learning_rate": 1.990353836273304e-06, + "loss": 0.3501, + "num_input_tokens_seen": 18888992, + "step": 18770 + }, + { + "epoch": 8.851956624233852, + "grad_norm": 0.3416897654533386, + "learning_rate": 1.9823177379512946e-06, + "loss": 0.3609, + "num_input_tokens_seen": 18893664, + "step": 18775 + }, + { + "epoch": 8.854314002828854, + "grad_norm": 0.3174048066139221, + "learning_rate": 1.9742972256397816e-06, + "loss": 0.3244, + "num_input_tokens_seen": 18899328, + "step": 18780 + }, + { + "epoch": 8.856671381423856, + "grad_norm": 0.3553679287433624, + "learning_rate": 1.966292304769715e-06, + "loss": 0.3064, + "num_input_tokens_seen": 18904320, + "step": 18785 + }, + { + "epoch": 8.859028760018859, + "grad_norm": 0.3170500695705414, + "learning_rate": 1.9583029807614726e-06, + "loss": 0.2903, + "num_input_tokens_seen": 18909056, + "step": 18790 + }, + { + "epoch": 8.861386138613861, + "grad_norm": 0.3604758083820343, + "learning_rate": 1.950329259024891e-06, + "loss": 0.3035, + "num_input_tokens_seen": 18913536, + "step": 18795 + }, + { + "epoch": 8.863743517208864, + "grad_norm": 0.4642994701862335, + "learning_rate": 1.942371144959229e-06, + "loss": 0.3376, + "num_input_tokens_seen": 18919456, + "step": 18800 + }, + { + "epoch": 8.866100895803866, + "grad_norm": 0.37015822529792786, + "learning_rate": 1.9344286439531872e-06, + "loss": 0.3915, + "num_input_tokens_seen": 18924512, + "step": 18805 + }, + { + "epoch": 8.868458274398868, + "grad_norm": 0.45508861541748047, + "learning_rate": 1.9265017613848774e-06, + "loss": 0.3436, + "num_input_tokens_seen": 18929344, + "step": 18810 + }, + { + "epoch": 8.87081565299387, + "grad_norm": 0.44829970598220825, + "learning_rate": 1.9185905026218503e-06, + "loss": 0.3148, + "num_input_tokens_seen": 18934560, + "step": 18815 + }, + { + "epoch": 8.873173031588873, + "grad_norm": 0.35154709219932556, + "learning_rate": 1.910694873021074e-06, + "loss": 0.3281, + "num_input_tokens_seen": 18940256, + "step": 18820 + }, + { + "epoch": 8.875530410183876, + "grad_norm": 0.3262569308280945, + "learning_rate": 1.902814877928938e-06, + "loss": 0.2821, + "num_input_tokens_seen": 18946272, + "step": 18825 + }, + { + "epoch": 8.877887788778878, + "grad_norm": 0.3997124433517456, + "learning_rate": 1.8949505226812243e-06, + "loss": 0.3731, + "num_input_tokens_seen": 18952576, + "step": 18830 + }, + { + "epoch": 8.88024516737388, + "grad_norm": 0.459866464138031, + "learning_rate": 1.8871018126031536e-06, + "loss": 0.3308, + "num_input_tokens_seen": 18957344, + "step": 18835 + }, + { + "epoch": 8.882602545968883, + "grad_norm": 0.35579854249954224, + "learning_rate": 1.8792687530093305e-06, + "loss": 0.2611, + "num_input_tokens_seen": 18962144, + "step": 18840 + }, + { + "epoch": 8.884959924563885, + "grad_norm": 0.4513177275657654, + "learning_rate": 1.8714513492037815e-06, + "loss": 0.2715, + "num_input_tokens_seen": 18966816, + "step": 18845 + }, + { + "epoch": 8.887317303158888, + "grad_norm": 0.21915116906166077, + "learning_rate": 1.8636496064799085e-06, + "loss": 0.2842, + "num_input_tokens_seen": 18970496, + "step": 18850 + }, + { + "epoch": 8.88967468175389, + "grad_norm": 0.3701836168766022, + "learning_rate": 1.8558635301205274e-06, + "loss": 0.3738, + "num_input_tokens_seen": 18975264, + "step": 18855 + }, + { + "epoch": 8.892032060348892, + "grad_norm": 0.31976747512817383, + "learning_rate": 1.8480931253978455e-06, + "loss": 0.3627, + "num_input_tokens_seen": 18979808, + "step": 18860 + }, + { + "epoch": 8.894389438943895, + "grad_norm": 0.3718668520450592, + "learning_rate": 1.8403383975734435e-06, + "loss": 0.3368, + "num_input_tokens_seen": 18984832, + "step": 18865 + }, + { + "epoch": 8.896746817538897, + "grad_norm": 0.359388530254364, + "learning_rate": 1.8325993518983038e-06, + "loss": 0.3087, + "num_input_tokens_seen": 18990752, + "step": 18870 + }, + { + "epoch": 8.8991041961339, + "grad_norm": 0.40700119733810425, + "learning_rate": 1.8248759936127762e-06, + "loss": 0.3295, + "num_input_tokens_seen": 18995616, + "step": 18875 + }, + { + "epoch": 8.901461574728902, + "grad_norm": 0.2543441951274872, + "learning_rate": 1.8171683279465946e-06, + "loss": 0.3015, + "num_input_tokens_seen": 19000320, + "step": 18880 + }, + { + "epoch": 8.903818953323904, + "grad_norm": 0.40551888942718506, + "learning_rate": 1.8094763601188696e-06, + "loss": 0.2918, + "num_input_tokens_seen": 19007360, + "step": 18885 + }, + { + "epoch": 8.906176331918907, + "grad_norm": 0.510479748249054, + "learning_rate": 1.8018000953380825e-06, + "loss": 0.3565, + "num_input_tokens_seen": 19012576, + "step": 18890 + }, + { + "epoch": 8.90853371051391, + "grad_norm": 0.4733826816082001, + "learning_rate": 1.7941395388020699e-06, + "loss": 0.3035, + "num_input_tokens_seen": 19017344, + "step": 18895 + }, + { + "epoch": 8.910891089108912, + "grad_norm": 0.31911134719848633, + "learning_rate": 1.7864946956980433e-06, + "loss": 0.2648, + "num_input_tokens_seen": 19022016, + "step": 18900 + }, + { + "epoch": 8.913248467703912, + "grad_norm": 0.3912449777126312, + "learning_rate": 1.7788655712025732e-06, + "loss": 0.3475, + "num_input_tokens_seen": 19026912, + "step": 18905 + }, + { + "epoch": 8.915605846298917, + "grad_norm": 0.3482230603694916, + "learning_rate": 1.7712521704815882e-06, + "loss": 0.268, + "num_input_tokens_seen": 19033184, + "step": 18910 + }, + { + "epoch": 8.917963224893917, + "grad_norm": 0.371263712644577, + "learning_rate": 1.7636544986903535e-06, + "loss": 0.3266, + "num_input_tokens_seen": 19037664, + "step": 18915 + }, + { + "epoch": 8.92032060348892, + "grad_norm": 0.47324231266975403, + "learning_rate": 1.7560725609735068e-06, + "loss": 0.3145, + "num_input_tokens_seen": 19042784, + "step": 18920 + }, + { + "epoch": 8.922677982083922, + "grad_norm": 0.5577282309532166, + "learning_rate": 1.748506362465016e-06, + "loss": 0.3979, + "num_input_tokens_seen": 19047424, + "step": 18925 + }, + { + "epoch": 8.925035360678924, + "grad_norm": 0.264773428440094, + "learning_rate": 1.7409559082882054e-06, + "loss": 0.32, + "num_input_tokens_seen": 19052512, + "step": 18930 + }, + { + "epoch": 8.927392739273927, + "grad_norm": 0.30884629487991333, + "learning_rate": 1.7334212035557156e-06, + "loss": 0.333, + "num_input_tokens_seen": 19056992, + "step": 18935 + }, + { + "epoch": 8.92975011786893, + "grad_norm": 0.4067171514034271, + "learning_rate": 1.7259022533695462e-06, + "loss": 0.322, + "num_input_tokens_seen": 19061984, + "step": 18940 + }, + { + "epoch": 8.932107496463932, + "grad_norm": 0.5169681906700134, + "learning_rate": 1.7183990628210162e-06, + "loss": 0.3449, + "num_input_tokens_seen": 19066592, + "step": 18945 + }, + { + "epoch": 8.934464875058934, + "grad_norm": 0.3765588700771332, + "learning_rate": 1.7109116369907806e-06, + "loss": 0.32, + "num_input_tokens_seen": 19072128, + "step": 18950 + }, + { + "epoch": 8.936822253653936, + "grad_norm": 0.30323806405067444, + "learning_rate": 1.7034399809488095e-06, + "loss": 0.3095, + "num_input_tokens_seen": 19077056, + "step": 18955 + }, + { + "epoch": 8.939179632248939, + "grad_norm": 0.8725069165229797, + "learning_rate": 1.6959840997544024e-06, + "loss": 0.3283, + "num_input_tokens_seen": 19082048, + "step": 18960 + }, + { + "epoch": 8.941537010843941, + "grad_norm": 0.35965675115585327, + "learning_rate": 1.6885439984561767e-06, + "loss": 0.3099, + "num_input_tokens_seen": 19086912, + "step": 18965 + }, + { + "epoch": 8.943894389438944, + "grad_norm": 0.2748062014579773, + "learning_rate": 1.6811196820920694e-06, + "loss": 0.2753, + "num_input_tokens_seen": 19090816, + "step": 18970 + }, + { + "epoch": 8.946251768033946, + "grad_norm": 0.5412883162498474, + "learning_rate": 1.6737111556893142e-06, + "loss": 0.3429, + "num_input_tokens_seen": 19096224, + "step": 18975 + }, + { + "epoch": 8.948609146628948, + "grad_norm": 0.5248409509658813, + "learning_rate": 1.6663184242644625e-06, + "loss": 0.3327, + "num_input_tokens_seen": 19100352, + "step": 18980 + }, + { + "epoch": 8.95096652522395, + "grad_norm": 0.4300454556941986, + "learning_rate": 1.658941492823371e-06, + "loss": 0.2821, + "num_input_tokens_seen": 19104480, + "step": 18985 + }, + { + "epoch": 8.953323903818953, + "grad_norm": 0.3019610643386841, + "learning_rate": 1.6515803663611968e-06, + "loss": 0.3959, + "num_input_tokens_seen": 19109152, + "step": 18990 + }, + { + "epoch": 8.955681282413956, + "grad_norm": 0.4451536238193512, + "learning_rate": 1.6442350498623971e-06, + "loss": 0.3357, + "num_input_tokens_seen": 19116512, + "step": 18995 + }, + { + "epoch": 8.958038661008958, + "grad_norm": 0.36028459668159485, + "learning_rate": 1.6369055483007185e-06, + "loss": 0.3361, + "num_input_tokens_seen": 19121920, + "step": 19000 + }, + { + "epoch": 8.96039603960396, + "grad_norm": 0.5607564449310303, + "learning_rate": 1.629591866639199e-06, + "loss": 0.3488, + "num_input_tokens_seen": 19126144, + "step": 19005 + }, + { + "epoch": 8.962753418198963, + "grad_norm": 0.48121070861816406, + "learning_rate": 1.622294009830172e-06, + "loss": 0.3317, + "num_input_tokens_seen": 19131648, + "step": 19010 + }, + { + "epoch": 8.965110796793965, + "grad_norm": 0.29074782133102417, + "learning_rate": 1.615011982815251e-06, + "loss": 0.2911, + "num_input_tokens_seen": 19136768, + "step": 19015 + }, + { + "epoch": 8.967468175388968, + "grad_norm": 0.328279048204422, + "learning_rate": 1.6077457905253251e-06, + "loss": 0.33, + "num_input_tokens_seen": 19141792, + "step": 19020 + }, + { + "epoch": 8.96982555398397, + "grad_norm": 0.3304401934146881, + "learning_rate": 1.600495437880567e-06, + "loss": 0.3274, + "num_input_tokens_seen": 19146720, + "step": 19025 + }, + { + "epoch": 8.972182932578972, + "grad_norm": 0.42159608006477356, + "learning_rate": 1.5932609297904244e-06, + "loss": 0.3305, + "num_input_tokens_seen": 19151776, + "step": 19030 + }, + { + "epoch": 8.974540311173975, + "grad_norm": 0.5018957257270813, + "learning_rate": 1.5860422711536177e-06, + "loss": 0.3394, + "num_input_tokens_seen": 19157856, + "step": 19035 + }, + { + "epoch": 8.976897689768977, + "grad_norm": 0.3585177958011627, + "learning_rate": 1.578839466858123e-06, + "loss": 0.366, + "num_input_tokens_seen": 19162752, + "step": 19040 + }, + { + "epoch": 8.97925506836398, + "grad_norm": 0.32145509123802185, + "learning_rate": 1.5716525217811973e-06, + "loss": 0.3512, + "num_input_tokens_seen": 19166944, + "step": 19045 + }, + { + "epoch": 8.981612446958982, + "grad_norm": 0.2790268361568451, + "learning_rate": 1.5644814407893505e-06, + "loss": 0.353, + "num_input_tokens_seen": 19172064, + "step": 19050 + }, + { + "epoch": 8.983969825553984, + "grad_norm": 0.31004106998443604, + "learning_rate": 1.5573262287383512e-06, + "loss": 0.3579, + "num_input_tokens_seen": 19177216, + "step": 19055 + }, + { + "epoch": 8.986327204148987, + "grad_norm": 0.4231646656990051, + "learning_rate": 1.5501868904732209e-06, + "loss": 0.3212, + "num_input_tokens_seen": 19182560, + "step": 19060 + }, + { + "epoch": 8.98868458274399, + "grad_norm": 0.33081990480422974, + "learning_rate": 1.5430634308282343e-06, + "loss": 0.3627, + "num_input_tokens_seen": 19187520, + "step": 19065 + }, + { + "epoch": 8.991041961338992, + "grad_norm": 0.34339869022369385, + "learning_rate": 1.5359558546269166e-06, + "loss": 0.3458, + "num_input_tokens_seen": 19192160, + "step": 19070 + }, + { + "epoch": 8.993399339933994, + "grad_norm": 0.4063813090324402, + "learning_rate": 1.5288641666820397e-06, + "loss": 0.3963, + "num_input_tokens_seen": 19196832, + "step": 19075 + }, + { + "epoch": 8.995756718528996, + "grad_norm": 0.36011940240859985, + "learning_rate": 1.5217883717956016e-06, + "loss": 0.3383, + "num_input_tokens_seen": 19202272, + "step": 19080 + }, + { + "epoch": 8.998114097123999, + "grad_norm": 0.3164633810520172, + "learning_rate": 1.5147284747588613e-06, + "loss": 0.3185, + "num_input_tokens_seen": 19207616, + "step": 19085 + }, + { + "epoch": 9.000471475719001, + "grad_norm": 0.33721303939819336, + "learning_rate": 1.5076844803522922e-06, + "loss": 0.316, + "num_input_tokens_seen": 19212512, + "step": 19090 + }, + { + "epoch": 9.002828854314004, + "grad_norm": 0.3576095998287201, + "learning_rate": 1.5006563933456119e-06, + "loss": 0.3066, + "num_input_tokens_seen": 19219552, + "step": 19095 + }, + { + "epoch": 9.004243281471004, + "eval_loss": 0.3299795985221863, + "eval_runtime": 25.6644, + "eval_samples_per_second": 36.743, + "eval_steps_per_second": 9.196, + "num_input_tokens_seen": 19222688, + "step": 19098 + }, + { + "epoch": 9.005186232909006, + "grad_norm": 0.3912183344364166, + "learning_rate": 1.4936442184977666e-06, + "loss": 0.3891, + "num_input_tokens_seen": 19224320, + "step": 19100 + }, + { + "epoch": 9.007543611504008, + "grad_norm": 0.30490967631340027, + "learning_rate": 1.4866479605569195e-06, + "loss": 0.2818, + "num_input_tokens_seen": 19229536, + "step": 19105 + }, + { + "epoch": 9.009900990099009, + "grad_norm": 0.5142773389816284, + "learning_rate": 1.479667624260464e-06, + "loss": 0.3376, + "num_input_tokens_seen": 19233920, + "step": 19110 + }, + { + "epoch": 9.012258368694011, + "grad_norm": 0.42091652750968933, + "learning_rate": 1.4727032143350112e-06, + "loss": 0.3383, + "num_input_tokens_seen": 19238944, + "step": 19115 + }, + { + "epoch": 9.014615747289014, + "grad_norm": 0.3299631178379059, + "learning_rate": 1.4657547354963857e-06, + "loss": 0.3079, + "num_input_tokens_seen": 19243744, + "step": 19120 + }, + { + "epoch": 9.016973125884016, + "grad_norm": 0.342649906873703, + "learning_rate": 1.4588221924496215e-06, + "loss": 0.2919, + "num_input_tokens_seen": 19248608, + "step": 19125 + }, + { + "epoch": 9.019330504479019, + "grad_norm": 0.307315856218338, + "learning_rate": 1.451905589888966e-06, + "loss": 0.3236, + "num_input_tokens_seen": 19253152, + "step": 19130 + }, + { + "epoch": 9.021687883074021, + "grad_norm": 0.28188058733940125, + "learning_rate": 1.445004932497876e-06, + "loss": 0.3392, + "num_input_tokens_seen": 19258112, + "step": 19135 + }, + { + "epoch": 9.024045261669023, + "grad_norm": 0.5676253437995911, + "learning_rate": 1.4381202249490106e-06, + "loss": 0.3349, + "num_input_tokens_seen": 19262624, + "step": 19140 + }, + { + "epoch": 9.026402640264026, + "grad_norm": 0.37386375665664673, + "learning_rate": 1.4312514719042187e-06, + "loss": 0.3452, + "num_input_tokens_seen": 19267360, + "step": 19145 + }, + { + "epoch": 9.028760018859028, + "grad_norm": 0.4669724702835083, + "learning_rate": 1.4243986780145547e-06, + "loss": 0.3529, + "num_input_tokens_seen": 19272608, + "step": 19150 + }, + { + "epoch": 9.03111739745403, + "grad_norm": 0.2910473644733429, + "learning_rate": 1.4175618479202678e-06, + "loss": 0.32, + "num_input_tokens_seen": 19276448, + "step": 19155 + }, + { + "epoch": 9.033474776049033, + "grad_norm": 0.4252482056617737, + "learning_rate": 1.410740986250797e-06, + "loss": 0.3229, + "num_input_tokens_seen": 19280832, + "step": 19160 + }, + { + "epoch": 9.035832154644035, + "grad_norm": 0.37456122040748596, + "learning_rate": 1.4039360976247584e-06, + "loss": 0.372, + "num_input_tokens_seen": 19285824, + "step": 19165 + }, + { + "epoch": 9.038189533239038, + "grad_norm": 0.4118141233921051, + "learning_rate": 1.3971471866499663e-06, + "loss": 0.3585, + "num_input_tokens_seen": 19291424, + "step": 19170 + }, + { + "epoch": 9.04054691183404, + "grad_norm": 0.5505111217498779, + "learning_rate": 1.3903742579234075e-06, + "loss": 0.3338, + "num_input_tokens_seen": 19297024, + "step": 19175 + }, + { + "epoch": 9.042904290429043, + "grad_norm": 0.29031988978385925, + "learning_rate": 1.383617316031255e-06, + "loss": 0.354, + "num_input_tokens_seen": 19301440, + "step": 19180 + }, + { + "epoch": 9.045261669024045, + "grad_norm": 0.42857638001441956, + "learning_rate": 1.3768763655488403e-06, + "loss": 0.3394, + "num_input_tokens_seen": 19306848, + "step": 19185 + }, + { + "epoch": 9.047619047619047, + "grad_norm": 0.4315056800842285, + "learning_rate": 1.370151411040685e-06, + "loss": 0.2697, + "num_input_tokens_seen": 19312992, + "step": 19190 + }, + { + "epoch": 9.04997642621405, + "grad_norm": 0.5250687003135681, + "learning_rate": 1.363442457060468e-06, + "loss": 0.3231, + "num_input_tokens_seen": 19320544, + "step": 19195 + }, + { + "epoch": 9.052333804809052, + "grad_norm": 0.4764079749584198, + "learning_rate": 1.3567495081510451e-06, + "loss": 0.3599, + "num_input_tokens_seen": 19325088, + "step": 19200 + }, + { + "epoch": 9.054691183404055, + "grad_norm": 0.5130615830421448, + "learning_rate": 1.3500725688444127e-06, + "loss": 0.351, + "num_input_tokens_seen": 19328768, + "step": 19205 + }, + { + "epoch": 9.057048561999057, + "grad_norm": 0.3239733874797821, + "learning_rate": 1.343411643661746e-06, + "loss": 0.3018, + "num_input_tokens_seen": 19334144, + "step": 19210 + }, + { + "epoch": 9.05940594059406, + "grad_norm": 0.5132869482040405, + "learning_rate": 1.3367667371133734e-06, + "loss": 0.3855, + "num_input_tokens_seen": 19339552, + "step": 19215 + }, + { + "epoch": 9.061763319189062, + "grad_norm": 0.3396722972393036, + "learning_rate": 1.3301378536987763e-06, + "loss": 0.3147, + "num_input_tokens_seen": 19344928, + "step": 19220 + }, + { + "epoch": 9.064120697784064, + "grad_norm": 0.4985547363758087, + "learning_rate": 1.3235249979065777e-06, + "loss": 0.351, + "num_input_tokens_seen": 19349856, + "step": 19225 + }, + { + "epoch": 9.066478076379067, + "grad_norm": 0.49147331714630127, + "learning_rate": 1.3169281742145572e-06, + "loss": 0.3, + "num_input_tokens_seen": 19354368, + "step": 19230 + }, + { + "epoch": 9.068835454974069, + "grad_norm": 0.35282039642333984, + "learning_rate": 1.310347387089636e-06, + "loss": 0.3141, + "num_input_tokens_seen": 19360096, + "step": 19235 + }, + { + "epoch": 9.071192833569071, + "grad_norm": 0.3060775399208069, + "learning_rate": 1.30378264098788e-06, + "loss": 0.3178, + "num_input_tokens_seen": 19364544, + "step": 19240 + }, + { + "epoch": 9.073550212164074, + "grad_norm": 0.38504934310913086, + "learning_rate": 1.2972339403544781e-06, + "loss": 0.3069, + "num_input_tokens_seen": 19369728, + "step": 19245 + }, + { + "epoch": 9.075907590759076, + "grad_norm": 0.3272925615310669, + "learning_rate": 1.2907012896237774e-06, + "loss": 0.3608, + "num_input_tokens_seen": 19375392, + "step": 19250 + }, + { + "epoch": 9.078264969354079, + "grad_norm": 0.31953269243240356, + "learning_rate": 1.2841846932192369e-06, + "loss": 0.3545, + "num_input_tokens_seen": 19380704, + "step": 19255 + }, + { + "epoch": 9.080622347949081, + "grad_norm": 0.49354588985443115, + "learning_rate": 1.2776841555534603e-06, + "loss": 0.3058, + "num_input_tokens_seen": 19384992, + "step": 19260 + }, + { + "epoch": 9.082979726544083, + "grad_norm": 0.5663245916366577, + "learning_rate": 1.2711996810281596e-06, + "loss": 0.3168, + "num_input_tokens_seen": 19390112, + "step": 19265 + }, + { + "epoch": 9.085337105139086, + "grad_norm": 0.3138241767883301, + "learning_rate": 1.2647312740341893e-06, + "loss": 0.3823, + "num_input_tokens_seen": 19395360, + "step": 19270 + }, + { + "epoch": 9.087694483734088, + "grad_norm": 0.5460134744644165, + "learning_rate": 1.2582789389515127e-06, + "loss": 0.3965, + "num_input_tokens_seen": 19399712, + "step": 19275 + }, + { + "epoch": 9.09005186232909, + "grad_norm": 0.3443089723587036, + "learning_rate": 1.251842680149215e-06, + "loss": 0.3201, + "num_input_tokens_seen": 19404960, + "step": 19280 + }, + { + "epoch": 9.092409240924093, + "grad_norm": 0.2994457185268402, + "learning_rate": 1.245422501985488e-06, + "loss": 0.3313, + "num_input_tokens_seen": 19410944, + "step": 19285 + }, + { + "epoch": 9.094766619519095, + "grad_norm": 0.4411541521549225, + "learning_rate": 1.2390184088076434e-06, + "loss": 0.3061, + "num_input_tokens_seen": 19416480, + "step": 19290 + }, + { + "epoch": 9.097123998114098, + "grad_norm": 0.30127274990081787, + "learning_rate": 1.2326304049520953e-06, + "loss": 0.2922, + "num_input_tokens_seen": 19420960, + "step": 19295 + }, + { + "epoch": 9.0994813767091, + "grad_norm": 0.30823785066604614, + "learning_rate": 1.2262584947443728e-06, + "loss": 0.393, + "num_input_tokens_seen": 19426112, + "step": 19300 + }, + { + "epoch": 9.101838755304103, + "grad_norm": 0.3077658414840698, + "learning_rate": 1.219902682499091e-06, + "loss": 0.3267, + "num_input_tokens_seen": 19430592, + "step": 19305 + }, + { + "epoch": 9.104196133899103, + "grad_norm": 0.2446441799402237, + "learning_rate": 1.2135629725199826e-06, + "loss": 0.2865, + "num_input_tokens_seen": 19435168, + "step": 19310 + }, + { + "epoch": 9.106553512494106, + "grad_norm": 0.36410436034202576, + "learning_rate": 1.2072393690998635e-06, + "loss": 0.3201, + "num_input_tokens_seen": 19440672, + "step": 19315 + }, + { + "epoch": 9.108910891089108, + "grad_norm": 0.5657645463943481, + "learning_rate": 1.2009318765206472e-06, + "loss": 0.2994, + "num_input_tokens_seen": 19444992, + "step": 19320 + }, + { + "epoch": 9.11126826968411, + "grad_norm": 0.4410147964954376, + "learning_rate": 1.1946404990533455e-06, + "loss": 0.2635, + "num_input_tokens_seen": 19451040, + "step": 19325 + }, + { + "epoch": 9.113625648279113, + "grad_norm": 0.3306174874305725, + "learning_rate": 1.1883652409580447e-06, + "loss": 0.3407, + "num_input_tokens_seen": 19455328, + "step": 19330 + }, + { + "epoch": 9.115983026874115, + "grad_norm": 0.3731156587600708, + "learning_rate": 1.1821061064839266e-06, + "loss": 0.348, + "num_input_tokens_seen": 19462176, + "step": 19335 + }, + { + "epoch": 9.118340405469118, + "grad_norm": 0.4803495407104492, + "learning_rate": 1.175863099869251e-06, + "loss": 0.297, + "num_input_tokens_seen": 19467360, + "step": 19340 + }, + { + "epoch": 9.12069778406412, + "grad_norm": 0.3474481999874115, + "learning_rate": 1.1696362253413613e-06, + "loss": 0.3321, + "num_input_tokens_seen": 19473376, + "step": 19345 + }, + { + "epoch": 9.123055162659123, + "grad_norm": 0.23895660042762756, + "learning_rate": 1.163425487116665e-06, + "loss": 0.3376, + "num_input_tokens_seen": 19477408, + "step": 19350 + }, + { + "epoch": 9.125412541254125, + "grad_norm": 0.39934810996055603, + "learning_rate": 1.1572308894006623e-06, + "loss": 0.2919, + "num_input_tokens_seen": 19482656, + "step": 19355 + }, + { + "epoch": 9.127769919849127, + "grad_norm": 0.3922344446182251, + "learning_rate": 1.151052436387906e-06, + "loss": 0.3715, + "num_input_tokens_seen": 19488416, + "step": 19360 + }, + { + "epoch": 9.13012729844413, + "grad_norm": 0.40150728821754456, + "learning_rate": 1.144890132262033e-06, + "loss": 0.3159, + "num_input_tokens_seen": 19494144, + "step": 19365 + }, + { + "epoch": 9.132484677039132, + "grad_norm": 0.33839911222457886, + "learning_rate": 1.138743981195728e-06, + "loss": 0.3183, + "num_input_tokens_seen": 19498944, + "step": 19370 + }, + { + "epoch": 9.134842055634135, + "grad_norm": 0.3879467248916626, + "learning_rate": 1.1326139873507502e-06, + "loss": 0.3139, + "num_input_tokens_seen": 19504448, + "step": 19375 + }, + { + "epoch": 9.137199434229137, + "grad_norm": 0.3390980362892151, + "learning_rate": 1.1265001548779158e-06, + "loss": 0.338, + "num_input_tokens_seen": 19510176, + "step": 19380 + }, + { + "epoch": 9.13955681282414, + "grad_norm": 0.3444470465183258, + "learning_rate": 1.1204024879171022e-06, + "loss": 0.308, + "num_input_tokens_seen": 19514688, + "step": 19385 + }, + { + "epoch": 9.141914191419142, + "grad_norm": 0.36181917786598206, + "learning_rate": 1.114320990597223e-06, + "loss": 0.3351, + "num_input_tokens_seen": 19519328, + "step": 19390 + }, + { + "epoch": 9.144271570014144, + "grad_norm": 0.34358689188957214, + "learning_rate": 1.108255667036262e-06, + "loss": 0.3797, + "num_input_tokens_seen": 19524992, + "step": 19395 + }, + { + "epoch": 9.146628948609147, + "grad_norm": 0.2568299472332001, + "learning_rate": 1.1022065213412453e-06, + "loss": 0.3027, + "num_input_tokens_seen": 19530752, + "step": 19400 + }, + { + "epoch": 9.148986327204149, + "grad_norm": 0.2770223915576935, + "learning_rate": 1.096173557608246e-06, + "loss": 0.3347, + "num_input_tokens_seen": 19536704, + "step": 19405 + }, + { + "epoch": 9.151343705799151, + "grad_norm": 0.3244384229183197, + "learning_rate": 1.0901567799223661e-06, + "loss": 0.2934, + "num_input_tokens_seen": 19540352, + "step": 19410 + }, + { + "epoch": 9.153701084394154, + "grad_norm": 0.314353883266449, + "learning_rate": 1.0841561923577688e-06, + "loss": 0.3239, + "num_input_tokens_seen": 19544640, + "step": 19415 + }, + { + "epoch": 9.156058462989156, + "grad_norm": 0.4885445535182953, + "learning_rate": 1.0781717989776396e-06, + "loss": 0.3016, + "num_input_tokens_seen": 19549216, + "step": 19420 + }, + { + "epoch": 9.158415841584159, + "grad_norm": 0.29705512523651123, + "learning_rate": 1.072203603834207e-06, + "loss": 0.3284, + "num_input_tokens_seen": 19553408, + "step": 19425 + }, + { + "epoch": 9.160773220179161, + "grad_norm": 0.3843209743499756, + "learning_rate": 1.066251610968727e-06, + "loss": 0.28, + "num_input_tokens_seen": 19558752, + "step": 19430 + }, + { + "epoch": 9.163130598774163, + "grad_norm": 0.38210049271583557, + "learning_rate": 1.0603158244114785e-06, + "loss": 0.2688, + "num_input_tokens_seen": 19563840, + "step": 19435 + }, + { + "epoch": 9.165487977369166, + "grad_norm": 0.2776324152946472, + "learning_rate": 1.054396248181777e-06, + "loss": 0.3197, + "num_input_tokens_seen": 19568576, + "step": 19440 + }, + { + "epoch": 9.167845355964168, + "grad_norm": 0.6064152121543884, + "learning_rate": 1.048492886287955e-06, + "loss": 0.3324, + "num_input_tokens_seen": 19573504, + "step": 19445 + }, + { + "epoch": 9.17020273455917, + "grad_norm": 0.3545464873313904, + "learning_rate": 1.042605742727376e-06, + "loss": 0.3113, + "num_input_tokens_seen": 19578176, + "step": 19450 + }, + { + "epoch": 9.172560113154173, + "grad_norm": 0.23433852195739746, + "learning_rate": 1.0367348214864042e-06, + "loss": 0.2937, + "num_input_tokens_seen": 19582432, + "step": 19455 + }, + { + "epoch": 9.174917491749175, + "grad_norm": 0.3967834711074829, + "learning_rate": 1.0308801265404316e-06, + "loss": 0.3399, + "num_input_tokens_seen": 19587936, + "step": 19460 + }, + { + "epoch": 9.177274870344178, + "grad_norm": 0.3833472430706024, + "learning_rate": 1.0250416618538594e-06, + "loss": 0.3138, + "num_input_tokens_seen": 19592704, + "step": 19465 + }, + { + "epoch": 9.17963224893918, + "grad_norm": 0.41279277205467224, + "learning_rate": 1.0192194313801056e-06, + "loss": 0.3425, + "num_input_tokens_seen": 19597440, + "step": 19470 + }, + { + "epoch": 9.181989627534183, + "grad_norm": 0.6314656734466553, + "learning_rate": 1.0134134390615806e-06, + "loss": 0.3638, + "num_input_tokens_seen": 19603488, + "step": 19475 + }, + { + "epoch": 9.184347006129185, + "grad_norm": 0.3976409435272217, + "learning_rate": 1.007623688829709e-06, + "loss": 0.3736, + "num_input_tokens_seen": 19609088, + "step": 19480 + }, + { + "epoch": 9.186704384724187, + "grad_norm": 0.3238728642463684, + "learning_rate": 1.0018501846049188e-06, + "loss": 0.3085, + "num_input_tokens_seen": 19613856, + "step": 19485 + }, + { + "epoch": 9.18906176331919, + "grad_norm": 0.4124813973903656, + "learning_rate": 9.960929302966382e-07, + "loss": 0.3203, + "num_input_tokens_seen": 19619648, + "step": 19490 + }, + { + "epoch": 9.191419141914192, + "grad_norm": 0.2791059911251068, + "learning_rate": 9.90351929803282e-07, + "loss": 0.2634, + "num_input_tokens_seen": 19624320, + "step": 19495 + }, + { + "epoch": 9.193776520509195, + "grad_norm": 0.4663216471672058, + "learning_rate": 9.846271870122685e-07, + "loss": 0.307, + "num_input_tokens_seen": 19628832, + "step": 19500 + }, + { + "epoch": 9.196133899104197, + "grad_norm": 0.521487295627594, + "learning_rate": 9.789187058000028e-07, + "loss": 0.3266, + "num_input_tokens_seen": 19634464, + "step": 19505 + }, + { + "epoch": 9.198491277699198, + "grad_norm": 0.4656079411506653, + "learning_rate": 9.732264900318866e-07, + "loss": 0.2863, + "num_input_tokens_seen": 19640384, + "step": 19510 + }, + { + "epoch": 9.2008486562942, + "grad_norm": 0.3510400354862213, + "learning_rate": 9.675505435622955e-07, + "loss": 0.3059, + "num_input_tokens_seen": 19644864, + "step": 19515 + }, + { + "epoch": 9.203206034889202, + "grad_norm": 0.2997593283653259, + "learning_rate": 9.618908702345942e-07, + "loss": 0.3235, + "num_input_tokens_seen": 19650784, + "step": 19520 + }, + { + "epoch": 9.205563413484205, + "grad_norm": 0.3029867112636566, + "learning_rate": 9.562474738811334e-07, + "loss": 0.332, + "num_input_tokens_seen": 19656032, + "step": 19525 + }, + { + "epoch": 9.207920792079207, + "grad_norm": 0.3947512209415436, + "learning_rate": 9.50620358323237e-07, + "loss": 0.307, + "num_input_tokens_seen": 19661120, + "step": 19530 + }, + { + "epoch": 9.21027817067421, + "grad_norm": 0.3999944031238556, + "learning_rate": 9.450095273712046e-07, + "loss": 0.3333, + "num_input_tokens_seen": 19665792, + "step": 19535 + }, + { + "epoch": 9.212635549269212, + "grad_norm": 0.42093321681022644, + "learning_rate": 9.39414984824305e-07, + "loss": 0.3378, + "num_input_tokens_seen": 19671712, + "step": 19540 + }, + { + "epoch": 9.214992927864214, + "grad_norm": 0.3434015214443207, + "learning_rate": 9.33836734470786e-07, + "loss": 0.3189, + "num_input_tokens_seen": 19677120, + "step": 19545 + }, + { + "epoch": 9.217350306459217, + "grad_norm": 0.32441946864128113, + "learning_rate": 9.282747800878622e-07, + "loss": 0.3218, + "num_input_tokens_seen": 19682112, + "step": 19550 + }, + { + "epoch": 9.21970768505422, + "grad_norm": 0.5309189558029175, + "learning_rate": 9.227291254417097e-07, + "loss": 0.3971, + "num_input_tokens_seen": 19688288, + "step": 19555 + }, + { + "epoch": 9.222065063649222, + "grad_norm": 0.37717753648757935, + "learning_rate": 9.171997742874666e-07, + "loss": 0.3176, + "num_input_tokens_seen": 19693216, + "step": 19560 + }, + { + "epoch": 9.224422442244224, + "grad_norm": 0.6419678330421448, + "learning_rate": 9.11686730369235e-07, + "loss": 0.3355, + "num_input_tokens_seen": 19697504, + "step": 19565 + }, + { + "epoch": 9.226779820839226, + "grad_norm": 0.4945491552352905, + "learning_rate": 9.061899974200761e-07, + "loss": 0.3345, + "num_input_tokens_seen": 19702560, + "step": 19570 + }, + { + "epoch": 9.229137199434229, + "grad_norm": 0.34571900963783264, + "learning_rate": 9.007095791620012e-07, + "loss": 0.2903, + "num_input_tokens_seen": 19706304, + "step": 19575 + }, + { + "epoch": 9.231494578029231, + "grad_norm": 0.390760213136673, + "learning_rate": 8.952454793059783e-07, + "loss": 0.3119, + "num_input_tokens_seen": 19713216, + "step": 19580 + }, + { + "epoch": 9.233851956624234, + "grad_norm": 0.2611690163612366, + "learning_rate": 8.897977015519227e-07, + "loss": 0.3101, + "num_input_tokens_seen": 19718592, + "step": 19585 + }, + { + "epoch": 9.236209335219236, + "grad_norm": 0.4644429683685303, + "learning_rate": 8.843662495887028e-07, + "loss": 0.292, + "num_input_tokens_seen": 19725472, + "step": 19590 + }, + { + "epoch": 9.238566713814238, + "grad_norm": 0.5907584428787231, + "learning_rate": 8.78951127094127e-07, + "loss": 0.3655, + "num_input_tokens_seen": 19729056, + "step": 19595 + }, + { + "epoch": 9.24092409240924, + "grad_norm": 0.6274378299713135, + "learning_rate": 8.73552337734948e-07, + "loss": 0.3846, + "num_input_tokens_seen": 19734816, + "step": 19600 + }, + { + "epoch": 9.243281471004243, + "grad_norm": 0.5372101664543152, + "learning_rate": 8.681698851668585e-07, + "loss": 0.3166, + "num_input_tokens_seen": 19739584, + "step": 19605 + }, + { + "epoch": 9.245638849599246, + "grad_norm": 0.34144189953804016, + "learning_rate": 8.628037730344901e-07, + "loss": 0.3344, + "num_input_tokens_seen": 19744416, + "step": 19610 + }, + { + "epoch": 9.247996228194248, + "grad_norm": 0.5203116536140442, + "learning_rate": 8.574540049714142e-07, + "loss": 0.3711, + "num_input_tokens_seen": 19749184, + "step": 19615 + }, + { + "epoch": 9.25035360678925, + "grad_norm": 0.2837945520877838, + "learning_rate": 8.521205846001218e-07, + "loss": 0.2936, + "num_input_tokens_seen": 19753888, + "step": 19620 + }, + { + "epoch": 9.252710985384253, + "grad_norm": 0.3297234773635864, + "learning_rate": 8.468035155320464e-07, + "loss": 0.3199, + "num_input_tokens_seen": 19758368, + "step": 19625 + }, + { + "epoch": 9.255068363979255, + "grad_norm": 0.33203092217445374, + "learning_rate": 8.415028013675469e-07, + "loss": 0.3563, + "num_input_tokens_seen": 19762432, + "step": 19630 + }, + { + "epoch": 9.257425742574258, + "grad_norm": 0.3250002861022949, + "learning_rate": 8.362184456959104e-07, + "loss": 0.334, + "num_input_tokens_seen": 19766784, + "step": 19635 + }, + { + "epoch": 9.25978312116926, + "grad_norm": 0.5116686224937439, + "learning_rate": 8.30950452095336e-07, + "loss": 0.3383, + "num_input_tokens_seen": 19771264, + "step": 19640 + }, + { + "epoch": 9.262140499764262, + "grad_norm": 0.3679901361465454, + "learning_rate": 8.256988241329533e-07, + "loss": 0.3328, + "num_input_tokens_seen": 19776320, + "step": 19645 + }, + { + "epoch": 9.264497878359265, + "grad_norm": 0.3189883232116699, + "learning_rate": 8.204635653648124e-07, + "loss": 0.332, + "num_input_tokens_seen": 19781696, + "step": 19650 + }, + { + "epoch": 9.266855256954267, + "grad_norm": 0.35792243480682373, + "learning_rate": 8.152446793358692e-07, + "loss": 0.3262, + "num_input_tokens_seen": 19786400, + "step": 19655 + }, + { + "epoch": 9.26921263554927, + "grad_norm": 0.3391929864883423, + "learning_rate": 8.100421695800026e-07, + "loss": 0.3608, + "num_input_tokens_seen": 19791072, + "step": 19660 + }, + { + "epoch": 9.271570014144272, + "grad_norm": 0.5488930344581604, + "learning_rate": 8.048560396199944e-07, + "loss": 0.3461, + "num_input_tokens_seen": 19796032, + "step": 19665 + }, + { + "epoch": 9.273927392739274, + "grad_norm": 0.3227047920227051, + "learning_rate": 7.996862929675441e-07, + "loss": 0.3476, + "num_input_tokens_seen": 19801024, + "step": 19670 + }, + { + "epoch": 9.276284771334277, + "grad_norm": 0.5094165205955505, + "learning_rate": 7.945329331232487e-07, + "loss": 0.2915, + "num_input_tokens_seen": 19805696, + "step": 19675 + }, + { + "epoch": 9.27864214992928, + "grad_norm": 0.34403514862060547, + "learning_rate": 7.893959635766196e-07, + "loss": 0.3308, + "num_input_tokens_seen": 19810144, + "step": 19680 + }, + { + "epoch": 9.280999528524282, + "grad_norm": 0.39046770334243774, + "learning_rate": 7.842753878060576e-07, + "loss": 0.2652, + "num_input_tokens_seen": 19814848, + "step": 19685 + }, + { + "epoch": 9.283356907119284, + "grad_norm": 0.45756590366363525, + "learning_rate": 7.7917120927887e-07, + "loss": 0.3196, + "num_input_tokens_seen": 19820384, + "step": 19690 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 0.2794291377067566, + "learning_rate": 7.740834314512585e-07, + "loss": 0.2764, + "num_input_tokens_seen": 19824960, + "step": 19695 + }, + { + "epoch": 9.288071664309289, + "grad_norm": 0.3083910346031189, + "learning_rate": 7.690120577683285e-07, + "loss": 0.319, + "num_input_tokens_seen": 19830720, + "step": 19700 + }, + { + "epoch": 9.290429042904291, + "grad_norm": 0.47940585017204285, + "learning_rate": 7.639570916640582e-07, + "loss": 0.294, + "num_input_tokens_seen": 19836480, + "step": 19705 + }, + { + "epoch": 9.292786421499294, + "grad_norm": 0.3410528898239136, + "learning_rate": 7.589185365613344e-07, + "loss": 0.3052, + "num_input_tokens_seen": 19841184, + "step": 19710 + }, + { + "epoch": 9.295143800094294, + "grad_norm": 0.30818575620651245, + "learning_rate": 7.538963958719198e-07, + "loss": 0.3707, + "num_input_tokens_seen": 19845984, + "step": 19715 + }, + { + "epoch": 9.297501178689297, + "grad_norm": 0.376822292804718, + "learning_rate": 7.488906729964745e-07, + "loss": 0.3463, + "num_input_tokens_seen": 19850304, + "step": 19720 + }, + { + "epoch": 9.299858557284299, + "grad_norm": 0.38290461897850037, + "learning_rate": 7.439013713245263e-07, + "loss": 0.3908, + "num_input_tokens_seen": 19855552, + "step": 19725 + }, + { + "epoch": 9.302215935879302, + "grad_norm": 0.46869346499443054, + "learning_rate": 7.38928494234492e-07, + "loss": 0.2886, + "num_input_tokens_seen": 19862176, + "step": 19730 + }, + { + "epoch": 9.304573314474304, + "grad_norm": 0.4866323173046112, + "learning_rate": 7.339720450936699e-07, + "loss": 0.3099, + "num_input_tokens_seen": 19867008, + "step": 19735 + }, + { + "epoch": 9.306930693069306, + "grad_norm": 0.37008407711982727, + "learning_rate": 7.290320272582308e-07, + "loss": 0.3095, + "num_input_tokens_seen": 19872640, + "step": 19740 + }, + { + "epoch": 9.309288071664309, + "grad_norm": 0.38785460591316223, + "learning_rate": 7.241084440732127e-07, + "loss": 0.3463, + "num_input_tokens_seen": 19878656, + "step": 19745 + }, + { + "epoch": 9.311645450259311, + "grad_norm": 0.332159161567688, + "learning_rate": 7.192012988725377e-07, + "loss": 0.32, + "num_input_tokens_seen": 19882624, + "step": 19750 + }, + { + "epoch": 9.314002828854314, + "grad_norm": 0.42304185032844543, + "learning_rate": 7.143105949789896e-07, + "loss": 0.3233, + "num_input_tokens_seen": 19888096, + "step": 19755 + }, + { + "epoch": 9.316360207449316, + "grad_norm": 0.40368568897247314, + "learning_rate": 7.094363357042222e-07, + "loss": 0.3538, + "num_input_tokens_seen": 19892480, + "step": 19760 + }, + { + "epoch": 9.318717586044318, + "grad_norm": 0.29401305317878723, + "learning_rate": 7.045785243487507e-07, + "loss": 0.3336, + "num_input_tokens_seen": 19898272, + "step": 19765 + }, + { + "epoch": 9.32107496463932, + "grad_norm": 0.33411070704460144, + "learning_rate": 6.997371642019523e-07, + "loss": 0.363, + "num_input_tokens_seen": 19903168, + "step": 19770 + }, + { + "epoch": 9.323432343234323, + "grad_norm": 0.46504607796669006, + "learning_rate": 6.949122585420714e-07, + "loss": 0.3171, + "num_input_tokens_seen": 19908032, + "step": 19775 + }, + { + "epoch": 9.325789721829326, + "grad_norm": 0.2754053473472595, + "learning_rate": 6.901038106362029e-07, + "loss": 0.3399, + "num_input_tokens_seen": 19912160, + "step": 19780 + }, + { + "epoch": 9.328147100424328, + "grad_norm": 0.3021303117275238, + "learning_rate": 6.853118237403088e-07, + "loss": 0.3421, + "num_input_tokens_seen": 19916640, + "step": 19785 + }, + { + "epoch": 9.33050447901933, + "grad_norm": 0.3260660767555237, + "learning_rate": 6.805363010991855e-07, + "loss": 0.2717, + "num_input_tokens_seen": 19921824, + "step": 19790 + }, + { + "epoch": 9.332861857614333, + "grad_norm": 0.349576473236084, + "learning_rate": 6.75777245946499e-07, + "loss": 0.2925, + "num_input_tokens_seen": 19927104, + "step": 19795 + }, + { + "epoch": 9.335219236209335, + "grad_norm": 0.5064194798469543, + "learning_rate": 6.710346615047602e-07, + "loss": 0.3457, + "num_input_tokens_seen": 19932800, + "step": 19800 + }, + { + "epoch": 9.337576614804338, + "grad_norm": 0.3223877251148224, + "learning_rate": 6.6630855098532e-07, + "loss": 0.2976, + "num_input_tokens_seen": 19937696, + "step": 19805 + }, + { + "epoch": 9.33993399339934, + "grad_norm": 0.3465731739997864, + "learning_rate": 6.615989175883825e-07, + "loss": 0.3496, + "num_input_tokens_seen": 19941472, + "step": 19810 + }, + { + "epoch": 9.342291371994342, + "grad_norm": 0.39273154735565186, + "learning_rate": 6.56905764502988e-07, + "loss": 0.3453, + "num_input_tokens_seen": 19946336, + "step": 19815 + }, + { + "epoch": 9.344648750589345, + "grad_norm": 0.25108274817466736, + "learning_rate": 6.522290949070253e-07, + "loss": 0.3197, + "num_input_tokens_seen": 19951584, + "step": 19820 + }, + { + "epoch": 9.347006129184347, + "grad_norm": 0.4181373715400696, + "learning_rate": 6.475689119672168e-07, + "loss": 0.2628, + "num_input_tokens_seen": 19956192, + "step": 19825 + }, + { + "epoch": 9.34936350777935, + "grad_norm": 0.5063548684120178, + "learning_rate": 6.429252188391188e-07, + "loss": 0.3531, + "num_input_tokens_seen": 19961472, + "step": 19830 + }, + { + "epoch": 9.351720886374352, + "grad_norm": 0.4337232708930969, + "learning_rate": 6.382980186671273e-07, + "loss": 0.307, + "num_input_tokens_seen": 19966016, + "step": 19835 + }, + { + "epoch": 9.354078264969354, + "grad_norm": 0.2953597605228424, + "learning_rate": 6.336873145844635e-07, + "loss": 0.3361, + "num_input_tokens_seen": 19970560, + "step": 19840 + }, + { + "epoch": 9.356435643564357, + "grad_norm": 0.3100830912590027, + "learning_rate": 6.290931097131914e-07, + "loss": 0.3554, + "num_input_tokens_seen": 19974592, + "step": 19845 + }, + { + "epoch": 9.35879302215936, + "grad_norm": 0.36986684799194336, + "learning_rate": 6.245154071641862e-07, + "loss": 0.4032, + "num_input_tokens_seen": 19979712, + "step": 19850 + }, + { + "epoch": 9.361150400754362, + "grad_norm": 0.3304552435874939, + "learning_rate": 6.199542100371574e-07, + "loss": 0.3685, + "num_input_tokens_seen": 19984416, + "step": 19855 + }, + { + "epoch": 9.363507779349364, + "grad_norm": 0.2311861515045166, + "learning_rate": 6.154095214206429e-07, + "loss": 0.3435, + "num_input_tokens_seen": 19988576, + "step": 19860 + }, + { + "epoch": 9.365865157944366, + "grad_norm": 0.514200747013092, + "learning_rate": 6.108813443919976e-07, + "loss": 0.2991, + "num_input_tokens_seen": 19993696, + "step": 19865 + }, + { + "epoch": 9.368222536539369, + "grad_norm": 0.4111751616001129, + "learning_rate": 6.063696820173881e-07, + "loss": 0.3578, + "num_input_tokens_seen": 19999872, + "step": 19870 + }, + { + "epoch": 9.370579915134371, + "grad_norm": 0.337167352437973, + "learning_rate": 6.018745373518153e-07, + "loss": 0.3439, + "num_input_tokens_seen": 20004896, + "step": 19875 + }, + { + "epoch": 9.372937293729374, + "grad_norm": 0.42717283964157104, + "learning_rate": 5.97395913439075e-07, + "loss": 0.3222, + "num_input_tokens_seen": 20009088, + "step": 19880 + }, + { + "epoch": 9.375294672324376, + "grad_norm": 0.6778307557106018, + "learning_rate": 5.92933813311794e-07, + "loss": 0.3636, + "num_input_tokens_seen": 20013856, + "step": 19885 + }, + { + "epoch": 9.377652050919378, + "grad_norm": 0.2735232412815094, + "learning_rate": 5.884882399914027e-07, + "loss": 0.335, + "num_input_tokens_seen": 20018400, + "step": 19890 + }, + { + "epoch": 9.38000942951438, + "grad_norm": 0.6942339539527893, + "learning_rate": 5.840591964881375e-07, + "loss": 0.3584, + "num_input_tokens_seen": 20023264, + "step": 19895 + }, + { + "epoch": 9.382366808109383, + "grad_norm": 0.38265982270240784, + "learning_rate": 5.796466858010496e-07, + "loss": 0.3097, + "num_input_tokens_seen": 20029696, + "step": 19900 + }, + { + "epoch": 9.384724186704386, + "grad_norm": 0.32580745220184326, + "learning_rate": 5.752507109179905e-07, + "loss": 0.368, + "num_input_tokens_seen": 20033952, + "step": 19905 + }, + { + "epoch": 9.387081565299386, + "grad_norm": 0.43999388813972473, + "learning_rate": 5.708712748156153e-07, + "loss": 0.364, + "num_input_tokens_seen": 20039104, + "step": 19910 + }, + { + "epoch": 9.389438943894389, + "grad_norm": 0.34926727414131165, + "learning_rate": 5.665083804593824e-07, + "loss": 0.3291, + "num_input_tokens_seen": 20043296, + "step": 19915 + }, + { + "epoch": 9.391796322489391, + "grad_norm": 0.6332018971443176, + "learning_rate": 5.621620308035425e-07, + "loss": 0.3598, + "num_input_tokens_seen": 20048128, + "step": 19920 + }, + { + "epoch": 9.394153701084393, + "grad_norm": 0.3116321563720703, + "learning_rate": 5.578322287911525e-07, + "loss": 0.3095, + "num_input_tokens_seen": 20053664, + "step": 19925 + }, + { + "epoch": 9.396511079679396, + "grad_norm": 0.416912317276001, + "learning_rate": 5.535189773540617e-07, + "loss": 0.3461, + "num_input_tokens_seen": 20060064, + "step": 19930 + }, + { + "epoch": 9.398868458274398, + "grad_norm": 0.540388286113739, + "learning_rate": 5.492222794129059e-07, + "loss": 0.3261, + "num_input_tokens_seen": 20064800, + "step": 19935 + }, + { + "epoch": 9.4012258368694, + "grad_norm": 0.3919592499732971, + "learning_rate": 5.449421378771192e-07, + "loss": 0.3489, + "num_input_tokens_seen": 20070496, + "step": 19940 + }, + { + "epoch": 9.403583215464403, + "grad_norm": 0.3276906907558441, + "learning_rate": 5.406785556449245e-07, + "loss": 0.2832, + "num_input_tokens_seen": 20075840, + "step": 19945 + }, + { + "epoch": 9.405940594059405, + "grad_norm": 0.527314305305481, + "learning_rate": 5.364315356033295e-07, + "loss": 0.3272, + "num_input_tokens_seen": 20082816, + "step": 19950 + }, + { + "epoch": 9.408297972654408, + "grad_norm": 0.28281137347221375, + "learning_rate": 5.322010806281252e-07, + "loss": 0.3101, + "num_input_tokens_seen": 20087712, + "step": 19955 + }, + { + "epoch": 9.41065535124941, + "grad_norm": 0.5527560114860535, + "learning_rate": 5.27987193583887e-07, + "loss": 0.3171, + "num_input_tokens_seen": 20092256, + "step": 19960 + }, + { + "epoch": 9.413012729844413, + "grad_norm": 0.34110090136528015, + "learning_rate": 5.237898773239797e-07, + "loss": 0.3609, + "num_input_tokens_seen": 20097408, + "step": 19965 + }, + { + "epoch": 9.415370108439415, + "grad_norm": 0.5143293738365173, + "learning_rate": 5.196091346905352e-07, + "loss": 0.2802, + "num_input_tokens_seen": 20102048, + "step": 19970 + }, + { + "epoch": 9.417727487034417, + "grad_norm": 0.3512914776802063, + "learning_rate": 5.154449685144696e-07, + "loss": 0.308, + "num_input_tokens_seen": 20107200, + "step": 19975 + }, + { + "epoch": 9.42008486562942, + "grad_norm": 0.34040358662605286, + "learning_rate": 5.11297381615472e-07, + "loss": 0.3384, + "num_input_tokens_seen": 20111392, + "step": 19980 + }, + { + "epoch": 9.422442244224422, + "grad_norm": 0.5023015737533569, + "learning_rate": 5.071663768020096e-07, + "loss": 0.2853, + "num_input_tokens_seen": 20116000, + "step": 19985 + }, + { + "epoch": 9.424799622819425, + "grad_norm": 0.36851766705513, + "learning_rate": 5.030519568713143e-07, + "loss": 0.302, + "num_input_tokens_seen": 20122432, + "step": 19990 + }, + { + "epoch": 9.427157001414427, + "grad_norm": 0.31272566318511963, + "learning_rate": 4.989541246093993e-07, + "loss": 0.2931, + "num_input_tokens_seen": 20126432, + "step": 19995 + }, + { + "epoch": 9.42951438000943, + "grad_norm": 0.26063844561576843, + "learning_rate": 4.948728827910254e-07, + "loss": 0.3839, + "num_input_tokens_seen": 20131648, + "step": 20000 + }, + { + "epoch": 9.431871758604432, + "grad_norm": 0.5750132203102112, + "learning_rate": 4.908082341797432e-07, + "loss": 0.2994, + "num_input_tokens_seen": 20135968, + "step": 20005 + }, + { + "epoch": 9.434229137199434, + "grad_norm": 0.45753979682922363, + "learning_rate": 4.86760181527851e-07, + "loss": 0.2697, + "num_input_tokens_seen": 20140896, + "step": 20010 + }, + { + "epoch": 9.436586515794437, + "grad_norm": 0.36710235476493835, + "learning_rate": 4.827287275764203e-07, + "loss": 0.3523, + "num_input_tokens_seen": 20145952, + "step": 20015 + }, + { + "epoch": 9.438943894389439, + "grad_norm": 0.4169948399066925, + "learning_rate": 4.787138750552727e-07, + "loss": 0.3314, + "num_input_tokens_seen": 20151712, + "step": 20020 + }, + { + "epoch": 9.441301272984441, + "grad_norm": 0.30519232153892517, + "learning_rate": 4.74715626682995e-07, + "loss": 0.2746, + "num_input_tokens_seen": 20156320, + "step": 20025 + }, + { + "epoch": 9.443658651579444, + "grad_norm": 0.4536895155906677, + "learning_rate": 4.707339851669268e-07, + "loss": 0.3347, + "num_input_tokens_seen": 20161408, + "step": 20030 + }, + { + "epoch": 9.446016030174446, + "grad_norm": 0.36530593037605286, + "learning_rate": 4.667689532031727e-07, + "loss": 0.3027, + "num_input_tokens_seen": 20166976, + "step": 20035 + }, + { + "epoch": 9.448373408769449, + "grad_norm": 0.4724484980106354, + "learning_rate": 4.628205334765767e-07, + "loss": 0.3269, + "num_input_tokens_seen": 20171328, + "step": 20040 + }, + { + "epoch": 9.450730787364451, + "grad_norm": 0.35357582569122314, + "learning_rate": 4.5888872866073906e-07, + "loss": 0.2873, + "num_input_tokens_seen": 20176256, + "step": 20045 + }, + { + "epoch": 9.453088165959453, + "grad_norm": 0.30860981345176697, + "learning_rate": 4.549735414180162e-07, + "loss": 0.2816, + "num_input_tokens_seen": 20182080, + "step": 20050 + }, + { + "epoch": 9.455445544554456, + "grad_norm": 0.2404826134443283, + "learning_rate": 4.5107497439950687e-07, + "loss": 0.3013, + "num_input_tokens_seen": 20187168, + "step": 20055 + }, + { + "epoch": 9.457802923149458, + "grad_norm": 0.49507254362106323, + "learning_rate": 4.4719303024504944e-07, + "loss": 0.3125, + "num_input_tokens_seen": 20192480, + "step": 20060 + }, + { + "epoch": 9.46016030174446, + "grad_norm": 0.32374879717826843, + "learning_rate": 4.433277115832385e-07, + "loss": 0.3534, + "num_input_tokens_seen": 20197920, + "step": 20065 + }, + { + "epoch": 9.462517680339463, + "grad_norm": 0.4585314989089966, + "learning_rate": 4.394790210314026e-07, + "loss": 0.2689, + "num_input_tokens_seen": 20203008, + "step": 20070 + }, + { + "epoch": 9.464875058934465, + "grad_norm": 0.315044105052948, + "learning_rate": 4.3564696119561543e-07, + "loss": 0.3242, + "num_input_tokens_seen": 20207680, + "step": 20075 + }, + { + "epoch": 9.467232437529468, + "grad_norm": 0.5221394896507263, + "learning_rate": 4.318315346706847e-07, + "loss": 0.3414, + "num_input_tokens_seen": 20212544, + "step": 20080 + }, + { + "epoch": 9.46958981612447, + "grad_norm": 0.46531781554222107, + "learning_rate": 4.280327440401577e-07, + "loss": 0.2732, + "num_input_tokens_seen": 20216608, + "step": 20085 + }, + { + "epoch": 9.471947194719473, + "grad_norm": 0.3133058249950409, + "learning_rate": 4.242505918763212e-07, + "loss": 0.3295, + "num_input_tokens_seen": 20220768, + "step": 20090 + }, + { + "epoch": 9.474304573314475, + "grad_norm": 0.1855202168226242, + "learning_rate": 4.204850807401933e-07, + "loss": 0.3746, + "num_input_tokens_seen": 20225728, + "step": 20095 + }, + { + "epoch": 9.476661951909477, + "grad_norm": 0.47849348187446594, + "learning_rate": 4.16736213181515e-07, + "loss": 0.3681, + "num_input_tokens_seen": 20230304, + "step": 20100 + }, + { + "epoch": 9.47901933050448, + "grad_norm": 0.31048452854156494, + "learning_rate": 4.1300399173876957e-07, + "loss": 0.2801, + "num_input_tokens_seen": 20235360, + "step": 20105 + }, + { + "epoch": 9.481376709099482, + "grad_norm": 0.2580253481864929, + "learning_rate": 4.092884189391605e-07, + "loss": 0.3148, + "num_input_tokens_seen": 20240608, + "step": 20110 + }, + { + "epoch": 9.483734087694483, + "grad_norm": 0.791926920413971, + "learning_rate": 4.0558949729862517e-07, + "loss": 0.3928, + "num_input_tokens_seen": 20246112, + "step": 20115 + }, + { + "epoch": 9.486091466289485, + "grad_norm": 0.3531147539615631, + "learning_rate": 4.0190722932182126e-07, + "loss": 0.2983, + "num_input_tokens_seen": 20251104, + "step": 20120 + }, + { + "epoch": 9.488448844884488, + "grad_norm": 0.30406898260116577, + "learning_rate": 3.9824161750212917e-07, + "loss": 0.2868, + "num_input_tokens_seen": 20256288, + "step": 20125 + }, + { + "epoch": 9.49080622347949, + "grad_norm": 0.332427978515625, + "learning_rate": 3.9459266432164966e-07, + "loss": 0.3104, + "num_input_tokens_seen": 20262048, + "step": 20130 + }, + { + "epoch": 9.493163602074493, + "grad_norm": 0.54879230260849, + "learning_rate": 3.90960372251209e-07, + "loss": 0.419, + "num_input_tokens_seen": 20266912, + "step": 20135 + }, + { + "epoch": 9.495520980669495, + "grad_norm": 0.3894733190536499, + "learning_rate": 3.873447437503508e-07, + "loss": 0.3731, + "num_input_tokens_seen": 20272800, + "step": 20140 + }, + { + "epoch": 9.497878359264497, + "grad_norm": 0.2721690535545349, + "learning_rate": 3.837457812673306e-07, + "loss": 0.3405, + "num_input_tokens_seen": 20277440, + "step": 20145 + }, + { + "epoch": 9.5002357378595, + "grad_norm": 0.39979004859924316, + "learning_rate": 3.8016348723911855e-07, + "loss": 0.3482, + "num_input_tokens_seen": 20282624, + "step": 20150 + }, + { + "epoch": 9.502593116454502, + "grad_norm": 0.4052923321723938, + "learning_rate": 3.7659786409140476e-07, + "loss": 0.3451, + "num_input_tokens_seen": 20286848, + "step": 20155 + }, + { + "epoch": 9.504479019330505, + "eval_loss": 0.32978886365890503, + "eval_runtime": 25.6507, + "eval_samples_per_second": 36.763, + "eval_steps_per_second": 9.201, + "num_input_tokens_seen": 20290176, + "step": 20159 + }, + { + "epoch": 9.504950495049505, + "grad_norm": 0.4892962574958801, + "learning_rate": 3.730489142385857e-07, + "loss": 0.332, + "num_input_tokens_seen": 20291104, + "step": 20160 + }, + { + "epoch": 9.507307873644507, + "grad_norm": 0.5537700057029724, + "learning_rate": 3.695166400837669e-07, + "loss": 0.3198, + "num_input_tokens_seen": 20295392, + "step": 20165 + }, + { + "epoch": 9.50966525223951, + "grad_norm": 0.30443286895751953, + "learning_rate": 3.6600104401876834e-07, + "loss": 0.3431, + "num_input_tokens_seen": 20300192, + "step": 20170 + }, + { + "epoch": 9.512022630834512, + "grad_norm": 0.3540840148925781, + "learning_rate": 3.6250212842410814e-07, + "loss": 0.3251, + "num_input_tokens_seen": 20304672, + "step": 20175 + }, + { + "epoch": 9.514380009429514, + "grad_norm": 0.25884178280830383, + "learning_rate": 3.590198956690216e-07, + "loss": 0.3622, + "num_input_tokens_seen": 20309408, + "step": 20180 + }, + { + "epoch": 9.516737388024517, + "grad_norm": 0.3451556861400604, + "learning_rate": 3.555543481114337e-07, + "loss": 0.2888, + "num_input_tokens_seen": 20316320, + "step": 20185 + }, + { + "epoch": 9.519094766619519, + "grad_norm": 0.4892306327819824, + "learning_rate": 3.521054880979785e-07, + "loss": 0.3651, + "num_input_tokens_seen": 20321248, + "step": 20190 + }, + { + "epoch": 9.521452145214521, + "grad_norm": 0.31061282753944397, + "learning_rate": 3.486733179639906e-07, + "loss": 0.3454, + "num_input_tokens_seen": 20326944, + "step": 20195 + }, + { + "epoch": 9.523809523809524, + "grad_norm": 0.8194427490234375, + "learning_rate": 3.452578400335027e-07, + "loss": 0.3416, + "num_input_tokens_seen": 20332096, + "step": 20200 + }, + { + "epoch": 9.526166902404526, + "grad_norm": 0.29292863607406616, + "learning_rate": 3.4185905661924534e-07, + "loss": 0.3481, + "num_input_tokens_seen": 20337216, + "step": 20205 + }, + { + "epoch": 9.528524280999529, + "grad_norm": 0.8830149173736572, + "learning_rate": 3.3847697002264147e-07, + "loss": 0.3569, + "num_input_tokens_seen": 20342336, + "step": 20210 + }, + { + "epoch": 9.530881659594531, + "grad_norm": 0.34554174542427063, + "learning_rate": 3.351115825338119e-07, + "loss": 0.3023, + "num_input_tokens_seen": 20347616, + "step": 20215 + }, + { + "epoch": 9.533239038189533, + "grad_norm": 0.33198532462120056, + "learning_rate": 3.3176289643157e-07, + "loss": 0.3543, + "num_input_tokens_seen": 20352096, + "step": 20220 + }, + { + "epoch": 9.535596416784536, + "grad_norm": 0.5302532911300659, + "learning_rate": 3.28430913983413e-07, + "loss": 0.2687, + "num_input_tokens_seen": 20357376, + "step": 20225 + }, + { + "epoch": 9.537953795379538, + "grad_norm": 0.510170042514801, + "learning_rate": 3.2511563744553884e-07, + "loss": 0.2984, + "num_input_tokens_seen": 20361824, + "step": 20230 + }, + { + "epoch": 9.54031117397454, + "grad_norm": 0.28183552622795105, + "learning_rate": 3.218170690628214e-07, + "loss": 0.3402, + "num_input_tokens_seen": 20366176, + "step": 20235 + }, + { + "epoch": 9.542668552569543, + "grad_norm": 0.56882643699646, + "learning_rate": 3.18535211068835e-07, + "loss": 0.3714, + "num_input_tokens_seen": 20372352, + "step": 20240 + }, + { + "epoch": 9.545025931164545, + "grad_norm": 0.30213436484336853, + "learning_rate": 3.152700656858243e-07, + "loss": 0.3038, + "num_input_tokens_seen": 20377216, + "step": 20245 + }, + { + "epoch": 9.547383309759548, + "grad_norm": 0.46022218465805054, + "learning_rate": 3.1202163512472905e-07, + "loss": 0.3052, + "num_input_tokens_seen": 20382368, + "step": 20250 + }, + { + "epoch": 9.54974068835455, + "grad_norm": 0.3031485974788666, + "learning_rate": 3.087899215851592e-07, + "loss": 0.2772, + "num_input_tokens_seen": 20387072, + "step": 20255 + }, + { + "epoch": 9.552098066949553, + "grad_norm": 0.4850974380970001, + "learning_rate": 3.055749272554198e-07, + "loss": 0.3252, + "num_input_tokens_seen": 20392256, + "step": 20260 + }, + { + "epoch": 9.554455445544555, + "grad_norm": 0.36888912320137024, + "learning_rate": 3.0237665431247784e-07, + "loss": 0.34, + "num_input_tokens_seen": 20396960, + "step": 20265 + }, + { + "epoch": 9.556812824139557, + "grad_norm": 0.3386296331882477, + "learning_rate": 2.9919510492199267e-07, + "loss": 0.3027, + "num_input_tokens_seen": 20401472, + "step": 20270 + }, + { + "epoch": 9.55917020273456, + "grad_norm": 0.43162551522254944, + "learning_rate": 2.960302812382909e-07, + "loss": 0.3046, + "num_input_tokens_seen": 20406144, + "step": 20275 + }, + { + "epoch": 9.561527581329562, + "grad_norm": 0.3018546402454376, + "learning_rate": 2.928821854043778e-07, + "loss": 0.3186, + "num_input_tokens_seen": 20410816, + "step": 20280 + }, + { + "epoch": 9.563884959924565, + "grad_norm": 0.33654868602752686, + "learning_rate": 2.8975081955192605e-07, + "loss": 0.3148, + "num_input_tokens_seen": 20415712, + "step": 20285 + }, + { + "epoch": 9.566242338519567, + "grad_norm": 0.3981882929801941, + "learning_rate": 2.8663618580128947e-07, + "loss": 0.3277, + "num_input_tokens_seen": 20420704, + "step": 20290 + }, + { + "epoch": 9.56859971711457, + "grad_norm": 0.5166996717453003, + "learning_rate": 2.8353828626148107e-07, + "loss": 0.3613, + "num_input_tokens_seen": 20425504, + "step": 20295 + }, + { + "epoch": 9.570957095709572, + "grad_norm": 0.5607509613037109, + "learning_rate": 2.80457123030195e-07, + "loss": 0.3327, + "num_input_tokens_seen": 20430656, + "step": 20300 + }, + { + "epoch": 9.573314474304574, + "grad_norm": 0.36272698640823364, + "learning_rate": 2.7739269819377633e-07, + "loss": 0.3347, + "num_input_tokens_seen": 20435680, + "step": 20305 + }, + { + "epoch": 9.575671852899575, + "grad_norm": 0.3483540117740631, + "learning_rate": 2.743450138272513e-07, + "loss": 0.3742, + "num_input_tokens_seen": 20440512, + "step": 20310 + }, + { + "epoch": 9.578029231494579, + "grad_norm": 0.35037335753440857, + "learning_rate": 2.713140719943025e-07, + "loss": 0.3521, + "num_input_tokens_seen": 20445472, + "step": 20315 + }, + { + "epoch": 9.58038661008958, + "grad_norm": 0.5643270015716553, + "learning_rate": 2.682998747472826e-07, + "loss": 0.3109, + "num_input_tokens_seen": 20451232, + "step": 20320 + }, + { + "epoch": 9.582743988684582, + "grad_norm": 0.522584855556488, + "learning_rate": 2.653024241271951e-07, + "loss": 0.2647, + "num_input_tokens_seen": 20456032, + "step": 20325 + }, + { + "epoch": 9.585101367279584, + "grad_norm": 0.5793828964233398, + "learning_rate": 2.6232172216371086e-07, + "loss": 0.4058, + "num_input_tokens_seen": 20461056, + "step": 20330 + }, + { + "epoch": 9.587458745874587, + "grad_norm": 0.2709656357765198, + "learning_rate": 2.5935777087515987e-07, + "loss": 0.3103, + "num_input_tokens_seen": 20466144, + "step": 20335 + }, + { + "epoch": 9.58981612446959, + "grad_norm": 0.365656316280365, + "learning_rate": 2.5641057226853116e-07, + "loss": 0.3152, + "num_input_tokens_seen": 20472192, + "step": 20340 + }, + { + "epoch": 9.592173503064592, + "grad_norm": 0.41086238622665405, + "learning_rate": 2.5348012833946445e-07, + "loss": 0.3283, + "num_input_tokens_seen": 20476992, + "step": 20345 + }, + { + "epoch": 9.594530881659594, + "grad_norm": 0.6162238717079163, + "learning_rate": 2.505664410722558e-07, + "loss": 0.3001, + "num_input_tokens_seen": 20482848, + "step": 20350 + }, + { + "epoch": 9.596888260254596, + "grad_norm": 0.30845361948013306, + "learning_rate": 2.4766951243985756e-07, + "loss": 0.3292, + "num_input_tokens_seen": 20488000, + "step": 20355 + }, + { + "epoch": 9.599245638849599, + "grad_norm": 0.3838736116886139, + "learning_rate": 2.447893444038757e-07, + "loss": 0.3302, + "num_input_tokens_seen": 20492864, + "step": 20360 + }, + { + "epoch": 9.601603017444601, + "grad_norm": 0.41369399428367615, + "learning_rate": 2.4192593891456395e-07, + "loss": 0.3472, + "num_input_tokens_seen": 20497216, + "step": 20365 + }, + { + "epoch": 9.603960396039604, + "grad_norm": 0.2867957055568695, + "learning_rate": 2.390792979108214e-07, + "loss": 0.2974, + "num_input_tokens_seen": 20502560, + "step": 20370 + }, + { + "epoch": 9.606317774634606, + "grad_norm": 0.2849068343639374, + "learning_rate": 2.362494233202034e-07, + "loss": 0.2823, + "num_input_tokens_seen": 20508000, + "step": 20375 + }, + { + "epoch": 9.608675153229608, + "grad_norm": 0.5936785340309143, + "learning_rate": 2.3343631705890766e-07, + "loss": 0.3536, + "num_input_tokens_seen": 20514432, + "step": 20380 + }, + { + "epoch": 9.61103253182461, + "grad_norm": 0.3667358160018921, + "learning_rate": 2.3063998103177998e-07, + "loss": 0.2743, + "num_input_tokens_seen": 20519872, + "step": 20385 + }, + { + "epoch": 9.613389910419613, + "grad_norm": 0.48266172409057617, + "learning_rate": 2.2786041713230565e-07, + "loss": 0.3492, + "num_input_tokens_seen": 20523872, + "step": 20390 + }, + { + "epoch": 9.615747289014616, + "grad_norm": 0.5167723894119263, + "learning_rate": 2.2509762724262085e-07, + "loss": 0.2843, + "num_input_tokens_seen": 20528320, + "step": 20395 + }, + { + "epoch": 9.618104667609618, + "grad_norm": 0.39916980266571045, + "learning_rate": 2.2235161323349573e-07, + "loss": 0.3231, + "num_input_tokens_seen": 20534016, + "step": 20400 + }, + { + "epoch": 9.62046204620462, + "grad_norm": 0.5470597147941589, + "learning_rate": 2.196223769643485e-07, + "loss": 0.3169, + "num_input_tokens_seen": 20539008, + "step": 20405 + }, + { + "epoch": 9.622819424799623, + "grad_norm": 0.3035525977611542, + "learning_rate": 2.1690992028322866e-07, + "loss": 0.3324, + "num_input_tokens_seen": 20544416, + "step": 20410 + }, + { + "epoch": 9.625176803394625, + "grad_norm": 0.36219412088394165, + "learning_rate": 2.1421424502683086e-07, + "loss": 0.3486, + "num_input_tokens_seen": 20549664, + "step": 20415 + }, + { + "epoch": 9.627534181989628, + "grad_norm": 0.3856527805328369, + "learning_rate": 2.1153535302047832e-07, + "loss": 0.3373, + "num_input_tokens_seen": 20554848, + "step": 20420 + }, + { + "epoch": 9.62989156058463, + "grad_norm": 0.3961987793445587, + "learning_rate": 2.0887324607813952e-07, + "loss": 0.3206, + "num_input_tokens_seen": 20559840, + "step": 20425 + }, + { + "epoch": 9.632248939179632, + "grad_norm": 0.2920820116996765, + "learning_rate": 2.0622792600241135e-07, + "loss": 0.3144, + "num_input_tokens_seen": 20563840, + "step": 20430 + }, + { + "epoch": 9.634606317774635, + "grad_norm": 0.37943047285079956, + "learning_rate": 2.0359939458452216e-07, + "loss": 0.3151, + "num_input_tokens_seen": 20568448, + "step": 20435 + }, + { + "epoch": 9.636963696369637, + "grad_norm": 0.3529983162879944, + "learning_rate": 2.0098765360433703e-07, + "loss": 0.3421, + "num_input_tokens_seen": 20573824, + "step": 20440 + }, + { + "epoch": 9.63932107496464, + "grad_norm": 0.5454068779945374, + "learning_rate": 1.9839270483034966e-07, + "loss": 0.3247, + "num_input_tokens_seen": 20577984, + "step": 20445 + }, + { + "epoch": 9.641678453559642, + "grad_norm": 0.3366587162017822, + "learning_rate": 1.9581455001968506e-07, + "loss": 0.2998, + "num_input_tokens_seen": 20583520, + "step": 20450 + }, + { + "epoch": 9.644035832154644, + "grad_norm": 0.4007675349712372, + "learning_rate": 1.9325319091808847e-07, + "loss": 0.3139, + "num_input_tokens_seen": 20587840, + "step": 20455 + }, + { + "epoch": 9.646393210749647, + "grad_norm": 0.45961111783981323, + "learning_rate": 1.9070862925994194e-07, + "loss": 0.338, + "num_input_tokens_seen": 20592480, + "step": 20460 + }, + { + "epoch": 9.64875058934465, + "grad_norm": 0.6025760173797607, + "learning_rate": 1.8818086676825052e-07, + "loss": 0.3435, + "num_input_tokens_seen": 20596096, + "step": 20465 + }, + { + "epoch": 9.651107967939652, + "grad_norm": 0.30061835050582886, + "learning_rate": 1.8566990515464232e-07, + "loss": 0.2951, + "num_input_tokens_seen": 20600928, + "step": 20470 + }, + { + "epoch": 9.653465346534654, + "grad_norm": 0.3335823714733124, + "learning_rate": 1.8317574611936839e-07, + "loss": 0.3601, + "num_input_tokens_seen": 20605024, + "step": 20475 + }, + { + "epoch": 9.655822725129656, + "grad_norm": 0.36324650049209595, + "learning_rate": 1.8069839135130827e-07, + "loss": 0.3254, + "num_input_tokens_seen": 20610560, + "step": 20480 + }, + { + "epoch": 9.658180103724659, + "grad_norm": 0.458371639251709, + "learning_rate": 1.7823784252795073e-07, + "loss": 0.2418, + "num_input_tokens_seen": 20616608, + "step": 20485 + }, + { + "epoch": 9.660537482319661, + "grad_norm": 0.368862509727478, + "learning_rate": 1.757941013154213e-07, + "loss": 0.3721, + "num_input_tokens_seen": 20622240, + "step": 20490 + }, + { + "epoch": 9.662894860914664, + "grad_norm": 0.5275381207466125, + "learning_rate": 1.733671693684491e-07, + "loss": 0.3318, + "num_input_tokens_seen": 20627584, + "step": 20495 + }, + { + "epoch": 9.665252239509666, + "grad_norm": 0.46304842829704285, + "learning_rate": 1.7095704833038907e-07, + "loss": 0.2861, + "num_input_tokens_seen": 20632480, + "step": 20500 + }, + { + "epoch": 9.667609618104667, + "grad_norm": 0.371692419052124, + "learning_rate": 1.685637398332135e-07, + "loss": 0.3244, + "num_input_tokens_seen": 20637408, + "step": 20505 + }, + { + "epoch": 9.66996699669967, + "grad_norm": 0.2796646058559418, + "learning_rate": 1.6618724549750387e-07, + "loss": 0.2909, + "num_input_tokens_seen": 20643456, + "step": 20510 + }, + { + "epoch": 9.672324375294671, + "grad_norm": 0.322055459022522, + "learning_rate": 1.638275669324646e-07, + "loss": 0.3467, + "num_input_tokens_seen": 20648224, + "step": 20515 + }, + { + "epoch": 9.674681753889674, + "grad_norm": 0.20663391053676605, + "learning_rate": 1.6148470573590925e-07, + "loss": 0.3343, + "num_input_tokens_seen": 20653824, + "step": 20520 + }, + { + "epoch": 9.677039132484676, + "grad_norm": 0.49540260434150696, + "learning_rate": 1.5915866349426323e-07, + "loss": 0.3187, + "num_input_tokens_seen": 20658048, + "step": 20525 + }, + { + "epoch": 9.679396511079679, + "grad_norm": 0.3787500560283661, + "learning_rate": 1.5684944178256388e-07, + "loss": 0.2712, + "num_input_tokens_seen": 20663744, + "step": 20530 + }, + { + "epoch": 9.681753889674681, + "grad_norm": 0.5903692245483398, + "learning_rate": 1.5455704216446044e-07, + "loss": 0.3526, + "num_input_tokens_seen": 20668992, + "step": 20535 + }, + { + "epoch": 9.684111268269683, + "grad_norm": 0.480255663394928, + "learning_rate": 1.522814661922084e-07, + "loss": 0.3376, + "num_input_tokens_seen": 20673632, + "step": 20540 + }, + { + "epoch": 9.686468646864686, + "grad_norm": 0.37938931584358215, + "learning_rate": 1.5002271540667523e-07, + "loss": 0.3323, + "num_input_tokens_seen": 20678912, + "step": 20545 + }, + { + "epoch": 9.688826025459688, + "grad_norm": 0.4175063669681549, + "learning_rate": 1.4778079133733468e-07, + "loss": 0.226, + "num_input_tokens_seen": 20683680, + "step": 20550 + }, + { + "epoch": 9.69118340405469, + "grad_norm": 0.2938574254512787, + "learning_rate": 1.4555569550226133e-07, + "loss": 0.324, + "num_input_tokens_seen": 20688128, + "step": 20555 + }, + { + "epoch": 9.693540782649693, + "grad_norm": 0.4368872046470642, + "learning_rate": 1.4334742940814162e-07, + "loss": 0.3399, + "num_input_tokens_seen": 20693856, + "step": 20560 + }, + { + "epoch": 9.695898161244696, + "grad_norm": 0.5178336501121521, + "learning_rate": 1.4115599455026273e-07, + "loss": 0.3378, + "num_input_tokens_seen": 20698688, + "step": 20565 + }, + { + "epoch": 9.698255539839698, + "grad_norm": 0.3831962049007416, + "learning_rate": 1.389813924125155e-07, + "loss": 0.2956, + "num_input_tokens_seen": 20703840, + "step": 20570 + }, + { + "epoch": 9.7006129184347, + "grad_norm": 0.34491682052612305, + "learning_rate": 1.368236244673915e-07, + "loss": 0.2751, + "num_input_tokens_seen": 20708576, + "step": 20575 + }, + { + "epoch": 9.702970297029703, + "grad_norm": 0.3386102616786957, + "learning_rate": 1.3468269217598585e-07, + "loss": 0.3192, + "num_input_tokens_seen": 20713120, + "step": 20580 + }, + { + "epoch": 9.705327675624705, + "grad_norm": 0.7704822421073914, + "learning_rate": 1.3255859698799168e-07, + "loss": 0.3197, + "num_input_tokens_seen": 20717984, + "step": 20585 + }, + { + "epoch": 9.707685054219708, + "grad_norm": 0.39014557003974915, + "learning_rate": 1.304513403417029e-07, + "loss": 0.3116, + "num_input_tokens_seen": 20723648, + "step": 20590 + }, + { + "epoch": 9.71004243281471, + "grad_norm": 0.3856606185436249, + "learning_rate": 1.283609236640143e-07, + "loss": 0.3243, + "num_input_tokens_seen": 20728000, + "step": 20595 + }, + { + "epoch": 9.712399811409712, + "grad_norm": 0.4707925319671631, + "learning_rate": 1.262873483704047e-07, + "loss": 0.316, + "num_input_tokens_seen": 20733152, + "step": 20600 + }, + { + "epoch": 9.714757190004715, + "grad_norm": 0.30386170744895935, + "learning_rate": 1.2423061586496477e-07, + "loss": 0.3429, + "num_input_tokens_seen": 20738944, + "step": 20605 + }, + { + "epoch": 9.717114568599717, + "grad_norm": 0.5035497546195984, + "learning_rate": 1.221907275403722e-07, + "loss": 0.3153, + "num_input_tokens_seen": 20744192, + "step": 20610 + }, + { + "epoch": 9.71947194719472, + "grad_norm": 0.3752072751522064, + "learning_rate": 1.201676847779054e-07, + "loss": 0.3375, + "num_input_tokens_seen": 20748864, + "step": 20615 + }, + { + "epoch": 9.721829325789722, + "grad_norm": 0.46195924282073975, + "learning_rate": 1.1816148894742418e-07, + "loss": 0.29, + "num_input_tokens_seen": 20754144, + "step": 20620 + }, + { + "epoch": 9.724186704384724, + "grad_norm": 0.2967844009399414, + "learning_rate": 1.1617214140738908e-07, + "loss": 0.309, + "num_input_tokens_seen": 20758592, + "step": 20625 + }, + { + "epoch": 9.726544082979727, + "grad_norm": 0.24614794552326202, + "learning_rate": 1.141996435048559e-07, + "loss": 0.3538, + "num_input_tokens_seen": 20763520, + "step": 20630 + }, + { + "epoch": 9.72890146157473, + "grad_norm": 0.2953421175479889, + "learning_rate": 1.1224399657546458e-07, + "loss": 0.3489, + "num_input_tokens_seen": 20768480, + "step": 20635 + }, + { + "epoch": 9.731258840169732, + "grad_norm": 0.47379574179649353, + "learning_rate": 1.1030520194344473e-07, + "loss": 0.2798, + "num_input_tokens_seen": 20773760, + "step": 20640 + }, + { + "epoch": 9.733616218764734, + "grad_norm": 0.3553076982498169, + "learning_rate": 1.0838326092161844e-07, + "loss": 0.3388, + "num_input_tokens_seen": 20777696, + "step": 20645 + }, + { + "epoch": 9.735973597359736, + "grad_norm": 0.22322848439216614, + "learning_rate": 1.064781748113891e-07, + "loss": 0.283, + "num_input_tokens_seen": 20781728, + "step": 20650 + }, + { + "epoch": 9.738330975954739, + "grad_norm": 0.7463592886924744, + "learning_rate": 1.0458994490275543e-07, + "loss": 0.3663, + "num_input_tokens_seen": 20787200, + "step": 20655 + }, + { + "epoch": 9.740688354549741, + "grad_norm": 0.33379119634628296, + "learning_rate": 1.0271857247430017e-07, + "loss": 0.3827, + "num_input_tokens_seen": 20791296, + "step": 20660 + }, + { + "epoch": 9.743045733144744, + "grad_norm": 0.2867121994495392, + "learning_rate": 1.0086405879318473e-07, + "loss": 0.3355, + "num_input_tokens_seen": 20795456, + "step": 20665 + }, + { + "epoch": 9.745403111739746, + "grad_norm": 0.2812403440475464, + "learning_rate": 9.902640511516292e-08, + "loss": 0.3713, + "num_input_tokens_seen": 20799648, + "step": 20670 + }, + { + "epoch": 9.747760490334748, + "grad_norm": 0.3582909405231476, + "learning_rate": 9.720561268456718e-08, + "loss": 0.3354, + "num_input_tokens_seen": 20804480, + "step": 20675 + }, + { + "epoch": 9.75011786892975, + "grad_norm": 0.3565206825733185, + "learning_rate": 9.540168273431682e-08, + "loss": 0.3736, + "num_input_tokens_seen": 20810016, + "step": 20680 + }, + { + "epoch": 9.752475247524753, + "grad_norm": 0.3160330057144165, + "learning_rate": 9.361461648590697e-08, + "loss": 0.3482, + "num_input_tokens_seen": 20814976, + "step": 20685 + }, + { + "epoch": 9.754832626119756, + "grad_norm": 0.4971728026866913, + "learning_rate": 9.184441514942243e-08, + "loss": 0.337, + "num_input_tokens_seen": 20819104, + "step": 20690 + }, + { + "epoch": 9.757190004714758, + "grad_norm": 0.5466611981391907, + "learning_rate": 9.009107992351828e-08, + "loss": 0.3462, + "num_input_tokens_seen": 20823904, + "step": 20695 + }, + { + "epoch": 9.75954738330976, + "grad_norm": 0.5114795565605164, + "learning_rate": 8.835461199543649e-08, + "loss": 0.3573, + "num_input_tokens_seen": 20830048, + "step": 20700 + }, + { + "epoch": 9.761904761904763, + "grad_norm": 0.4165806472301483, + "learning_rate": 8.66350125409976e-08, + "loss": 0.3844, + "num_input_tokens_seen": 20834688, + "step": 20705 + }, + { + "epoch": 9.764262140499763, + "grad_norm": 0.8737223148345947, + "learning_rate": 8.493228272459242e-08, + "loss": 0.3647, + "num_input_tokens_seen": 20839040, + "step": 20710 + }, + { + "epoch": 9.766619519094768, + "grad_norm": 0.5627273321151733, + "learning_rate": 8.324642369919588e-08, + "loss": 0.2956, + "num_input_tokens_seen": 20843840, + "step": 20715 + }, + { + "epoch": 9.768976897689768, + "grad_norm": 0.3333481550216675, + "learning_rate": 8.157743660635875e-08, + "loss": 0.3413, + "num_input_tokens_seen": 20848512, + "step": 20720 + }, + { + "epoch": 9.77133427628477, + "grad_norm": 0.3072681725025177, + "learning_rate": 7.992532257620478e-08, + "loss": 0.3101, + "num_input_tokens_seen": 20853120, + "step": 20725 + }, + { + "epoch": 9.773691654879773, + "grad_norm": 0.3383781611919403, + "learning_rate": 7.829008272743077e-08, + "loss": 0.3208, + "num_input_tokens_seen": 20857536, + "step": 20730 + }, + { + "epoch": 9.776049033474775, + "grad_norm": 0.5953621864318848, + "learning_rate": 7.667171816731489e-08, + "loss": 0.3203, + "num_input_tokens_seen": 20862688, + "step": 20735 + }, + { + "epoch": 9.778406412069778, + "grad_norm": 0.4941369891166687, + "learning_rate": 7.507022999169999e-08, + "loss": 0.346, + "num_input_tokens_seen": 20867712, + "step": 20740 + }, + { + "epoch": 9.78076379066478, + "grad_norm": 0.2427927553653717, + "learning_rate": 7.348561928500752e-08, + "loss": 0.3177, + "num_input_tokens_seen": 20872544, + "step": 20745 + }, + { + "epoch": 9.783121169259783, + "grad_norm": 0.49167513847351074, + "learning_rate": 7.19178871202264e-08, + "loss": 0.2969, + "num_input_tokens_seen": 20877504, + "step": 20750 + }, + { + "epoch": 9.785478547854785, + "grad_norm": 0.44819021224975586, + "learning_rate": 7.036703455891858e-08, + "loss": 0.294, + "num_input_tokens_seen": 20882656, + "step": 20755 + }, + { + "epoch": 9.787835926449787, + "grad_norm": 0.2664033770561218, + "learning_rate": 6.883306265121625e-08, + "loss": 0.337, + "num_input_tokens_seen": 20887136, + "step": 20760 + }, + { + "epoch": 9.79019330504479, + "grad_norm": 0.3609359562397003, + "learning_rate": 6.731597243581911e-08, + "loss": 0.3283, + "num_input_tokens_seen": 20891488, + "step": 20765 + }, + { + "epoch": 9.792550683639792, + "grad_norm": 0.793319046497345, + "learning_rate": 6.581576494000264e-08, + "loss": 0.3423, + "num_input_tokens_seen": 20896992, + "step": 20770 + }, + { + "epoch": 9.794908062234795, + "grad_norm": 0.4787183701992035, + "learning_rate": 6.43324411795987e-08, + "loss": 0.3464, + "num_input_tokens_seen": 20901504, + "step": 20775 + }, + { + "epoch": 9.797265440829797, + "grad_norm": 0.5470705628395081, + "learning_rate": 6.286600215902049e-08, + "loss": 0.3223, + "num_input_tokens_seen": 20906880, + "step": 20780 + }, + { + "epoch": 9.7996228194248, + "grad_norm": 0.3881804943084717, + "learning_rate": 6.141644887123487e-08, + "loss": 0.351, + "num_input_tokens_seen": 20911776, + "step": 20785 + }, + { + "epoch": 9.801980198019802, + "grad_norm": 0.47209295630455017, + "learning_rate": 5.998378229778446e-08, + "loss": 0.3694, + "num_input_tokens_seen": 20917376, + "step": 20790 + }, + { + "epoch": 9.804337576614804, + "grad_norm": 0.3681820333003998, + "learning_rate": 5.8568003408770996e-08, + "loss": 0.389, + "num_input_tokens_seen": 20922048, + "step": 20795 + }, + { + "epoch": 9.806694955209807, + "grad_norm": 0.37363988161087036, + "learning_rate": 5.716911316286655e-08, + "loss": 0.33, + "num_input_tokens_seen": 20927008, + "step": 20800 + }, + { + "epoch": 9.809052333804809, + "grad_norm": 0.28159308433532715, + "learning_rate": 5.578711250730506e-08, + "loss": 0.3283, + "num_input_tokens_seen": 20930944, + "step": 20805 + }, + { + "epoch": 9.811409712399811, + "grad_norm": 0.4914868175983429, + "learning_rate": 5.4422002377879645e-08, + "loss": 0.3366, + "num_input_tokens_seen": 20935296, + "step": 20810 + }, + { + "epoch": 9.813767090994814, + "grad_norm": 0.49394771456718445, + "learning_rate": 5.3073783698950885e-08, + "loss": 0.2694, + "num_input_tokens_seen": 20940064, + "step": 20815 + }, + { + "epoch": 9.816124469589816, + "grad_norm": 0.3337688446044922, + "learning_rate": 5.174245738344408e-08, + "loss": 0.361, + "num_input_tokens_seen": 20944064, + "step": 20820 + }, + { + "epoch": 9.818481848184819, + "grad_norm": 0.23761023581027985, + "learning_rate": 5.042802433283811e-08, + "loss": 0.3211, + "num_input_tokens_seen": 20948672, + "step": 20825 + }, + { + "epoch": 9.820839226779821, + "grad_norm": 0.5067458152770996, + "learning_rate": 4.91304854371849e-08, + "loss": 0.3126, + "num_input_tokens_seen": 20953696, + "step": 20830 + }, + { + "epoch": 9.823196605374823, + "grad_norm": 0.34861132502555847, + "learning_rate": 4.784984157508166e-08, + "loss": 0.3496, + "num_input_tokens_seen": 20959296, + "step": 20835 + }, + { + "epoch": 9.825553983969826, + "grad_norm": 0.35926273465156555, + "learning_rate": 4.658609361369859e-08, + "loss": 0.3404, + "num_input_tokens_seen": 20964480, + "step": 20840 + }, + { + "epoch": 9.827911362564828, + "grad_norm": 0.32105201482772827, + "learning_rate": 4.533924240875953e-08, + "loss": 0.3871, + "num_input_tokens_seen": 20969376, + "step": 20845 + }, + { + "epoch": 9.83026874115983, + "grad_norm": 0.5165014266967773, + "learning_rate": 4.410928880454468e-08, + "loss": 0.3445, + "num_input_tokens_seen": 20975648, + "step": 20850 + }, + { + "epoch": 9.832626119754833, + "grad_norm": 0.5435631275177002, + "learning_rate": 4.2896233633896165e-08, + "loss": 0.3423, + "num_input_tokens_seen": 20981632, + "step": 20855 + }, + { + "epoch": 9.834983498349835, + "grad_norm": 0.3271990418434143, + "learning_rate": 4.170007771821527e-08, + "loss": 0.2868, + "num_input_tokens_seen": 20987584, + "step": 20860 + }, + { + "epoch": 9.837340876944838, + "grad_norm": 0.36093294620513916, + "learning_rate": 4.052082186745409e-08, + "loss": 0.3531, + "num_input_tokens_seen": 20992256, + "step": 20865 + }, + { + "epoch": 9.83969825553984, + "grad_norm": 0.27724772691726685, + "learning_rate": 3.9358466880126674e-08, + "loss": 0.3414, + "num_input_tokens_seen": 20997504, + "step": 20870 + }, + { + "epoch": 9.842055634134843, + "grad_norm": 0.3829445540904999, + "learning_rate": 3.821301354329787e-08, + "loss": 0.333, + "num_input_tokens_seen": 21002752, + "step": 20875 + }, + { + "epoch": 9.844413012729845, + "grad_norm": 0.44164058566093445, + "learning_rate": 3.7084462632594465e-08, + "loss": 0.2725, + "num_input_tokens_seen": 21008608, + "step": 20880 + }, + { + "epoch": 9.846770391324847, + "grad_norm": 0.28281909227371216, + "learning_rate": 3.597281491219129e-08, + "loss": 0.3262, + "num_input_tokens_seen": 21012928, + "step": 20885 + }, + { + "epoch": 9.84912776991985, + "grad_norm": 0.5596720576286316, + "learning_rate": 3.487807113482511e-08, + "loss": 0.3604, + "num_input_tokens_seen": 21018080, + "step": 20890 + }, + { + "epoch": 9.851485148514852, + "grad_norm": 0.3861311674118042, + "learning_rate": 3.3800232041777954e-08, + "loss": 0.3134, + "num_input_tokens_seen": 21022112, + "step": 20895 + }, + { + "epoch": 9.853842527109855, + "grad_norm": 0.49770310521125793, + "learning_rate": 3.2739298362888246e-08, + "loss": 0.3121, + "num_input_tokens_seen": 21027712, + "step": 20900 + }, + { + "epoch": 9.856199905704855, + "grad_norm": 0.279004842042923, + "learning_rate": 3.1695270816553546e-08, + "loss": 0.3572, + "num_input_tokens_seen": 21032384, + "step": 20905 + }, + { + "epoch": 9.85855728429986, + "grad_norm": 0.3949665129184723, + "learning_rate": 3.066815010971391e-08, + "loss": 0.348, + "num_input_tokens_seen": 21037824, + "step": 20910 + }, + { + "epoch": 9.86091466289486, + "grad_norm": 0.42473092675209045, + "learning_rate": 2.9657936937865782e-08, + "loss": 0.2702, + "num_input_tokens_seen": 21042816, + "step": 20915 + }, + { + "epoch": 9.863272041489862, + "grad_norm": 0.5335753560066223, + "learning_rate": 2.86646319850592e-08, + "loss": 0.2773, + "num_input_tokens_seen": 21047520, + "step": 20920 + }, + { + "epoch": 9.865629420084865, + "grad_norm": 0.35911786556243896, + "learning_rate": 2.768823592389225e-08, + "loss": 0.3455, + "num_input_tokens_seen": 21052512, + "step": 20925 + }, + { + "epoch": 9.867986798679867, + "grad_norm": 0.5150962471961975, + "learning_rate": 2.6728749415511066e-08, + "loss": 0.3351, + "num_input_tokens_seen": 21057952, + "step": 20930 + }, + { + "epoch": 9.87034417727487, + "grad_norm": 0.2666049599647522, + "learning_rate": 2.5786173109620948e-08, + "loss": 0.3334, + "num_input_tokens_seen": 21063200, + "step": 20935 + }, + { + "epoch": 9.872701555869872, + "grad_norm": 0.5516017079353333, + "learning_rate": 2.4860507644464126e-08, + "loss": 0.301, + "num_input_tokens_seen": 21069024, + "step": 20940 + }, + { + "epoch": 9.875058934464874, + "grad_norm": 0.31771397590637207, + "learning_rate": 2.395175364684199e-08, + "loss": 0.299, + "num_input_tokens_seen": 21073408, + "step": 20945 + }, + { + "epoch": 9.877416313059877, + "grad_norm": 0.3365183472633362, + "learning_rate": 2.305991173209843e-08, + "loss": 0.3514, + "num_input_tokens_seen": 21078688, + "step": 20950 + }, + { + "epoch": 9.87977369165488, + "grad_norm": 0.32431215047836304, + "learning_rate": 2.2184982504130926e-08, + "loss": 0.3276, + "num_input_tokens_seen": 21082912, + "step": 20955 + }, + { + "epoch": 9.882131070249882, + "grad_norm": 0.25610828399658203, + "learning_rate": 2.1326966555379468e-08, + "loss": 0.2838, + "num_input_tokens_seen": 21088192, + "step": 20960 + }, + { + "epoch": 9.884488448844884, + "grad_norm": 0.29381000995635986, + "learning_rate": 2.0485864466837645e-08, + "loss": 0.3673, + "num_input_tokens_seen": 21092768, + "step": 20965 + }, + { + "epoch": 9.886845827439886, + "grad_norm": 0.3301774859428406, + "learning_rate": 1.9661676808038763e-08, + "loss": 0.3534, + "num_input_tokens_seen": 21097888, + "step": 20970 + }, + { + "epoch": 9.889203206034889, + "grad_norm": 0.2865719795227051, + "learning_rate": 1.8854404137069738e-08, + "loss": 0.3053, + "num_input_tokens_seen": 21102400, + "step": 20975 + }, + { + "epoch": 9.891560584629891, + "grad_norm": 0.5295373201370239, + "learning_rate": 1.8064047000557193e-08, + "loss": 0.2478, + "num_input_tokens_seen": 21108160, + "step": 20980 + }, + { + "epoch": 9.893917963224894, + "grad_norm": 0.37825217843055725, + "learning_rate": 1.729060593368137e-08, + "loss": 0.2866, + "num_input_tokens_seen": 21113184, + "step": 20985 + }, + { + "epoch": 9.896275341819896, + "grad_norm": 0.31483179330825806, + "learning_rate": 1.653408146016222e-08, + "loss": 0.3231, + "num_input_tokens_seen": 21118208, + "step": 20990 + }, + { + "epoch": 9.898632720414899, + "grad_norm": 0.7945747971534729, + "learning_rate": 1.5794474092267753e-08, + "loss": 0.3198, + "num_input_tokens_seen": 21123488, + "step": 20995 + }, + { + "epoch": 9.900990099009901, + "grad_norm": 0.397238165140152, + "learning_rate": 1.507178433080847e-08, + "loss": 0.2892, + "num_input_tokens_seen": 21128800, + "step": 21000 + }, + { + "epoch": 9.903347477604903, + "grad_norm": 0.5282673835754395, + "learning_rate": 1.4366012665140149e-08, + "loss": 0.2661, + "num_input_tokens_seen": 21132352, + "step": 21005 + }, + { + "epoch": 9.905704856199906, + "grad_norm": 0.3132556974887848, + "learning_rate": 1.3677159573163844e-08, + "loss": 0.3496, + "num_input_tokens_seen": 21139040, + "step": 21010 + }, + { + "epoch": 9.908062234794908, + "grad_norm": 0.320646196603775, + "learning_rate": 1.3005225521325881e-08, + "loss": 0.3129, + "num_input_tokens_seen": 21143840, + "step": 21015 + }, + { + "epoch": 9.91041961338991, + "grad_norm": 0.27793097496032715, + "learning_rate": 1.2350210964612308e-08, + "loss": 0.2627, + "num_input_tokens_seen": 21149664, + "step": 21020 + }, + { + "epoch": 9.912776991984913, + "grad_norm": 0.34693294763565063, + "learning_rate": 1.1712116346557222e-08, + "loss": 0.31, + "num_input_tokens_seen": 21156064, + "step": 21025 + }, + { + "epoch": 9.915134370579915, + "grad_norm": 0.3009720742702484, + "learning_rate": 1.1090942099228895e-08, + "loss": 0.3133, + "num_input_tokens_seen": 21160928, + "step": 21030 + }, + { + "epoch": 9.917491749174918, + "grad_norm": 0.3236214518547058, + "learning_rate": 1.0486688643251974e-08, + "loss": 0.3323, + "num_input_tokens_seen": 21165248, + "step": 21035 + }, + { + "epoch": 9.91984912776992, + "grad_norm": 0.35960477590560913, + "learning_rate": 9.899356387779724e-09, + "loss": 0.3219, + "num_input_tokens_seen": 21170112, + "step": 21040 + }, + { + "epoch": 9.922206506364923, + "grad_norm": 0.3876228928565979, + "learning_rate": 9.328945730519012e-09, + "loss": 0.3102, + "num_input_tokens_seen": 21175584, + "step": 21045 + }, + { + "epoch": 9.924563884959925, + "grad_norm": 0.36101001501083374, + "learning_rate": 8.775457057708102e-09, + "loss": 0.3287, + "num_input_tokens_seen": 21179744, + "step": 21050 + }, + { + "epoch": 9.926921263554927, + "grad_norm": 0.3316519260406494, + "learning_rate": 8.238890744136084e-09, + "loss": 0.3487, + "num_input_tokens_seen": 21184704, + "step": 21055 + }, + { + "epoch": 9.92927864214993, + "grad_norm": 0.40789639949798584, + "learning_rate": 7.71924715312622e-09, + "loss": 0.3394, + "num_input_tokens_seen": 21189664, + "step": 21060 + }, + { + "epoch": 9.931636020744932, + "grad_norm": 0.3217167258262634, + "learning_rate": 7.216526636547039e-09, + "loss": 0.3094, + "num_input_tokens_seen": 21194240, + "step": 21065 + }, + { + "epoch": 9.933993399339935, + "grad_norm": 0.37959516048431396, + "learning_rate": 6.730729534804025e-09, + "loss": 0.3551, + "num_input_tokens_seen": 21199552, + "step": 21070 + }, + { + "epoch": 9.936350777934937, + "grad_norm": 0.30707448720932007, + "learning_rate": 6.261856176850711e-09, + "loss": 0.3329, + "num_input_tokens_seen": 21203232, + "step": 21075 + }, + { + "epoch": 9.93870815652994, + "grad_norm": 0.35258832573890686, + "learning_rate": 5.809906880174798e-09, + "loss": 0.3065, + "num_input_tokens_seen": 21209696, + "step": 21080 + }, + { + "epoch": 9.941065535124942, + "grad_norm": 0.47105392813682556, + "learning_rate": 5.374881950803712e-09, + "loss": 0.3348, + "num_input_tokens_seen": 21214688, + "step": 21085 + }, + { + "epoch": 9.943422913719944, + "grad_norm": 0.3740828335285187, + "learning_rate": 4.956781683310152e-09, + "loss": 0.3285, + "num_input_tokens_seen": 21221184, + "step": 21090 + }, + { + "epoch": 9.945780292314947, + "grad_norm": 0.31674885749816895, + "learning_rate": 4.555606360798215e-09, + "loss": 0.3072, + "num_input_tokens_seen": 21225408, + "step": 21095 + }, + { + "epoch": 9.948137670909949, + "grad_norm": 0.3569197654724121, + "learning_rate": 4.171356254920045e-09, + "loss": 0.3194, + "num_input_tokens_seen": 21230400, + "step": 21100 + }, + { + "epoch": 9.950495049504951, + "grad_norm": 0.4137776792049408, + "learning_rate": 3.804031625864735e-09, + "loss": 0.3172, + "num_input_tokens_seen": 21234752, + "step": 21105 + }, + { + "epoch": 9.952852428099952, + "grad_norm": 0.7074018120765686, + "learning_rate": 3.453632722358324e-09, + "loss": 0.3453, + "num_input_tokens_seen": 21240192, + "step": 21110 + }, + { + "epoch": 9.955209806694956, + "grad_norm": 0.35192224383354187, + "learning_rate": 3.1201597816638006e-09, + "loss": 0.3439, + "num_input_tokens_seen": 21244608, + "step": 21115 + }, + { + "epoch": 9.957567185289957, + "grad_norm": 0.7644410729408264, + "learning_rate": 2.8036130295922004e-09, + "loss": 0.3861, + "num_input_tokens_seen": 21251680, + "step": 21120 + }, + { + "epoch": 9.95992456388496, + "grad_norm": 0.455217570066452, + "learning_rate": 2.5039926804831803e-09, + "loss": 0.3405, + "num_input_tokens_seen": 21257600, + "step": 21125 + }, + { + "epoch": 9.962281942479962, + "grad_norm": 0.4991098642349243, + "learning_rate": 2.2212989372188964e-09, + "loss": 0.3087, + "num_input_tokens_seen": 21262336, + "step": 21130 + }, + { + "epoch": 9.964639321074964, + "grad_norm": 0.452227383852005, + "learning_rate": 1.955531991224002e-09, + "loss": 0.306, + "num_input_tokens_seen": 21267136, + "step": 21135 + }, + { + "epoch": 9.966996699669966, + "grad_norm": 0.4429827630519867, + "learning_rate": 1.7066920224573236e-09, + "loss": 0.2891, + "num_input_tokens_seen": 21271872, + "step": 21140 + }, + { + "epoch": 9.969354078264969, + "grad_norm": 0.3449242413043976, + "learning_rate": 1.4747791994118575e-09, + "loss": 0.298, + "num_input_tokens_seen": 21277056, + "step": 21145 + }, + { + "epoch": 9.971711456859971, + "grad_norm": 0.32945263385772705, + "learning_rate": 1.2597936791286514e-09, + "loss": 0.337, + "num_input_tokens_seen": 21281888, + "step": 21150 + }, + { + "epoch": 9.974068835454974, + "grad_norm": 0.5306524634361267, + "learning_rate": 1.061735607177372e-09, + "loss": 0.2993, + "num_input_tokens_seen": 21288128, + "step": 21155 + }, + { + "epoch": 9.976426214049976, + "grad_norm": 0.2950415015220642, + "learning_rate": 8.806051176729612e-10, + "loss": 0.3316, + "num_input_tokens_seen": 21293280, + "step": 21160 + }, + { + "epoch": 9.978783592644978, + "grad_norm": 0.3454124629497528, + "learning_rate": 7.164023332617564e-10, + "loss": 0.2848, + "num_input_tokens_seen": 21298880, + "step": 21165 + }, + { + "epoch": 9.98114097123998, + "grad_norm": 0.4946621060371399, + "learning_rate": 5.691273651325935e-10, + "loss": 0.2927, + "num_input_tokens_seen": 21304480, + "step": 21170 + }, + { + "epoch": 9.983498349834983, + "grad_norm": 0.404501348733902, + "learning_rate": 4.3878031300847997e-10, + "loss": 0.3214, + "num_input_tokens_seen": 21309120, + "step": 21175 + }, + { + "epoch": 9.985855728429986, + "grad_norm": 0.5248972773551941, + "learning_rate": 3.253612651521465e-10, + "loss": 0.3304, + "num_input_tokens_seen": 21314048, + "step": 21180 + }, + { + "epoch": 9.988213107024988, + "grad_norm": 0.3731975555419922, + "learning_rate": 2.2887029836327067e-10, + "loss": 0.3329, + "num_input_tokens_seen": 21319552, + "step": 21185 + }, + { + "epoch": 9.99057048561999, + "grad_norm": 0.2725750803947449, + "learning_rate": 1.4930747798125312e-10, + "loss": 0.3012, + "num_input_tokens_seen": 21323808, + "step": 21190 + }, + { + "epoch": 9.992927864214993, + "grad_norm": 0.3214316666126251, + "learning_rate": 8.66728578768905e-11, + "loss": 0.3043, + "num_input_tokens_seen": 21329056, + "step": 21195 + }, + { + "epoch": 9.995285242809995, + "grad_norm": 0.24733874201774597, + "learning_rate": 4.096648046347795e-11, + "loss": 0.3316, + "num_input_tokens_seen": 21333280, + "step": 21200 + }, + { + "epoch": 9.997642621404998, + "grad_norm": 0.3036029040813446, + "learning_rate": 1.218837669125783e-11, + "loss": 0.3499, + "num_input_tokens_seen": 21337632, + "step": 21205 + }, + { + "epoch": 10.0, + "grad_norm": 0.3289010524749756, + "learning_rate": 3.3856604464421736e-13, + "loss": 0.3516, + "num_input_tokens_seen": 21342336, + "step": 21210 + }, + { + "epoch": 10.0, + "num_input_tokens_seen": 21342336, + "step": 21210, + "total_flos": 9.610357440270828e+17, + "train_loss": 0.8144663356093516, + "train_runtime": 5530.2917, + "train_samples_per_second": 15.341, + "train_steps_per_second": 3.835 + } + ], + "logging_steps": 5, + "max_steps": 21210, + "num_input_tokens_seen": 21342336, + "num_train_epochs": 10, + "save_steps": 1061, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.610357440270828e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}