{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4930747922437675, "eval_steps": 3001, "global_step": 18000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013850415512465374, "grad_norm": 18.132577896118164, "learning_rate": 2.5688073394495415e-06, "loss": 3.1562, "step": 10 }, { "epoch": 0.002770083102493075, "grad_norm": 10.339975357055664, "learning_rate": 6.238532110091744e-06, "loss": 2.5596, "step": 20 }, { "epoch": 0.004155124653739612, "grad_norm": 8.46363639831543, "learning_rate": 9.908256880733946e-06, "loss": 2.3108, "step": 30 }, { "epoch": 0.00554016620498615, "grad_norm": 10.257675170898438, "learning_rate": 1.3577981651376149e-05, "loss": 2.1665, "step": 40 }, { "epoch": 0.006925207756232687, "grad_norm": 10.614598274230957, "learning_rate": 1.724770642201835e-05, "loss": 1.9422, "step": 50 }, { "epoch": 0.008310249307479225, "grad_norm": 10.544795989990234, "learning_rate": 2.091743119266055e-05, "loss": 1.7234, "step": 60 }, { "epoch": 0.009695290858725761, "grad_norm": 9.736770629882812, "learning_rate": 2.4587155963302752e-05, "loss": 1.7008, "step": 70 }, { "epoch": 0.0110803324099723, "grad_norm": 8.845556259155273, "learning_rate": 2.8256880733944954e-05, "loss": 1.6215, "step": 80 }, { "epoch": 0.012465373961218837, "grad_norm": 12.72878360748291, "learning_rate": 3.1926605504587156e-05, "loss": 1.5234, "step": 90 }, { "epoch": 0.013850415512465374, "grad_norm": 9.1851167678833, "learning_rate": 3.559633027522936e-05, "loss": 1.5045, "step": 100 }, { "epoch": 0.015235457063711912, "grad_norm": 12.70969009399414, "learning_rate": 3.926605504587156e-05, "loss": 1.5202, "step": 110 }, { "epoch": 0.01662049861495845, "grad_norm": 13.66281509399414, "learning_rate": 3.998515150109044e-05, "loss": 1.4786, "step": 120 }, { "epoch": 0.018005540166204988, "grad_norm": 13.012584686279297, "learning_rate": 3.996659087745348e-05, "loss": 1.4217, "step": 130 }, { "epoch": 0.019390581717451522, "grad_norm": 8.268105506896973, "learning_rate": 3.994803025381653e-05, "loss": 1.3395, "step": 140 }, { "epoch": 0.02077562326869806, "grad_norm": 10.73968505859375, "learning_rate": 3.9929469630179577e-05, "loss": 1.3918, "step": 150 }, { "epoch": 0.0221606648199446, "grad_norm": 15.68938159942627, "learning_rate": 3.991090900654263e-05, "loss": 1.3772, "step": 160 }, { "epoch": 0.023545706371191136, "grad_norm": 8.91905689239502, "learning_rate": 3.989234838290567e-05, "loss": 1.3547, "step": 170 }, { "epoch": 0.024930747922437674, "grad_norm": 8.650906562805176, "learning_rate": 3.9873787759268715e-05, "loss": 1.2924, "step": 180 }, { "epoch": 0.02631578947368421, "grad_norm": 6.8491291999816895, "learning_rate": 3.985522713563176e-05, "loss": 1.2698, "step": 190 }, { "epoch": 0.027700831024930747, "grad_norm": 12.829854011535645, "learning_rate": 3.983666651199481e-05, "loss": 1.2712, "step": 200 }, { "epoch": 0.029085872576177285, "grad_norm": 7.840148448944092, "learning_rate": 3.981810588835785e-05, "loss": 1.2306, "step": 210 }, { "epoch": 0.030470914127423823, "grad_norm": 9.273849487304688, "learning_rate": 3.97995452647209e-05, "loss": 1.1315, "step": 220 }, { "epoch": 0.03185595567867036, "grad_norm": 6.926727771759033, "learning_rate": 3.978098464108395e-05, "loss": 1.2234, "step": 230 }, { "epoch": 0.0332409972299169, "grad_norm": 8.453941345214844, "learning_rate": 3.9762424017446984e-05, "loss": 1.1558, "step": 240 }, { "epoch": 0.03462603878116344, "grad_norm": 12.281208992004395, "learning_rate": 3.9743863393810035e-05, "loss": 1.2207, "step": 250 }, { "epoch": 0.036011080332409975, "grad_norm": 8.216794967651367, "learning_rate": 3.972530277017308e-05, "loss": 1.1615, "step": 260 }, { "epoch": 0.037396121883656507, "grad_norm": 8.111953735351562, "learning_rate": 3.970674214653613e-05, "loss": 1.0734, "step": 270 }, { "epoch": 0.038781163434903045, "grad_norm": 25.132801055908203, "learning_rate": 3.968818152289917e-05, "loss": 1.1208, "step": 280 }, { "epoch": 0.04016620498614958, "grad_norm": 8.570955276489258, "learning_rate": 3.966962089926222e-05, "loss": 1.1871, "step": 290 }, { "epoch": 0.04155124653739612, "grad_norm": 6.715633869171143, "learning_rate": 3.965106027562526e-05, "loss": 1.0505, "step": 300 }, { "epoch": 0.04293628808864266, "grad_norm": 9.195822715759277, "learning_rate": 3.963249965198831e-05, "loss": 1.132, "step": 310 }, { "epoch": 0.0443213296398892, "grad_norm": 13.249361991882324, "learning_rate": 3.9613939028351355e-05, "loss": 1.1381, "step": 320 }, { "epoch": 0.045706371191135735, "grad_norm": 10.22148323059082, "learning_rate": 3.95953784047144e-05, "loss": 1.03, "step": 330 }, { "epoch": 0.04709141274238227, "grad_norm": 6.477139949798584, "learning_rate": 3.957681778107745e-05, "loss": 1.0697, "step": 340 }, { "epoch": 0.04847645429362881, "grad_norm": 7.295064449310303, "learning_rate": 3.9558257157440493e-05, "loss": 1.1033, "step": 350 }, { "epoch": 0.04986149584487535, "grad_norm": 7.153372287750244, "learning_rate": 3.953969653380354e-05, "loss": 1.0929, "step": 360 }, { "epoch": 0.05124653739612189, "grad_norm": 7.3654608726501465, "learning_rate": 3.952113591016658e-05, "loss": 1.0833, "step": 370 }, { "epoch": 0.05263157894736842, "grad_norm": 6.621361255645752, "learning_rate": 3.950257528652963e-05, "loss": 1.132, "step": 380 }, { "epoch": 0.054016620498614956, "grad_norm": 8.671921730041504, "learning_rate": 3.9484014662892675e-05, "loss": 0.9377, "step": 390 }, { "epoch": 0.055401662049861494, "grad_norm": 5.83281135559082, "learning_rate": 3.9465454039255726e-05, "loss": 1.077, "step": 400 }, { "epoch": 0.05678670360110803, "grad_norm": 10.525577545166016, "learning_rate": 3.944689341561877e-05, "loss": 1.0157, "step": 410 }, { "epoch": 0.05817174515235457, "grad_norm": 8.713621139526367, "learning_rate": 3.9428332791981814e-05, "loss": 0.9943, "step": 420 }, { "epoch": 0.05955678670360111, "grad_norm": 7.146033763885498, "learning_rate": 3.940977216834486e-05, "loss": 1.0711, "step": 430 }, { "epoch": 0.060941828254847646, "grad_norm": 5.721404075622559, "learning_rate": 3.93912115447079e-05, "loss": 1.0587, "step": 440 }, { "epoch": 0.062326869806094184, "grad_norm": 6.381560802459717, "learning_rate": 3.937265092107095e-05, "loss": 0.9532, "step": 450 }, { "epoch": 0.06371191135734072, "grad_norm": 6.270388603210449, "learning_rate": 3.9354090297433996e-05, "loss": 0.9633, "step": 460 }, { "epoch": 0.06509695290858726, "grad_norm": 14.950815200805664, "learning_rate": 3.9335529673797046e-05, "loss": 1.0124, "step": 470 }, { "epoch": 0.0664819944598338, "grad_norm": 10.147126197814941, "learning_rate": 3.931696905016009e-05, "loss": 0.9639, "step": 480 }, { "epoch": 0.06786703601108034, "grad_norm": 6.738188743591309, "learning_rate": 3.9298408426523134e-05, "loss": 0.8883, "step": 490 }, { "epoch": 0.06925207756232687, "grad_norm": 6.236662864685059, "learning_rate": 3.927984780288618e-05, "loss": 0.978, "step": 500 }, { "epoch": 0.07063711911357341, "grad_norm": 6.446779251098633, "learning_rate": 3.926128717924923e-05, "loss": 0.9513, "step": 510 }, { "epoch": 0.07202216066481995, "grad_norm": 5.965839385986328, "learning_rate": 3.924272655561227e-05, "loss": 0.9512, "step": 520 }, { "epoch": 0.07340720221606649, "grad_norm": 6.4189863204956055, "learning_rate": 3.9224165931975316e-05, "loss": 0.9166, "step": 530 }, { "epoch": 0.07479224376731301, "grad_norm": 6.521060943603516, "learning_rate": 3.9205605308338366e-05, "loss": 0.9101, "step": 540 }, { "epoch": 0.07617728531855955, "grad_norm": 6.578867435455322, "learning_rate": 3.918704468470141e-05, "loss": 0.89, "step": 550 }, { "epoch": 0.07756232686980609, "grad_norm": 5.1601080894470215, "learning_rate": 3.9168484061064454e-05, "loss": 0.9313, "step": 560 }, { "epoch": 0.07894736842105263, "grad_norm": 6.995227336883545, "learning_rate": 3.91499234374275e-05, "loss": 0.8137, "step": 570 }, { "epoch": 0.08033240997229917, "grad_norm": 8.542673110961914, "learning_rate": 3.913136281379055e-05, "loss": 0.843, "step": 580 }, { "epoch": 0.0817174515235457, "grad_norm": 7.692107677459717, "learning_rate": 3.911280219015359e-05, "loss": 0.8752, "step": 590 }, { "epoch": 0.08310249307479224, "grad_norm": 7.9079060554504395, "learning_rate": 3.909424156651664e-05, "loss": 0.9327, "step": 600 }, { "epoch": 0.08448753462603878, "grad_norm": 5.869115829467773, "learning_rate": 3.907568094287969e-05, "loss": 0.878, "step": 610 }, { "epoch": 0.08587257617728532, "grad_norm": 8.590213775634766, "learning_rate": 3.905712031924273e-05, "loss": 0.8299, "step": 620 }, { "epoch": 0.08725761772853186, "grad_norm": 5.390839099884033, "learning_rate": 3.9038559695605774e-05, "loss": 0.8466, "step": 630 }, { "epoch": 0.0886426592797784, "grad_norm": 9.3699312210083, "learning_rate": 3.901999907196882e-05, "loss": 0.8318, "step": 640 }, { "epoch": 0.09002770083102493, "grad_norm": 4.842573642730713, "learning_rate": 3.900143844833187e-05, "loss": 0.8058, "step": 650 }, { "epoch": 0.09141274238227147, "grad_norm": 6.638500690460205, "learning_rate": 3.898287782469491e-05, "loss": 0.925, "step": 660 }, { "epoch": 0.09279778393351801, "grad_norm": 5.404094219207764, "learning_rate": 3.8964317201057956e-05, "loss": 0.9217, "step": 670 }, { "epoch": 0.09418282548476455, "grad_norm": 6.114800453186035, "learning_rate": 3.8945756577421e-05, "loss": 0.8202, "step": 680 }, { "epoch": 0.09556786703601108, "grad_norm": 9.157390594482422, "learning_rate": 3.892719595378405e-05, "loss": 0.8351, "step": 690 }, { "epoch": 0.09695290858725762, "grad_norm": 4.4271697998046875, "learning_rate": 3.8908635330147095e-05, "loss": 0.8524, "step": 700 }, { "epoch": 0.09833795013850416, "grad_norm": 6.291593551635742, "learning_rate": 3.8890074706510145e-05, "loss": 0.7766, "step": 710 }, { "epoch": 0.0997229916897507, "grad_norm": 5.622344017028809, "learning_rate": 3.887151408287319e-05, "loss": 0.8793, "step": 720 }, { "epoch": 0.10110803324099724, "grad_norm": 5.384952545166016, "learning_rate": 3.885295345923623e-05, "loss": 0.8128, "step": 730 }, { "epoch": 0.10249307479224377, "grad_norm": 5.869991302490234, "learning_rate": 3.8834392835599277e-05, "loss": 0.7979, "step": 740 }, { "epoch": 0.1038781163434903, "grad_norm": 15.408794403076172, "learning_rate": 3.881583221196232e-05, "loss": 0.8147, "step": 750 }, { "epoch": 0.10526315789473684, "grad_norm": 3.851168394088745, "learning_rate": 3.879727158832537e-05, "loss": 0.8531, "step": 760 }, { "epoch": 0.10664819944598337, "grad_norm": 7.8020124435424805, "learning_rate": 3.8778710964688415e-05, "loss": 0.762, "step": 770 }, { "epoch": 0.10803324099722991, "grad_norm": 6.013789653778076, "learning_rate": 3.8760150341051465e-05, "loss": 0.9078, "step": 780 }, { "epoch": 0.10941828254847645, "grad_norm": 10.16032600402832, "learning_rate": 3.874158971741451e-05, "loss": 0.7734, "step": 790 }, { "epoch": 0.11080332409972299, "grad_norm": 5.860641956329346, "learning_rate": 3.872302909377755e-05, "loss": 0.7048, "step": 800 }, { "epoch": 0.11218836565096953, "grad_norm": 5.945785045623779, "learning_rate": 3.87044684701406e-05, "loss": 0.7713, "step": 810 }, { "epoch": 0.11357340720221606, "grad_norm": 7.668937683105469, "learning_rate": 3.868590784650365e-05, "loss": 0.8446, "step": 820 }, { "epoch": 0.1149584487534626, "grad_norm": 4.74345064163208, "learning_rate": 3.866734722286669e-05, "loss": 0.7783, "step": 830 }, { "epoch": 0.11634349030470914, "grad_norm": 6.158203601837158, "learning_rate": 3.8648786599229735e-05, "loss": 0.7757, "step": 840 }, { "epoch": 0.11772853185595568, "grad_norm": 6.251650333404541, "learning_rate": 3.8630225975592786e-05, "loss": 0.7444, "step": 850 }, { "epoch": 0.11911357340720222, "grad_norm": 6.265425682067871, "learning_rate": 3.861166535195583e-05, "loss": 0.7332, "step": 860 }, { "epoch": 0.12049861495844875, "grad_norm": 6.330787658691406, "learning_rate": 3.859310472831887e-05, "loss": 0.8099, "step": 870 }, { "epoch": 0.12188365650969529, "grad_norm": 5.528647422790527, "learning_rate": 3.857454410468192e-05, "loss": 0.7554, "step": 880 }, { "epoch": 0.12326869806094183, "grad_norm": 4.549752712249756, "learning_rate": 3.855598348104497e-05, "loss": 0.8289, "step": 890 }, { "epoch": 0.12465373961218837, "grad_norm": 6.695212364196777, "learning_rate": 3.853742285740801e-05, "loss": 0.8219, "step": 900 }, { "epoch": 0.1260387811634349, "grad_norm": 8.377049446105957, "learning_rate": 3.851886223377106e-05, "loss": 0.7509, "step": 910 }, { "epoch": 0.12742382271468145, "grad_norm": 3.6747989654541016, "learning_rate": 3.8500301610134106e-05, "loss": 0.7237, "step": 920 }, { "epoch": 0.12880886426592797, "grad_norm": 5.396711826324463, "learning_rate": 3.848174098649715e-05, "loss": 0.7875, "step": 930 }, { "epoch": 0.13019390581717452, "grad_norm": 5.941190719604492, "learning_rate": 3.8463180362860193e-05, "loss": 0.8429, "step": 940 }, { "epoch": 0.13157894736842105, "grad_norm": 6.800902843475342, "learning_rate": 3.844461973922324e-05, "loss": 0.722, "step": 950 }, { "epoch": 0.1329639889196676, "grad_norm": 5.411956787109375, "learning_rate": 3.842605911558629e-05, "loss": 0.6992, "step": 960 }, { "epoch": 0.13434903047091412, "grad_norm": 5.536626815795898, "learning_rate": 3.840749849194933e-05, "loss": 0.7314, "step": 970 }, { "epoch": 0.13573407202216067, "grad_norm": 6.304145812988281, "learning_rate": 3.838893786831238e-05, "loss": 0.7551, "step": 980 }, { "epoch": 0.1371191135734072, "grad_norm": 5.245241641998291, "learning_rate": 3.8370377244675426e-05, "loss": 0.7268, "step": 990 }, { "epoch": 0.13850415512465375, "grad_norm": 4.621682643890381, "learning_rate": 3.835181662103847e-05, "loss": 0.7104, "step": 1000 }, { "epoch": 0.13988919667590027, "grad_norm": 4.966935634613037, "learning_rate": 3.8333255997401514e-05, "loss": 0.7683, "step": 1010 }, { "epoch": 0.14127423822714683, "grad_norm": 5.239284992218018, "learning_rate": 3.8314695373764564e-05, "loss": 0.692, "step": 1020 }, { "epoch": 0.14265927977839335, "grad_norm": 5.201038360595703, "learning_rate": 3.829613475012761e-05, "loss": 0.7262, "step": 1030 }, { "epoch": 0.1440443213296399, "grad_norm": 5.287726879119873, "learning_rate": 3.827757412649065e-05, "loss": 0.7775, "step": 1040 }, { "epoch": 0.14542936288088643, "grad_norm": 5.782080173492432, "learning_rate": 3.8259013502853696e-05, "loss": 0.68, "step": 1050 }, { "epoch": 0.14681440443213298, "grad_norm": 4.6837239265441895, "learning_rate": 3.824045287921674e-05, "loss": 0.6769, "step": 1060 }, { "epoch": 0.1481994459833795, "grad_norm": 4.747815132141113, "learning_rate": 3.822189225557979e-05, "loss": 0.7367, "step": 1070 }, { "epoch": 0.14958448753462603, "grad_norm": 6.258734226226807, "learning_rate": 3.8203331631942834e-05, "loss": 0.7328, "step": 1080 }, { "epoch": 0.15096952908587258, "grad_norm": 6.083340644836426, "learning_rate": 3.8184771008305885e-05, "loss": 0.7175, "step": 1090 }, { "epoch": 0.1523545706371191, "grad_norm": 5.685976028442383, "learning_rate": 3.816621038466893e-05, "loss": 0.7737, "step": 1100 }, { "epoch": 0.15373961218836565, "grad_norm": 5.695319175720215, "learning_rate": 3.814764976103197e-05, "loss": 0.6561, "step": 1110 }, { "epoch": 0.15512465373961218, "grad_norm": 4.47851037979126, "learning_rate": 3.8129089137395016e-05, "loss": 0.7342, "step": 1120 }, { "epoch": 0.15650969529085873, "grad_norm": 5.210443019866943, "learning_rate": 3.8110528513758067e-05, "loss": 0.761, "step": 1130 }, { "epoch": 0.15789473684210525, "grad_norm": 4.6967549324035645, "learning_rate": 3.809196789012111e-05, "loss": 0.7034, "step": 1140 }, { "epoch": 0.1592797783933518, "grad_norm": 5.27488899230957, "learning_rate": 3.807340726648416e-05, "loss": 0.7862, "step": 1150 }, { "epoch": 0.16066481994459833, "grad_norm": 6.890642166137695, "learning_rate": 3.8054846642847205e-05, "loss": 0.6824, "step": 1160 }, { "epoch": 0.16204986149584488, "grad_norm": 4.336601734161377, "learning_rate": 3.803628601921025e-05, "loss": 0.6637, "step": 1170 }, { "epoch": 0.1634349030470914, "grad_norm": 7.999341011047363, "learning_rate": 3.801772539557329e-05, "loss": 0.6404, "step": 1180 }, { "epoch": 0.16481994459833796, "grad_norm": 5.302224636077881, "learning_rate": 3.7999164771936336e-05, "loss": 0.7179, "step": 1190 }, { "epoch": 0.16620498614958448, "grad_norm": 4.466766834259033, "learning_rate": 3.798060414829939e-05, "loss": 0.7234, "step": 1200 }, { "epoch": 0.16759002770083103, "grad_norm": 7.038808345794678, "learning_rate": 3.796204352466243e-05, "loss": 0.6967, "step": 1210 }, { "epoch": 0.16897506925207756, "grad_norm": 5.036026477813721, "learning_rate": 3.794348290102548e-05, "loss": 0.6566, "step": 1220 }, { "epoch": 0.1703601108033241, "grad_norm": 3.9233360290527344, "learning_rate": 3.7924922277388525e-05, "loss": 0.6905, "step": 1230 }, { "epoch": 0.17174515235457063, "grad_norm": 4.825267791748047, "learning_rate": 3.790636165375157e-05, "loss": 0.7097, "step": 1240 }, { "epoch": 0.1731301939058172, "grad_norm": 5.921151638031006, "learning_rate": 3.788780103011461e-05, "loss": 0.7085, "step": 1250 }, { "epoch": 0.1745152354570637, "grad_norm": 6.008309841156006, "learning_rate": 3.7869240406477656e-05, "loss": 0.7447, "step": 1260 }, { "epoch": 0.17590027700831026, "grad_norm": 9.49063491821289, "learning_rate": 3.785067978284071e-05, "loss": 0.5783, "step": 1270 }, { "epoch": 0.1772853185595568, "grad_norm": 8.459501266479492, "learning_rate": 3.783211915920375e-05, "loss": 0.6649, "step": 1280 }, { "epoch": 0.1786703601108033, "grad_norm": 4.825805187225342, "learning_rate": 3.78135585355668e-05, "loss": 0.7529, "step": 1290 }, { "epoch": 0.18005540166204986, "grad_norm": 5.760547161102295, "learning_rate": 3.7794997911929845e-05, "loss": 0.7133, "step": 1300 }, { "epoch": 0.1814404432132964, "grad_norm": 4.8905158042907715, "learning_rate": 3.777643728829289e-05, "loss": 0.6426, "step": 1310 }, { "epoch": 0.18282548476454294, "grad_norm": 3.2545783519744873, "learning_rate": 3.775787666465593e-05, "loss": 0.6756, "step": 1320 }, { "epoch": 0.18421052631578946, "grad_norm": 5.443678855895996, "learning_rate": 3.7739316041018983e-05, "loss": 0.7079, "step": 1330 }, { "epoch": 0.18559556786703602, "grad_norm": 4.882148742675781, "learning_rate": 3.772075541738203e-05, "loss": 0.6906, "step": 1340 }, { "epoch": 0.18698060941828254, "grad_norm": 4.647951126098633, "learning_rate": 3.770219479374508e-05, "loss": 0.7146, "step": 1350 }, { "epoch": 0.1883656509695291, "grad_norm": 4.474496841430664, "learning_rate": 3.768363417010812e-05, "loss": 0.6606, "step": 1360 }, { "epoch": 0.18975069252077562, "grad_norm": 4.751135349273682, "learning_rate": 3.7665073546471165e-05, "loss": 0.6824, "step": 1370 }, { "epoch": 0.19113573407202217, "grad_norm": 7.56292724609375, "learning_rate": 3.764651292283421e-05, "loss": 0.6646, "step": 1380 }, { "epoch": 0.1925207756232687, "grad_norm": 3.877157688140869, "learning_rate": 3.762795229919725e-05, "loss": 0.6486, "step": 1390 }, { "epoch": 0.19390581717451524, "grad_norm": 3.8777120113372803, "learning_rate": 3.7609391675560304e-05, "loss": 0.6452, "step": 1400 }, { "epoch": 0.19529085872576177, "grad_norm": 11.175263404846191, "learning_rate": 3.759083105192335e-05, "loss": 0.6786, "step": 1410 }, { "epoch": 0.19667590027700832, "grad_norm": 4.816016674041748, "learning_rate": 3.75722704282864e-05, "loss": 0.6549, "step": 1420 }, { "epoch": 0.19806094182825484, "grad_norm": 4.739681720733643, "learning_rate": 3.755370980464944e-05, "loss": 0.7098, "step": 1430 }, { "epoch": 0.1994459833795014, "grad_norm": 3.974384307861328, "learning_rate": 3.7535149181012486e-05, "loss": 0.6228, "step": 1440 }, { "epoch": 0.20083102493074792, "grad_norm": 5.216143608093262, "learning_rate": 3.751658855737553e-05, "loss": 0.6322, "step": 1450 }, { "epoch": 0.20221606648199447, "grad_norm": 4.246068954467773, "learning_rate": 3.749802793373858e-05, "loss": 0.6919, "step": 1460 }, { "epoch": 0.203601108033241, "grad_norm": 5.880276203155518, "learning_rate": 3.7479467310101624e-05, "loss": 0.6664, "step": 1470 }, { "epoch": 0.20498614958448755, "grad_norm": 4.310441493988037, "learning_rate": 3.746090668646467e-05, "loss": 0.666, "step": 1480 }, { "epoch": 0.20637119113573407, "grad_norm": 4.8572893142700195, "learning_rate": 3.744234606282771e-05, "loss": 0.5718, "step": 1490 }, { "epoch": 0.2077562326869806, "grad_norm": 4.827873229980469, "learning_rate": 3.7423785439190755e-05, "loss": 0.643, "step": 1500 }, { "epoch": 0.20914127423822715, "grad_norm": 4.610620021820068, "learning_rate": 3.7405224815553806e-05, "loss": 0.6318, "step": 1510 }, { "epoch": 0.21052631578947367, "grad_norm": 3.8572442531585693, "learning_rate": 3.738666419191685e-05, "loss": 0.6384, "step": 1520 }, { "epoch": 0.21191135734072022, "grad_norm": 5.45397424697876, "learning_rate": 3.73681035682799e-05, "loss": 0.6112, "step": 1530 }, { "epoch": 0.21329639889196675, "grad_norm": 4.176355361938477, "learning_rate": 3.7349542944642944e-05, "loss": 0.6084, "step": 1540 }, { "epoch": 0.2146814404432133, "grad_norm": 4.451131343841553, "learning_rate": 3.733098232100599e-05, "loss": 0.6443, "step": 1550 }, { "epoch": 0.21606648199445982, "grad_norm": 4.111794471740723, "learning_rate": 3.731242169736903e-05, "loss": 0.5832, "step": 1560 }, { "epoch": 0.21745152354570638, "grad_norm": 5.375326156616211, "learning_rate": 3.7293861073732076e-05, "loss": 0.6261, "step": 1570 }, { "epoch": 0.2188365650969529, "grad_norm": 3.982163429260254, "learning_rate": 3.7275300450095126e-05, "loss": 0.6045, "step": 1580 }, { "epoch": 0.22022160664819945, "grad_norm": 4.093824863433838, "learning_rate": 3.725859588882187e-05, "loss": 0.6789, "step": 1590 }, { "epoch": 0.22160664819944598, "grad_norm": 4.912144660949707, "learning_rate": 3.724003526518491e-05, "loss": 0.6261, "step": 1600 }, { "epoch": 0.22299168975069253, "grad_norm": 7.267054080963135, "learning_rate": 3.7221474641547957e-05, "loss": 0.6439, "step": 1610 }, { "epoch": 0.22437673130193905, "grad_norm": 7.550549507141113, "learning_rate": 3.7202914017911e-05, "loss": 0.6169, "step": 1620 }, { "epoch": 0.2257617728531856, "grad_norm": 5.615678310394287, "learning_rate": 3.718435339427405e-05, "loss": 0.6302, "step": 1630 }, { "epoch": 0.22714681440443213, "grad_norm": 5.005547046661377, "learning_rate": 3.7165792770637095e-05, "loss": 0.6026, "step": 1640 }, { "epoch": 0.22853185595567868, "grad_norm": 4.575570106506348, "learning_rate": 3.7147232147000145e-05, "loss": 0.5985, "step": 1650 }, { "epoch": 0.2299168975069252, "grad_norm": 4.003867149353027, "learning_rate": 3.712867152336319e-05, "loss": 0.6355, "step": 1660 }, { "epoch": 0.23130193905817176, "grad_norm": 4.371513843536377, "learning_rate": 3.711011089972623e-05, "loss": 0.6858, "step": 1670 }, { "epoch": 0.23268698060941828, "grad_norm": 4.9826765060424805, "learning_rate": 3.709155027608928e-05, "loss": 0.5826, "step": 1680 }, { "epoch": 0.23407202216066483, "grad_norm": 5.753824710845947, "learning_rate": 3.707298965245233e-05, "loss": 0.6877, "step": 1690 }, { "epoch": 0.23545706371191136, "grad_norm": 5.006950378417969, "learning_rate": 3.705442902881537e-05, "loss": 0.6843, "step": 1700 }, { "epoch": 0.23684210526315788, "grad_norm": 4.370176792144775, "learning_rate": 3.703586840517842e-05, "loss": 0.6099, "step": 1710 }, { "epoch": 0.23822714681440443, "grad_norm": 3.8350753784179688, "learning_rate": 3.7017307781541466e-05, "loss": 0.6045, "step": 1720 }, { "epoch": 0.23961218836565096, "grad_norm": 4.00664758682251, "learning_rate": 3.699874715790451e-05, "loss": 0.6468, "step": 1730 }, { "epoch": 0.2409972299168975, "grad_norm": 4.4113030433654785, "learning_rate": 3.698018653426755e-05, "loss": 0.6667, "step": 1740 }, { "epoch": 0.24238227146814403, "grad_norm": 6.2188639640808105, "learning_rate": 3.69616259106306e-05, "loss": 0.5793, "step": 1750 }, { "epoch": 0.24376731301939059, "grad_norm": 5.410155773162842, "learning_rate": 3.694306528699365e-05, "loss": 0.5879, "step": 1760 }, { "epoch": 0.2451523545706371, "grad_norm": 4.4489521980285645, "learning_rate": 3.692450466335669e-05, "loss": 0.6499, "step": 1770 }, { "epoch": 0.24653739612188366, "grad_norm": 5.712794303894043, "learning_rate": 3.690594403971974e-05, "loss": 0.6009, "step": 1780 }, { "epoch": 0.24792243767313019, "grad_norm": 4.065003395080566, "learning_rate": 3.6887383416082786e-05, "loss": 0.6676, "step": 1790 }, { "epoch": 0.24930747922437674, "grad_norm": 4.225168228149414, "learning_rate": 3.686882279244583e-05, "loss": 0.6011, "step": 1800 }, { "epoch": 0.25069252077562326, "grad_norm": 3.180039167404175, "learning_rate": 3.6850262168808873e-05, "loss": 0.5549, "step": 1810 }, { "epoch": 0.2520775623268698, "grad_norm": 3.799586772918701, "learning_rate": 3.6831701545171924e-05, "loss": 0.6039, "step": 1820 }, { "epoch": 0.25346260387811637, "grad_norm": 4.4404425621032715, "learning_rate": 3.681314092153497e-05, "loss": 0.5592, "step": 1830 }, { "epoch": 0.2548476454293629, "grad_norm": 5.383872985839844, "learning_rate": 3.679458029789801e-05, "loss": 0.6932, "step": 1840 }, { "epoch": 0.2562326869806094, "grad_norm": 4.271496295928955, "learning_rate": 3.6776019674261055e-05, "loss": 0.5506, "step": 1850 }, { "epoch": 0.25761772853185594, "grad_norm": 4.5665202140808105, "learning_rate": 3.67574590506241e-05, "loss": 0.6214, "step": 1860 }, { "epoch": 0.2590027700831025, "grad_norm": 5.333662986755371, "learning_rate": 3.673889842698715e-05, "loss": 0.6171, "step": 1870 }, { "epoch": 0.26038781163434904, "grad_norm": 3.2647597789764404, "learning_rate": 3.6720337803350194e-05, "loss": 0.5626, "step": 1880 }, { "epoch": 0.26177285318559557, "grad_norm": 11.73008918762207, "learning_rate": 3.6701777179713244e-05, "loss": 0.5816, "step": 1890 }, { "epoch": 0.2631578947368421, "grad_norm": 4.770632266998291, "learning_rate": 3.668321655607629e-05, "loss": 0.5818, "step": 1900 }, { "epoch": 0.26454293628808867, "grad_norm": 3.688140869140625, "learning_rate": 3.666465593243933e-05, "loss": 0.5898, "step": 1910 }, { "epoch": 0.2659279778393352, "grad_norm": 6.2547736167907715, "learning_rate": 3.6646095308802376e-05, "loss": 0.6211, "step": 1920 }, { "epoch": 0.2673130193905817, "grad_norm": 5.1031389236450195, "learning_rate": 3.6627534685165426e-05, "loss": 0.6125, "step": 1930 }, { "epoch": 0.26869806094182824, "grad_norm": 4.455575942993164, "learning_rate": 3.660897406152847e-05, "loss": 0.5901, "step": 1940 }, { "epoch": 0.27008310249307477, "grad_norm": 5.132863998413086, "learning_rate": 3.6590413437891514e-05, "loss": 0.582, "step": 1950 }, { "epoch": 0.27146814404432135, "grad_norm": 3.6740236282348633, "learning_rate": 3.6571852814254565e-05, "loss": 0.5668, "step": 1960 }, { "epoch": 0.27285318559556787, "grad_norm": 4.9556989669799805, "learning_rate": 3.655329219061761e-05, "loss": 0.497, "step": 1970 }, { "epoch": 0.2742382271468144, "grad_norm": 5.816377639770508, "learning_rate": 3.653473156698065e-05, "loss": 0.5759, "step": 1980 }, { "epoch": 0.2756232686980609, "grad_norm": 11.042893409729004, "learning_rate": 3.6516170943343696e-05, "loss": 0.5747, "step": 1990 }, { "epoch": 0.2770083102493075, "grad_norm": 5.5561699867248535, "learning_rate": 3.6497610319706747e-05, "loss": 0.6001, "step": 2000 }, { "epoch": 0.278393351800554, "grad_norm": 3.443624496459961, "learning_rate": 3.647904969606979e-05, "loss": 0.5558, "step": 2010 }, { "epoch": 0.27977839335180055, "grad_norm": 3.77988862991333, "learning_rate": 3.646048907243284e-05, "loss": 0.4779, "step": 2020 }, { "epoch": 0.28116343490304707, "grad_norm": 4.237990379333496, "learning_rate": 3.6441928448795885e-05, "loss": 0.5643, "step": 2030 }, { "epoch": 0.28254847645429365, "grad_norm": 5.618165016174316, "learning_rate": 3.642336782515893e-05, "loss": 0.6431, "step": 2040 }, { "epoch": 0.2839335180055402, "grad_norm": 4.02553129196167, "learning_rate": 3.640480720152197e-05, "loss": 0.5379, "step": 2050 }, { "epoch": 0.2853185595567867, "grad_norm": 4.569252967834473, "learning_rate": 3.6386246577885016e-05, "loss": 0.5608, "step": 2060 }, { "epoch": 0.2867036011080332, "grad_norm": 4.142032146453857, "learning_rate": 3.636768595424807e-05, "loss": 0.572, "step": 2070 }, { "epoch": 0.2880886426592798, "grad_norm": 6.030134677886963, "learning_rate": 3.634912533061111e-05, "loss": 0.6114, "step": 2080 }, { "epoch": 0.2894736842105263, "grad_norm": 5.247105598449707, "learning_rate": 3.633056470697416e-05, "loss": 0.5943, "step": 2090 }, { "epoch": 0.29085872576177285, "grad_norm": 3.4325268268585205, "learning_rate": 3.6312004083337205e-05, "loss": 0.5745, "step": 2100 }, { "epoch": 0.2922437673130194, "grad_norm": 3.9712328910827637, "learning_rate": 3.629344345970025e-05, "loss": 0.5947, "step": 2110 }, { "epoch": 0.29362880886426596, "grad_norm": 4.028194427490234, "learning_rate": 3.627488283606329e-05, "loss": 0.5505, "step": 2120 }, { "epoch": 0.2950138504155125, "grad_norm": 3.4895143508911133, "learning_rate": 3.625632221242634e-05, "loss": 0.5593, "step": 2130 }, { "epoch": 0.296398891966759, "grad_norm": 3.2017526626586914, "learning_rate": 3.623776158878939e-05, "loss": 0.5276, "step": 2140 }, { "epoch": 0.29778393351800553, "grad_norm": 3.9487078189849854, "learning_rate": 3.621920096515243e-05, "loss": 0.5495, "step": 2150 }, { "epoch": 0.29916897506925205, "grad_norm": 4.558566570281982, "learning_rate": 3.620064034151548e-05, "loss": 0.6063, "step": 2160 }, { "epoch": 0.30055401662049863, "grad_norm": 5.271570682525635, "learning_rate": 3.6182079717878525e-05, "loss": 0.5808, "step": 2170 }, { "epoch": 0.30193905817174516, "grad_norm": 4.145854949951172, "learning_rate": 3.616351909424157e-05, "loss": 0.5319, "step": 2180 }, { "epoch": 0.3033240997229917, "grad_norm": 5.569437503814697, "learning_rate": 3.614495847060461e-05, "loss": 0.5293, "step": 2190 }, { "epoch": 0.3047091412742382, "grad_norm": 5.583808422088623, "learning_rate": 3.6126397846967663e-05, "loss": 0.5285, "step": 2200 }, { "epoch": 0.3060941828254848, "grad_norm": 5.36161470413208, "learning_rate": 3.610783722333071e-05, "loss": 0.5507, "step": 2210 }, { "epoch": 0.3074792243767313, "grad_norm": 4.53624963760376, "learning_rate": 3.608927659969375e-05, "loss": 0.529, "step": 2220 }, { "epoch": 0.30886426592797783, "grad_norm": 5.9227705001831055, "learning_rate": 3.6070715976056795e-05, "loss": 0.4959, "step": 2230 }, { "epoch": 0.31024930747922436, "grad_norm": 3.500408887863159, "learning_rate": 3.6052155352419845e-05, "loss": 0.5269, "step": 2240 }, { "epoch": 0.31163434903047094, "grad_norm": 3.2887611389160156, "learning_rate": 3.603359472878289e-05, "loss": 0.5743, "step": 2250 }, { "epoch": 0.31301939058171746, "grad_norm": 4.520106792449951, "learning_rate": 3.601503410514593e-05, "loss": 0.5698, "step": 2260 }, { "epoch": 0.314404432132964, "grad_norm": 4.752847671508789, "learning_rate": 3.5996473481508984e-05, "loss": 0.6082, "step": 2270 }, { "epoch": 0.3157894736842105, "grad_norm": 3.0846593379974365, "learning_rate": 3.597791285787203e-05, "loss": 0.5231, "step": 2280 }, { "epoch": 0.3171745152354571, "grad_norm": 3.945099115371704, "learning_rate": 3.595935223423507e-05, "loss": 0.552, "step": 2290 }, { "epoch": 0.3185595567867036, "grad_norm": 3.7080931663513184, "learning_rate": 3.5940791610598115e-05, "loss": 0.5954, "step": 2300 }, { "epoch": 0.31994459833795014, "grad_norm": 3.6161038875579834, "learning_rate": 3.5922230986961166e-05, "loss": 0.5742, "step": 2310 }, { "epoch": 0.32132963988919666, "grad_norm": 3.662464141845703, "learning_rate": 3.590367036332421e-05, "loss": 0.5134, "step": 2320 }, { "epoch": 0.32271468144044324, "grad_norm": 5.78975248336792, "learning_rate": 3.588510973968726e-05, "loss": 0.6286, "step": 2330 }, { "epoch": 0.32409972299168976, "grad_norm": 4.353717803955078, "learning_rate": 3.5866549116050304e-05, "loss": 0.5182, "step": 2340 }, { "epoch": 0.3254847645429363, "grad_norm": 3.810673952102661, "learning_rate": 3.584798849241335e-05, "loss": 0.4705, "step": 2350 }, { "epoch": 0.3268698060941828, "grad_norm": 3.3044183254241943, "learning_rate": 3.582942786877639e-05, "loss": 0.5736, "step": 2360 }, { "epoch": 0.32825484764542934, "grad_norm": 5.248436450958252, "learning_rate": 3.5810867245139435e-05, "loss": 0.5238, "step": 2370 }, { "epoch": 0.3296398891966759, "grad_norm": 4.443650245666504, "learning_rate": 3.5792306621502486e-05, "loss": 0.5366, "step": 2380 }, { "epoch": 0.33102493074792244, "grad_norm": 4.204538345336914, "learning_rate": 3.577374599786553e-05, "loss": 0.4978, "step": 2390 }, { "epoch": 0.33240997229916897, "grad_norm": 3.603652000427246, "learning_rate": 3.575518537422858e-05, "loss": 0.5319, "step": 2400 }, { "epoch": 0.3337950138504155, "grad_norm": 4.066627025604248, "learning_rate": 3.5736624750591624e-05, "loss": 0.4795, "step": 2410 }, { "epoch": 0.33518005540166207, "grad_norm": 7.520932197570801, "learning_rate": 3.571806412695467e-05, "loss": 0.4946, "step": 2420 }, { "epoch": 0.3365650969529086, "grad_norm": 4.000723361968994, "learning_rate": 3.569950350331771e-05, "loss": 0.5143, "step": 2430 }, { "epoch": 0.3379501385041551, "grad_norm": 3.362517833709717, "learning_rate": 3.568094287968076e-05, "loss": 0.5311, "step": 2440 }, { "epoch": 0.33933518005540164, "grad_norm": 4.851598262786865, "learning_rate": 3.5662382256043806e-05, "loss": 0.503, "step": 2450 }, { "epoch": 0.3407202216066482, "grad_norm": 2.6380112171173096, "learning_rate": 3.564382163240685e-05, "loss": 0.5267, "step": 2460 }, { "epoch": 0.34210526315789475, "grad_norm": 4.395262241363525, "learning_rate": 3.56252610087699e-05, "loss": 0.5091, "step": 2470 }, { "epoch": 0.34349030470914127, "grad_norm": 5.229985237121582, "learning_rate": 3.5606700385132944e-05, "loss": 0.5397, "step": 2480 }, { "epoch": 0.3448753462603878, "grad_norm": 3.8562421798706055, "learning_rate": 3.558813976149599e-05, "loss": 0.5188, "step": 2490 }, { "epoch": 0.3462603878116344, "grad_norm": 11.592336654663086, "learning_rate": 3.556957913785903e-05, "loss": 0.5095, "step": 2500 }, { "epoch": 0.3476454293628809, "grad_norm": 3.571937322616577, "learning_rate": 3.555101851422208e-05, "loss": 0.6358, "step": 2510 }, { "epoch": 0.3490304709141274, "grad_norm": 3.7377965450286865, "learning_rate": 3.5532457890585126e-05, "loss": 0.568, "step": 2520 }, { "epoch": 0.35041551246537395, "grad_norm": 3.339667320251465, "learning_rate": 3.551389726694818e-05, "loss": 0.4745, "step": 2530 }, { "epoch": 0.3518005540166205, "grad_norm": 4.469910621643066, "learning_rate": 3.549533664331122e-05, "loss": 0.5272, "step": 2540 }, { "epoch": 0.35318559556786705, "grad_norm": 3.5588207244873047, "learning_rate": 3.5476776019674265e-05, "loss": 0.5189, "step": 2550 }, { "epoch": 0.3545706371191136, "grad_norm": 4.583835124969482, "learning_rate": 3.545821539603731e-05, "loss": 0.6198, "step": 2560 }, { "epoch": 0.3559556786703601, "grad_norm": 4.148881912231445, "learning_rate": 3.543965477240035e-05, "loss": 0.5521, "step": 2570 }, { "epoch": 0.3573407202216066, "grad_norm": 11.703550338745117, "learning_rate": 3.54210941487634e-05, "loss": 0.4996, "step": 2580 }, { "epoch": 0.3587257617728532, "grad_norm": 3.770169734954834, "learning_rate": 3.5402533525126447e-05, "loss": 0.5395, "step": 2590 }, { "epoch": 0.3601108033240997, "grad_norm": 4.923569679260254, "learning_rate": 3.53839729014895e-05, "loss": 0.4904, "step": 2600 }, { "epoch": 0.36149584487534625, "grad_norm": 2.8726134300231934, "learning_rate": 3.536541227785254e-05, "loss": 0.5093, "step": 2610 }, { "epoch": 0.3628808864265928, "grad_norm": 2.879404306411743, "learning_rate": 3.5346851654215585e-05, "loss": 0.5385, "step": 2620 }, { "epoch": 0.36426592797783935, "grad_norm": 2.7815260887145996, "learning_rate": 3.532829103057863e-05, "loss": 0.5266, "step": 2630 }, { "epoch": 0.3656509695290859, "grad_norm": 3.5711984634399414, "learning_rate": 3.530973040694168e-05, "loss": 0.5257, "step": 2640 }, { "epoch": 0.3670360110803324, "grad_norm": 2.760270118713379, "learning_rate": 3.529116978330472e-05, "loss": 0.5074, "step": 2650 }, { "epoch": 0.3684210526315789, "grad_norm": 2.6999144554138184, "learning_rate": 3.527260915966777e-05, "loss": 0.5, "step": 2660 }, { "epoch": 0.3698060941828255, "grad_norm": 3.2955033779144287, "learning_rate": 3.525404853603081e-05, "loss": 0.478, "step": 2670 }, { "epoch": 0.37119113573407203, "grad_norm": 6.846369743347168, "learning_rate": 3.5235487912393854e-05, "loss": 0.5314, "step": 2680 }, { "epoch": 0.37257617728531855, "grad_norm": 3.0586764812469482, "learning_rate": 3.5216927288756905e-05, "loss": 0.4788, "step": 2690 }, { "epoch": 0.3739612188365651, "grad_norm": 5.029668807983398, "learning_rate": 3.519836666511995e-05, "loss": 0.4966, "step": 2700 }, { "epoch": 0.37534626038781166, "grad_norm": 6.83918571472168, "learning_rate": 3.5179806041483e-05, "loss": 0.5342, "step": 2710 }, { "epoch": 0.3767313019390582, "grad_norm": 3.3956825733184814, "learning_rate": 3.516124541784604e-05, "loss": 0.5528, "step": 2720 }, { "epoch": 0.3781163434903047, "grad_norm": 3.8964269161224365, "learning_rate": 3.514268479420909e-05, "loss": 0.4569, "step": 2730 }, { "epoch": 0.37950138504155123, "grad_norm": 3.9531123638153076, "learning_rate": 3.512412417057213e-05, "loss": 0.4998, "step": 2740 }, { "epoch": 0.3808864265927978, "grad_norm": 4.969144344329834, "learning_rate": 3.510556354693518e-05, "loss": 0.4915, "step": 2750 }, { "epoch": 0.38227146814404434, "grad_norm": 7.940041542053223, "learning_rate": 3.5087002923298225e-05, "loss": 0.5062, "step": 2760 }, { "epoch": 0.38365650969529086, "grad_norm": 4.755401134490967, "learning_rate": 3.506844229966127e-05, "loss": 0.4752, "step": 2770 }, { "epoch": 0.3850415512465374, "grad_norm": 6.793601989746094, "learning_rate": 3.504988167602432e-05, "loss": 0.449, "step": 2780 }, { "epoch": 0.3864265927977839, "grad_norm": 3.899365186691284, "learning_rate": 3.5031321052387363e-05, "loss": 0.4956, "step": 2790 }, { "epoch": 0.3878116343490305, "grad_norm": 4.510800838470459, "learning_rate": 3.501276042875041e-05, "loss": 0.5161, "step": 2800 }, { "epoch": 0.389196675900277, "grad_norm": 3.8182857036590576, "learning_rate": 3.499419980511345e-05, "loss": 0.5209, "step": 2810 }, { "epoch": 0.39058171745152354, "grad_norm": 4.2780256271362305, "learning_rate": 3.49756391814765e-05, "loss": 0.5402, "step": 2820 }, { "epoch": 0.39196675900277006, "grad_norm": 3.448150157928467, "learning_rate": 3.4957078557839545e-05, "loss": 0.504, "step": 2830 }, { "epoch": 0.39335180055401664, "grad_norm": 4.190673828125, "learning_rate": 3.4938517934202596e-05, "loss": 0.5799, "step": 2840 }, { "epoch": 0.39473684210526316, "grad_norm": 4.70761775970459, "learning_rate": 3.491995731056564e-05, "loss": 0.4654, "step": 2850 }, { "epoch": 0.3961218836565097, "grad_norm": 4.735783576965332, "learning_rate": 3.4901396686928684e-05, "loss": 0.4908, "step": 2860 }, { "epoch": 0.3975069252077562, "grad_norm": 6.540755271911621, "learning_rate": 3.488283606329173e-05, "loss": 0.4903, "step": 2870 }, { "epoch": 0.3988919667590028, "grad_norm": 4.559645175933838, "learning_rate": 3.486427543965477e-05, "loss": 0.5602, "step": 2880 }, { "epoch": 0.4002770083102493, "grad_norm": 4.629283428192139, "learning_rate": 3.484571481601782e-05, "loss": 0.5174, "step": 2890 }, { "epoch": 0.40166204986149584, "grad_norm": 3.9410197734832764, "learning_rate": 3.4827154192380866e-05, "loss": 0.5041, "step": 2900 }, { "epoch": 0.40304709141274236, "grad_norm": 4.21798849105835, "learning_rate": 3.481044963110761e-05, "loss": 0.5106, "step": 2910 }, { "epoch": 0.40443213296398894, "grad_norm": 4.3796257972717285, "learning_rate": 3.479188900747065e-05, "loss": 0.4752, "step": 2920 }, { "epoch": 0.40581717451523547, "grad_norm": 3.2095866203308105, "learning_rate": 3.4773328383833696e-05, "loss": 0.4748, "step": 2930 }, { "epoch": 0.407202216066482, "grad_norm": 6.470020771026611, "learning_rate": 3.475476776019675e-05, "loss": 0.5938, "step": 2940 }, { "epoch": 0.4085872576177285, "grad_norm": 4.654620170593262, "learning_rate": 3.473620713655979e-05, "loss": 0.5489, "step": 2950 }, { "epoch": 0.4099722991689751, "grad_norm": 3.7245383262634277, "learning_rate": 3.471764651292284e-05, "loss": 0.4662, "step": 2960 }, { "epoch": 0.4113573407202216, "grad_norm": 4.06242036819458, "learning_rate": 3.4699085889285885e-05, "loss": 0.4416, "step": 2970 }, { "epoch": 0.41274238227146814, "grad_norm": 3.3347277641296387, "learning_rate": 3.468052526564893e-05, "loss": 0.5031, "step": 2980 }, { "epoch": 0.41412742382271467, "grad_norm": 4.123641490936279, "learning_rate": 3.466196464201197e-05, "loss": 0.5363, "step": 2990 }, { "epoch": 0.4155124653739612, "grad_norm": 4.5999016761779785, "learning_rate": 3.464340401837502e-05, "loss": 0.4584, "step": 3000 }, { "epoch": 0.41565096952908587, "eval_loss": 0.48571890592575073, "eval_runtime": 1418.1173, "eval_samples_per_second": 6.431, "eval_steps_per_second": 0.804, "step": 3001 }, { "epoch": 0.4168975069252078, "grad_norm": 5.3403496742248535, "learning_rate": 3.462484339473807e-05, "loss": 0.5159, "step": 3010 }, { "epoch": 0.4182825484764543, "grad_norm": 4.382096290588379, "learning_rate": 3.46081388334648e-05, "loss": 0.5077, "step": 3020 }, { "epoch": 0.4196675900277008, "grad_norm": 3.537562131881714, "learning_rate": 3.4589578209827854e-05, "loss": 0.4805, "step": 3030 }, { "epoch": 0.42105263157894735, "grad_norm": 2.426811456680298, "learning_rate": 3.45710175861909e-05, "loss": 0.4675, "step": 3040 }, { "epoch": 0.4224376731301939, "grad_norm": 4.311849117279053, "learning_rate": 3.455245696255395e-05, "loss": 0.5579, "step": 3050 }, { "epoch": 0.42382271468144045, "grad_norm": 3.4808189868927, "learning_rate": 3.453389633891699e-05, "loss": 0.4591, "step": 3060 }, { "epoch": 0.425207756232687, "grad_norm": 4.2031049728393555, "learning_rate": 3.4515335715280036e-05, "loss": 0.5279, "step": 3070 }, { "epoch": 0.4265927977839335, "grad_norm": 4.175496578216553, "learning_rate": 3.449677509164308e-05, "loss": 0.4731, "step": 3080 }, { "epoch": 0.4279778393351801, "grad_norm": 3.436108350753784, "learning_rate": 3.447821446800612e-05, "loss": 0.4754, "step": 3090 }, { "epoch": 0.4293628808864266, "grad_norm": 6.296165943145752, "learning_rate": 3.4459653844369174e-05, "loss": 0.4821, "step": 3100 }, { "epoch": 0.4307479224376731, "grad_norm": 3.948596239089966, "learning_rate": 3.444109322073222e-05, "loss": 0.5217, "step": 3110 }, { "epoch": 0.43213296398891965, "grad_norm": 3.046654224395752, "learning_rate": 3.442253259709527e-05, "loss": 0.4889, "step": 3120 }, { "epoch": 0.43351800554016623, "grad_norm": 2.8600590229034424, "learning_rate": 3.440397197345831e-05, "loss": 0.4593, "step": 3130 }, { "epoch": 0.43490304709141275, "grad_norm": 3.5031280517578125, "learning_rate": 3.4385411349821356e-05, "loss": 0.4989, "step": 3140 }, { "epoch": 0.4362880886426593, "grad_norm": 4.562175750732422, "learning_rate": 3.43668507261844e-05, "loss": 0.5269, "step": 3150 }, { "epoch": 0.4376731301939058, "grad_norm": 3.6332297325134277, "learning_rate": 3.434829010254745e-05, "loss": 0.4136, "step": 3160 }, { "epoch": 0.4390581717451524, "grad_norm": 3.7745444774627686, "learning_rate": 3.4329729478910494e-05, "loss": 0.4953, "step": 3170 }, { "epoch": 0.4404432132963989, "grad_norm": 3.368767499923706, "learning_rate": 3.431116885527354e-05, "loss": 0.4436, "step": 3180 }, { "epoch": 0.44182825484764543, "grad_norm": 3.215421676635742, "learning_rate": 3.429260823163659e-05, "loss": 0.5105, "step": 3190 }, { "epoch": 0.44321329639889195, "grad_norm": 4.738856315612793, "learning_rate": 3.427404760799963e-05, "loss": 0.5586, "step": 3200 }, { "epoch": 0.4445983379501385, "grad_norm": 10.23610782623291, "learning_rate": 3.4255486984362676e-05, "loss": 0.5084, "step": 3210 }, { "epoch": 0.44598337950138506, "grad_norm": 6.943126678466797, "learning_rate": 3.423692636072572e-05, "loss": 0.4687, "step": 3220 }, { "epoch": 0.4473684210526316, "grad_norm": 3.2041103839874268, "learning_rate": 3.421836573708877e-05, "loss": 0.4768, "step": 3230 }, { "epoch": 0.4487534626038781, "grad_norm": 7.946971893310547, "learning_rate": 3.4199805113451814e-05, "loss": 0.5148, "step": 3240 }, { "epoch": 0.45013850415512463, "grad_norm": 5.417855739593506, "learning_rate": 3.4181244489814865e-05, "loss": 0.5197, "step": 3250 }, { "epoch": 0.4515235457063712, "grad_norm": 3.584216833114624, "learning_rate": 3.416268386617791e-05, "loss": 0.4556, "step": 3260 }, { "epoch": 0.45290858725761773, "grad_norm": 3.537973165512085, "learning_rate": 3.414412324254095e-05, "loss": 0.4912, "step": 3270 }, { "epoch": 0.45429362880886426, "grad_norm": 3.106688976287842, "learning_rate": 3.4125562618903996e-05, "loss": 0.4669, "step": 3280 }, { "epoch": 0.4556786703601108, "grad_norm": 4.34760046005249, "learning_rate": 3.410700199526704e-05, "loss": 0.4704, "step": 3290 }, { "epoch": 0.45706371191135736, "grad_norm": 3.2222983837127686, "learning_rate": 3.408844137163009e-05, "loss": 0.4986, "step": 3300 }, { "epoch": 0.4584487534626039, "grad_norm": 3.6748571395874023, "learning_rate": 3.4069880747993135e-05, "loss": 0.4944, "step": 3310 }, { "epoch": 0.4598337950138504, "grad_norm": 2.6829590797424316, "learning_rate": 3.4051320124356185e-05, "loss": 0.4449, "step": 3320 }, { "epoch": 0.46121883656509693, "grad_norm": 2.984292984008789, "learning_rate": 3.403275950071923e-05, "loss": 0.4484, "step": 3330 }, { "epoch": 0.4626038781163435, "grad_norm": 4.855510234832764, "learning_rate": 3.401419887708227e-05, "loss": 0.5278, "step": 3340 }, { "epoch": 0.46398891966759004, "grad_norm": 3.2982306480407715, "learning_rate": 3.3995638253445317e-05, "loss": 0.5044, "step": 3350 }, { "epoch": 0.46537396121883656, "grad_norm": 6.512004852294922, "learning_rate": 3.397707762980837e-05, "loss": 0.471, "step": 3360 }, { "epoch": 0.4667590027700831, "grad_norm": 4.388622760772705, "learning_rate": 3.395851700617141e-05, "loss": 0.4689, "step": 3370 }, { "epoch": 0.46814404432132967, "grad_norm": 3.218029022216797, "learning_rate": 3.3939956382534455e-05, "loss": 0.4975, "step": 3380 }, { "epoch": 0.4695290858725762, "grad_norm": 3.2648587226867676, "learning_rate": 3.39213957588975e-05, "loss": 0.4542, "step": 3390 }, { "epoch": 0.4709141274238227, "grad_norm": 4.079712390899658, "learning_rate": 3.390283513526054e-05, "loss": 0.4281, "step": 3400 }, { "epoch": 0.47229916897506924, "grad_norm": 4.063242435455322, "learning_rate": 3.388427451162359e-05, "loss": 0.5375, "step": 3410 }, { "epoch": 0.47368421052631576, "grad_norm": 3.1259000301361084, "learning_rate": 3.386571388798664e-05, "loss": 0.4897, "step": 3420 }, { "epoch": 0.47506925207756234, "grad_norm": 6.046876430511475, "learning_rate": 3.384715326434969e-05, "loss": 0.4871, "step": 3430 }, { "epoch": 0.47645429362880887, "grad_norm": 23.784345626831055, "learning_rate": 3.382859264071273e-05, "loss": 0.4712, "step": 3440 }, { "epoch": 0.4778393351800554, "grad_norm": 4.167202472686768, "learning_rate": 3.3810032017075775e-05, "loss": 0.4637, "step": 3450 }, { "epoch": 0.4792243767313019, "grad_norm": 7.40032434463501, "learning_rate": 3.379147139343882e-05, "loss": 0.4799, "step": 3460 }, { "epoch": 0.4806094182825485, "grad_norm": 3.7741355895996094, "learning_rate": 3.377291076980187e-05, "loss": 0.513, "step": 3470 }, { "epoch": 0.481994459833795, "grad_norm": 2.4384195804595947, "learning_rate": 3.375435014616491e-05, "loss": 0.501, "step": 3480 }, { "epoch": 0.48337950138504154, "grad_norm": 5.508775234222412, "learning_rate": 3.3735789522527964e-05, "loss": 0.4425, "step": 3490 }, { "epoch": 0.48476454293628807, "grad_norm": 3.220346212387085, "learning_rate": 3.371722889889101e-05, "loss": 0.4754, "step": 3500 }, { "epoch": 0.48614958448753465, "grad_norm": 3.7024574279785156, "learning_rate": 3.369866827525405e-05, "loss": 0.4953, "step": 3510 }, { "epoch": 0.48753462603878117, "grad_norm": 5.731636047363281, "learning_rate": 3.3680107651617095e-05, "loss": 0.4893, "step": 3520 }, { "epoch": 0.4889196675900277, "grad_norm": 2.938441038131714, "learning_rate": 3.366154702798014e-05, "loss": 0.4364, "step": 3530 }, { "epoch": 0.4903047091412742, "grad_norm": 3.5388107299804688, "learning_rate": 3.364298640434319e-05, "loss": 0.4975, "step": 3540 }, { "epoch": 0.4916897506925208, "grad_norm": 4.601080894470215, "learning_rate": 3.3624425780706233e-05, "loss": 0.5368, "step": 3550 }, { "epoch": 0.4930747922437673, "grad_norm": 6.017910480499268, "learning_rate": 3.3605865157069284e-05, "loss": 0.4677, "step": 3560 }, { "epoch": 0.49445983379501385, "grad_norm": 4.040421962738037, "learning_rate": 3.358730453343233e-05, "loss": 0.4467, "step": 3570 }, { "epoch": 0.49584487534626037, "grad_norm": 4.375626087188721, "learning_rate": 3.356874390979537e-05, "loss": 0.4409, "step": 3580 }, { "epoch": 0.49722991689750695, "grad_norm": 3.4123740196228027, "learning_rate": 3.3550183286158415e-05, "loss": 0.4369, "step": 3590 }, { "epoch": 0.4986149584487535, "grad_norm": 3.231276750564575, "learning_rate": 3.353162266252146e-05, "loss": 0.4736, "step": 3600 }, { "epoch": 0.5, "grad_norm": 6.275581359863281, "learning_rate": 3.351306203888451e-05, "loss": 0.464, "step": 3610 }, { "epoch": 0.5013850415512465, "grad_norm": 4.812581539154053, "learning_rate": 3.3494501415247554e-05, "loss": 0.4516, "step": 3620 }, { "epoch": 0.502770083102493, "grad_norm": 9.834177017211914, "learning_rate": 3.3475940791610604e-05, "loss": 0.4231, "step": 3630 }, { "epoch": 0.5041551246537396, "grad_norm": 3.000638246536255, "learning_rate": 3.345738016797365e-05, "loss": 0.3877, "step": 3640 }, { "epoch": 0.5055401662049861, "grad_norm": 3.8207039833068848, "learning_rate": 3.343881954433669e-05, "loss": 0.4568, "step": 3650 }, { "epoch": 0.5069252077562327, "grad_norm": 4.181671142578125, "learning_rate": 3.3420258920699736e-05, "loss": 0.458, "step": 3660 }, { "epoch": 0.5083102493074793, "grad_norm": 3.0442757606506348, "learning_rate": 3.3401698297062786e-05, "loss": 0.4284, "step": 3670 }, { "epoch": 0.5096952908587258, "grad_norm": 4.325800895690918, "learning_rate": 3.338313767342583e-05, "loss": 0.4374, "step": 3680 }, { "epoch": 0.5110803324099723, "grad_norm": 4.549467086791992, "learning_rate": 3.336457704978888e-05, "loss": 0.4454, "step": 3690 }, { "epoch": 0.5124653739612188, "grad_norm": 3.2602922916412354, "learning_rate": 3.3346016426151924e-05, "loss": 0.4707, "step": 3700 }, { "epoch": 0.5138504155124654, "grad_norm": 2.9979166984558105, "learning_rate": 3.332745580251497e-05, "loss": 0.4444, "step": 3710 }, { "epoch": 0.5152354570637119, "grad_norm": 3.0722646713256836, "learning_rate": 3.330889517887801e-05, "loss": 0.4034, "step": 3720 }, { "epoch": 0.5166204986149584, "grad_norm": 3.6399686336517334, "learning_rate": 3.3290334555241056e-05, "loss": 0.4298, "step": 3730 }, { "epoch": 0.518005540166205, "grad_norm": 3.675416946411133, "learning_rate": 3.3271773931604107e-05, "loss": 0.4135, "step": 3740 }, { "epoch": 0.5193905817174516, "grad_norm": 4.051141738891602, "learning_rate": 3.325321330796715e-05, "loss": 0.4872, "step": 3750 }, { "epoch": 0.5207756232686981, "grad_norm": 3.5294289588928223, "learning_rate": 3.32346526843302e-05, "loss": 0.4141, "step": 3760 }, { "epoch": 0.5221606648199446, "grad_norm": 4.430352687835693, "learning_rate": 3.321609206069324e-05, "loss": 0.4086, "step": 3770 }, { "epoch": 0.5235457063711911, "grad_norm": 3.236701011657715, "learning_rate": 3.319753143705629e-05, "loss": 0.4745, "step": 3780 }, { "epoch": 0.5249307479224377, "grad_norm": 3.0971288681030273, "learning_rate": 3.317897081341933e-05, "loss": 0.4963, "step": 3790 }, { "epoch": 0.5263157894736842, "grad_norm": 4.079372406005859, "learning_rate": 3.316041018978238e-05, "loss": 0.4894, "step": 3800 }, { "epoch": 0.5277008310249307, "grad_norm": 4.282620906829834, "learning_rate": 3.314184956614543e-05, "loss": 0.4807, "step": 3810 }, { "epoch": 0.5290858725761773, "grad_norm": 2.9405722618103027, "learning_rate": 3.312328894250847e-05, "loss": 0.4863, "step": 3820 }, { "epoch": 0.5304709141274239, "grad_norm": 2.777052402496338, "learning_rate": 3.3104728318871514e-05, "loss": 0.4326, "step": 3830 }, { "epoch": 0.5318559556786704, "grad_norm": 2.2695729732513428, "learning_rate": 3.308616769523456e-05, "loss": 0.4694, "step": 3840 }, { "epoch": 0.5332409972299169, "grad_norm": 3.705563545227051, "learning_rate": 3.306760707159761e-05, "loss": 0.4308, "step": 3850 }, { "epoch": 0.5346260387811634, "grad_norm": 3.7749478816986084, "learning_rate": 3.304904644796065e-05, "loss": 0.4174, "step": 3860 }, { "epoch": 0.53601108033241, "grad_norm": 4.059981822967529, "learning_rate": 3.30304858243237e-05, "loss": 0.4492, "step": 3870 }, { "epoch": 0.5373961218836565, "grad_norm": 3.0971269607543945, "learning_rate": 3.301192520068675e-05, "loss": 0.4399, "step": 3880 }, { "epoch": 0.538781163434903, "grad_norm": 4.655627250671387, "learning_rate": 3.299336457704979e-05, "loss": 0.4115, "step": 3890 }, { "epoch": 0.5401662049861495, "grad_norm": 4.1396684646606445, "learning_rate": 3.2974803953412835e-05, "loss": 0.4738, "step": 3900 }, { "epoch": 0.5415512465373962, "grad_norm": 3.7863826751708984, "learning_rate": 3.295624332977588e-05, "loss": 0.474, "step": 3910 }, { "epoch": 0.5429362880886427, "grad_norm": 2.854278087615967, "learning_rate": 3.293768270613893e-05, "loss": 0.4478, "step": 3920 }, { "epoch": 0.5443213296398892, "grad_norm": 5.139938831329346, "learning_rate": 3.291912208250197e-05, "loss": 0.484, "step": 3930 }, { "epoch": 0.5457063711911357, "grad_norm": 3.8379576206207275, "learning_rate": 3.290056145886502e-05, "loss": 0.4665, "step": 3940 }, { "epoch": 0.5470914127423823, "grad_norm": 3.043219566345215, "learning_rate": 3.288200083522807e-05, "loss": 0.484, "step": 3950 }, { "epoch": 0.5484764542936288, "grad_norm": 3.708510398864746, "learning_rate": 3.286344021159111e-05, "loss": 0.4471, "step": 3960 }, { "epoch": 0.5498614958448753, "grad_norm": 5.9408159255981445, "learning_rate": 3.2844879587954155e-05, "loss": 0.4637, "step": 3970 }, { "epoch": 0.5512465373961218, "grad_norm": 2.6901965141296387, "learning_rate": 3.2826318964317205e-05, "loss": 0.4472, "step": 3980 }, { "epoch": 0.5526315789473685, "grad_norm": 3.6067564487457275, "learning_rate": 3.280775834068025e-05, "loss": 0.4264, "step": 3990 }, { "epoch": 0.554016620498615, "grad_norm": 4.2996344566345215, "learning_rate": 3.27891977170433e-05, "loss": 0.4591, "step": 4000 }, { "epoch": 0.5554016620498615, "grad_norm": 3.5069291591644287, "learning_rate": 3.2770637093406344e-05, "loss": 0.4149, "step": 4010 }, { "epoch": 0.556786703601108, "grad_norm": 3.3348257541656494, "learning_rate": 3.275207646976939e-05, "loss": 0.4734, "step": 4020 }, { "epoch": 0.5581717451523546, "grad_norm": 3.276700258255005, "learning_rate": 3.273351584613243e-05, "loss": 0.4616, "step": 4030 }, { "epoch": 0.5595567867036011, "grad_norm": 2.2809834480285645, "learning_rate": 3.2714955222495475e-05, "loss": 0.4587, "step": 4040 }, { "epoch": 0.5609418282548476, "grad_norm": 4.095324993133545, "learning_rate": 3.2696394598858526e-05, "loss": 0.441, "step": 4050 }, { "epoch": 0.5623268698060941, "grad_norm": 3.506425619125366, "learning_rate": 3.267783397522157e-05, "loss": 0.4117, "step": 4060 }, { "epoch": 0.5637119113573407, "grad_norm": 2.7167859077453613, "learning_rate": 3.265927335158462e-05, "loss": 0.4472, "step": 4070 }, { "epoch": 0.5650969529085873, "grad_norm": 2.6138830184936523, "learning_rate": 3.2640712727947664e-05, "loss": 0.4344, "step": 4080 }, { "epoch": 0.5664819944598338, "grad_norm": 4.395142078399658, "learning_rate": 3.262215210431071e-05, "loss": 0.3822, "step": 4090 }, { "epoch": 0.5678670360110804, "grad_norm": 4.22299861907959, "learning_rate": 3.260359148067375e-05, "loss": 0.439, "step": 4100 }, { "epoch": 0.5692520775623269, "grad_norm": 2.483031988143921, "learning_rate": 3.25850308570368e-05, "loss": 0.4168, "step": 4110 }, { "epoch": 0.5706371191135734, "grad_norm": 5.224299430847168, "learning_rate": 3.2566470233399846e-05, "loss": 0.4622, "step": 4120 }, { "epoch": 0.5720221606648199, "grad_norm": 4.899472713470459, "learning_rate": 3.254790960976289e-05, "loss": 0.4427, "step": 4130 }, { "epoch": 0.5734072022160664, "grad_norm": 4.368985652923584, "learning_rate": 3.252934898612594e-05, "loss": 0.4067, "step": 4140 }, { "epoch": 0.574792243767313, "grad_norm": 2.419369697570801, "learning_rate": 3.2510788362488984e-05, "loss": 0.4096, "step": 4150 }, { "epoch": 0.5761772853185596, "grad_norm": 3.5503952503204346, "learning_rate": 3.249222773885203e-05, "loss": 0.4385, "step": 4160 }, { "epoch": 0.5775623268698061, "grad_norm": 4.396968841552734, "learning_rate": 3.247366711521507e-05, "loss": 0.4686, "step": 4170 }, { "epoch": 0.5789473684210527, "grad_norm": 2.777759313583374, "learning_rate": 3.245510649157812e-05, "loss": 0.4215, "step": 4180 }, { "epoch": 0.5803324099722992, "grad_norm": 5.332396030426025, "learning_rate": 3.2436545867941166e-05, "loss": 0.4416, "step": 4190 }, { "epoch": 0.5817174515235457, "grad_norm": 2.5784120559692383, "learning_rate": 3.241798524430421e-05, "loss": 0.425, "step": 4200 }, { "epoch": 0.5831024930747922, "grad_norm": 2.983954668045044, "learning_rate": 3.2399424620667254e-05, "loss": 0.4213, "step": 4210 }, { "epoch": 0.5844875346260388, "grad_norm": 3.8476195335388184, "learning_rate": 3.23808639970303e-05, "loss": 0.4716, "step": 4220 }, { "epoch": 0.5858725761772853, "grad_norm": 12.473455429077148, "learning_rate": 3.236230337339335e-05, "loss": 0.428, "step": 4230 }, { "epoch": 0.5872576177285319, "grad_norm": 2.7237050533294678, "learning_rate": 3.234374274975639e-05, "loss": 0.4505, "step": 4240 }, { "epoch": 0.5886426592797784, "grad_norm": 3.166870594024658, "learning_rate": 3.232518212611944e-05, "loss": 0.4082, "step": 4250 }, { "epoch": 0.590027700831025, "grad_norm": 2.7044925689697266, "learning_rate": 3.2306621502482486e-05, "loss": 0.451, "step": 4260 }, { "epoch": 0.5914127423822715, "grad_norm": 2.526685953140259, "learning_rate": 3.228806087884553e-05, "loss": 0.3867, "step": 4270 }, { "epoch": 0.592797783933518, "grad_norm": 3.4184718132019043, "learning_rate": 3.2269500255208574e-05, "loss": 0.4196, "step": 4280 }, { "epoch": 0.5941828254847645, "grad_norm": 2.575986862182617, "learning_rate": 3.2250939631571625e-05, "loss": 0.4277, "step": 4290 }, { "epoch": 0.5955678670360111, "grad_norm": 8.845891952514648, "learning_rate": 3.223237900793467e-05, "loss": 0.4054, "step": 4300 }, { "epoch": 0.5969529085872576, "grad_norm": 2.8799660205841064, "learning_rate": 3.221381838429772e-05, "loss": 0.4524, "step": 4310 }, { "epoch": 0.5983379501385041, "grad_norm": 2.95517897605896, "learning_rate": 3.219525776066076e-05, "loss": 0.4075, "step": 4320 }, { "epoch": 0.5997229916897507, "grad_norm": 3.06547212600708, "learning_rate": 3.2176697137023807e-05, "loss": 0.4467, "step": 4330 }, { "epoch": 0.6011080332409973, "grad_norm": 4.401691913604736, "learning_rate": 3.215813651338685e-05, "loss": 0.4717, "step": 4340 }, { "epoch": 0.6024930747922438, "grad_norm": 3.3205084800720215, "learning_rate": 3.2139575889749894e-05, "loss": 0.4815, "step": 4350 }, { "epoch": 0.6038781163434903, "grad_norm": 4.638674736022949, "learning_rate": 3.2121015266112945e-05, "loss": 0.4042, "step": 4360 }, { "epoch": 0.6052631578947368, "grad_norm": 2.9574596881866455, "learning_rate": 3.210245464247599e-05, "loss": 0.4469, "step": 4370 }, { "epoch": 0.6066481994459834, "grad_norm": 4.905550956726074, "learning_rate": 3.208389401883904e-05, "loss": 0.4386, "step": 4380 }, { "epoch": 0.6080332409972299, "grad_norm": 4.671908855438232, "learning_rate": 3.206533339520208e-05, "loss": 0.4219, "step": 4390 }, { "epoch": 0.6094182825484764, "grad_norm": 3.3245506286621094, "learning_rate": 3.204677277156513e-05, "loss": 0.4523, "step": 4400 }, { "epoch": 0.610803324099723, "grad_norm": 4.6899285316467285, "learning_rate": 3.202821214792817e-05, "loss": 0.4662, "step": 4410 }, { "epoch": 0.6121883656509696, "grad_norm": 3.6640164852142334, "learning_rate": 3.200965152429122e-05, "loss": 0.3351, "step": 4420 }, { "epoch": 0.6135734072022161, "grad_norm": 3.060845375061035, "learning_rate": 3.1991090900654265e-05, "loss": 0.3734, "step": 4430 }, { "epoch": 0.6149584487534626, "grad_norm": 3.4755430221557617, "learning_rate": 3.197253027701731e-05, "loss": 0.4375, "step": 4440 }, { "epoch": 0.6163434903047091, "grad_norm": 3.331393241882324, "learning_rate": 3.195396965338036e-05, "loss": 0.4491, "step": 4450 }, { "epoch": 0.6177285318559557, "grad_norm": 3.3888213634490967, "learning_rate": 3.19354090297434e-05, "loss": 0.3774, "step": 4460 }, { "epoch": 0.6191135734072022, "grad_norm": 3.0479416847229004, "learning_rate": 3.191684840610645e-05, "loss": 0.3667, "step": 4470 }, { "epoch": 0.6204986149584487, "grad_norm": 3.2111737728118896, "learning_rate": 3.189828778246949e-05, "loss": 0.4133, "step": 4480 }, { "epoch": 0.6218836565096952, "grad_norm": 3.8854033946990967, "learning_rate": 3.187972715883254e-05, "loss": 0.4047, "step": 4490 }, { "epoch": 0.6232686980609419, "grad_norm": 2.769989252090454, "learning_rate": 3.1861166535195585e-05, "loss": 0.4135, "step": 4500 }, { "epoch": 0.6246537396121884, "grad_norm": 3.127634048461914, "learning_rate": 3.1842605911558636e-05, "loss": 0.4154, "step": 4510 }, { "epoch": 0.6260387811634349, "grad_norm": 3.8508613109588623, "learning_rate": 3.182404528792168e-05, "loss": 0.4187, "step": 4520 }, { "epoch": 0.6274238227146814, "grad_norm": 3.4920480251312256, "learning_rate": 3.1805484664284723e-05, "loss": 0.4257, "step": 4530 }, { "epoch": 0.628808864265928, "grad_norm": 1.9404661655426025, "learning_rate": 3.178692404064777e-05, "loss": 0.4067, "step": 4540 }, { "epoch": 0.6301939058171745, "grad_norm": 2.8970749378204346, "learning_rate": 3.176836341701081e-05, "loss": 0.4321, "step": 4550 }, { "epoch": 0.631578947368421, "grad_norm": 2.96341609954834, "learning_rate": 3.174980279337386e-05, "loss": 0.4049, "step": 4560 }, { "epoch": 0.6329639889196675, "grad_norm": 2.9328839778900146, "learning_rate": 3.1731242169736905e-05, "loss": 0.3811, "step": 4570 }, { "epoch": 0.6343490304709142, "grad_norm": 3.4064273834228516, "learning_rate": 3.171268154609995e-05, "loss": 0.4501, "step": 4580 }, { "epoch": 0.6357340720221607, "grad_norm": 4.2957682609558105, "learning_rate": 3.169597698482669e-05, "loss": 0.4221, "step": 4590 }, { "epoch": 0.6371191135734072, "grad_norm": 2.247894763946533, "learning_rate": 3.1677416361189736e-05, "loss": 0.391, "step": 4600 }, { "epoch": 0.6385041551246537, "grad_norm": 3.282683849334717, "learning_rate": 3.1658855737552786e-05, "loss": 0.4178, "step": 4610 }, { "epoch": 0.6398891966759003, "grad_norm": 7.642999172210693, "learning_rate": 3.164029511391583e-05, "loss": 0.4455, "step": 4620 }, { "epoch": 0.6412742382271468, "grad_norm": 2.312368392944336, "learning_rate": 3.1621734490278874e-05, "loss": 0.4496, "step": 4630 }, { "epoch": 0.6426592797783933, "grad_norm": 3.5621540546417236, "learning_rate": 3.160317386664192e-05, "loss": 0.4543, "step": 4640 }, { "epoch": 0.6440443213296398, "grad_norm": 3.406416416168213, "learning_rate": 3.158461324300497e-05, "loss": 0.455, "step": 4650 }, { "epoch": 0.6454293628808865, "grad_norm": 3.192976236343384, "learning_rate": 3.156605261936801e-05, "loss": 0.4025, "step": 4660 }, { "epoch": 0.646814404432133, "grad_norm": 5.069347858428955, "learning_rate": 3.154749199573106e-05, "loss": 0.3751, "step": 4670 }, { "epoch": 0.6481994459833795, "grad_norm": 4.459850311279297, "learning_rate": 3.152893137209411e-05, "loss": 0.4012, "step": 4680 }, { "epoch": 0.649584487534626, "grad_norm": 2.7644102573394775, "learning_rate": 3.151037074845715e-05, "loss": 0.3464, "step": 4690 }, { "epoch": 0.6509695290858726, "grad_norm": 3.0112130641937256, "learning_rate": 3.1491810124820194e-05, "loss": 0.4453, "step": 4700 }, { "epoch": 0.6523545706371191, "grad_norm": 4.661027908325195, "learning_rate": 3.147324950118324e-05, "loss": 0.4233, "step": 4710 }, { "epoch": 0.6537396121883656, "grad_norm": 5.308691501617432, "learning_rate": 3.145468887754629e-05, "loss": 0.4013, "step": 4720 }, { "epoch": 0.6551246537396122, "grad_norm": 3.739926815032959, "learning_rate": 3.143612825390933e-05, "loss": 0.3827, "step": 4730 }, { "epoch": 0.6565096952908587, "grad_norm": 3.5572757720947266, "learning_rate": 3.141756763027238e-05, "loss": 0.3911, "step": 4740 }, { "epoch": 0.6578947368421053, "grad_norm": 3.1638436317443848, "learning_rate": 3.139900700663543e-05, "loss": 0.3802, "step": 4750 }, { "epoch": 0.6592797783933518, "grad_norm": 3.4495441913604736, "learning_rate": 3.138044638299847e-05, "loss": 0.4217, "step": 4760 }, { "epoch": 0.6606648199445984, "grad_norm": 3.4912328720092773, "learning_rate": 3.1361885759361515e-05, "loss": 0.3968, "step": 4770 }, { "epoch": 0.6620498614958449, "grad_norm": 4.580535888671875, "learning_rate": 3.1343325135724565e-05, "loss": 0.4025, "step": 4780 }, { "epoch": 0.6634349030470914, "grad_norm": 2.982192039489746, "learning_rate": 3.132476451208761e-05, "loss": 0.4258, "step": 4790 }, { "epoch": 0.6648199445983379, "grad_norm": 3.1720199584960938, "learning_rate": 3.130620388845065e-05, "loss": 0.4045, "step": 4800 }, { "epoch": 0.6662049861495845, "grad_norm": 3.1427557468414307, "learning_rate": 3.12876432648137e-05, "loss": 0.4073, "step": 4810 }, { "epoch": 0.667590027700831, "grad_norm": 3.666034698486328, "learning_rate": 3.126908264117675e-05, "loss": 0.4056, "step": 4820 }, { "epoch": 0.6689750692520776, "grad_norm": 4.027162075042725, "learning_rate": 3.125052201753979e-05, "loss": 0.3917, "step": 4830 }, { "epoch": 0.6703601108033241, "grad_norm": 4.736123561859131, "learning_rate": 3.1231961393902835e-05, "loss": 0.4332, "step": 4840 }, { "epoch": 0.6717451523545707, "grad_norm": 3.1902873516082764, "learning_rate": 3.1213400770265885e-05, "loss": 0.4597, "step": 4850 }, { "epoch": 0.6731301939058172, "grad_norm": 3.289825916290283, "learning_rate": 3.119484014662893e-05, "loss": 0.4102, "step": 4860 }, { "epoch": 0.6745152354570637, "grad_norm": 3.381176471710205, "learning_rate": 3.117627952299198e-05, "loss": 0.3899, "step": 4870 }, { "epoch": 0.6759002770083102, "grad_norm": 3.6667535305023193, "learning_rate": 3.1157718899355024e-05, "loss": 0.3948, "step": 4880 }, { "epoch": 0.6772853185595568, "grad_norm": 4.411403656005859, "learning_rate": 3.113915827571807e-05, "loss": 0.396, "step": 4890 }, { "epoch": 0.6786703601108033, "grad_norm": 3.952453136444092, "learning_rate": 3.112059765208111e-05, "loss": 0.3817, "step": 4900 }, { "epoch": 0.6800554016620498, "grad_norm": 4.688000679016113, "learning_rate": 3.1102037028444155e-05, "loss": 0.4643, "step": 4910 }, { "epoch": 0.6814404432132964, "grad_norm": 3.189419984817505, "learning_rate": 3.1083476404807206e-05, "loss": 0.4133, "step": 4920 }, { "epoch": 0.682825484764543, "grad_norm": 2.98185658454895, "learning_rate": 3.106491578117025e-05, "loss": 0.3862, "step": 4930 }, { "epoch": 0.6842105263157895, "grad_norm": 3.0567667484283447, "learning_rate": 3.10463551575333e-05, "loss": 0.4629, "step": 4940 }, { "epoch": 0.685595567867036, "grad_norm": 2.5653200149536133, "learning_rate": 3.102779453389634e-05, "loss": 0.4463, "step": 4950 }, { "epoch": 0.6869806094182825, "grad_norm": 4.169416427612305, "learning_rate": 3.100923391025939e-05, "loss": 0.4068, "step": 4960 }, { "epoch": 0.6883656509695291, "grad_norm": 2.639570951461792, "learning_rate": 3.099067328662243e-05, "loss": 0.4079, "step": 4970 }, { "epoch": 0.6897506925207756, "grad_norm": 2.7639517784118652, "learning_rate": 3.097211266298548e-05, "loss": 0.3994, "step": 4980 }, { "epoch": 0.6911357340720221, "grad_norm": 3.754966974258423, "learning_rate": 3.0953552039348526e-05, "loss": 0.377, "step": 4990 }, { "epoch": 0.6925207756232687, "grad_norm": 7.626768589019775, "learning_rate": 3.093499141571157e-05, "loss": 0.4358, "step": 5000 }, { "epoch": 0.6939058171745153, "grad_norm": 2.9823813438415527, "learning_rate": 3.0916430792074613e-05, "loss": 0.412, "step": 5010 }, { "epoch": 0.6952908587257618, "grad_norm": 2.7320168018341064, "learning_rate": 3.089787016843766e-05, "loss": 0.3826, "step": 5020 }, { "epoch": 0.6966759002770083, "grad_norm": 2.937063694000244, "learning_rate": 3.087930954480071e-05, "loss": 0.368, "step": 5030 }, { "epoch": 0.6980609418282548, "grad_norm": 3.7012999057769775, "learning_rate": 3.086074892116375e-05, "loss": 0.4141, "step": 5040 }, { "epoch": 0.6994459833795014, "grad_norm": 3.345500946044922, "learning_rate": 3.08421882975268e-05, "loss": 0.4539, "step": 5050 }, { "epoch": 0.7008310249307479, "grad_norm": 2.865732192993164, "learning_rate": 3.0823627673889846e-05, "loss": 0.404, "step": 5060 }, { "epoch": 0.7022160664819944, "grad_norm": 3.1072158813476562, "learning_rate": 3.080506705025289e-05, "loss": 0.4306, "step": 5070 }, { "epoch": 0.703601108033241, "grad_norm": 3.3509767055511475, "learning_rate": 3.0786506426615934e-05, "loss": 0.4512, "step": 5080 }, { "epoch": 0.7049861495844876, "grad_norm": 3.022902727127075, "learning_rate": 3.0767945802978984e-05, "loss": 0.4477, "step": 5090 }, { "epoch": 0.7063711911357341, "grad_norm": 3.4046220779418945, "learning_rate": 3.074938517934203e-05, "loss": 0.384, "step": 5100 }, { "epoch": 0.7077562326869806, "grad_norm": 4.66195011138916, "learning_rate": 3.073082455570507e-05, "loss": 0.436, "step": 5110 }, { "epoch": 0.7091412742382271, "grad_norm": 2.5576562881469727, "learning_rate": 3.071226393206812e-05, "loss": 0.3902, "step": 5120 }, { "epoch": 0.7105263157894737, "grad_norm": 3.4099183082580566, "learning_rate": 3.0693703308431166e-05, "loss": 0.4065, "step": 5130 }, { "epoch": 0.7119113573407202, "grad_norm": 2.215935230255127, "learning_rate": 3.067514268479421e-05, "loss": 0.3731, "step": 5140 }, { "epoch": 0.7132963988919667, "grad_norm": 5.798324108123779, "learning_rate": 3.0656582061157254e-05, "loss": 0.4359, "step": 5150 }, { "epoch": 0.7146814404432132, "grad_norm": 1.9546353816986084, "learning_rate": 3.0638021437520305e-05, "loss": 0.3753, "step": 5160 }, { "epoch": 0.7160664819944599, "grad_norm": 2.7487800121307373, "learning_rate": 3.061946081388335e-05, "loss": 0.4366, "step": 5170 }, { "epoch": 0.7174515235457064, "grad_norm": 4.348243713378906, "learning_rate": 3.06009001902464e-05, "loss": 0.3798, "step": 5180 }, { "epoch": 0.7188365650969529, "grad_norm": 5.212021350860596, "learning_rate": 3.058233956660944e-05, "loss": 0.3622, "step": 5190 }, { "epoch": 0.7202216066481995, "grad_norm": 2.5864319801330566, "learning_rate": 3.0563778942972487e-05, "loss": 0.4204, "step": 5200 }, { "epoch": 0.721606648199446, "grad_norm": 4.407830715179443, "learning_rate": 3.054521831933553e-05, "loss": 0.3879, "step": 5210 }, { "epoch": 0.7229916897506925, "grad_norm": 3.6577796936035156, "learning_rate": 3.0526657695698574e-05, "loss": 0.3786, "step": 5220 }, { "epoch": 0.724376731301939, "grad_norm": 3.39837384223938, "learning_rate": 3.0508097072061625e-05, "loss": 0.402, "step": 5230 }, { "epoch": 0.7257617728531855, "grad_norm": 3.36942982673645, "learning_rate": 3.048953644842467e-05, "loss": 0.3856, "step": 5240 }, { "epoch": 0.7271468144044322, "grad_norm": 5.817683219909668, "learning_rate": 3.0470975824787716e-05, "loss": 0.4173, "step": 5250 }, { "epoch": 0.7285318559556787, "grad_norm": 3.100637674331665, "learning_rate": 3.045241520115076e-05, "loss": 0.3486, "step": 5260 }, { "epoch": 0.7299168975069252, "grad_norm": 6.541337490081787, "learning_rate": 3.0433854577513807e-05, "loss": 0.4273, "step": 5270 }, { "epoch": 0.7313019390581718, "grad_norm": 2.8053693771362305, "learning_rate": 3.041529395387685e-05, "loss": 0.4533, "step": 5280 }, { "epoch": 0.7326869806094183, "grad_norm": 2.527963638305664, "learning_rate": 3.03967333302399e-05, "loss": 0.3753, "step": 5290 }, { "epoch": 0.7340720221606648, "grad_norm": 2.9458463191986084, "learning_rate": 3.0378172706602945e-05, "loss": 0.4179, "step": 5300 }, { "epoch": 0.7354570637119113, "grad_norm": 3.373678207397461, "learning_rate": 3.035961208296599e-05, "loss": 0.4206, "step": 5310 }, { "epoch": 0.7368421052631579, "grad_norm": 2.686825752258301, "learning_rate": 3.0341051459329036e-05, "loss": 0.3849, "step": 5320 }, { "epoch": 0.7382271468144044, "grad_norm": 2.6222784519195557, "learning_rate": 3.032249083569208e-05, "loss": 0.3689, "step": 5330 }, { "epoch": 0.739612188365651, "grad_norm": 3.494692087173462, "learning_rate": 3.0303930212055127e-05, "loss": 0.4181, "step": 5340 }, { "epoch": 0.7409972299168975, "grad_norm": 4.222794055938721, "learning_rate": 3.028536958841817e-05, "loss": 0.4103, "step": 5350 }, { "epoch": 0.7423822714681441, "grad_norm": 2.4981813430786133, "learning_rate": 3.026680896478122e-05, "loss": 0.3996, "step": 5360 }, { "epoch": 0.7437673130193906, "grad_norm": 4.366548538208008, "learning_rate": 3.0248248341144265e-05, "loss": 0.3658, "step": 5370 }, { "epoch": 0.7451523545706371, "grad_norm": 3.9107649326324463, "learning_rate": 3.0229687717507312e-05, "loss": 0.4083, "step": 5380 }, { "epoch": 0.7465373961218836, "grad_norm": 2.8175508975982666, "learning_rate": 3.0211127093870356e-05, "loss": 0.3993, "step": 5390 }, { "epoch": 0.7479224376731302, "grad_norm": 3.2297942638397217, "learning_rate": 3.0192566470233403e-05, "loss": 0.3592, "step": 5400 }, { "epoch": 0.7493074792243767, "grad_norm": 6.544715404510498, "learning_rate": 3.0174005846596447e-05, "loss": 0.4534, "step": 5410 }, { "epoch": 0.7506925207756233, "grad_norm": 4.909923553466797, "learning_rate": 3.015544522295949e-05, "loss": 0.4161, "step": 5420 }, { "epoch": 0.7520775623268698, "grad_norm": 4.12788724899292, "learning_rate": 3.013688459932254e-05, "loss": 0.3657, "step": 5430 }, { "epoch": 0.7534626038781164, "grad_norm": 4.8378071784973145, "learning_rate": 3.0118323975685585e-05, "loss": 0.3741, "step": 5440 }, { "epoch": 0.7548476454293629, "grad_norm": 2.4138081073760986, "learning_rate": 3.0099763352048633e-05, "loss": 0.3844, "step": 5450 }, { "epoch": 0.7562326869806094, "grad_norm": 3.8834354877471924, "learning_rate": 3.0081202728411676e-05, "loss": 0.3943, "step": 5460 }, { "epoch": 0.7576177285318559, "grad_norm": 5.033694744110107, "learning_rate": 3.0062642104774724e-05, "loss": 0.3897, "step": 5470 }, { "epoch": 0.7590027700831025, "grad_norm": 3.515544891357422, "learning_rate": 3.0044081481137767e-05, "loss": 0.363, "step": 5480 }, { "epoch": 0.760387811634349, "grad_norm": 4.053267002105713, "learning_rate": 3.0025520857500815e-05, "loss": 0.3566, "step": 5490 }, { "epoch": 0.7617728531855956, "grad_norm": 2.7113759517669678, "learning_rate": 3.000696023386386e-05, "loss": 0.358, "step": 5500 }, { "epoch": 0.7631578947368421, "grad_norm": 2.1109673976898193, "learning_rate": 2.998839961022691e-05, "loss": 0.4205, "step": 5510 }, { "epoch": 0.7645429362880887, "grad_norm": 4.754786014556885, "learning_rate": 2.9969838986589953e-05, "loss": 0.414, "step": 5520 }, { "epoch": 0.7659279778393352, "grad_norm": 2.2650856971740723, "learning_rate": 2.9951278362952997e-05, "loss": 0.3979, "step": 5530 }, { "epoch": 0.7673130193905817, "grad_norm": 2.702939033508301, "learning_rate": 2.9932717739316044e-05, "loss": 0.3778, "step": 5540 }, { "epoch": 0.7686980609418282, "grad_norm": 4.2221455574035645, "learning_rate": 2.9914157115679088e-05, "loss": 0.3941, "step": 5550 }, { "epoch": 0.7700831024930748, "grad_norm": 2.9708549976348877, "learning_rate": 2.9895596492042135e-05, "loss": 0.4391, "step": 5560 }, { "epoch": 0.7714681440443213, "grad_norm": 4.177848815917969, "learning_rate": 2.987703586840518e-05, "loss": 0.3986, "step": 5570 }, { "epoch": 0.7728531855955678, "grad_norm": 5.936276912689209, "learning_rate": 2.985847524476823e-05, "loss": 0.3538, "step": 5580 }, { "epoch": 0.7742382271468145, "grad_norm": 2.817993640899658, "learning_rate": 2.9839914621131273e-05, "loss": 0.3736, "step": 5590 }, { "epoch": 0.775623268698061, "grad_norm": 3.635772466659546, "learning_rate": 2.982135399749432e-05, "loss": 0.3802, "step": 5600 }, { "epoch": 0.7770083102493075, "grad_norm": 4.219478607177734, "learning_rate": 2.980464943622106e-05, "loss": 0.4488, "step": 5610 }, { "epoch": 0.778393351800554, "grad_norm": 2.885037422180176, "learning_rate": 2.9789800937311498e-05, "loss": 0.4068, "step": 5620 }, { "epoch": 0.7797783933518005, "grad_norm": 4.46284818649292, "learning_rate": 2.9771240313674542e-05, "loss": 0.3819, "step": 5630 }, { "epoch": 0.7811634349030471, "grad_norm": 4.2689080238342285, "learning_rate": 2.975267969003759e-05, "loss": 0.4599, "step": 5640 }, { "epoch": 0.7825484764542936, "grad_norm": 2.5441861152648926, "learning_rate": 2.9734119066400633e-05, "loss": 0.3504, "step": 5650 }, { "epoch": 0.7839335180055401, "grad_norm": 4.003300189971924, "learning_rate": 2.971555844276368e-05, "loss": 0.3054, "step": 5660 }, { "epoch": 0.7853185595567868, "grad_norm": 3.1598663330078125, "learning_rate": 2.9696997819126724e-05, "loss": 0.3683, "step": 5670 }, { "epoch": 0.7867036011080333, "grad_norm": 3.1675422191619873, "learning_rate": 2.9678437195489768e-05, "loss": 0.4038, "step": 5680 }, { "epoch": 0.7880886426592798, "grad_norm": 2.500715732574463, "learning_rate": 2.965987657185282e-05, "loss": 0.3432, "step": 5690 }, { "epoch": 0.7894736842105263, "grad_norm": 3.7989180088043213, "learning_rate": 2.9641315948215862e-05, "loss": 0.3607, "step": 5700 }, { "epoch": 0.7908587257617729, "grad_norm": 3.9156484603881836, "learning_rate": 2.962275532457891e-05, "loss": 0.3906, "step": 5710 }, { "epoch": 0.7922437673130194, "grad_norm": 3.2225682735443115, "learning_rate": 2.9604194700941953e-05, "loss": 0.414, "step": 5720 }, { "epoch": 0.7936288088642659, "grad_norm": 2.859682321548462, "learning_rate": 2.9585634077305e-05, "loss": 0.3411, "step": 5730 }, { "epoch": 0.7950138504155124, "grad_norm": 2.8549094200134277, "learning_rate": 2.9567073453668044e-05, "loss": 0.3899, "step": 5740 }, { "epoch": 0.796398891966759, "grad_norm": 4.423649787902832, "learning_rate": 2.954851283003109e-05, "loss": 0.4039, "step": 5750 }, { "epoch": 0.7977839335180056, "grad_norm": 2.772611379623413, "learning_rate": 2.9529952206394135e-05, "loss": 0.4265, "step": 5760 }, { "epoch": 0.7991689750692521, "grad_norm": 3.356656789779663, "learning_rate": 2.9511391582757186e-05, "loss": 0.3442, "step": 5770 }, { "epoch": 0.8005540166204986, "grad_norm": 2.5400006771087646, "learning_rate": 2.949283095912023e-05, "loss": 0.3697, "step": 5780 }, { "epoch": 0.8019390581717452, "grad_norm": 2.1723203659057617, "learning_rate": 2.9474270335483273e-05, "loss": 0.3727, "step": 5790 }, { "epoch": 0.8033240997229917, "grad_norm": 5.687036514282227, "learning_rate": 2.945570971184632e-05, "loss": 0.4095, "step": 5800 }, { "epoch": 0.8047091412742382, "grad_norm": 3.2231178283691406, "learning_rate": 2.9437149088209364e-05, "loss": 0.3397, "step": 5810 }, { "epoch": 0.8060941828254847, "grad_norm": 2.743824005126953, "learning_rate": 2.941858846457241e-05, "loss": 0.3663, "step": 5820 }, { "epoch": 0.8074792243767313, "grad_norm": 3.0033442974090576, "learning_rate": 2.9400027840935455e-05, "loss": 0.3799, "step": 5830 }, { "epoch": 0.8088642659279779, "grad_norm": 3.7056267261505127, "learning_rate": 2.9381467217298506e-05, "loss": 0.388, "step": 5840 }, { "epoch": 0.8102493074792244, "grad_norm": 4.837870121002197, "learning_rate": 2.936290659366155e-05, "loss": 0.4043, "step": 5850 }, { "epoch": 0.8116343490304709, "grad_norm": 3.815831422805786, "learning_rate": 2.9344345970024597e-05, "loss": 0.34, "step": 5860 }, { "epoch": 0.8130193905817175, "grad_norm": 3.2043607234954834, "learning_rate": 2.932578534638764e-05, "loss": 0.3182, "step": 5870 }, { "epoch": 0.814404432132964, "grad_norm": 3.9446229934692383, "learning_rate": 2.9307224722750688e-05, "loss": 0.3658, "step": 5880 }, { "epoch": 0.8157894736842105, "grad_norm": 1.8936973810195923, "learning_rate": 2.9288664099113732e-05, "loss": 0.3958, "step": 5890 }, { "epoch": 0.817174515235457, "grad_norm": 4.075359344482422, "learning_rate": 2.9270103475476776e-05, "loss": 0.4343, "step": 5900 }, { "epoch": 0.8185595567867036, "grad_norm": 3.6802127361297607, "learning_rate": 2.9251542851839826e-05, "loss": 0.3579, "step": 5910 }, { "epoch": 0.8199445983379502, "grad_norm": 2.847928285598755, "learning_rate": 2.923298222820287e-05, "loss": 0.3709, "step": 5920 }, { "epoch": 0.8213296398891967, "grad_norm": 3.1300485134124756, "learning_rate": 2.9214421604565917e-05, "loss": 0.3932, "step": 5930 }, { "epoch": 0.8227146814404432, "grad_norm": 4.011430263519287, "learning_rate": 2.919586098092896e-05, "loss": 0.3944, "step": 5940 }, { "epoch": 0.8240997229916898, "grad_norm": 2.8449504375457764, "learning_rate": 2.917730035729201e-05, "loss": 0.3857, "step": 5950 }, { "epoch": 0.8254847645429363, "grad_norm": 2.396585464477539, "learning_rate": 2.9158739733655052e-05, "loss": 0.3609, "step": 5960 }, { "epoch": 0.8268698060941828, "grad_norm": 2.1157963275909424, "learning_rate": 2.91401791100181e-05, "loss": 0.3388, "step": 5970 }, { "epoch": 0.8282548476454293, "grad_norm": 4.279614448547363, "learning_rate": 2.9121618486381143e-05, "loss": 0.3645, "step": 5980 }, { "epoch": 0.8296398891966759, "grad_norm": 2.4833781719207764, "learning_rate": 2.9103057862744187e-05, "loss": 0.3857, "step": 5990 }, { "epoch": 0.8310249307479224, "grad_norm": 3.525597095489502, "learning_rate": 2.9084497239107238e-05, "loss": 0.3576, "step": 6000 }, { "epoch": 0.8313019390581717, "eval_loss": 0.3741193115711212, "eval_runtime": 1433.4277, "eval_samples_per_second": 6.362, "eval_steps_per_second": 0.795, "step": 6002 }, { "epoch": 0.832409972299169, "grad_norm": 4.141083717346191, "learning_rate": 2.906593661547028e-05, "loss": 0.3799, "step": 6010 }, { "epoch": 0.8337950138504155, "grad_norm": 5.583711624145508, "learning_rate": 2.904737599183333e-05, "loss": 0.393, "step": 6020 }, { "epoch": 0.8351800554016621, "grad_norm": 2.304410457611084, "learning_rate": 2.9028815368196372e-05, "loss": 0.3457, "step": 6030 }, { "epoch": 0.8365650969529086, "grad_norm": 2.9231860637664795, "learning_rate": 2.901025474455942e-05, "loss": 0.3795, "step": 6040 }, { "epoch": 0.8379501385041551, "grad_norm": 5.1649980545043945, "learning_rate": 2.8991694120922463e-05, "loss": 0.3586, "step": 6050 }, { "epoch": 0.8393351800554016, "grad_norm": 3.0985405445098877, "learning_rate": 2.8973133497285514e-05, "loss": 0.4043, "step": 6060 }, { "epoch": 0.8407202216066482, "grad_norm": 3.0035006999969482, "learning_rate": 2.8954572873648558e-05, "loss": 0.3332, "step": 6070 }, { "epoch": 0.8421052631578947, "grad_norm": 3.4762930870056152, "learning_rate": 2.8936012250011605e-05, "loss": 0.4091, "step": 6080 }, { "epoch": 0.8434903047091413, "grad_norm": 5.735511302947998, "learning_rate": 2.891745162637465e-05, "loss": 0.3473, "step": 6090 }, { "epoch": 0.8448753462603878, "grad_norm": 3.5551562309265137, "learning_rate": 2.8898891002737693e-05, "loss": 0.3818, "step": 6100 }, { "epoch": 0.8462603878116344, "grad_norm": 3.491703510284424, "learning_rate": 2.888033037910074e-05, "loss": 0.3756, "step": 6110 }, { "epoch": 0.8476454293628809, "grad_norm": 2.843029260635376, "learning_rate": 2.8861769755463784e-05, "loss": 0.3641, "step": 6120 }, { "epoch": 0.8490304709141274, "grad_norm": 2.7495148181915283, "learning_rate": 2.8843209131826834e-05, "loss": 0.4011, "step": 6130 }, { "epoch": 0.850415512465374, "grad_norm": 4.130334377288818, "learning_rate": 2.8824648508189878e-05, "loss": 0.3776, "step": 6140 }, { "epoch": 0.8518005540166205, "grad_norm": 4.290791034698486, "learning_rate": 2.8806087884552925e-05, "loss": 0.4193, "step": 6150 }, { "epoch": 0.853185595567867, "grad_norm": 2.533083438873291, "learning_rate": 2.878752726091597e-05, "loss": 0.3763, "step": 6160 }, { "epoch": 0.8545706371191135, "grad_norm": 5.1947712898254395, "learning_rate": 2.8768966637279016e-05, "loss": 0.393, "step": 6170 }, { "epoch": 0.8559556786703602, "grad_norm": 2.4162371158599854, "learning_rate": 2.875040601364206e-05, "loss": 0.4049, "step": 6180 }, { "epoch": 0.8573407202216067, "grad_norm": 3.275009870529175, "learning_rate": 2.8731845390005107e-05, "loss": 0.3903, "step": 6190 }, { "epoch": 0.8587257617728532, "grad_norm": 2.260618209838867, "learning_rate": 2.871328476636815e-05, "loss": 0.3721, "step": 6200 }, { "epoch": 0.8601108033240997, "grad_norm": 2.923968553543091, "learning_rate": 2.8694724142731195e-05, "loss": 0.3749, "step": 6210 }, { "epoch": 0.8614958448753463, "grad_norm": 5.046422481536865, "learning_rate": 2.8676163519094245e-05, "loss": 0.4056, "step": 6220 }, { "epoch": 0.8628808864265928, "grad_norm": 3.8435168266296387, "learning_rate": 2.865760289545729e-05, "loss": 0.3755, "step": 6230 }, { "epoch": 0.8642659279778393, "grad_norm": 3.553178548812866, "learning_rate": 2.8639042271820336e-05, "loss": 0.3926, "step": 6240 }, { "epoch": 0.8656509695290858, "grad_norm": 3.8052849769592285, "learning_rate": 2.862048164818338e-05, "loss": 0.3685, "step": 6250 }, { "epoch": 0.8670360110803325, "grad_norm": 2.288756847381592, "learning_rate": 2.8601921024546427e-05, "loss": 0.3499, "step": 6260 }, { "epoch": 0.868421052631579, "grad_norm": 3.1977062225341797, "learning_rate": 2.858336040090947e-05, "loss": 0.3487, "step": 6270 }, { "epoch": 0.8698060941828255, "grad_norm": 3.752891778945923, "learning_rate": 2.8564799777272522e-05, "loss": 0.3952, "step": 6280 }, { "epoch": 0.871191135734072, "grad_norm": 2.5730514526367188, "learning_rate": 2.8546239153635566e-05, "loss": 0.349, "step": 6290 }, { "epoch": 0.8725761772853186, "grad_norm": 3.132359027862549, "learning_rate": 2.852767852999861e-05, "loss": 0.3595, "step": 6300 }, { "epoch": 0.8739612188365651, "grad_norm": 3.2165608406066895, "learning_rate": 2.8509117906361657e-05, "loss": 0.3478, "step": 6310 }, { "epoch": 0.8753462603878116, "grad_norm": 10.091190338134766, "learning_rate": 2.84905572827247e-05, "loss": 0.3414, "step": 6320 }, { "epoch": 0.8767313019390581, "grad_norm": 3.1347556114196777, "learning_rate": 2.8471996659087748e-05, "loss": 0.4414, "step": 6330 }, { "epoch": 0.8781163434903048, "grad_norm": 2.5149002075195312, "learning_rate": 2.845343603545079e-05, "loss": 0.3524, "step": 6340 }, { "epoch": 0.8795013850415513, "grad_norm": 3.1567630767822266, "learning_rate": 2.843487541181384e-05, "loss": 0.3568, "step": 6350 }, { "epoch": 0.8808864265927978, "grad_norm": 3.1177406311035156, "learning_rate": 2.8416314788176883e-05, "loss": 0.3671, "step": 6360 }, { "epoch": 0.8822714681440443, "grad_norm": 2.766374349594116, "learning_rate": 2.8397754164539933e-05, "loss": 0.3389, "step": 6370 }, { "epoch": 0.8836565096952909, "grad_norm": 2.419781446456909, "learning_rate": 2.8379193540902977e-05, "loss": 0.3868, "step": 6380 }, { "epoch": 0.8850415512465374, "grad_norm": 2.5221714973449707, "learning_rate": 2.8360632917266024e-05, "loss": 0.3744, "step": 6390 }, { "epoch": 0.8864265927977839, "grad_norm": 4.218471050262451, "learning_rate": 2.8342072293629068e-05, "loss": 0.3245, "step": 6400 }, { "epoch": 0.8878116343490304, "grad_norm": 2.45046067237854, "learning_rate": 2.8323511669992112e-05, "loss": 0.3801, "step": 6410 }, { "epoch": 0.889196675900277, "grad_norm": 3.297358989715576, "learning_rate": 2.830495104635516e-05, "loss": 0.364, "step": 6420 }, { "epoch": 0.8905817174515236, "grad_norm": 2.321702003479004, "learning_rate": 2.8286390422718203e-05, "loss": 0.3487, "step": 6430 }, { "epoch": 0.8919667590027701, "grad_norm": 2.7762234210968018, "learning_rate": 2.8267829799081253e-05, "loss": 0.3723, "step": 6440 }, { "epoch": 0.8933518005540166, "grad_norm": 2.534067392349243, "learning_rate": 2.8249269175444297e-05, "loss": 0.3825, "step": 6450 }, { "epoch": 0.8947368421052632, "grad_norm": 6.5774078369140625, "learning_rate": 2.8230708551807344e-05, "loss": 0.43, "step": 6460 }, { "epoch": 0.8961218836565097, "grad_norm": 2.7400996685028076, "learning_rate": 2.8212147928170388e-05, "loss": 0.3599, "step": 6470 }, { "epoch": 0.8975069252077562, "grad_norm": 1.8518990278244019, "learning_rate": 2.8193587304533435e-05, "loss": 0.3228, "step": 6480 }, { "epoch": 0.8988919667590027, "grad_norm": 2.071018695831299, "learning_rate": 2.817502668089648e-05, "loss": 0.3509, "step": 6490 }, { "epoch": 0.9002770083102493, "grad_norm": 3.566608428955078, "learning_rate": 2.815646605725953e-05, "loss": 0.3726, "step": 6500 }, { "epoch": 0.9016620498614959, "grad_norm": 2.569943904876709, "learning_rate": 2.8137905433622574e-05, "loss": 0.3811, "step": 6510 }, { "epoch": 0.9030470914127424, "grad_norm": 3.897632598876953, "learning_rate": 2.8119344809985617e-05, "loss": 0.3855, "step": 6520 }, { "epoch": 0.9044321329639889, "grad_norm": 2.6584584712982178, "learning_rate": 2.8100784186348665e-05, "loss": 0.3709, "step": 6530 }, { "epoch": 0.9058171745152355, "grad_norm": 2.8664989471435547, "learning_rate": 2.808222356271171e-05, "loss": 0.3666, "step": 6540 }, { "epoch": 0.907202216066482, "grad_norm": 2.7662582397460938, "learning_rate": 2.8063662939074756e-05, "loss": 0.3615, "step": 6550 }, { "epoch": 0.9085872576177285, "grad_norm": 3.2466659545898438, "learning_rate": 2.80451023154378e-05, "loss": 0.4115, "step": 6560 }, { "epoch": 0.909972299168975, "grad_norm": 2.0117952823638916, "learning_rate": 2.8026541691800847e-05, "loss": 0.336, "step": 6570 }, { "epoch": 0.9113573407202216, "grad_norm": 3.4352803230285645, "learning_rate": 2.800798106816389e-05, "loss": 0.3656, "step": 6580 }, { "epoch": 0.9127423822714681, "grad_norm": 2.4090380668640137, "learning_rate": 2.798942044452694e-05, "loss": 0.3774, "step": 6590 }, { "epoch": 0.9141274238227147, "grad_norm": 4.36330509185791, "learning_rate": 2.7970859820889985e-05, "loss": 0.4315, "step": 6600 }, { "epoch": 0.9155124653739612, "grad_norm": 3.1175436973571777, "learning_rate": 2.7952299197253032e-05, "loss": 0.3692, "step": 6610 }, { "epoch": 0.9168975069252078, "grad_norm": 3.6826770305633545, "learning_rate": 2.7933738573616076e-05, "loss": 0.4001, "step": 6620 }, { "epoch": 0.9182825484764543, "grad_norm": 2.450596570968628, "learning_rate": 2.791517794997912e-05, "loss": 0.3838, "step": 6630 }, { "epoch": 0.9196675900277008, "grad_norm": 3.6701254844665527, "learning_rate": 2.7896617326342167e-05, "loss": 0.3145, "step": 6640 }, { "epoch": 0.9210526315789473, "grad_norm": 3.4804341793060303, "learning_rate": 2.787805670270521e-05, "loss": 0.3894, "step": 6650 }, { "epoch": 0.9224376731301939, "grad_norm": 3.159144163131714, "learning_rate": 2.785949607906826e-05, "loss": 0.338, "step": 6660 }, { "epoch": 0.9238227146814404, "grad_norm": 4.329410076141357, "learning_rate": 2.7840935455431305e-05, "loss": 0.3899, "step": 6670 }, { "epoch": 0.925207756232687, "grad_norm": 3.3670008182525635, "learning_rate": 2.7822374831794352e-05, "loss": 0.307, "step": 6680 }, { "epoch": 0.9265927977839336, "grad_norm": 2.9940109252929688, "learning_rate": 2.7803814208157396e-05, "loss": 0.3342, "step": 6690 }, { "epoch": 0.9279778393351801, "grad_norm": 3.1586687564849854, "learning_rate": 2.7785253584520443e-05, "loss": 0.3562, "step": 6700 }, { "epoch": 0.9293628808864266, "grad_norm": 2.943342447280884, "learning_rate": 2.7766692960883487e-05, "loss": 0.3607, "step": 6710 }, { "epoch": 0.9307479224376731, "grad_norm": 2.752495050430298, "learning_rate": 2.774813233724653e-05, "loss": 0.4183, "step": 6720 }, { "epoch": 0.9321329639889196, "grad_norm": 4.039161205291748, "learning_rate": 2.772957171360958e-05, "loss": 0.3647, "step": 6730 }, { "epoch": 0.9335180055401662, "grad_norm": 3.0139365196228027, "learning_rate": 2.7711011089972625e-05, "loss": 0.3389, "step": 6740 }, { "epoch": 0.9349030470914127, "grad_norm": 3.619724750518799, "learning_rate": 2.7692450466335672e-05, "loss": 0.3462, "step": 6750 }, { "epoch": 0.9362880886426593, "grad_norm": 1.989438772201538, "learning_rate": 2.7673889842698716e-05, "loss": 0.3869, "step": 6760 }, { "epoch": 0.9376731301939059, "grad_norm": 2.1334989070892334, "learning_rate": 2.7655329219061763e-05, "loss": 0.317, "step": 6770 }, { "epoch": 0.9390581717451524, "grad_norm": 3.1076736450195312, "learning_rate": 2.7636768595424807e-05, "loss": 0.3654, "step": 6780 }, { "epoch": 0.9404432132963989, "grad_norm": 4.218199253082275, "learning_rate": 2.7618207971787854e-05, "loss": 0.356, "step": 6790 }, { "epoch": 0.9418282548476454, "grad_norm": 3.333724021911621, "learning_rate": 2.7599647348150898e-05, "loss": 0.3675, "step": 6800 }, { "epoch": 0.943213296398892, "grad_norm": 2.9643044471740723, "learning_rate": 2.758108672451395e-05, "loss": 0.3695, "step": 6810 }, { "epoch": 0.9445983379501385, "grad_norm": 4.082902431488037, "learning_rate": 2.7562526100876993e-05, "loss": 0.3575, "step": 6820 }, { "epoch": 0.945983379501385, "grad_norm": 2.4494361877441406, "learning_rate": 2.7543965477240037e-05, "loss": 0.3926, "step": 6830 }, { "epoch": 0.9473684210526315, "grad_norm": 2.541417121887207, "learning_rate": 2.7525404853603084e-05, "loss": 0.3666, "step": 6840 }, { "epoch": 0.9487534626038782, "grad_norm": 3.4122767448425293, "learning_rate": 2.7506844229966128e-05, "loss": 0.3842, "step": 6850 }, { "epoch": 0.9501385041551247, "grad_norm": 4.269729137420654, "learning_rate": 2.7488283606329175e-05, "loss": 0.375, "step": 6860 }, { "epoch": 0.9515235457063712, "grad_norm": 2.563281297683716, "learning_rate": 2.746972298269222e-05, "loss": 0.3856, "step": 6870 }, { "epoch": 0.9529085872576177, "grad_norm": 3.160914182662964, "learning_rate": 2.745116235905527e-05, "loss": 0.3851, "step": 6880 }, { "epoch": 0.9542936288088643, "grad_norm": 2.7195470333099365, "learning_rate": 2.7432601735418313e-05, "loss": 0.3667, "step": 6890 }, { "epoch": 0.9556786703601108, "grad_norm": 2.226888418197632, "learning_rate": 2.741404111178136e-05, "loss": 0.397, "step": 6900 }, { "epoch": 0.9570637119113573, "grad_norm": 2.3146603107452393, "learning_rate": 2.7395480488144404e-05, "loss": 0.3152, "step": 6910 }, { "epoch": 0.9584487534626038, "grad_norm": 5.1704487800598145, "learning_rate": 2.737691986450745e-05, "loss": 0.389, "step": 6920 }, { "epoch": 0.9598337950138505, "grad_norm": 3.0150206089019775, "learning_rate": 2.7358359240870495e-05, "loss": 0.3743, "step": 6930 }, { "epoch": 0.961218836565097, "grad_norm": 4.35557222366333, "learning_rate": 2.733979861723354e-05, "loss": 0.3308, "step": 6940 }, { "epoch": 0.9626038781163435, "grad_norm": 3.1160595417022705, "learning_rate": 2.732123799359659e-05, "loss": 0.3711, "step": 6950 }, { "epoch": 0.96398891966759, "grad_norm": 2.697511672973633, "learning_rate": 2.7302677369959633e-05, "loss": 0.3848, "step": 6960 }, { "epoch": 0.9653739612188366, "grad_norm": 2.6081149578094482, "learning_rate": 2.728411674632268e-05, "loss": 0.3419, "step": 6970 }, { "epoch": 0.9667590027700831, "grad_norm": 3.462388753890991, "learning_rate": 2.7265556122685724e-05, "loss": 0.351, "step": 6980 }, { "epoch": 0.9681440443213296, "grad_norm": 4.886613368988037, "learning_rate": 2.724699549904877e-05, "loss": 0.3263, "step": 6990 }, { "epoch": 0.9695290858725761, "grad_norm": 3.7553138732910156, "learning_rate": 2.7228434875411815e-05, "loss": 0.3353, "step": 7000 }, { "epoch": 0.9709141274238227, "grad_norm": 2.2478718757629395, "learning_rate": 2.7209874251774862e-05, "loss": 0.3363, "step": 7010 }, { "epoch": 0.9722991689750693, "grad_norm": 2.6377060413360596, "learning_rate": 2.7191313628137906e-05, "loss": 0.3089, "step": 7020 }, { "epoch": 0.9736842105263158, "grad_norm": 4.078857898712158, "learning_rate": 2.717275300450095e-05, "loss": 0.3259, "step": 7030 }, { "epoch": 0.9750692520775623, "grad_norm": 4.050114154815674, "learning_rate": 2.7154192380864e-05, "loss": 0.3485, "step": 7040 }, { "epoch": 0.9764542936288089, "grad_norm": 2.680589437484741, "learning_rate": 2.7135631757227044e-05, "loss": 0.3903, "step": 7050 }, { "epoch": 0.9778393351800554, "grad_norm": 2.600092649459839, "learning_rate": 2.711707113359009e-05, "loss": 0.3716, "step": 7060 }, { "epoch": 0.9792243767313019, "grad_norm": 2.405036211013794, "learning_rate": 2.7098510509953135e-05, "loss": 0.3366, "step": 7070 }, { "epoch": 0.9806094182825484, "grad_norm": 2.041038751602173, "learning_rate": 2.7079949886316183e-05, "loss": 0.3859, "step": 7080 }, { "epoch": 0.981994459833795, "grad_norm": 3.4083938598632812, "learning_rate": 2.7061389262679226e-05, "loss": 0.3576, "step": 7090 }, { "epoch": 0.9833795013850416, "grad_norm": 2.4382524490356445, "learning_rate": 2.7042828639042277e-05, "loss": 0.3199, "step": 7100 }, { "epoch": 0.9847645429362881, "grad_norm": 2.378704071044922, "learning_rate": 2.702426801540532e-05, "loss": 0.399, "step": 7110 }, { "epoch": 0.9861495844875346, "grad_norm": 1.8466202020645142, "learning_rate": 2.7005707391768368e-05, "loss": 0.3673, "step": 7120 }, { "epoch": 0.9875346260387812, "grad_norm": 2.051624059677124, "learning_rate": 2.6987146768131412e-05, "loss": 0.327, "step": 7130 }, { "epoch": 0.9889196675900277, "grad_norm": 3.790463924407959, "learning_rate": 2.6968586144494456e-05, "loss": 0.3481, "step": 7140 }, { "epoch": 0.9903047091412742, "grad_norm": 3.0214810371398926, "learning_rate": 2.6950025520857503e-05, "loss": 0.2776, "step": 7150 }, { "epoch": 0.9916897506925207, "grad_norm": 3.270256519317627, "learning_rate": 2.6931464897220547e-05, "loss": 0.4097, "step": 7160 }, { "epoch": 0.9930747922437673, "grad_norm": 1.8735462427139282, "learning_rate": 2.6912904273583594e-05, "loss": 0.3464, "step": 7170 }, { "epoch": 0.9944598337950139, "grad_norm": 2.376981496810913, "learning_rate": 2.6894343649946638e-05, "loss": 0.2948, "step": 7180 }, { "epoch": 0.9958448753462604, "grad_norm": 2.929786205291748, "learning_rate": 2.6875783026309688e-05, "loss": 0.3857, "step": 7190 }, { "epoch": 0.997229916897507, "grad_norm": 3.4466960430145264, "learning_rate": 2.6857222402672732e-05, "loss": 0.4073, "step": 7200 }, { "epoch": 0.9986149584487535, "grad_norm": 2.4240450859069824, "learning_rate": 2.683866177903578e-05, "loss": 0.3999, "step": 7210 }, { "epoch": 1.0, "grad_norm": 4.055125713348389, "learning_rate": 2.6820101155398823e-05, "loss": 0.3633, "step": 7220 }, { "epoch": 1.0013850415512466, "grad_norm": 2.4502010345458984, "learning_rate": 2.680154053176187e-05, "loss": 0.3134, "step": 7230 }, { "epoch": 1.002770083102493, "grad_norm": 6.7965779304504395, "learning_rate": 2.6782979908124914e-05, "loss": 0.2852, "step": 7240 }, { "epoch": 1.0041551246537397, "grad_norm": 3.1844305992126465, "learning_rate": 2.6764419284487958e-05, "loss": 0.2654, "step": 7250 }, { "epoch": 1.005540166204986, "grad_norm": 3.839517831802368, "learning_rate": 2.674585866085101e-05, "loss": 0.3125, "step": 7260 }, { "epoch": 1.0069252077562327, "grad_norm": 2.922276020050049, "learning_rate": 2.6727298037214052e-05, "loss": 0.3124, "step": 7270 }, { "epoch": 1.0083102493074791, "grad_norm": 1.8956462144851685, "learning_rate": 2.67087374135771e-05, "loss": 0.3266, "step": 7280 }, { "epoch": 1.0096952908587258, "grad_norm": 2.989438533782959, "learning_rate": 2.6690176789940143e-05, "loss": 0.3117, "step": 7290 }, { "epoch": 1.0110803324099722, "grad_norm": 4.252190589904785, "learning_rate": 2.667161616630319e-05, "loss": 0.3084, "step": 7300 }, { "epoch": 1.0124653739612188, "grad_norm": 2.619835138320923, "learning_rate": 2.6653055542666234e-05, "loss": 0.3015, "step": 7310 }, { "epoch": 1.0138504155124655, "grad_norm": 3.276683807373047, "learning_rate": 2.6634494919029285e-05, "loss": 0.3145, "step": 7320 }, { "epoch": 1.0152354570637119, "grad_norm": 1.749790906906128, "learning_rate": 2.661593429539233e-05, "loss": 0.291, "step": 7330 }, { "epoch": 1.0166204986149585, "grad_norm": 3.3701796531677246, "learning_rate": 2.6597373671755373e-05, "loss": 0.3213, "step": 7340 }, { "epoch": 1.018005540166205, "grad_norm": 2.0237927436828613, "learning_rate": 2.657881304811842e-05, "loss": 0.285, "step": 7350 }, { "epoch": 1.0193905817174516, "grad_norm": 5.210159778594971, "learning_rate": 2.6560252424481464e-05, "loss": 0.3031, "step": 7360 }, { "epoch": 1.020775623268698, "grad_norm": 3.3727283477783203, "learning_rate": 2.654169180084451e-05, "loss": 0.299, "step": 7370 }, { "epoch": 1.0221606648199446, "grad_norm": 2.3973450660705566, "learning_rate": 2.6523131177207555e-05, "loss": 0.29, "step": 7380 }, { "epoch": 1.0235457063711912, "grad_norm": 2.515382766723633, "learning_rate": 2.6504570553570602e-05, "loss": 0.2893, "step": 7390 }, { "epoch": 1.0249307479224377, "grad_norm": 2.33882999420166, "learning_rate": 2.6486009929933646e-05, "loss": 0.2978, "step": 7400 }, { "epoch": 1.0263157894736843, "grad_norm": 2.946920871734619, "learning_rate": 2.6467449306296696e-05, "loss": 0.2885, "step": 7410 }, { "epoch": 1.0277008310249307, "grad_norm": 6.108001708984375, "learning_rate": 2.644888868265974e-05, "loss": 0.283, "step": 7420 }, { "epoch": 1.0290858725761773, "grad_norm": 2.6035192012786865, "learning_rate": 2.6430328059022787e-05, "loss": 0.3212, "step": 7430 }, { "epoch": 1.0304709141274238, "grad_norm": 2.987347364425659, "learning_rate": 2.641176743538583e-05, "loss": 0.2891, "step": 7440 }, { "epoch": 1.0318559556786704, "grad_norm": 2.5733394622802734, "learning_rate": 2.6393206811748875e-05, "loss": 0.3054, "step": 7450 }, { "epoch": 1.0332409972299168, "grad_norm": 3.5433621406555176, "learning_rate": 2.6374646188111922e-05, "loss": 0.3316, "step": 7460 }, { "epoch": 1.0346260387811634, "grad_norm": 2.2597479820251465, "learning_rate": 2.6356085564474966e-05, "loss": 0.3214, "step": 7470 }, { "epoch": 1.03601108033241, "grad_norm": 3.3408634662628174, "learning_rate": 2.6337524940838016e-05, "loss": 0.2911, "step": 7480 }, { "epoch": 1.0373961218836565, "grad_norm": 2.973517417907715, "learning_rate": 2.631896431720106e-05, "loss": 0.263, "step": 7490 }, { "epoch": 1.0387811634349031, "grad_norm": 2.121873378753662, "learning_rate": 2.6300403693564107e-05, "loss": 0.2777, "step": 7500 }, { "epoch": 1.0401662049861495, "grad_norm": 1.9657741785049438, "learning_rate": 2.628184306992715e-05, "loss": 0.2595, "step": 7510 }, { "epoch": 1.0415512465373962, "grad_norm": 2.234025716781616, "learning_rate": 2.62632824462902e-05, "loss": 0.2688, "step": 7520 }, { "epoch": 1.0429362880886426, "grad_norm": 2.578979969024658, "learning_rate": 2.6244721822653242e-05, "loss": 0.3076, "step": 7530 }, { "epoch": 1.0443213296398892, "grad_norm": 3.483440637588501, "learning_rate": 2.6226161199016293e-05, "loss": 0.2832, "step": 7540 }, { "epoch": 1.0457063711911356, "grad_norm": 1.7430837154388428, "learning_rate": 2.6207600575379337e-05, "loss": 0.2926, "step": 7550 }, { "epoch": 1.0470914127423823, "grad_norm": 3.583524703979492, "learning_rate": 2.618903995174238e-05, "loss": 0.3038, "step": 7560 }, { "epoch": 1.048476454293629, "grad_norm": 2.33888840675354, "learning_rate": 2.6170479328105428e-05, "loss": 0.3276, "step": 7570 }, { "epoch": 1.0498614958448753, "grad_norm": 2.268723249435425, "learning_rate": 2.615191870446847e-05, "loss": 0.2792, "step": 7580 }, { "epoch": 1.051246537396122, "grad_norm": 2.320464611053467, "learning_rate": 2.613335808083152e-05, "loss": 0.2704, "step": 7590 }, { "epoch": 1.0526315789473684, "grad_norm": 2.0587191581726074, "learning_rate": 2.6114797457194562e-05, "loss": 0.3529, "step": 7600 }, { "epoch": 1.054016620498615, "grad_norm": 3.5844640731811523, "learning_rate": 2.609623683355761e-05, "loss": 0.3146, "step": 7610 }, { "epoch": 1.0554016620498614, "grad_norm": 2.7052695751190186, "learning_rate": 2.6077676209920653e-05, "loss": 0.3062, "step": 7620 }, { "epoch": 1.056786703601108, "grad_norm": 2.0099308490753174, "learning_rate": 2.6059115586283704e-05, "loss": 0.2721, "step": 7630 }, { "epoch": 1.0581717451523547, "grad_norm": 2.6577095985412598, "learning_rate": 2.6040554962646748e-05, "loss": 0.3372, "step": 7640 }, { "epoch": 1.059556786703601, "grad_norm": 5.141023635864258, "learning_rate": 2.602199433900979e-05, "loss": 0.2875, "step": 7650 }, { "epoch": 1.0609418282548477, "grad_norm": 3.8370797634124756, "learning_rate": 2.600343371537284e-05, "loss": 0.2623, "step": 7660 }, { "epoch": 1.0623268698060941, "grad_norm": 2.1279635429382324, "learning_rate": 2.5984873091735883e-05, "loss": 0.2965, "step": 7670 }, { "epoch": 1.0637119113573408, "grad_norm": 3.3452510833740234, "learning_rate": 2.596816853046263e-05, "loss": 0.2858, "step": 7680 }, { "epoch": 1.0650969529085872, "grad_norm": 1.935064435005188, "learning_rate": 2.5949607906825673e-05, "loss": 0.2622, "step": 7690 }, { "epoch": 1.0664819944598338, "grad_norm": 3.9529714584350586, "learning_rate": 2.593104728318872e-05, "loss": 0.3025, "step": 7700 }, { "epoch": 1.0678670360110802, "grad_norm": 4.040821075439453, "learning_rate": 2.5912486659551764e-05, "loss": 0.2804, "step": 7710 }, { "epoch": 1.0692520775623269, "grad_norm": 2.2184488773345947, "learning_rate": 2.5893926035914808e-05, "loss": 0.2972, "step": 7720 }, { "epoch": 1.0706371191135735, "grad_norm": 2.0147476196289062, "learning_rate": 2.5875365412277855e-05, "loss": 0.2688, "step": 7730 }, { "epoch": 1.07202216066482, "grad_norm": 2.8058619499206543, "learning_rate": 2.58568047886409e-05, "loss": 0.2834, "step": 7740 }, { "epoch": 1.0734072022160666, "grad_norm": 2.537014961242676, "learning_rate": 2.5838244165003946e-05, "loss": 0.3082, "step": 7750 }, { "epoch": 1.074792243767313, "grad_norm": 2.190993309020996, "learning_rate": 2.581968354136699e-05, "loss": 0.279, "step": 7760 }, { "epoch": 1.0761772853185596, "grad_norm": 2.4779722690582275, "learning_rate": 2.580112291773004e-05, "loss": 0.3169, "step": 7770 }, { "epoch": 1.077562326869806, "grad_norm": 2.9042325019836426, "learning_rate": 2.5782562294093084e-05, "loss": 0.2668, "step": 7780 }, { "epoch": 1.0789473684210527, "grad_norm": 3.5843074321746826, "learning_rate": 2.576400167045613e-05, "loss": 0.3598, "step": 7790 }, { "epoch": 1.080332409972299, "grad_norm": 3.1596109867095947, "learning_rate": 2.5745441046819175e-05, "loss": 0.3084, "step": 7800 }, { "epoch": 1.0817174515235457, "grad_norm": 2.0699515342712402, "learning_rate": 2.572688042318222e-05, "loss": 0.2557, "step": 7810 }, { "epoch": 1.0831024930747923, "grad_norm": 2.1602230072021484, "learning_rate": 2.5708319799545266e-05, "loss": 0.3003, "step": 7820 }, { "epoch": 1.0844875346260388, "grad_norm": 2.242274522781372, "learning_rate": 2.568975917590831e-05, "loss": 0.2886, "step": 7830 }, { "epoch": 1.0858725761772854, "grad_norm": 2.3011515140533447, "learning_rate": 2.567119855227136e-05, "loss": 0.3011, "step": 7840 }, { "epoch": 1.0872576177285318, "grad_norm": 2.657965898513794, "learning_rate": 2.56544939909981e-05, "loss": 0.3247, "step": 7850 }, { "epoch": 1.0886426592797784, "grad_norm": 3.9587013721466064, "learning_rate": 2.5635933367361144e-05, "loss": 0.2618, "step": 7860 }, { "epoch": 1.0900277008310248, "grad_norm": 3.764596939086914, "learning_rate": 2.561737274372419e-05, "loss": 0.3045, "step": 7870 }, { "epoch": 1.0914127423822715, "grad_norm": 2.5493414402008057, "learning_rate": 2.5598812120087235e-05, "loss": 0.2534, "step": 7880 }, { "epoch": 1.0927977839335181, "grad_norm": 2.782120704650879, "learning_rate": 2.5580251496450285e-05, "loss": 0.283, "step": 7890 }, { "epoch": 1.0941828254847645, "grad_norm": 3.238389253616333, "learning_rate": 2.5561690872813326e-05, "loss": 0.3168, "step": 7900 }, { "epoch": 1.0955678670360112, "grad_norm": 1.759173035621643, "learning_rate": 2.5543130249176376e-05, "loss": 0.2869, "step": 7910 }, { "epoch": 1.0969529085872576, "grad_norm": 3.056239604949951, "learning_rate": 2.552456962553942e-05, "loss": 0.2956, "step": 7920 }, { "epoch": 1.0983379501385042, "grad_norm": 2.110605478286743, "learning_rate": 2.5506009001902467e-05, "loss": 0.2692, "step": 7930 }, { "epoch": 1.0997229916897506, "grad_norm": 2.7864487171173096, "learning_rate": 2.548744837826551e-05, "loss": 0.3191, "step": 7940 }, { "epoch": 1.1011080332409973, "grad_norm": 4.533125400543213, "learning_rate": 2.5468887754628558e-05, "loss": 0.3144, "step": 7950 }, { "epoch": 1.1024930747922437, "grad_norm": 4.58744478225708, "learning_rate": 2.5450327130991602e-05, "loss": 0.286, "step": 7960 }, { "epoch": 1.1038781163434903, "grad_norm": 1.8122034072875977, "learning_rate": 2.5431766507354646e-05, "loss": 0.2732, "step": 7970 }, { "epoch": 1.1052631578947367, "grad_norm": 2.5709145069122314, "learning_rate": 2.5413205883717696e-05, "loss": 0.2861, "step": 7980 }, { "epoch": 1.1066481994459834, "grad_norm": 3.8308207988739014, "learning_rate": 2.539464526008074e-05, "loss": 0.2891, "step": 7990 }, { "epoch": 1.10803324099723, "grad_norm": 3.6823465824127197, "learning_rate": 2.5376084636443787e-05, "loss": 0.2855, "step": 8000 }, { "epoch": 1.1094182825484764, "grad_norm": 1.981024146080017, "learning_rate": 2.535752401280683e-05, "loss": 0.3265, "step": 8010 }, { "epoch": 1.110803324099723, "grad_norm": 2.555572748184204, "learning_rate": 2.533896338916988e-05, "loss": 0.3057, "step": 8020 }, { "epoch": 1.1121883656509695, "grad_norm": 2.066537380218506, "learning_rate": 2.5320402765532922e-05, "loss": 0.3195, "step": 8030 }, { "epoch": 1.113573407202216, "grad_norm": 2.2335755825042725, "learning_rate": 2.5301842141895973e-05, "loss": 0.255, "step": 8040 }, { "epoch": 1.1149584487534625, "grad_norm": 2.2410192489624023, "learning_rate": 2.5283281518259017e-05, "loss": 0.2554, "step": 8050 }, { "epoch": 1.1163434903047091, "grad_norm": 3.2448580265045166, "learning_rate": 2.5264720894622064e-05, "loss": 0.2927, "step": 8060 }, { "epoch": 1.1177285318559558, "grad_norm": 4.015766143798828, "learning_rate": 2.5246160270985108e-05, "loss": 0.3037, "step": 8070 }, { "epoch": 1.1191135734072022, "grad_norm": 3.3745741844177246, "learning_rate": 2.522759964734815e-05, "loss": 0.3126, "step": 8080 }, { "epoch": 1.1204986149584488, "grad_norm": 2.4334716796875, "learning_rate": 2.52090390237112e-05, "loss": 0.3142, "step": 8090 }, { "epoch": 1.1218836565096952, "grad_norm": 2.36714243888855, "learning_rate": 2.5190478400074242e-05, "loss": 0.2496, "step": 8100 }, { "epoch": 1.1232686980609419, "grad_norm": 1.657104730606079, "learning_rate": 2.517191777643729e-05, "loss": 0.3198, "step": 8110 }, { "epoch": 1.1246537396121883, "grad_norm": 10.06810188293457, "learning_rate": 2.5153357152800333e-05, "loss": 0.3197, "step": 8120 }, { "epoch": 1.126038781163435, "grad_norm": 3.1592838764190674, "learning_rate": 2.5134796529163384e-05, "loss": 0.3127, "step": 8130 }, { "epoch": 1.1274238227146816, "grad_norm": 2.0332980155944824, "learning_rate": 2.5116235905526428e-05, "loss": 0.2899, "step": 8140 }, { "epoch": 1.128808864265928, "grad_norm": 2.814194917678833, "learning_rate": 2.5097675281889475e-05, "loss": 0.3003, "step": 8150 }, { "epoch": 1.1301939058171746, "grad_norm": 2.8768527507781982, "learning_rate": 2.507911465825252e-05, "loss": 0.318, "step": 8160 }, { "epoch": 1.131578947368421, "grad_norm": 2.1469993591308594, "learning_rate": 2.5060554034615566e-05, "loss": 0.3028, "step": 8170 }, { "epoch": 1.1329639889196677, "grad_norm": 1.9456562995910645, "learning_rate": 2.504384947334231e-05, "loss": 0.2919, "step": 8180 }, { "epoch": 1.134349030470914, "grad_norm": 2.4310145378112793, "learning_rate": 2.5025288849705353e-05, "loss": 0.2333, "step": 8190 }, { "epoch": 1.1357340720221607, "grad_norm": 3.5691587924957275, "learning_rate": 2.50067282260684e-05, "loss": 0.2492, "step": 8200 }, { "epoch": 1.1371191135734071, "grad_norm": 2.5814597606658936, "learning_rate": 2.4988167602431444e-05, "loss": 0.2944, "step": 8210 }, { "epoch": 1.1385041551246537, "grad_norm": 2.083385467529297, "learning_rate": 2.496960697879449e-05, "loss": 0.2617, "step": 8220 }, { "epoch": 1.1398891966759002, "grad_norm": 3.2597713470458984, "learning_rate": 2.4951046355157535e-05, "loss": 0.2932, "step": 8230 }, { "epoch": 1.1412742382271468, "grad_norm": 2.046088695526123, "learning_rate": 2.493248573152058e-05, "loss": 0.2742, "step": 8240 }, { "epoch": 1.1426592797783934, "grad_norm": 2.5761780738830566, "learning_rate": 2.491392510788363e-05, "loss": 0.3342, "step": 8250 }, { "epoch": 1.1440443213296398, "grad_norm": 2.987704277038574, "learning_rate": 2.4895364484246673e-05, "loss": 0.2775, "step": 8260 }, { "epoch": 1.1454293628808865, "grad_norm": 1.9415042400360107, "learning_rate": 2.487680386060972e-05, "loss": 0.2547, "step": 8270 }, { "epoch": 1.146814404432133, "grad_norm": 2.4116508960723877, "learning_rate": 2.4858243236972764e-05, "loss": 0.2978, "step": 8280 }, { "epoch": 1.1481994459833795, "grad_norm": 4.54902458190918, "learning_rate": 2.483968261333581e-05, "loss": 0.3454, "step": 8290 }, { "epoch": 1.149584487534626, "grad_norm": 1.8645448684692383, "learning_rate": 2.4821121989698855e-05, "loss": 0.2742, "step": 8300 }, { "epoch": 1.1509695290858726, "grad_norm": 3.6969997882843018, "learning_rate": 2.4802561366061902e-05, "loss": 0.3105, "step": 8310 }, { "epoch": 1.1523545706371192, "grad_norm": 4.505978107452393, "learning_rate": 2.4784000742424946e-05, "loss": 0.2696, "step": 8320 }, { "epoch": 1.1537396121883656, "grad_norm": 3.0299696922302246, "learning_rate": 2.476544011878799e-05, "loss": 0.2728, "step": 8330 }, { "epoch": 1.1551246537396123, "grad_norm": 22.56894874572754, "learning_rate": 2.474687949515104e-05, "loss": 0.3225, "step": 8340 }, { "epoch": 1.1565096952908587, "grad_norm": 2.4208364486694336, "learning_rate": 2.4728318871514084e-05, "loss": 0.3178, "step": 8350 }, { "epoch": 1.1578947368421053, "grad_norm": 2.0815048217773438, "learning_rate": 2.470975824787713e-05, "loss": 0.3118, "step": 8360 }, { "epoch": 1.1592797783933517, "grad_norm": 2.7576022148132324, "learning_rate": 2.4691197624240175e-05, "loss": 0.3092, "step": 8370 }, { "epoch": 1.1606648199445984, "grad_norm": 4.915147304534912, "learning_rate": 2.4672637000603222e-05, "loss": 0.3538, "step": 8380 }, { "epoch": 1.162049861495845, "grad_norm": 3.7766640186309814, "learning_rate": 2.4654076376966266e-05, "loss": 0.2736, "step": 8390 }, { "epoch": 1.1634349030470914, "grad_norm": 2.9332292079925537, "learning_rate": 2.4635515753329317e-05, "loss": 0.2785, "step": 8400 }, { "epoch": 1.164819944598338, "grad_norm": 2.0695388317108154, "learning_rate": 2.461695512969236e-05, "loss": 0.2671, "step": 8410 }, { "epoch": 1.1662049861495845, "grad_norm": 2.214833974838257, "learning_rate": 2.4598394506055408e-05, "loss": 0.2717, "step": 8420 }, { "epoch": 1.167590027700831, "grad_norm": 2.94752836227417, "learning_rate": 2.457983388241845e-05, "loss": 0.277, "step": 8430 }, { "epoch": 1.1689750692520775, "grad_norm": 2.258779764175415, "learning_rate": 2.4561273258781495e-05, "loss": 0.2858, "step": 8440 }, { "epoch": 1.1703601108033241, "grad_norm": 3.7083449363708496, "learning_rate": 2.4542712635144543e-05, "loss": 0.2831, "step": 8450 }, { "epoch": 1.1717451523545706, "grad_norm": 2.4679653644561768, "learning_rate": 2.4524152011507586e-05, "loss": 0.292, "step": 8460 }, { "epoch": 1.1731301939058172, "grad_norm": 3.176544189453125, "learning_rate": 2.4505591387870634e-05, "loss": 0.3516, "step": 8470 }, { "epoch": 1.1745152354570636, "grad_norm": 2.446927070617676, "learning_rate": 2.4487030764233677e-05, "loss": 0.2781, "step": 8480 }, { "epoch": 1.1759002770083102, "grad_norm": 1.7947635650634766, "learning_rate": 2.4468470140596728e-05, "loss": 0.2913, "step": 8490 }, { "epoch": 1.1772853185595569, "grad_norm": 2.4101195335388184, "learning_rate": 2.4449909516959772e-05, "loss": 0.3274, "step": 8500 }, { "epoch": 1.1786703601108033, "grad_norm": 2.146688222885132, "learning_rate": 2.443134889332282e-05, "loss": 0.2721, "step": 8510 }, { "epoch": 1.18005540166205, "grad_norm": 2.8058419227600098, "learning_rate": 2.4412788269685863e-05, "loss": 0.2937, "step": 8520 }, { "epoch": 1.1814404432132963, "grad_norm": 7.315638065338135, "learning_rate": 2.439422764604891e-05, "loss": 0.2474, "step": 8530 }, { "epoch": 1.182825484764543, "grad_norm": 3.7715487480163574, "learning_rate": 2.4375667022411954e-05, "loss": 0.308, "step": 8540 }, { "epoch": 1.1842105263157894, "grad_norm": 2.942664861679077, "learning_rate": 2.4357106398774998e-05, "loss": 0.3335, "step": 8550 }, { "epoch": 1.185595567867036, "grad_norm": 2.307800769805908, "learning_rate": 2.4338545775138048e-05, "loss": 0.2615, "step": 8560 }, { "epoch": 1.1869806094182827, "grad_norm": 2.0695810317993164, "learning_rate": 2.4319985151501092e-05, "loss": 0.2943, "step": 8570 }, { "epoch": 1.188365650969529, "grad_norm": 1.8298561573028564, "learning_rate": 2.430142452786414e-05, "loss": 0.3084, "step": 8580 }, { "epoch": 1.1897506925207757, "grad_norm": 3.479372024536133, "learning_rate": 2.4282863904227183e-05, "loss": 0.3498, "step": 8590 }, { "epoch": 1.1911357340720221, "grad_norm": 3.58359694480896, "learning_rate": 2.426430328059023e-05, "loss": 0.3179, "step": 8600 }, { "epoch": 1.1925207756232687, "grad_norm": 2.570115327835083, "learning_rate": 2.4245742656953274e-05, "loss": 0.2642, "step": 8610 }, { "epoch": 1.1939058171745152, "grad_norm": 2.4691929817199707, "learning_rate": 2.4227182033316325e-05, "loss": 0.2939, "step": 8620 }, { "epoch": 1.1952908587257618, "grad_norm": 2.3441882133483887, "learning_rate": 2.420862140967937e-05, "loss": 0.2578, "step": 8630 }, { "epoch": 1.1966759002770084, "grad_norm": 2.204817295074463, "learning_rate": 2.4190060786042412e-05, "loss": 0.2726, "step": 8640 }, { "epoch": 1.1980609418282548, "grad_norm": 3.0930302143096924, "learning_rate": 2.417150016240546e-05, "loss": 0.2888, "step": 8650 }, { "epoch": 1.1994459833795015, "grad_norm": 5.16555643081665, "learning_rate": 2.4152939538768503e-05, "loss": 0.3081, "step": 8660 }, { "epoch": 1.200831024930748, "grad_norm": 2.1353518962860107, "learning_rate": 2.413437891513155e-05, "loss": 0.288, "step": 8670 }, { "epoch": 1.2022160664819945, "grad_norm": 3.657902956008911, "learning_rate": 2.4115818291494594e-05, "loss": 0.366, "step": 8680 }, { "epoch": 1.203601108033241, "grad_norm": 2.1335654258728027, "learning_rate": 2.409725766785764e-05, "loss": 0.3347, "step": 8690 }, { "epoch": 1.2049861495844876, "grad_norm": 1.3963650465011597, "learning_rate": 2.4078697044220685e-05, "loss": 0.2674, "step": 8700 }, { "epoch": 1.206371191135734, "grad_norm": 2.855625629425049, "learning_rate": 2.4060136420583736e-05, "loss": 0.2461, "step": 8710 }, { "epoch": 1.2077562326869806, "grad_norm": 6.864447593688965, "learning_rate": 2.404157579694678e-05, "loss": 0.2974, "step": 8720 }, { "epoch": 1.209141274238227, "grad_norm": 1.8673814535140991, "learning_rate": 2.4023015173309827e-05, "loss": 0.3228, "step": 8730 }, { "epoch": 1.2105263157894737, "grad_norm": 2.104527235031128, "learning_rate": 2.400445454967287e-05, "loss": 0.2705, "step": 8740 }, { "epoch": 1.2119113573407203, "grad_norm": 3.6864442825317383, "learning_rate": 2.3985893926035915e-05, "loss": 0.2893, "step": 8750 }, { "epoch": 1.2132963988919667, "grad_norm": 2.519566774368286, "learning_rate": 2.3967333302398962e-05, "loss": 0.2942, "step": 8760 }, { "epoch": 1.2146814404432134, "grad_norm": 2.556389093399048, "learning_rate": 2.3948772678762006e-05, "loss": 0.2899, "step": 8770 }, { "epoch": 1.2160664819944598, "grad_norm": 3.0239813327789307, "learning_rate": 2.3930212055125056e-05, "loss": 0.2676, "step": 8780 }, { "epoch": 1.2174515235457064, "grad_norm": 1.96883225440979, "learning_rate": 2.39116514314881e-05, "loss": 0.272, "step": 8790 }, { "epoch": 1.2188365650969528, "grad_norm": 2.565373659133911, "learning_rate": 2.3893090807851147e-05, "loss": 0.2608, "step": 8800 }, { "epoch": 1.2202216066481995, "grad_norm": 2.15969181060791, "learning_rate": 2.387453018421419e-05, "loss": 0.3213, "step": 8810 }, { "epoch": 1.221606648199446, "grad_norm": 2.249484062194824, "learning_rate": 2.3855969560577238e-05, "loss": 0.2686, "step": 8820 }, { "epoch": 1.2229916897506925, "grad_norm": 2.9420814514160156, "learning_rate": 2.3837408936940282e-05, "loss": 0.2769, "step": 8830 }, { "epoch": 1.2243767313019391, "grad_norm": 3.2192189693450928, "learning_rate": 2.3818848313303333e-05, "loss": 0.3125, "step": 8840 }, { "epoch": 1.2257617728531855, "grad_norm": 2.442768096923828, "learning_rate": 2.3800287689666376e-05, "loss": 0.283, "step": 8850 }, { "epoch": 1.2271468144044322, "grad_norm": 1.6071429252624512, "learning_rate": 2.378172706602942e-05, "loss": 0.3011, "step": 8860 }, { "epoch": 1.2285318559556786, "grad_norm": 3.044898748397827, "learning_rate": 2.3763166442392467e-05, "loss": 0.2661, "step": 8870 }, { "epoch": 1.2299168975069252, "grad_norm": 2.5329463481903076, "learning_rate": 2.374460581875551e-05, "loss": 0.276, "step": 8880 }, { "epoch": 1.2313019390581719, "grad_norm": 2.2597010135650635, "learning_rate": 2.372604519511856e-05, "loss": 0.2875, "step": 8890 }, { "epoch": 1.2326869806094183, "grad_norm": 2.011082649230957, "learning_rate": 2.3707484571481602e-05, "loss": 0.3035, "step": 8900 }, { "epoch": 1.234072022160665, "grad_norm": 2.701603412628174, "learning_rate": 2.368892394784465e-05, "loss": 0.2657, "step": 8910 }, { "epoch": 1.2354570637119113, "grad_norm": 2.4791486263275146, "learning_rate": 2.3670363324207693e-05, "loss": 0.3118, "step": 8920 }, { "epoch": 1.236842105263158, "grad_norm": 3.129568338394165, "learning_rate": 2.3651802700570744e-05, "loss": 0.3063, "step": 8930 }, { "epoch": 1.2382271468144044, "grad_norm": 1.9781174659729004, "learning_rate": 2.3633242076933788e-05, "loss": 0.2864, "step": 8940 }, { "epoch": 1.239612188365651, "grad_norm": 1.7614623308181763, "learning_rate": 2.3614681453296835e-05, "loss": 0.288, "step": 8950 }, { "epoch": 1.2409972299168974, "grad_norm": 1.9567925930023193, "learning_rate": 2.359612082965988e-05, "loss": 0.2872, "step": 8960 }, { "epoch": 1.242382271468144, "grad_norm": 2.3994760513305664, "learning_rate": 2.3577560206022922e-05, "loss": 0.3582, "step": 8970 }, { "epoch": 1.2437673130193905, "grad_norm": 1.8160407543182373, "learning_rate": 2.355899958238597e-05, "loss": 0.2995, "step": 8980 }, { "epoch": 1.245152354570637, "grad_norm": 2.4072775840759277, "learning_rate": 2.3540438958749013e-05, "loss": 0.273, "step": 8990 }, { "epoch": 1.2465373961218837, "grad_norm": 2.576134204864502, "learning_rate": 2.3521878335112064e-05, "loss": 0.2805, "step": 9000 }, { "epoch": 1.2469529085872577, "eval_loss": 0.32686564326286316, "eval_runtime": 1418.8287, "eval_samples_per_second": 6.428, "eval_steps_per_second": 0.803, "step": 9003 }, { "epoch": 1.2479224376731302, "grad_norm": 1.4296165704727173, "learning_rate": 2.3503317711475108e-05, "loss": 0.2445, "step": 9010 }, { "epoch": 1.2493074792243768, "grad_norm": 2.6902916431427, "learning_rate": 2.3484757087838155e-05, "loss": 0.288, "step": 9020 }, { "epoch": 1.2506925207756232, "grad_norm": 1.775841474533081, "learning_rate": 2.34661964642012e-05, "loss": 0.2803, "step": 9030 }, { "epoch": 1.2520775623268698, "grad_norm": 3.9191651344299316, "learning_rate": 2.3447635840564246e-05, "loss": 0.3268, "step": 9040 }, { "epoch": 1.2534626038781163, "grad_norm": 2.516214370727539, "learning_rate": 2.342907521692729e-05, "loss": 0.27, "step": 9050 }, { "epoch": 1.254847645429363, "grad_norm": 2.2284133434295654, "learning_rate": 2.3410514593290334e-05, "loss": 0.2835, "step": 9060 }, { "epoch": 1.2562326869806095, "grad_norm": 1.9421190023422241, "learning_rate": 2.3391953969653384e-05, "loss": 0.2742, "step": 9070 }, { "epoch": 1.257617728531856, "grad_norm": 3.4131953716278076, "learning_rate": 2.3373393346016425e-05, "loss": 0.3056, "step": 9080 }, { "epoch": 1.2590027700831026, "grad_norm": 2.111435890197754, "learning_rate": 2.3354832722379475e-05, "loss": 0.2678, "step": 9090 }, { "epoch": 1.260387811634349, "grad_norm": 2.7209439277648926, "learning_rate": 2.333627209874252e-05, "loss": 0.2866, "step": 9100 }, { "epoch": 1.2617728531855956, "grad_norm": 2.8803234100341797, "learning_rate": 2.3317711475105566e-05, "loss": 0.2763, "step": 9110 }, { "epoch": 1.263157894736842, "grad_norm": 1.6891577243804932, "learning_rate": 2.329915085146861e-05, "loss": 0.2875, "step": 9120 }, { "epoch": 1.2645429362880887, "grad_norm": 3.695586919784546, "learning_rate": 2.3280590227831657e-05, "loss": 0.2863, "step": 9130 }, { "epoch": 1.2659279778393353, "grad_norm": 2.572779417037964, "learning_rate": 2.32620296041947e-05, "loss": 0.2841, "step": 9140 }, { "epoch": 1.2673130193905817, "grad_norm": 2.819389581680298, "learning_rate": 2.3243468980557752e-05, "loss": 0.2626, "step": 9150 }, { "epoch": 1.2686980609418281, "grad_norm": 1.8481063842773438, "learning_rate": 2.3224908356920796e-05, "loss": 0.2853, "step": 9160 }, { "epoch": 1.2700831024930748, "grad_norm": 3.2458460330963135, "learning_rate": 2.320634773328384e-05, "loss": 0.286, "step": 9170 }, { "epoch": 1.2714681440443214, "grad_norm": 7.590353488922119, "learning_rate": 2.3187787109646887e-05, "loss": 0.2914, "step": 9180 }, { "epoch": 1.2728531855955678, "grad_norm": 2.576279401779175, "learning_rate": 2.316922648600993e-05, "loss": 0.2798, "step": 9190 }, { "epoch": 1.2742382271468145, "grad_norm": 2.004206657409668, "learning_rate": 2.3150665862372978e-05, "loss": 0.2607, "step": 9200 }, { "epoch": 1.2756232686980609, "grad_norm": 1.7435073852539062, "learning_rate": 2.313210523873602e-05, "loss": 0.297, "step": 9210 }, { "epoch": 1.2770083102493075, "grad_norm": 2.3837873935699463, "learning_rate": 2.3113544615099072e-05, "loss": 0.2865, "step": 9220 }, { "epoch": 1.278393351800554, "grad_norm": 2.763490676879883, "learning_rate": 2.3094983991462116e-05, "loss": 0.2951, "step": 9230 }, { "epoch": 1.2797783933518005, "grad_norm": 2.5408225059509277, "learning_rate": 2.3076423367825163e-05, "loss": 0.2737, "step": 9240 }, { "epoch": 1.2811634349030472, "grad_norm": 2.692471742630005, "learning_rate": 2.3057862744188207e-05, "loss": 0.2757, "step": 9250 }, { "epoch": 1.2825484764542936, "grad_norm": 2.177374839782715, "learning_rate": 2.3039302120551254e-05, "loss": 0.2898, "step": 9260 }, { "epoch": 1.2839335180055402, "grad_norm": 2.7410569190979004, "learning_rate": 2.3020741496914298e-05, "loss": 0.302, "step": 9270 }, { "epoch": 1.2853185595567866, "grad_norm": 2.6148903369903564, "learning_rate": 2.300218087327734e-05, "loss": 0.3334, "step": 9280 }, { "epoch": 1.2867036011080333, "grad_norm": 1.9452197551727295, "learning_rate": 2.298362024964039e-05, "loss": 0.2833, "step": 9290 }, { "epoch": 1.2880886426592797, "grad_norm": 2.417820453643799, "learning_rate": 2.2965059626003433e-05, "loss": 0.2605, "step": 9300 }, { "epoch": 1.2894736842105263, "grad_norm": 3.1312954425811768, "learning_rate": 2.2946499002366483e-05, "loss": 0.2887, "step": 9310 }, { "epoch": 1.290858725761773, "grad_norm": 3.3138206005096436, "learning_rate": 2.2927938378729527e-05, "loss": 0.2811, "step": 9320 }, { "epoch": 1.2922437673130194, "grad_norm": 2.579641342163086, "learning_rate": 2.2909377755092574e-05, "loss": 0.2698, "step": 9330 }, { "epoch": 1.293628808864266, "grad_norm": 3.3723838329315186, "learning_rate": 2.2890817131455618e-05, "loss": 0.2913, "step": 9340 }, { "epoch": 1.2950138504155124, "grad_norm": 3.1993815898895264, "learning_rate": 2.2872256507818665e-05, "loss": 0.2779, "step": 9350 }, { "epoch": 1.296398891966759, "grad_norm": 3.524803876876831, "learning_rate": 2.285369588418171e-05, "loss": 0.3189, "step": 9360 }, { "epoch": 1.2977839335180055, "grad_norm": 2.997694492340088, "learning_rate": 2.2835135260544753e-05, "loss": 0.2694, "step": 9370 }, { "epoch": 1.299168975069252, "grad_norm": 2.596389055252075, "learning_rate": 2.2816574636907803e-05, "loss": 0.3072, "step": 9380 }, { "epoch": 1.3005540166204987, "grad_norm": 1.9428213834762573, "learning_rate": 2.2798014013270847e-05, "loss": 0.278, "step": 9390 }, { "epoch": 1.3019390581717452, "grad_norm": 1.9896929264068604, "learning_rate": 2.2779453389633894e-05, "loss": 0.2886, "step": 9400 }, { "epoch": 1.3033240997229916, "grad_norm": 1.7544904947280884, "learning_rate": 2.2760892765996938e-05, "loss": 0.286, "step": 9410 }, { "epoch": 1.3047091412742382, "grad_norm": 3.5407917499542236, "learning_rate": 2.2742332142359985e-05, "loss": 0.2905, "step": 9420 }, { "epoch": 1.3060941828254848, "grad_norm": 2.418882131576538, "learning_rate": 2.272377151872303e-05, "loss": 0.2943, "step": 9430 }, { "epoch": 1.3074792243767313, "grad_norm": 2.2245256900787354, "learning_rate": 2.270521089508608e-05, "loss": 0.2887, "step": 9440 }, { "epoch": 1.3088642659279779, "grad_norm": 2.4972410202026367, "learning_rate": 2.2686650271449124e-05, "loss": 0.294, "step": 9450 }, { "epoch": 1.3102493074792243, "grad_norm": 2.2846741676330566, "learning_rate": 2.266808964781217e-05, "loss": 0.265, "step": 9460 }, { "epoch": 1.311634349030471, "grad_norm": 1.917407751083374, "learning_rate": 2.2649529024175215e-05, "loss": 0.2872, "step": 9470 }, { "epoch": 1.3130193905817173, "grad_norm": 2.495126247406006, "learning_rate": 2.263096840053826e-05, "loss": 0.2577, "step": 9480 }, { "epoch": 1.314404432132964, "grad_norm": 2.143667697906494, "learning_rate": 2.2612407776901306e-05, "loss": 0.2766, "step": 9490 }, { "epoch": 1.3157894736842106, "grad_norm": 3.016167402267456, "learning_rate": 2.259384715326435e-05, "loss": 0.2724, "step": 9500 }, { "epoch": 1.317174515235457, "grad_norm": 2.1139116287231445, "learning_rate": 2.2575286529627397e-05, "loss": 0.2746, "step": 9510 }, { "epoch": 1.3185595567867037, "grad_norm": 1.5689263343811035, "learning_rate": 2.255672590599044e-05, "loss": 0.2304, "step": 9520 }, { "epoch": 1.31994459833795, "grad_norm": 4.322932243347168, "learning_rate": 2.253816528235349e-05, "loss": 0.2583, "step": 9530 }, { "epoch": 1.3213296398891967, "grad_norm": 2.0311527252197266, "learning_rate": 2.2519604658716535e-05, "loss": 0.2542, "step": 9540 }, { "epoch": 1.3227146814404431, "grad_norm": 2.6098263263702393, "learning_rate": 2.2501044035079582e-05, "loss": 0.2703, "step": 9550 }, { "epoch": 1.3240997229916898, "grad_norm": 1.897586464881897, "learning_rate": 2.2482483411442626e-05, "loss": 0.282, "step": 9560 }, { "epoch": 1.3254847645429364, "grad_norm": 2.359265089035034, "learning_rate": 2.2463922787805673e-05, "loss": 0.2873, "step": 9570 }, { "epoch": 1.3268698060941828, "grad_norm": 2.188415288925171, "learning_rate": 2.2445362164168717e-05, "loss": 0.2575, "step": 9580 }, { "epoch": 1.3282548476454292, "grad_norm": 3.153799533843994, "learning_rate": 2.242680154053176e-05, "loss": 0.2411, "step": 9590 }, { "epoch": 1.3296398891966759, "grad_norm": 1.8487639427185059, "learning_rate": 2.240824091689481e-05, "loss": 0.2386, "step": 9600 }, { "epoch": 1.3310249307479225, "grad_norm": 2.406550407409668, "learning_rate": 2.2389680293257855e-05, "loss": 0.2674, "step": 9610 }, { "epoch": 1.332409972299169, "grad_norm": 2.741635322570801, "learning_rate": 2.2371119669620902e-05, "loss": 0.2715, "step": 9620 }, { "epoch": 1.3337950138504155, "grad_norm": 2.4122025966644287, "learning_rate": 2.2352559045983946e-05, "loss": 0.3105, "step": 9630 }, { "epoch": 1.3351800554016622, "grad_norm": 2.6568686962127686, "learning_rate": 2.2333998422346993e-05, "loss": 0.3032, "step": 9640 }, { "epoch": 1.3365650969529086, "grad_norm": 2.265321731567383, "learning_rate": 2.2315437798710037e-05, "loss": 0.2754, "step": 9650 }, { "epoch": 1.337950138504155, "grad_norm": 3.134011745452881, "learning_rate": 2.2296877175073088e-05, "loss": 0.2874, "step": 9660 }, { "epoch": 1.3393351800554016, "grad_norm": 2.3763790130615234, "learning_rate": 2.227831655143613e-05, "loss": 0.2227, "step": 9670 }, { "epoch": 1.3407202216066483, "grad_norm": 1.92081880569458, "learning_rate": 2.2259755927799175e-05, "loss": 0.2886, "step": 9680 }, { "epoch": 1.3421052631578947, "grad_norm": 3.245089054107666, "learning_rate": 2.2241195304162223e-05, "loss": 0.2795, "step": 9690 }, { "epoch": 1.3434903047091413, "grad_norm": 2.542462110519409, "learning_rate": 2.2222634680525266e-05, "loss": 0.2814, "step": 9700 }, { "epoch": 1.3448753462603877, "grad_norm": 2.3733303546905518, "learning_rate": 2.2204074056888314e-05, "loss": 0.2629, "step": 9710 }, { "epoch": 1.3462603878116344, "grad_norm": 3.7426581382751465, "learning_rate": 2.2185513433251357e-05, "loss": 0.334, "step": 9720 }, { "epoch": 1.3476454293628808, "grad_norm": 3.181501626968384, "learning_rate": 2.2166952809614405e-05, "loss": 0.2797, "step": 9730 }, { "epoch": 1.3490304709141274, "grad_norm": 2.654956817626953, "learning_rate": 2.214839218597745e-05, "loss": 0.3192, "step": 9740 }, { "epoch": 1.350415512465374, "grad_norm": 1.8692598342895508, "learning_rate": 2.21298315623405e-05, "loss": 0.2282, "step": 9750 }, { "epoch": 1.3518005540166205, "grad_norm": 2.027109384536743, "learning_rate": 2.2111270938703543e-05, "loss": 0.3065, "step": 9760 }, { "epoch": 1.353185595567867, "grad_norm": 1.7037060260772705, "learning_rate": 2.209271031506659e-05, "loss": 0.2624, "step": 9770 }, { "epoch": 1.3545706371191135, "grad_norm": 2.3476176261901855, "learning_rate": 2.2074149691429634e-05, "loss": 0.2572, "step": 9780 }, { "epoch": 1.3559556786703602, "grad_norm": 3.0191843509674072, "learning_rate": 2.2055589067792678e-05, "loss": 0.275, "step": 9790 }, { "epoch": 1.3573407202216066, "grad_norm": 2.4362123012542725, "learning_rate": 2.2037028444155725e-05, "loss": 0.2981, "step": 9800 }, { "epoch": 1.3587257617728532, "grad_norm": 1.9792042970657349, "learning_rate": 2.201846782051877e-05, "loss": 0.2992, "step": 9810 }, { "epoch": 1.3601108033240998, "grad_norm": 3.2633731365203857, "learning_rate": 2.199990719688182e-05, "loss": 0.329, "step": 9820 }, { "epoch": 1.3614958448753463, "grad_norm": 2.1251308917999268, "learning_rate": 2.1981346573244863e-05, "loss": 0.2727, "step": 9830 }, { "epoch": 1.3628808864265927, "grad_norm": 1.8353410959243774, "learning_rate": 2.196278594960791e-05, "loss": 0.2765, "step": 9840 }, { "epoch": 1.3642659279778393, "grad_norm": 3.0422475337982178, "learning_rate": 2.1944225325970954e-05, "loss": 0.2846, "step": 9850 }, { "epoch": 1.365650969529086, "grad_norm": 2.062798023223877, "learning_rate": 2.1925664702334e-05, "loss": 0.2542, "step": 9860 }, { "epoch": 1.3670360110803323, "grad_norm": 1.8410342931747437, "learning_rate": 2.1907104078697045e-05, "loss": 0.2511, "step": 9870 }, { "epoch": 1.368421052631579, "grad_norm": 2.4124655723571777, "learning_rate": 2.1888543455060092e-05, "loss": 0.3129, "step": 9880 }, { "epoch": 1.3698060941828256, "grad_norm": 2.5593621730804443, "learning_rate": 2.1869982831423136e-05, "loss": 0.2813, "step": 9890 }, { "epoch": 1.371191135734072, "grad_norm": 1.6741851568222046, "learning_rate": 2.185142220778618e-05, "loss": 0.3063, "step": 9900 }, { "epoch": 1.3725761772853184, "grad_norm": 4.773355007171631, "learning_rate": 2.183286158414923e-05, "loss": 0.2506, "step": 9910 }, { "epoch": 1.373961218836565, "grad_norm": 2.980382204055786, "learning_rate": 2.1814300960512274e-05, "loss": 0.3116, "step": 9920 }, { "epoch": 1.3753462603878117, "grad_norm": 2.324427366256714, "learning_rate": 2.179574033687532e-05, "loss": 0.2479, "step": 9930 }, { "epoch": 1.3767313019390581, "grad_norm": 2.517643451690674, "learning_rate": 2.1777179713238365e-05, "loss": 0.2692, "step": 9940 }, { "epoch": 1.3781163434903048, "grad_norm": 2.677506923675537, "learning_rate": 2.1758619089601412e-05, "loss": 0.2807, "step": 9950 }, { "epoch": 1.3795013850415512, "grad_norm": 2.6422669887542725, "learning_rate": 2.1740058465964456e-05, "loss": 0.2816, "step": 9960 }, { "epoch": 1.3808864265927978, "grad_norm": 2.5744709968566895, "learning_rate": 2.1721497842327507e-05, "loss": 0.2577, "step": 9970 }, { "epoch": 1.3822714681440442, "grad_norm": 1.9916622638702393, "learning_rate": 2.170293721869055e-05, "loss": 0.2952, "step": 9980 }, { "epoch": 1.3836565096952909, "grad_norm": 3.8083372116088867, "learning_rate": 2.1684376595053595e-05, "loss": 0.3098, "step": 9990 }, { "epoch": 1.3850415512465375, "grad_norm": 2.3398289680480957, "learning_rate": 2.1665815971416642e-05, "loss": 0.2562, "step": 10000 }, { "epoch": 1.386426592797784, "grad_norm": 2.8088276386260986, "learning_rate": 2.1647255347779686e-05, "loss": 0.29, "step": 10010 }, { "epoch": 1.3878116343490305, "grad_norm": 2.151501178741455, "learning_rate": 2.1628694724142733e-05, "loss": 0.3037, "step": 10020 }, { "epoch": 1.389196675900277, "grad_norm": 1.9896818399429321, "learning_rate": 2.1610134100505777e-05, "loss": 0.296, "step": 10030 }, { "epoch": 1.3905817174515236, "grad_norm": 2.18941330909729, "learning_rate": 2.1591573476868827e-05, "loss": 0.275, "step": 10040 }, { "epoch": 1.39196675900277, "grad_norm": 2.6739301681518555, "learning_rate": 2.157301285323187e-05, "loss": 0.2596, "step": 10050 }, { "epoch": 1.3933518005540166, "grad_norm": 2.3680918216705322, "learning_rate": 2.1554452229594918e-05, "loss": 0.3076, "step": 10060 }, { "epoch": 1.3947368421052633, "grad_norm": 2.2901291847229004, "learning_rate": 2.1535891605957962e-05, "loss": 0.2879, "step": 10070 }, { "epoch": 1.3961218836565097, "grad_norm": 2.8573126792907715, "learning_rate": 2.151733098232101e-05, "loss": 0.2727, "step": 10080 }, { "epoch": 1.397506925207756, "grad_norm": 2.1237952709198, "learning_rate": 2.1498770358684053e-05, "loss": 0.2954, "step": 10090 }, { "epoch": 1.3988919667590027, "grad_norm": 2.6920433044433594, "learning_rate": 2.1480209735047097e-05, "loss": 0.2848, "step": 10100 }, { "epoch": 1.4002770083102494, "grad_norm": 2.2819883823394775, "learning_rate": 2.1461649111410144e-05, "loss": 0.2435, "step": 10110 }, { "epoch": 1.4016620498614958, "grad_norm": 1.8151028156280518, "learning_rate": 2.1443088487773188e-05, "loss": 0.255, "step": 10120 }, { "epoch": 1.4030470914127424, "grad_norm": 3.254143714904785, "learning_rate": 2.142452786413624e-05, "loss": 0.2825, "step": 10130 }, { "epoch": 1.404432132963989, "grad_norm": 2.8257339000701904, "learning_rate": 2.1405967240499282e-05, "loss": 0.2738, "step": 10140 }, { "epoch": 1.4058171745152355, "grad_norm": 2.7441883087158203, "learning_rate": 2.138740661686233e-05, "loss": 0.2641, "step": 10150 }, { "epoch": 1.4072022160664819, "grad_norm": 2.6015915870666504, "learning_rate": 2.1368845993225373e-05, "loss": 0.2996, "step": 10160 }, { "epoch": 1.4085872576177285, "grad_norm": 2.5906431674957275, "learning_rate": 2.1352141431952113e-05, "loss": 0.2535, "step": 10170 }, { "epoch": 1.4099722991689752, "grad_norm": 2.5820693969726562, "learning_rate": 2.1333580808315163e-05, "loss": 0.2971, "step": 10180 }, { "epoch": 1.4113573407202216, "grad_norm": 1.439150094985962, "learning_rate": 2.1315020184678207e-05, "loss": 0.3055, "step": 10190 }, { "epoch": 1.4127423822714682, "grad_norm": 2.544900894165039, "learning_rate": 2.1296459561041254e-05, "loss": 0.2866, "step": 10200 }, { "epoch": 1.4141274238227146, "grad_norm": 3.0438737869262695, "learning_rate": 2.1277898937404298e-05, "loss": 0.2676, "step": 10210 }, { "epoch": 1.4155124653739612, "grad_norm": 1.7316502332687378, "learning_rate": 2.1259338313767345e-05, "loss": 0.261, "step": 10220 }, { "epoch": 1.4168975069252077, "grad_norm": 2.2577292919158936, "learning_rate": 2.124077769013039e-05, "loss": 0.2667, "step": 10230 }, { "epoch": 1.4182825484764543, "grad_norm": 1.731268048286438, "learning_rate": 2.122221706649344e-05, "loss": 0.3069, "step": 10240 }, { "epoch": 1.419667590027701, "grad_norm": 3.460369825363159, "learning_rate": 2.120365644285648e-05, "loss": 0.2672, "step": 10250 }, { "epoch": 1.4210526315789473, "grad_norm": 1.6917779445648193, "learning_rate": 2.1185095819219524e-05, "loss": 0.2759, "step": 10260 }, { "epoch": 1.422437673130194, "grad_norm": 2.418710708618164, "learning_rate": 2.1166535195582574e-05, "loss": 0.3133, "step": 10270 }, { "epoch": 1.4238227146814404, "grad_norm": 2.533979892730713, "learning_rate": 2.1147974571945618e-05, "loss": 0.2795, "step": 10280 }, { "epoch": 1.425207756232687, "grad_norm": 3.7644662857055664, "learning_rate": 2.1129413948308665e-05, "loss": 0.3077, "step": 10290 }, { "epoch": 1.4265927977839334, "grad_norm": 2.7289891242980957, "learning_rate": 2.111085332467171e-05, "loss": 0.2475, "step": 10300 }, { "epoch": 1.42797783933518, "grad_norm": 2.3104724884033203, "learning_rate": 2.1092292701034756e-05, "loss": 0.2295, "step": 10310 }, { "epoch": 1.4293628808864267, "grad_norm": 1.9208799600601196, "learning_rate": 2.10737320773978e-05, "loss": 0.2759, "step": 10320 }, { "epoch": 1.4307479224376731, "grad_norm": 2.4903857707977295, "learning_rate": 2.105517145376085e-05, "loss": 0.2779, "step": 10330 }, { "epoch": 1.4321329639889195, "grad_norm": 2.3281075954437256, "learning_rate": 2.1036610830123895e-05, "loss": 0.2968, "step": 10340 }, { "epoch": 1.4335180055401662, "grad_norm": 2.194763660430908, "learning_rate": 2.1018050206486942e-05, "loss": 0.2857, "step": 10350 }, { "epoch": 1.4349030470914128, "grad_norm": 3.951835870742798, "learning_rate": 2.0999489582849986e-05, "loss": 0.2627, "step": 10360 }, { "epoch": 1.4362880886426592, "grad_norm": 4.016772747039795, "learning_rate": 2.098092895921303e-05, "loss": 0.2799, "step": 10370 }, { "epoch": 1.4376731301939059, "grad_norm": 2.1924023628234863, "learning_rate": 2.0962368335576077e-05, "loss": 0.311, "step": 10380 }, { "epoch": 1.4390581717451525, "grad_norm": 2.053593635559082, "learning_rate": 2.094380771193912e-05, "loss": 0.2654, "step": 10390 }, { "epoch": 1.440443213296399, "grad_norm": 1.9551750421524048, "learning_rate": 2.092524708830217e-05, "loss": 0.2732, "step": 10400 }, { "epoch": 1.4418282548476453, "grad_norm": 2.2008869647979736, "learning_rate": 2.0906686464665215e-05, "loss": 0.2788, "step": 10410 }, { "epoch": 1.443213296398892, "grad_norm": 2.2866125106811523, "learning_rate": 2.0888125841028262e-05, "loss": 0.2468, "step": 10420 }, { "epoch": 1.4445983379501386, "grad_norm": 2.323117733001709, "learning_rate": 2.0869565217391306e-05, "loss": 0.2816, "step": 10430 }, { "epoch": 1.445983379501385, "grad_norm": 2.459951639175415, "learning_rate": 2.0851004593754353e-05, "loss": 0.2886, "step": 10440 }, { "epoch": 1.4473684210526316, "grad_norm": 2.0339560508728027, "learning_rate": 2.0832443970117397e-05, "loss": 0.259, "step": 10450 }, { "epoch": 1.448753462603878, "grad_norm": 2.2099788188934326, "learning_rate": 2.081388334648044e-05, "loss": 0.2358, "step": 10460 }, { "epoch": 1.4501385041551247, "grad_norm": 2.0454177856445312, "learning_rate": 2.0795322722843488e-05, "loss": 0.2324, "step": 10470 }, { "epoch": 1.451523545706371, "grad_norm": 1.9629887342453003, "learning_rate": 2.0776762099206532e-05, "loss": 0.2338, "step": 10480 }, { "epoch": 1.4529085872576177, "grad_norm": 2.8075554370880127, "learning_rate": 2.0758201475569582e-05, "loss": 0.2898, "step": 10490 }, { "epoch": 1.4542936288088644, "grad_norm": 1.5575898885726929, "learning_rate": 2.0739640851932626e-05, "loss": 0.2811, "step": 10500 }, { "epoch": 1.4556786703601108, "grad_norm": 1.6579359769821167, "learning_rate": 2.0721080228295673e-05, "loss": 0.2391, "step": 10510 }, { "epoch": 1.4570637119113574, "grad_norm": 2.505563259124756, "learning_rate": 2.0702519604658717e-05, "loss": 0.2564, "step": 10520 }, { "epoch": 1.4584487534626038, "grad_norm": 2.028754949569702, "learning_rate": 2.0683958981021764e-05, "loss": 0.2855, "step": 10530 }, { "epoch": 1.4598337950138505, "grad_norm": 2.3444249629974365, "learning_rate": 2.0665398357384808e-05, "loss": 0.2575, "step": 10540 }, { "epoch": 1.4612188365650969, "grad_norm": 1.9929412603378296, "learning_rate": 2.064683773374786e-05, "loss": 0.2481, "step": 10550 }, { "epoch": 1.4626038781163435, "grad_norm": 4.5691094398498535, "learning_rate": 2.0628277110110903e-05, "loss": 0.2479, "step": 10560 }, { "epoch": 1.4639889196675901, "grad_norm": 3.3610942363739014, "learning_rate": 2.0609716486473946e-05, "loss": 0.3265, "step": 10570 }, { "epoch": 1.4653739612188366, "grad_norm": 2.8776373863220215, "learning_rate": 2.0591155862836994e-05, "loss": 0.2893, "step": 10580 }, { "epoch": 1.466759002770083, "grad_norm": 3.1740386486053467, "learning_rate": 2.0572595239200037e-05, "loss": 0.2574, "step": 10590 }, { "epoch": 1.4681440443213296, "grad_norm": 2.4539008140563965, "learning_rate": 2.0554034615563085e-05, "loss": 0.3127, "step": 10600 }, { "epoch": 1.4695290858725762, "grad_norm": 1.850730538368225, "learning_rate": 2.053547399192613e-05, "loss": 0.2471, "step": 10610 }, { "epoch": 1.4709141274238227, "grad_norm": 6.172549724578857, "learning_rate": 2.051691336828918e-05, "loss": 0.2555, "step": 10620 }, { "epoch": 1.4722991689750693, "grad_norm": 1.7516671419143677, "learning_rate": 2.0498352744652223e-05, "loss": 0.225, "step": 10630 }, { "epoch": 1.4736842105263157, "grad_norm": 3.782275915145874, "learning_rate": 2.047979212101527e-05, "loss": 0.2735, "step": 10640 }, { "epoch": 1.4750692520775623, "grad_norm": 4.208226680755615, "learning_rate": 2.0461231497378314e-05, "loss": 0.2755, "step": 10650 }, { "epoch": 1.4764542936288088, "grad_norm": 2.2313504219055176, "learning_rate": 2.044267087374136e-05, "loss": 0.2913, "step": 10660 }, { "epoch": 1.4778393351800554, "grad_norm": 1.933903694152832, "learning_rate": 2.0424110250104405e-05, "loss": 0.2789, "step": 10670 }, { "epoch": 1.479224376731302, "grad_norm": 3.100245952606201, "learning_rate": 2.040554962646745e-05, "loss": 0.2594, "step": 10680 }, { "epoch": 1.4806094182825484, "grad_norm": 5.2458953857421875, "learning_rate": 2.0386989002830496e-05, "loss": 0.2565, "step": 10690 }, { "epoch": 1.481994459833795, "grad_norm": 2.68757963180542, "learning_rate": 2.036842837919354e-05, "loss": 0.2794, "step": 10700 }, { "epoch": 1.4833795013850415, "grad_norm": 2.6526389122009277, "learning_rate": 2.034986775555659e-05, "loss": 0.2529, "step": 10710 }, { "epoch": 1.4847645429362881, "grad_norm": 2.440277338027954, "learning_rate": 2.0331307131919634e-05, "loss": 0.2684, "step": 10720 }, { "epoch": 1.4861495844875345, "grad_norm": 3.8751096725463867, "learning_rate": 2.031274650828268e-05, "loss": 0.2469, "step": 10730 }, { "epoch": 1.4875346260387812, "grad_norm": 1.865172266960144, "learning_rate": 2.0294185884645725e-05, "loss": 0.3031, "step": 10740 }, { "epoch": 1.4889196675900278, "grad_norm": 2.3093738555908203, "learning_rate": 2.0275625261008772e-05, "loss": 0.2806, "step": 10750 }, { "epoch": 1.4903047091412742, "grad_norm": 3.2070140838623047, "learning_rate": 2.0257064637371816e-05, "loss": 0.2778, "step": 10760 }, { "epoch": 1.4916897506925209, "grad_norm": 2.056748151779175, "learning_rate": 2.023850401373486e-05, "loss": 0.2943, "step": 10770 }, { "epoch": 1.4930747922437673, "grad_norm": 1.8449196815490723, "learning_rate": 2.021994339009791e-05, "loss": 0.2767, "step": 10780 }, { "epoch": 1.494459833795014, "grad_norm": 2.679731607437134, "learning_rate": 2.0201382766460954e-05, "loss": 0.3003, "step": 10790 }, { "epoch": 1.4958448753462603, "grad_norm": 4.13687801361084, "learning_rate": 2.0182822142824e-05, "loss": 0.279, "step": 10800 }, { "epoch": 1.497229916897507, "grad_norm": 1.9377161264419556, "learning_rate": 2.016611758155074e-05, "loss": 0.3019, "step": 10810 }, { "epoch": 1.4986149584487536, "grad_norm": 2.9904537200927734, "learning_rate": 2.0147556957913788e-05, "loss": 0.3032, "step": 10820 }, { "epoch": 1.5, "grad_norm": 2.0070672035217285, "learning_rate": 2.0128996334276832e-05, "loss": 0.2294, "step": 10830 }, { "epoch": 1.5013850415512464, "grad_norm": 2.722790241241455, "learning_rate": 2.0110435710639876e-05, "loss": 0.2812, "step": 10840 }, { "epoch": 1.502770083102493, "grad_norm": 2.189173460006714, "learning_rate": 2.0091875087002926e-05, "loss": 0.2594, "step": 10850 }, { "epoch": 1.5041551246537397, "grad_norm": 2.6748456954956055, "learning_rate": 2.007331446336597e-05, "loss": 0.2776, "step": 10860 }, { "epoch": 1.505540166204986, "grad_norm": 1.898979663848877, "learning_rate": 2.0054753839729017e-05, "loss": 0.2419, "step": 10870 }, { "epoch": 1.5069252077562327, "grad_norm": 2.8842554092407227, "learning_rate": 2.003619321609206e-05, "loss": 0.2908, "step": 10880 }, { "epoch": 1.5083102493074794, "grad_norm": 2.4269087314605713, "learning_rate": 2.001763259245511e-05, "loss": 0.2506, "step": 10890 }, { "epoch": 1.5096952908587258, "grad_norm": 1.824051022529602, "learning_rate": 1.9999071968818152e-05, "loss": 0.2592, "step": 10900 }, { "epoch": 1.5110803324099722, "grad_norm": 2.767005443572998, "learning_rate": 1.99805113451812e-05, "loss": 0.2193, "step": 10910 }, { "epoch": 1.5124653739612188, "grad_norm": 2.4028172492980957, "learning_rate": 1.9961950721544247e-05, "loss": 0.2823, "step": 10920 }, { "epoch": 1.5138504155124655, "grad_norm": 1.6937496662139893, "learning_rate": 1.994339009790729e-05, "loss": 0.2885, "step": 10930 }, { "epoch": 1.5152354570637119, "grad_norm": 3.3346996307373047, "learning_rate": 1.9924829474270338e-05, "loss": 0.2499, "step": 10940 }, { "epoch": 1.5166204986149583, "grad_norm": 1.9965386390686035, "learning_rate": 1.9906268850633385e-05, "loss": 0.2726, "step": 10950 }, { "epoch": 1.5180055401662051, "grad_norm": 3.443002462387085, "learning_rate": 1.988770822699643e-05, "loss": 0.2485, "step": 10960 }, { "epoch": 1.5193905817174516, "grad_norm": 2.690556287765503, "learning_rate": 1.9869147603359476e-05, "loss": 0.2588, "step": 10970 }, { "epoch": 1.520775623268698, "grad_norm": 4.155259609222412, "learning_rate": 1.985058697972252e-05, "loss": 0.2734, "step": 10980 }, { "epoch": 1.5221606648199446, "grad_norm": 1.9144017696380615, "learning_rate": 1.9832026356085567e-05, "loss": 0.2525, "step": 10990 }, { "epoch": 1.5235457063711912, "grad_norm": 2.1264841556549072, "learning_rate": 1.981346573244861e-05, "loss": 0.2779, "step": 11000 }, { "epoch": 1.5249307479224377, "grad_norm": 1.881961464881897, "learning_rate": 1.9794905108811658e-05, "loss": 0.2508, "step": 11010 }, { "epoch": 1.526315789473684, "grad_norm": 2.455152750015259, "learning_rate": 1.9776344485174705e-05, "loss": 0.283, "step": 11020 }, { "epoch": 1.5277008310249307, "grad_norm": 3.0961544513702393, "learning_rate": 1.975778386153775e-05, "loss": 0.2755, "step": 11030 }, { "epoch": 1.5290858725761773, "grad_norm": 2.5754737854003906, "learning_rate": 1.9739223237900796e-05, "loss": 0.3062, "step": 11040 }, { "epoch": 1.5304709141274238, "grad_norm": 2.443967580795288, "learning_rate": 1.972066261426384e-05, "loss": 0.2293, "step": 11050 }, { "epoch": 1.5318559556786704, "grad_norm": 2.465186595916748, "learning_rate": 1.9702101990626887e-05, "loss": 0.2857, "step": 11060 }, { "epoch": 1.533240997229917, "grad_norm": 1.6813825368881226, "learning_rate": 1.9683541366989934e-05, "loss": 0.2465, "step": 11070 }, { "epoch": 1.5346260387811634, "grad_norm": 2.964087963104248, "learning_rate": 1.9664980743352978e-05, "loss": 0.2299, "step": 11080 }, { "epoch": 1.5360110803324099, "grad_norm": 2.690337657928467, "learning_rate": 1.9646420119716022e-05, "loss": 0.2486, "step": 11090 }, { "epoch": 1.5373961218836565, "grad_norm": 1.8331809043884277, "learning_rate": 1.962785949607907e-05, "loss": 0.2713, "step": 11100 }, { "epoch": 1.5387811634349031, "grad_norm": 2.079843759536743, "learning_rate": 1.9609298872442116e-05, "loss": 0.269, "step": 11110 }, { "epoch": 1.5401662049861495, "grad_norm": 2.1486682891845703, "learning_rate": 1.959073824880516e-05, "loss": 0.2159, "step": 11120 }, { "epoch": 1.5415512465373962, "grad_norm": 1.5516518354415894, "learning_rate": 1.9572177625168207e-05, "loss": 0.2468, "step": 11130 }, { "epoch": 1.5429362880886428, "grad_norm": 3.138228178024292, "learning_rate": 1.9553617001531254e-05, "loss": 0.2686, "step": 11140 }, { "epoch": 1.5443213296398892, "grad_norm": 3.1182003021240234, "learning_rate": 1.9535056377894298e-05, "loss": 0.271, "step": 11150 }, { "epoch": 1.5457063711911356, "grad_norm": 1.9098355770111084, "learning_rate": 1.9516495754257345e-05, "loss": 0.2678, "step": 11160 }, { "epoch": 1.5470914127423823, "grad_norm": 2.740372657775879, "learning_rate": 1.9497935130620393e-05, "loss": 0.2768, "step": 11170 }, { "epoch": 1.548476454293629, "grad_norm": 3.55387282371521, "learning_rate": 1.9479374506983436e-05, "loss": 0.26, "step": 11180 }, { "epoch": 1.5498614958448753, "grad_norm": 6.384958267211914, "learning_rate": 1.946081388334648e-05, "loss": 0.2666, "step": 11190 }, { "epoch": 1.5512465373961217, "grad_norm": 1.738592267036438, "learning_rate": 1.9442253259709527e-05, "loss": 0.2751, "step": 11200 }, { "epoch": 1.5526315789473686, "grad_norm": 1.8383572101593018, "learning_rate": 1.9423692636072575e-05, "loss": 0.2539, "step": 11210 }, { "epoch": 1.554016620498615, "grad_norm": 1.8780285120010376, "learning_rate": 1.940513201243562e-05, "loss": 0.2417, "step": 11220 }, { "epoch": 1.5554016620498614, "grad_norm": 2.927269697189331, "learning_rate": 1.9386571388798666e-05, "loss": 0.276, "step": 11230 }, { "epoch": 1.556786703601108, "grad_norm": 2.321641206741333, "learning_rate": 1.936801076516171e-05, "loss": 0.289, "step": 11240 }, { "epoch": 1.5581717451523547, "grad_norm": 2.094604015350342, "learning_rate": 1.9349450141524757e-05, "loss": 0.2822, "step": 11250 }, { "epoch": 1.559556786703601, "grad_norm": 2.0591633319854736, "learning_rate": 1.9330889517887804e-05, "loss": 0.2471, "step": 11260 }, { "epoch": 1.5609418282548475, "grad_norm": 1.5026451349258423, "learning_rate": 1.9312328894250848e-05, "loss": 0.2624, "step": 11270 }, { "epoch": 1.5623268698060941, "grad_norm": 4.523801803588867, "learning_rate": 1.9293768270613895e-05, "loss": 0.2795, "step": 11280 }, { "epoch": 1.5637119113573408, "grad_norm": 2.1060678958892822, "learning_rate": 1.927520764697694e-05, "loss": 0.2823, "step": 11290 }, { "epoch": 1.5650969529085872, "grad_norm": 2.6808173656463623, "learning_rate": 1.9256647023339986e-05, "loss": 0.2963, "step": 11300 }, { "epoch": 1.5664819944598338, "grad_norm": 2.0549914836883545, "learning_rate": 1.923808639970303e-05, "loss": 0.2472, "step": 11310 }, { "epoch": 1.5678670360110805, "grad_norm": 2.008946657180786, "learning_rate": 1.9219525776066077e-05, "loss": 0.2275, "step": 11320 }, { "epoch": 1.5692520775623269, "grad_norm": 1.7144265174865723, "learning_rate": 1.9200965152429124e-05, "loss": 0.2256, "step": 11330 }, { "epoch": 1.5706371191135733, "grad_norm": 2.5419716835021973, "learning_rate": 1.9182404528792168e-05, "loss": 0.2723, "step": 11340 }, { "epoch": 1.57202216066482, "grad_norm": 3.4930813312530518, "learning_rate": 1.9163843905155215e-05, "loss": 0.3037, "step": 11350 }, { "epoch": 1.5734072022160666, "grad_norm": 5.525070667266846, "learning_rate": 1.9145283281518262e-05, "loss": 0.2941, "step": 11360 }, { "epoch": 1.574792243767313, "grad_norm": 2.0388412475585938, "learning_rate": 1.9126722657881306e-05, "loss": 0.2377, "step": 11370 }, { "epoch": 1.5761772853185596, "grad_norm": 3.1941895484924316, "learning_rate": 1.9108162034244353e-05, "loss": 0.247, "step": 11380 }, { "epoch": 1.5775623268698062, "grad_norm": 2.0771210193634033, "learning_rate": 1.90896014106074e-05, "loss": 0.2612, "step": 11390 }, { "epoch": 1.5789473684210527, "grad_norm": 1.8206599950790405, "learning_rate": 1.9071040786970444e-05, "loss": 0.2669, "step": 11400 }, { "epoch": 1.580332409972299, "grad_norm": 2.1787009239196777, "learning_rate": 1.9052480163333488e-05, "loss": 0.2582, "step": 11410 }, { "epoch": 1.5817174515235457, "grad_norm": 2.1410577297210693, "learning_rate": 1.9033919539696535e-05, "loss": 0.262, "step": 11420 }, { "epoch": 1.5831024930747923, "grad_norm": 2.490060329437256, "learning_rate": 1.901535891605958e-05, "loss": 0.2337, "step": 11430 }, { "epoch": 1.5844875346260388, "grad_norm": 2.937830924987793, "learning_rate": 1.8996798292422626e-05, "loss": 0.3014, "step": 11440 }, { "epoch": 1.5858725761772852, "grad_norm": 2.0263595581054688, "learning_rate": 1.8978237668785674e-05, "loss": 0.2887, "step": 11450 }, { "epoch": 1.587257617728532, "grad_norm": 1.8784677982330322, "learning_rate": 1.8959677045148717e-05, "loss": 0.2547, "step": 11460 }, { "epoch": 1.5886426592797784, "grad_norm": 2.2667269706726074, "learning_rate": 1.8941116421511765e-05, "loss": 0.2693, "step": 11470 }, { "epoch": 1.5900277008310248, "grad_norm": 2.0640201568603516, "learning_rate": 1.8922555797874812e-05, "loss": 0.2414, "step": 11480 }, { "epoch": 1.5914127423822715, "grad_norm": 2.125096559524536, "learning_rate": 1.8903995174237856e-05, "loss": 0.2657, "step": 11490 }, { "epoch": 1.5927977839335181, "grad_norm": 3.6004934310913086, "learning_rate": 1.88854345506009e-05, "loss": 0.2973, "step": 11500 }, { "epoch": 1.5941828254847645, "grad_norm": 2.045348644256592, "learning_rate": 1.8866873926963947e-05, "loss": 0.2659, "step": 11510 }, { "epoch": 1.595567867036011, "grad_norm": 1.4382702112197876, "learning_rate": 1.8848313303326994e-05, "loss": 0.2434, "step": 11520 }, { "epoch": 1.5969529085872576, "grad_norm": 2.119570016860962, "learning_rate": 1.8829752679690038e-05, "loss": 0.3205, "step": 11530 }, { "epoch": 1.5983379501385042, "grad_norm": 1.7247291803359985, "learning_rate": 1.8811192056053085e-05, "loss": 0.2501, "step": 11540 }, { "epoch": 1.5997229916897506, "grad_norm": 3.151301145553589, "learning_rate": 1.8792631432416132e-05, "loss": 0.3, "step": 11550 }, { "epoch": 1.6011080332409973, "grad_norm": 2.6698825359344482, "learning_rate": 1.8774070808779176e-05, "loss": 0.2233, "step": 11560 }, { "epoch": 1.602493074792244, "grad_norm": 3.476079225540161, "learning_rate": 1.8755510185142223e-05, "loss": 0.3094, "step": 11570 }, { "epoch": 1.6038781163434903, "grad_norm": 1.5601874589920044, "learning_rate": 1.873694956150527e-05, "loss": 0.2747, "step": 11580 }, { "epoch": 1.6052631578947367, "grad_norm": 2.1733288764953613, "learning_rate": 1.8718388937868314e-05, "loss": 0.2538, "step": 11590 }, { "epoch": 1.6066481994459834, "grad_norm": 1.6128807067871094, "learning_rate": 1.869982831423136e-05, "loss": 0.2436, "step": 11600 }, { "epoch": 1.60803324099723, "grad_norm": 6.194311618804932, "learning_rate": 1.8681267690594405e-05, "loss": 0.2285, "step": 11610 }, { "epoch": 1.6094182825484764, "grad_norm": 1.6359772682189941, "learning_rate": 1.8662707066957452e-05, "loss": 0.286, "step": 11620 }, { "epoch": 1.610803324099723, "grad_norm": 1.7897950410842896, "learning_rate": 1.8644146443320496e-05, "loss": 0.2714, "step": 11630 }, { "epoch": 1.6121883656509697, "grad_norm": 3.663844585418701, "learning_rate": 1.8625585819683543e-05, "loss": 0.2954, "step": 11640 }, { "epoch": 1.613573407202216, "grad_norm": 2.3490071296691895, "learning_rate": 1.8607025196046587e-05, "loss": 0.2516, "step": 11650 }, { "epoch": 1.6149584487534625, "grad_norm": 1.6959421634674072, "learning_rate": 1.8588464572409634e-05, "loss": 0.2355, "step": 11660 }, { "epoch": 1.6163434903047091, "grad_norm": 4.680472373962402, "learning_rate": 1.856990394877268e-05, "loss": 0.2936, "step": 11670 }, { "epoch": 1.6177285318559558, "grad_norm": 3.6040542125701904, "learning_rate": 1.8551343325135725e-05, "loss": 0.2481, "step": 11680 }, { "epoch": 1.6191135734072022, "grad_norm": 2.7770557403564453, "learning_rate": 1.8532782701498772e-05, "loss": 0.2705, "step": 11690 }, { "epoch": 1.6204986149584486, "grad_norm": 2.4273083209991455, "learning_rate": 1.851422207786182e-05, "loss": 0.2342, "step": 11700 }, { "epoch": 1.6218836565096952, "grad_norm": 3.629061460494995, "learning_rate": 1.8495661454224863e-05, "loss": 0.2755, "step": 11710 }, { "epoch": 1.6232686980609419, "grad_norm": 1.9627671241760254, "learning_rate": 1.8477100830587907e-05, "loss": 0.2337, "step": 11720 }, { "epoch": 1.6246537396121883, "grad_norm": 2.104898452758789, "learning_rate": 1.8458540206950954e-05, "loss": 0.2648, "step": 11730 }, { "epoch": 1.626038781163435, "grad_norm": 3.2535183429718018, "learning_rate": 1.8439979583314e-05, "loss": 0.2304, "step": 11740 }, { "epoch": 1.6274238227146816, "grad_norm": 3.0654265880584717, "learning_rate": 1.8421418959677045e-05, "loss": 0.2601, "step": 11750 }, { "epoch": 1.628808864265928, "grad_norm": 5.312877655029297, "learning_rate": 1.8402858336040093e-05, "loss": 0.2391, "step": 11760 }, { "epoch": 1.6301939058171744, "grad_norm": 2.483633518218994, "learning_rate": 1.838429771240314e-05, "loss": 0.2387, "step": 11770 }, { "epoch": 1.631578947368421, "grad_norm": 2.8224122524261475, "learning_rate": 1.8365737088766184e-05, "loss": 0.2573, "step": 11780 }, { "epoch": 1.6329639889196677, "grad_norm": 5.308402061462402, "learning_rate": 1.834717646512923e-05, "loss": 0.2999, "step": 11790 }, { "epoch": 1.634349030470914, "grad_norm": 3.696424722671509, "learning_rate": 1.8328615841492278e-05, "loss": 0.2781, "step": 11800 }, { "epoch": 1.6357340720221607, "grad_norm": 1.6067745685577393, "learning_rate": 1.8310055217855322e-05, "loss": 0.2638, "step": 11810 }, { "epoch": 1.6371191135734073, "grad_norm": 1.8693360090255737, "learning_rate": 1.8291494594218366e-05, "loss": 0.2355, "step": 11820 }, { "epoch": 1.6385041551246537, "grad_norm": 2.0607070922851562, "learning_rate": 1.8272933970581413e-05, "loss": 0.2543, "step": 11830 }, { "epoch": 1.6398891966759002, "grad_norm": 1.770366907119751, "learning_rate": 1.8254373346944457e-05, "loss": 0.2417, "step": 11840 }, { "epoch": 1.6412742382271468, "grad_norm": 2.100710153579712, "learning_rate": 1.8235812723307504e-05, "loss": 0.2341, "step": 11850 }, { "epoch": 1.6426592797783934, "grad_norm": 3.348633050918579, "learning_rate": 1.821725209967055e-05, "loss": 0.2537, "step": 11860 }, { "epoch": 1.6440443213296398, "grad_norm": 2.102045774459839, "learning_rate": 1.8198691476033595e-05, "loss": 0.2416, "step": 11870 }, { "epoch": 1.6454293628808865, "grad_norm": 4.399439334869385, "learning_rate": 1.8180130852396642e-05, "loss": 0.2804, "step": 11880 }, { "epoch": 1.6468144044321331, "grad_norm": 2.43581485748291, "learning_rate": 1.816157022875969e-05, "loss": 0.2577, "step": 11890 }, { "epoch": 1.6481994459833795, "grad_norm": 5.009562015533447, "learning_rate": 1.8143009605122733e-05, "loss": 0.2739, "step": 11900 }, { "epoch": 1.649584487534626, "grad_norm": 1.979146122932434, "learning_rate": 1.812444898148578e-05, "loss": 0.243, "step": 11910 }, { "epoch": 1.6509695290858726, "grad_norm": 2.0031955242156982, "learning_rate": 1.8105888357848824e-05, "loss": 0.2533, "step": 11920 }, { "epoch": 1.6523545706371192, "grad_norm": 2.9630982875823975, "learning_rate": 1.808732773421187e-05, "loss": 0.3004, "step": 11930 }, { "epoch": 1.6537396121883656, "grad_norm": 4.047551155090332, "learning_rate": 1.8068767110574915e-05, "loss": 0.2414, "step": 11940 }, { "epoch": 1.655124653739612, "grad_norm": 2.5093228816986084, "learning_rate": 1.8050206486937962e-05, "loss": 0.2627, "step": 11950 }, { "epoch": 1.6565096952908587, "grad_norm": 2.37447190284729, "learning_rate": 1.803164586330101e-05, "loss": 0.2665, "step": 11960 }, { "epoch": 1.6578947368421053, "grad_norm": 3.737452507019043, "learning_rate": 1.8013085239664053e-05, "loss": 0.2694, "step": 11970 }, { "epoch": 1.6592797783933517, "grad_norm": 2.8096158504486084, "learning_rate": 1.79945246160271e-05, "loss": 0.2797, "step": 11980 }, { "epoch": 1.6606648199445984, "grad_norm": 1.845086932182312, "learning_rate": 1.7975963992390148e-05, "loss": 0.2508, "step": 11990 }, { "epoch": 1.662049861495845, "grad_norm": 3.416377067565918, "learning_rate": 1.795740336875319e-05, "loss": 0.2383, "step": 12000 }, { "epoch": 1.6626038781163435, "eval_loss": 0.29201897978782654, "eval_runtime": 1417.9458, "eval_samples_per_second": 6.432, "eval_steps_per_second": 0.804, "step": 12004 }, { "epoch": 1.6634349030470914, "grad_norm": 2.057034730911255, "learning_rate": 1.793884274511624e-05, "loss": 0.2622, "step": 12010 }, { "epoch": 1.6648199445983378, "grad_norm": 2.438023805618286, "learning_rate": 1.7920282121479283e-05, "loss": 0.3083, "step": 12020 }, { "epoch": 1.6662049861495845, "grad_norm": 4.761218547821045, "learning_rate": 1.790172149784233e-05, "loss": 0.2488, "step": 12030 }, { "epoch": 1.667590027700831, "grad_norm": 3.3737921714782715, "learning_rate": 1.7883160874205374e-05, "loss": 0.2418, "step": 12040 }, { "epoch": 1.6689750692520775, "grad_norm": 2.7591564655303955, "learning_rate": 1.786460025056842e-05, "loss": 0.2849, "step": 12050 }, { "epoch": 1.6703601108033241, "grad_norm": 2.5756118297576904, "learning_rate": 1.7846039626931465e-05, "loss": 0.2317, "step": 12060 }, { "epoch": 1.6717451523545708, "grad_norm": 2.3147664070129395, "learning_rate": 1.7827479003294512e-05, "loss": 0.2806, "step": 12070 }, { "epoch": 1.6731301939058172, "grad_norm": 3.616359233856201, "learning_rate": 1.780891837965756e-05, "loss": 0.2557, "step": 12080 }, { "epoch": 1.6745152354570636, "grad_norm": 2.078604221343994, "learning_rate": 1.7790357756020603e-05, "loss": 0.2752, "step": 12090 }, { "epoch": 1.6759002770083102, "grad_norm": 1.7128912210464478, "learning_rate": 1.777179713238365e-05, "loss": 0.2697, "step": 12100 }, { "epoch": 1.6772853185595569, "grad_norm": 1.9377413988113403, "learning_rate": 1.7753236508746697e-05, "loss": 0.2526, "step": 12110 }, { "epoch": 1.6786703601108033, "grad_norm": 2.5126800537109375, "learning_rate": 1.773467588510974e-05, "loss": 0.2824, "step": 12120 }, { "epoch": 1.6800554016620497, "grad_norm": 1.8020765781402588, "learning_rate": 1.7716115261472785e-05, "loss": 0.2659, "step": 12130 }, { "epoch": 1.6814404432132966, "grad_norm": 2.5396931171417236, "learning_rate": 1.7697554637835832e-05, "loss": 0.2583, "step": 12140 }, { "epoch": 1.682825484764543, "grad_norm": 2.1180899143218994, "learning_rate": 1.767899401419888e-05, "loss": 0.2561, "step": 12150 }, { "epoch": 1.6842105263157894, "grad_norm": 2.3820462226867676, "learning_rate": 1.7660433390561923e-05, "loss": 0.2675, "step": 12160 }, { "epoch": 1.685595567867036, "grad_norm": 2.2656431198120117, "learning_rate": 1.764187276692497e-05, "loss": 0.2718, "step": 12170 }, { "epoch": 1.6869806094182827, "grad_norm": 1.6295078992843628, "learning_rate": 1.7623312143288017e-05, "loss": 0.2418, "step": 12180 }, { "epoch": 1.688365650969529, "grad_norm": 2.898495674133301, "learning_rate": 1.760475151965106e-05, "loss": 0.2562, "step": 12190 }, { "epoch": 1.6897506925207755, "grad_norm": 2.290031671524048, "learning_rate": 1.758619089601411e-05, "loss": 0.262, "step": 12200 }, { "epoch": 1.6911357340720221, "grad_norm": 3.4145724773406982, "learning_rate": 1.7567630272377156e-05, "loss": 0.2502, "step": 12210 }, { "epoch": 1.6925207756232687, "grad_norm": 1.9184894561767578, "learning_rate": 1.75490696487402e-05, "loss": 0.2703, "step": 12220 }, { "epoch": 1.6939058171745152, "grad_norm": 1.8107856512069702, "learning_rate": 1.7530509025103243e-05, "loss": 0.2628, "step": 12230 }, { "epoch": 1.6952908587257618, "grad_norm": 3.8108813762664795, "learning_rate": 1.751194840146629e-05, "loss": 0.2893, "step": 12240 }, { "epoch": 1.6966759002770084, "grad_norm": 2.709773063659668, "learning_rate": 1.7493387777829334e-05, "loss": 0.2641, "step": 12250 }, { "epoch": 1.6980609418282548, "grad_norm": 1.906054139137268, "learning_rate": 1.747482715419238e-05, "loss": 0.2278, "step": 12260 }, { "epoch": 1.6994459833795013, "grad_norm": 1.6972445249557495, "learning_rate": 1.745626653055543e-05, "loss": 0.2602, "step": 12270 }, { "epoch": 1.700831024930748, "grad_norm": 2.5641844272613525, "learning_rate": 1.7437705906918473e-05, "loss": 0.2422, "step": 12280 }, { "epoch": 1.7022160664819945, "grad_norm": 3.255160093307495, "learning_rate": 1.741914528328152e-05, "loss": 0.242, "step": 12290 }, { "epoch": 1.703601108033241, "grad_norm": 1.6492815017700195, "learning_rate": 1.7400584659644567e-05, "loss": 0.2636, "step": 12300 }, { "epoch": 1.7049861495844876, "grad_norm": 2.919952630996704, "learning_rate": 1.738202403600761e-05, "loss": 0.2592, "step": 12310 }, { "epoch": 1.7063711911357342, "grad_norm": 2.054386854171753, "learning_rate": 1.7363463412370658e-05, "loss": 0.2757, "step": 12320 }, { "epoch": 1.7077562326869806, "grad_norm": 3.644105911254883, "learning_rate": 1.7344902788733702e-05, "loss": 0.2622, "step": 12330 }, { "epoch": 1.709141274238227, "grad_norm": 1.5332365036010742, "learning_rate": 1.732634216509675e-05, "loss": 0.261, "step": 12340 }, { "epoch": 1.7105263157894737, "grad_norm": 1.936841368675232, "learning_rate": 1.7307781541459793e-05, "loss": 0.2524, "step": 12350 }, { "epoch": 1.7119113573407203, "grad_norm": 2.621289014816284, "learning_rate": 1.728922091782284e-05, "loss": 0.2641, "step": 12360 }, { "epoch": 1.7132963988919667, "grad_norm": 2.3396177291870117, "learning_rate": 1.7270660294185887e-05, "loss": 0.2534, "step": 12370 }, { "epoch": 1.7146814404432131, "grad_norm": 4.555574893951416, "learning_rate": 1.725209967054893e-05, "loss": 0.2479, "step": 12380 }, { "epoch": 1.71606648199446, "grad_norm": 2.390497922897339, "learning_rate": 1.7233539046911978e-05, "loss": 0.2653, "step": 12390 }, { "epoch": 1.7174515235457064, "grad_norm": 2.245471954345703, "learning_rate": 1.7214978423275025e-05, "loss": 0.2436, "step": 12400 }, { "epoch": 1.7188365650969528, "grad_norm": 2.013014554977417, "learning_rate": 1.719641779963807e-05, "loss": 0.2447, "step": 12410 }, { "epoch": 1.7202216066481995, "grad_norm": 2.733628988265991, "learning_rate": 1.7177857176001116e-05, "loss": 0.215, "step": 12420 }, { "epoch": 1.721606648199446, "grad_norm": 3.422853946685791, "learning_rate": 1.7159296552364164e-05, "loss": 0.2922, "step": 12430 }, { "epoch": 1.7229916897506925, "grad_norm": 1.407198429107666, "learning_rate": 1.7140735928727207e-05, "loss": 0.2427, "step": 12440 }, { "epoch": 1.724376731301939, "grad_norm": 2.1980371475219727, "learning_rate": 1.712217530509025e-05, "loss": 0.2996, "step": 12450 }, { "epoch": 1.7257617728531855, "grad_norm": 2.1491382122039795, "learning_rate": 1.71036146814533e-05, "loss": 0.2432, "step": 12460 }, { "epoch": 1.7271468144044322, "grad_norm": 3.203516960144043, "learning_rate": 1.7085054057816342e-05, "loss": 0.2397, "step": 12470 }, { "epoch": 1.7285318559556786, "grad_norm": 1.9788877964019775, "learning_rate": 1.706649343417939e-05, "loss": 0.2265, "step": 12480 }, { "epoch": 1.7299168975069252, "grad_norm": 2.2260544300079346, "learning_rate": 1.7047932810542437e-05, "loss": 0.2524, "step": 12490 }, { "epoch": 1.7313019390581719, "grad_norm": 3.5737645626068115, "learning_rate": 1.702937218690548e-05, "loss": 0.2211, "step": 12500 }, { "epoch": 1.7326869806094183, "grad_norm": 2.1288580894470215, "learning_rate": 1.7010811563268528e-05, "loss": 0.2408, "step": 12510 }, { "epoch": 1.7340720221606647, "grad_norm": 2.047696113586426, "learning_rate": 1.6992250939631575e-05, "loss": 0.2221, "step": 12520 }, { "epoch": 1.7354570637119113, "grad_norm": 2.0065062046051025, "learning_rate": 1.697369031599462e-05, "loss": 0.2525, "step": 12530 }, { "epoch": 1.736842105263158, "grad_norm": 11.675251007080078, "learning_rate": 1.6955129692357662e-05, "loss": 0.3013, "step": 12540 }, { "epoch": 1.7382271468144044, "grad_norm": 3.4562997817993164, "learning_rate": 1.693656906872071e-05, "loss": 0.3053, "step": 12550 }, { "epoch": 1.739612188365651, "grad_norm": 1.8160446882247925, "learning_rate": 1.6918008445083757e-05, "loss": 0.253, "step": 12560 }, { "epoch": 1.7409972299168976, "grad_norm": 2.6544394493103027, "learning_rate": 1.68994478214468e-05, "loss": 0.2457, "step": 12570 }, { "epoch": 1.742382271468144, "grad_norm": 2.2523653507232666, "learning_rate": 1.6880887197809848e-05, "loss": 0.2778, "step": 12580 }, { "epoch": 1.7437673130193905, "grad_norm": 3.4826242923736572, "learning_rate": 1.6862326574172895e-05, "loss": 0.2264, "step": 12590 }, { "epoch": 1.745152354570637, "grad_norm": 3.331815242767334, "learning_rate": 1.684376595053594e-05, "loss": 0.2287, "step": 12600 }, { "epoch": 1.7465373961218837, "grad_norm": 2.1905879974365234, "learning_rate": 1.6825205326898986e-05, "loss": 0.2654, "step": 12610 }, { "epoch": 1.7479224376731302, "grad_norm": 1.8256255388259888, "learning_rate": 1.6806644703262033e-05, "loss": 0.2156, "step": 12620 }, { "epoch": 1.7493074792243766, "grad_norm": 2.4552223682403564, "learning_rate": 1.6788084079625077e-05, "loss": 0.2542, "step": 12630 }, { "epoch": 1.7506925207756234, "grad_norm": 2.0152883529663086, "learning_rate": 1.676952345598812e-05, "loss": 0.2457, "step": 12640 }, { "epoch": 1.7520775623268698, "grad_norm": 2.6501598358154297, "learning_rate": 1.6750962832351168e-05, "loss": 0.2577, "step": 12650 }, { "epoch": 1.7534626038781163, "grad_norm": 2.4002187252044678, "learning_rate": 1.6732402208714212e-05, "loss": 0.2704, "step": 12660 }, { "epoch": 1.754847645429363, "grad_norm": 1.940310001373291, "learning_rate": 1.671384158507726e-05, "loss": 0.264, "step": 12670 }, { "epoch": 1.7562326869806095, "grad_norm": 2.7032876014709473, "learning_rate": 1.6695280961440306e-05, "loss": 0.2555, "step": 12680 }, { "epoch": 1.757617728531856, "grad_norm": 3.1682300567626953, "learning_rate": 1.667672033780335e-05, "loss": 0.2463, "step": 12690 }, { "epoch": 1.7590027700831024, "grad_norm": 1.8988887071609497, "learning_rate": 1.6658159714166397e-05, "loss": 0.2538, "step": 12700 }, { "epoch": 1.760387811634349, "grad_norm": 2.3494110107421875, "learning_rate": 1.6639599090529444e-05, "loss": 0.2371, "step": 12710 }, { "epoch": 1.7617728531855956, "grad_norm": 1.774849772453308, "learning_rate": 1.6621038466892488e-05, "loss": 0.3002, "step": 12720 }, { "epoch": 1.763157894736842, "grad_norm": 9.336555480957031, "learning_rate": 1.6602477843255536e-05, "loss": 0.2931, "step": 12730 }, { "epoch": 1.7645429362880887, "grad_norm": 1.589762568473816, "learning_rate": 1.6583917219618583e-05, "loss": 0.2275, "step": 12740 }, { "epoch": 1.7659279778393353, "grad_norm": 2.8760030269622803, "learning_rate": 1.6565356595981627e-05, "loss": 0.2964, "step": 12750 }, { "epoch": 1.7673130193905817, "grad_norm": 2.5935494899749756, "learning_rate": 1.654679597234467e-05, "loss": 0.2206, "step": 12760 }, { "epoch": 1.7686980609418281, "grad_norm": 1.6379286050796509, "learning_rate": 1.6528235348707718e-05, "loss": 0.2307, "step": 12770 }, { "epoch": 1.7700831024930748, "grad_norm": 3.137521266937256, "learning_rate": 1.6509674725070765e-05, "loss": 0.2435, "step": 12780 }, { "epoch": 1.7714681440443214, "grad_norm": 2.229773998260498, "learning_rate": 1.649111410143381e-05, "loss": 0.2427, "step": 12790 }, { "epoch": 1.7728531855955678, "grad_norm": 1.566584587097168, "learning_rate": 1.6472553477796856e-05, "loss": 0.2731, "step": 12800 }, { "epoch": 1.7742382271468145, "grad_norm": 1.718691110610962, "learning_rate": 1.6453992854159903e-05, "loss": 0.2549, "step": 12810 }, { "epoch": 1.775623268698061, "grad_norm": 2.1555016040802, "learning_rate": 1.6435432230522947e-05, "loss": 0.2222, "step": 12820 }, { "epoch": 1.7770083102493075, "grad_norm": 1.805146336555481, "learning_rate": 1.6416871606885994e-05, "loss": 0.2555, "step": 12830 }, { "epoch": 1.778393351800554, "grad_norm": 1.7481995820999146, "learning_rate": 1.639831098324904e-05, "loss": 0.256, "step": 12840 }, { "epoch": 1.7797783933518005, "grad_norm": 1.9545890092849731, "learning_rate": 1.6379750359612085e-05, "loss": 0.2571, "step": 12850 }, { "epoch": 1.7811634349030472, "grad_norm": 1.8375189304351807, "learning_rate": 1.636118973597513e-05, "loss": 0.2388, "step": 12860 }, { "epoch": 1.7825484764542936, "grad_norm": 1.7798672914505005, "learning_rate": 1.6342629112338176e-05, "loss": 0.2692, "step": 12870 }, { "epoch": 1.78393351800554, "grad_norm": 4.45228385925293, "learning_rate": 1.632406848870122e-05, "loss": 0.2504, "step": 12880 }, { "epoch": 1.7853185595567869, "grad_norm": 1.8389291763305664, "learning_rate": 1.6305507865064267e-05, "loss": 0.2507, "step": 12890 }, { "epoch": 1.7867036011080333, "grad_norm": 1.5367673635482788, "learning_rate": 1.6286947241427314e-05, "loss": 0.2639, "step": 12900 }, { "epoch": 1.7880886426592797, "grad_norm": 2.4199717044830322, "learning_rate": 1.6268386617790358e-05, "loss": 0.2615, "step": 12910 }, { "epoch": 1.7894736842105263, "grad_norm": 2.12934947013855, "learning_rate": 1.6249825994153405e-05, "loss": 0.2774, "step": 12920 }, { "epoch": 1.790858725761773, "grad_norm": 1.7487562894821167, "learning_rate": 1.6231265370516452e-05, "loss": 0.3096, "step": 12930 }, { "epoch": 1.7922437673130194, "grad_norm": 1.8843803405761719, "learning_rate": 1.6212704746879496e-05, "loss": 0.2188, "step": 12940 }, { "epoch": 1.7936288088642658, "grad_norm": 2.1922812461853027, "learning_rate": 1.6194144123242543e-05, "loss": 0.267, "step": 12950 }, { "epoch": 1.7950138504155124, "grad_norm": 1.6102290153503418, "learning_rate": 1.6175583499605587e-05, "loss": 0.2425, "step": 12960 }, { "epoch": 1.796398891966759, "grad_norm": 2.733360767364502, "learning_rate": 1.6157022875968634e-05, "loss": 0.2836, "step": 12970 }, { "epoch": 1.7977839335180055, "grad_norm": 2.1498701572418213, "learning_rate": 1.6138462252331678e-05, "loss": 0.2823, "step": 12980 }, { "epoch": 1.799168975069252, "grad_norm": 1.5078741312026978, "learning_rate": 1.6119901628694725e-05, "loss": 0.2216, "step": 12990 }, { "epoch": 1.8005540166204987, "grad_norm": 1.9320144653320312, "learning_rate": 1.6101341005057773e-05, "loss": 0.2468, "step": 13000 }, { "epoch": 1.8019390581717452, "grad_norm": 1.9885637760162354, "learning_rate": 1.6082780381420816e-05, "loss": 0.2413, "step": 13010 }, { "epoch": 1.8033240997229916, "grad_norm": 2.5898048877716064, "learning_rate": 1.6064219757783864e-05, "loss": 0.2327, "step": 13020 }, { "epoch": 1.8047091412742382, "grad_norm": 1.5741478204727173, "learning_rate": 1.604565913414691e-05, "loss": 0.2658, "step": 13030 }, { "epoch": 1.8060941828254848, "grad_norm": 2.036979913711548, "learning_rate": 1.6027098510509955e-05, "loss": 0.2436, "step": 13040 }, { "epoch": 1.8074792243767313, "grad_norm": 1.650017499923706, "learning_rate": 1.6008537886873002e-05, "loss": 0.2696, "step": 13050 }, { "epoch": 1.8088642659279779, "grad_norm": 2.0959882736206055, "learning_rate": 1.5989977263236046e-05, "loss": 0.247, "step": 13060 }, { "epoch": 1.8102493074792245, "grad_norm": 2.870182514190674, "learning_rate": 1.597141663959909e-05, "loss": 0.2529, "step": 13070 }, { "epoch": 1.811634349030471, "grad_norm": 1.9680346250534058, "learning_rate": 1.5952856015962137e-05, "loss": 0.2584, "step": 13080 }, { "epoch": 1.8130193905817173, "grad_norm": 3.2609808444976807, "learning_rate": 1.5934295392325184e-05, "loss": 0.2222, "step": 13090 }, { "epoch": 1.814404432132964, "grad_norm": 1.9616683721542358, "learning_rate": 1.5915734768688228e-05, "loss": 0.2617, "step": 13100 }, { "epoch": 1.8157894736842106, "grad_norm": 1.50966215133667, "learning_rate": 1.5897174145051275e-05, "loss": 0.2489, "step": 13110 }, { "epoch": 1.817174515235457, "grad_norm": 1.8157463073730469, "learning_rate": 1.5878613521414322e-05, "loss": 0.2753, "step": 13120 }, { "epoch": 1.8185595567867034, "grad_norm": 2.0493199825286865, "learning_rate": 1.5860052897777366e-05, "loss": 0.2452, "step": 13130 }, { "epoch": 1.8199445983379503, "grad_norm": 1.4649819135665894, "learning_rate": 1.5841492274140413e-05, "loss": 0.2371, "step": 13140 }, { "epoch": 1.8213296398891967, "grad_norm": 2.031050205230713, "learning_rate": 1.582293165050346e-05, "loss": 0.2287, "step": 13150 }, { "epoch": 1.8227146814404431, "grad_norm": 1.8526753187179565, "learning_rate": 1.5804371026866504e-05, "loss": 0.2441, "step": 13160 }, { "epoch": 1.8240997229916898, "grad_norm": 1.8726290464401245, "learning_rate": 1.5785810403229548e-05, "loss": 0.2684, "step": 13170 }, { "epoch": 1.8254847645429364, "grad_norm": 2.1293463706970215, "learning_rate": 1.5767249779592595e-05, "loss": 0.2394, "step": 13180 }, { "epoch": 1.8268698060941828, "grad_norm": 2.2473442554473877, "learning_rate": 1.5748689155955642e-05, "loss": 0.2238, "step": 13190 }, { "epoch": 1.8282548476454292, "grad_norm": 12.403087615966797, "learning_rate": 1.5730128532318686e-05, "loss": 0.2396, "step": 13200 }, { "epoch": 1.8296398891966759, "grad_norm": 2.3015928268432617, "learning_rate": 1.5711567908681733e-05, "loss": 0.2427, "step": 13210 }, { "epoch": 1.8310249307479225, "grad_norm": 1.473212480545044, "learning_rate": 1.569300728504478e-05, "loss": 0.2138, "step": 13220 }, { "epoch": 1.832409972299169, "grad_norm": 1.7529211044311523, "learning_rate": 1.5674446661407824e-05, "loss": 0.2489, "step": 13230 }, { "epoch": 1.8337950138504155, "grad_norm": 3.970229387283325, "learning_rate": 1.565588603777087e-05, "loss": 0.2636, "step": 13240 }, { "epoch": 1.8351800554016622, "grad_norm": 2.284043312072754, "learning_rate": 1.563732541413392e-05, "loss": 0.2571, "step": 13250 }, { "epoch": 1.8365650969529086, "grad_norm": 2.4445765018463135, "learning_rate": 1.5618764790496963e-05, "loss": 0.219, "step": 13260 }, { "epoch": 1.837950138504155, "grad_norm": 4.571364879608154, "learning_rate": 1.5600204166860006e-05, "loss": 0.2162, "step": 13270 }, { "epoch": 1.8393351800554016, "grad_norm": 2.4418375492095947, "learning_rate": 1.5581643543223054e-05, "loss": 0.2431, "step": 13280 }, { "epoch": 1.8407202216066483, "grad_norm": 2.472238302230835, "learning_rate": 1.5563082919586097e-05, "loss": 0.2483, "step": 13290 }, { "epoch": 1.8421052631578947, "grad_norm": 2.308302879333496, "learning_rate": 1.5544522295949145e-05, "loss": 0.2642, "step": 13300 }, { "epoch": 1.8434903047091413, "grad_norm": 1.9199141263961792, "learning_rate": 1.5525961672312192e-05, "loss": 0.2178, "step": 13310 }, { "epoch": 1.844875346260388, "grad_norm": 1.79179847240448, "learning_rate": 1.5507401048675236e-05, "loss": 0.2351, "step": 13320 }, { "epoch": 1.8462603878116344, "grad_norm": 2.497844934463501, "learning_rate": 1.5488840425038283e-05, "loss": 0.2773, "step": 13330 }, { "epoch": 1.8476454293628808, "grad_norm": 2.2991247177124023, "learning_rate": 1.547027980140133e-05, "loss": 0.2314, "step": 13340 }, { "epoch": 1.8490304709141274, "grad_norm": 1.5995428562164307, "learning_rate": 1.5451719177764374e-05, "loss": 0.2704, "step": 13350 }, { "epoch": 1.850415512465374, "grad_norm": 3.168255567550659, "learning_rate": 1.543315855412742e-05, "loss": 0.2568, "step": 13360 }, { "epoch": 1.8518005540166205, "grad_norm": 2.057743787765503, "learning_rate": 1.5414597930490465e-05, "loss": 0.2159, "step": 13370 }, { "epoch": 1.8531855955678669, "grad_norm": 1.9587434530258179, "learning_rate": 1.5396037306853512e-05, "loss": 0.2616, "step": 13380 }, { "epoch": 1.8545706371191135, "grad_norm": 1.7825109958648682, "learning_rate": 1.5377476683216556e-05, "loss": 0.2419, "step": 13390 }, { "epoch": 1.8559556786703602, "grad_norm": 2.3962173461914062, "learning_rate": 1.5358916059579603e-05, "loss": 0.2711, "step": 13400 }, { "epoch": 1.8573407202216066, "grad_norm": 1.998607873916626, "learning_rate": 1.534035543594265e-05, "loss": 0.2384, "step": 13410 }, { "epoch": 1.8587257617728532, "grad_norm": 1.8341917991638184, "learning_rate": 1.5321794812305694e-05, "loss": 0.2455, "step": 13420 }, { "epoch": 1.8601108033240998, "grad_norm": 2.9478933811187744, "learning_rate": 1.530323418866874e-05, "loss": 0.2249, "step": 13430 }, { "epoch": 1.8614958448753463, "grad_norm": 2.184619426727295, "learning_rate": 1.528652962739548e-05, "loss": 0.2616, "step": 13440 }, { "epoch": 1.8628808864265927, "grad_norm": 1.2912342548370361, "learning_rate": 1.5267969003758528e-05, "loss": 0.2492, "step": 13450 }, { "epoch": 1.8642659279778393, "grad_norm": 1.6502461433410645, "learning_rate": 1.5249408380121573e-05, "loss": 0.2248, "step": 13460 }, { "epoch": 1.865650969529086, "grad_norm": 2.327885389328003, "learning_rate": 1.5230847756484619e-05, "loss": 0.2457, "step": 13470 }, { "epoch": 1.8670360110803323, "grad_norm": 2.1829628944396973, "learning_rate": 1.5212287132847664e-05, "loss": 0.2727, "step": 13480 }, { "epoch": 1.868421052631579, "grad_norm": 2.1626899242401123, "learning_rate": 1.5193726509210712e-05, "loss": 0.2715, "step": 13490 }, { "epoch": 1.8698060941828256, "grad_norm": 1.8352174758911133, "learning_rate": 1.5175165885573757e-05, "loss": 0.2519, "step": 13500 }, { "epoch": 1.871191135734072, "grad_norm": 2.3800196647644043, "learning_rate": 1.5156605261936803e-05, "loss": 0.2763, "step": 13510 }, { "epoch": 1.8725761772853184, "grad_norm": 3.2877066135406494, "learning_rate": 1.513804463829985e-05, "loss": 0.2352, "step": 13520 }, { "epoch": 1.873961218836565, "grad_norm": 2.6012349128723145, "learning_rate": 1.5119484014662894e-05, "loss": 0.2523, "step": 13530 }, { "epoch": 1.8753462603878117, "grad_norm": 3.118762493133545, "learning_rate": 1.5100923391025939e-05, "loss": 0.2247, "step": 13540 }, { "epoch": 1.8767313019390581, "grad_norm": 1.825679898262024, "learning_rate": 1.5082362767388985e-05, "loss": 0.2371, "step": 13550 }, { "epoch": 1.8781163434903048, "grad_norm": 1.7097175121307373, "learning_rate": 1.5063802143752032e-05, "loss": 0.2948, "step": 13560 }, { "epoch": 1.8795013850415514, "grad_norm": 2.4472036361694336, "learning_rate": 1.5045241520115077e-05, "loss": 0.2461, "step": 13570 }, { "epoch": 1.8808864265927978, "grad_norm": 2.5957345962524414, "learning_rate": 1.5026680896478123e-05, "loss": 0.2742, "step": 13580 }, { "epoch": 1.8822714681440442, "grad_norm": 1.9023525714874268, "learning_rate": 1.5008120272841168e-05, "loss": 0.2592, "step": 13590 }, { "epoch": 1.8836565096952909, "grad_norm": 2.0497195720672607, "learning_rate": 1.4989559649204215e-05, "loss": 0.2265, "step": 13600 }, { "epoch": 1.8850415512465375, "grad_norm": 1.9706491231918335, "learning_rate": 1.4970999025567261e-05, "loss": 0.2663, "step": 13610 }, { "epoch": 1.886426592797784, "grad_norm": 2.4385011196136475, "learning_rate": 1.4952438401930306e-05, "loss": 0.2172, "step": 13620 }, { "epoch": 1.8878116343490303, "grad_norm": 2.258354663848877, "learning_rate": 1.493387777829335e-05, "loss": 0.2382, "step": 13630 }, { "epoch": 1.889196675900277, "grad_norm": 2.848677635192871, "learning_rate": 1.4915317154656398e-05, "loss": 0.2219, "step": 13640 }, { "epoch": 1.8905817174515236, "grad_norm": 2.522754430770874, "learning_rate": 1.4896756531019443e-05, "loss": 0.2295, "step": 13650 }, { "epoch": 1.89196675900277, "grad_norm": 2.012890100479126, "learning_rate": 1.4878195907382489e-05, "loss": 0.2387, "step": 13660 }, { "epoch": 1.8933518005540166, "grad_norm": 2.1994481086730957, "learning_rate": 1.4859635283745536e-05, "loss": 0.2494, "step": 13670 }, { "epoch": 1.8947368421052633, "grad_norm": 1.7102662324905396, "learning_rate": 1.4841074660108581e-05, "loss": 0.2619, "step": 13680 }, { "epoch": 1.8961218836565097, "grad_norm": 2.5355281829833984, "learning_rate": 1.4822514036471627e-05, "loss": 0.2366, "step": 13690 }, { "epoch": 1.897506925207756, "grad_norm": 1.9577605724334717, "learning_rate": 1.4803953412834672e-05, "loss": 0.2639, "step": 13700 }, { "epoch": 1.8988919667590027, "grad_norm": 2.3210580348968506, "learning_rate": 1.478539278919772e-05, "loss": 0.2418, "step": 13710 }, { "epoch": 1.9002770083102494, "grad_norm": 1.7135709524154663, "learning_rate": 1.4766832165560765e-05, "loss": 0.2498, "step": 13720 }, { "epoch": 1.9016620498614958, "grad_norm": 1.5019444227218628, "learning_rate": 1.4750127604287506e-05, "loss": 0.2527, "step": 13730 }, { "epoch": 1.9030470914127424, "grad_norm": 2.4784412384033203, "learning_rate": 1.4733423043014245e-05, "loss": 0.2757, "step": 13740 }, { "epoch": 1.904432132963989, "grad_norm": 2.923137664794922, "learning_rate": 1.4714862419377291e-05, "loss": 0.224, "step": 13750 }, { "epoch": 1.9058171745152355, "grad_norm": 2.0658411979675293, "learning_rate": 1.4696301795740338e-05, "loss": 0.218, "step": 13760 }, { "epoch": 1.9072022160664819, "grad_norm": 4.4282612800598145, "learning_rate": 1.4677741172103384e-05, "loss": 0.2321, "step": 13770 }, { "epoch": 1.9085872576177285, "grad_norm": 2.6436312198638916, "learning_rate": 1.465918054846643e-05, "loss": 0.2189, "step": 13780 }, { "epoch": 1.9099722991689752, "grad_norm": 1.3336904048919678, "learning_rate": 1.4640619924829476e-05, "loss": 0.2594, "step": 13790 }, { "epoch": 1.9113573407202216, "grad_norm": 1.4303447008132935, "learning_rate": 1.4622059301192522e-05, "loss": 0.2072, "step": 13800 }, { "epoch": 1.912742382271468, "grad_norm": 2.0474507808685303, "learning_rate": 1.4603498677555567e-05, "loss": 0.2438, "step": 13810 }, { "epoch": 1.9141274238227148, "grad_norm": 2.0859858989715576, "learning_rate": 1.4584938053918613e-05, "loss": 0.2408, "step": 13820 }, { "epoch": 1.9155124653739612, "grad_norm": 1.888232946395874, "learning_rate": 1.456637743028166e-05, "loss": 0.2586, "step": 13830 }, { "epoch": 1.9168975069252077, "grad_norm": 1.5036896467208862, "learning_rate": 1.4547816806644704e-05, "loss": 0.2402, "step": 13840 }, { "epoch": 1.9182825484764543, "grad_norm": 3.7954695224761963, "learning_rate": 1.452925618300775e-05, "loss": 0.2273, "step": 13850 }, { "epoch": 1.919667590027701, "grad_norm": 7.413731098175049, "learning_rate": 1.4510695559370795e-05, "loss": 0.2082, "step": 13860 }, { "epoch": 1.9210526315789473, "grad_norm": 2.4324305057525635, "learning_rate": 1.4492134935733842e-05, "loss": 0.2318, "step": 13870 }, { "epoch": 1.9224376731301938, "grad_norm": 2.1324281692504883, "learning_rate": 1.4473574312096888e-05, "loss": 0.2511, "step": 13880 }, { "epoch": 1.9238227146814404, "grad_norm": 1.7724601030349731, "learning_rate": 1.4455013688459933e-05, "loss": 0.2458, "step": 13890 }, { "epoch": 1.925207756232687, "grad_norm": 2.6785428524017334, "learning_rate": 1.4436453064822979e-05, "loss": 0.2472, "step": 13900 }, { "epoch": 1.9265927977839334, "grad_norm": 2.5979104042053223, "learning_rate": 1.4417892441186026e-05, "loss": 0.2446, "step": 13910 }, { "epoch": 1.92797783933518, "grad_norm": 1.9766836166381836, "learning_rate": 1.4399331817549071e-05, "loss": 0.2529, "step": 13920 }, { "epoch": 1.9293628808864267, "grad_norm": 1.5978502035140991, "learning_rate": 1.4380771193912117e-05, "loss": 0.2544, "step": 13930 }, { "epoch": 1.9307479224376731, "grad_norm": 2.787135362625122, "learning_rate": 1.4362210570275164e-05, "loss": 0.2549, "step": 13940 }, { "epoch": 1.9321329639889195, "grad_norm": 1.7493928670883179, "learning_rate": 1.4343649946638208e-05, "loss": 0.2343, "step": 13950 }, { "epoch": 1.9335180055401662, "grad_norm": 4.298009872436523, "learning_rate": 1.4325089323001253e-05, "loss": 0.2137, "step": 13960 }, { "epoch": 1.9349030470914128, "grad_norm": 1.7205559015274048, "learning_rate": 1.4306528699364299e-05, "loss": 0.212, "step": 13970 }, { "epoch": 1.9362880886426592, "grad_norm": 2.38380765914917, "learning_rate": 1.4287968075727346e-05, "loss": 0.2249, "step": 13980 }, { "epoch": 1.9376731301939059, "grad_norm": 1.6681180000305176, "learning_rate": 1.4269407452090392e-05, "loss": 0.2049, "step": 13990 }, { "epoch": 1.9390581717451525, "grad_norm": 1.9246506690979004, "learning_rate": 1.4250846828453437e-05, "loss": 0.2088, "step": 14000 }, { "epoch": 1.940443213296399, "grad_norm": 1.7347540855407715, "learning_rate": 1.4232286204816483e-05, "loss": 0.2386, "step": 14010 }, { "epoch": 1.9418282548476453, "grad_norm": 2.201300621032715, "learning_rate": 1.421372558117953e-05, "loss": 0.2532, "step": 14020 }, { "epoch": 1.943213296398892, "grad_norm": 1.9885168075561523, "learning_rate": 1.4195164957542575e-05, "loss": 0.2625, "step": 14030 }, { "epoch": 1.9445983379501386, "grad_norm": 2.3285436630249023, "learning_rate": 1.417660433390562e-05, "loss": 0.3034, "step": 14040 }, { "epoch": 1.945983379501385, "grad_norm": 2.0906248092651367, "learning_rate": 1.4158043710268665e-05, "loss": 0.2216, "step": 14050 }, { "epoch": 1.9473684210526314, "grad_norm": 4.676970958709717, "learning_rate": 1.4139483086631712e-05, "loss": 0.2359, "step": 14060 }, { "epoch": 1.9487534626038783, "grad_norm": 1.8214576244354248, "learning_rate": 1.4120922462994757e-05, "loss": 0.2811, "step": 14070 }, { "epoch": 1.9501385041551247, "grad_norm": 2.022029161453247, "learning_rate": 1.4102361839357803e-05, "loss": 0.2421, "step": 14080 }, { "epoch": 1.951523545706371, "grad_norm": 1.716650366783142, "learning_rate": 1.408380121572085e-05, "loss": 0.2361, "step": 14090 }, { "epoch": 1.9529085872576177, "grad_norm": 5.157698631286621, "learning_rate": 1.4065240592083896e-05, "loss": 0.2323, "step": 14100 }, { "epoch": 1.9542936288088644, "grad_norm": 2.0606625080108643, "learning_rate": 1.4046679968446941e-05, "loss": 0.2289, "step": 14110 }, { "epoch": 1.9556786703601108, "grad_norm": 1.8860923051834106, "learning_rate": 1.4028119344809987e-05, "loss": 0.263, "step": 14120 }, { "epoch": 1.9570637119113572, "grad_norm": 1.9369615316390991, "learning_rate": 1.4009558721173034e-05, "loss": 0.2373, "step": 14130 }, { "epoch": 1.9584487534626038, "grad_norm": 2.072380781173706, "learning_rate": 1.399099809753608e-05, "loss": 0.2269, "step": 14140 }, { "epoch": 1.9598337950138505, "grad_norm": 1.8450301885604858, "learning_rate": 1.3972437473899123e-05, "loss": 0.2124, "step": 14150 }, { "epoch": 1.9612188365650969, "grad_norm": 2.4834184646606445, "learning_rate": 1.3953876850262169e-05, "loss": 0.2375, "step": 14160 }, { "epoch": 1.9626038781163435, "grad_norm": 2.2427444458007812, "learning_rate": 1.3935316226625216e-05, "loss": 0.2936, "step": 14170 }, { "epoch": 1.9639889196675901, "grad_norm": 2.3271632194519043, "learning_rate": 1.3916755602988261e-05, "loss": 0.2835, "step": 14180 }, { "epoch": 1.9653739612188366, "grad_norm": 1.8166530132293701, "learning_rate": 1.3898194979351307e-05, "loss": 0.2144, "step": 14190 }, { "epoch": 1.966759002770083, "grad_norm": 2.4603664875030518, "learning_rate": 1.3879634355714354e-05, "loss": 0.23, "step": 14200 }, { "epoch": 1.9681440443213296, "grad_norm": 1.665629506111145, "learning_rate": 1.38610737320774e-05, "loss": 0.2648, "step": 14210 }, { "epoch": 1.9695290858725762, "grad_norm": 2.7033531665802, "learning_rate": 1.3842513108440445e-05, "loss": 0.2634, "step": 14220 }, { "epoch": 1.9709141274238227, "grad_norm": 2.542165517807007, "learning_rate": 1.382395248480349e-05, "loss": 0.2244, "step": 14230 }, { "epoch": 1.9722991689750693, "grad_norm": 2.3917179107666016, "learning_rate": 1.3805391861166538e-05, "loss": 0.2586, "step": 14240 }, { "epoch": 1.973684210526316, "grad_norm": 3.0972111225128174, "learning_rate": 1.3786831237529583e-05, "loss": 0.2225, "step": 14250 }, { "epoch": 1.9750692520775623, "grad_norm": 1.795332431793213, "learning_rate": 1.3768270613892627e-05, "loss": 0.2266, "step": 14260 }, { "epoch": 1.9764542936288088, "grad_norm": 3.2170612812042236, "learning_rate": 1.3749709990255673e-05, "loss": 0.245, "step": 14270 }, { "epoch": 1.9778393351800554, "grad_norm": 2.6222457885742188, "learning_rate": 1.373114936661872e-05, "loss": 0.2335, "step": 14280 }, { "epoch": 1.979224376731302, "grad_norm": 1.6004931926727295, "learning_rate": 1.3712588742981765e-05, "loss": 0.2409, "step": 14290 }, { "epoch": 1.9806094182825484, "grad_norm": 2.342514753341675, "learning_rate": 1.369402811934481e-05, "loss": 0.2446, "step": 14300 }, { "epoch": 1.9819944598337949, "grad_norm": 1.6561387777328491, "learning_rate": 1.3675467495707856e-05, "loss": 0.2394, "step": 14310 }, { "epoch": 1.9833795013850417, "grad_norm": 3.7785511016845703, "learning_rate": 1.3656906872070903e-05, "loss": 0.2463, "step": 14320 }, { "epoch": 1.9847645429362881, "grad_norm": 1.7614632844924927, "learning_rate": 1.3638346248433949e-05, "loss": 0.2448, "step": 14330 }, { "epoch": 1.9861495844875345, "grad_norm": 1.4015835523605347, "learning_rate": 1.3619785624796994e-05, "loss": 0.2339, "step": 14340 }, { "epoch": 1.9875346260387812, "grad_norm": 1.7252857685089111, "learning_rate": 1.3601225001160042e-05, "loss": 0.2338, "step": 14350 }, { "epoch": 1.9889196675900278, "grad_norm": 1.8692224025726318, "learning_rate": 1.3582664377523085e-05, "loss": 0.2114, "step": 14360 }, { "epoch": 1.9903047091412742, "grad_norm": 1.5987521409988403, "learning_rate": 1.3564103753886131e-05, "loss": 0.228, "step": 14370 }, { "epoch": 1.9916897506925206, "grad_norm": 2.2164413928985596, "learning_rate": 1.3545543130249176e-05, "loss": 0.2284, "step": 14380 }, { "epoch": 1.9930747922437673, "grad_norm": 2.2460415363311768, "learning_rate": 1.3526982506612224e-05, "loss": 0.2315, "step": 14390 }, { "epoch": 1.994459833795014, "grad_norm": 2.1210713386535645, "learning_rate": 1.350842188297527e-05, "loss": 0.2825, "step": 14400 }, { "epoch": 1.9958448753462603, "grad_norm": 2.85060977935791, "learning_rate": 1.3489861259338315e-05, "loss": 0.2321, "step": 14410 }, { "epoch": 1.997229916897507, "grad_norm": 1.8384126424789429, "learning_rate": 1.347130063570136e-05, "loss": 0.2197, "step": 14420 }, { "epoch": 1.9986149584487536, "grad_norm": 1.542345404624939, "learning_rate": 1.3452740012064407e-05, "loss": 0.2353, "step": 14430 }, { "epoch": 2.0, "grad_norm": 2.0727384090423584, "learning_rate": 1.3434179388427453e-05, "loss": 0.2297, "step": 14440 }, { "epoch": 2.0013850415512464, "grad_norm": 1.7574317455291748, "learning_rate": 1.3415618764790498e-05, "loss": 0.1947, "step": 14450 }, { "epoch": 2.0027700831024933, "grad_norm": 1.6832847595214844, "learning_rate": 1.3397058141153542e-05, "loss": 0.1959, "step": 14460 }, { "epoch": 2.0041551246537397, "grad_norm": 1.722998857498169, "learning_rate": 1.337849751751659e-05, "loss": 0.2054, "step": 14470 }, { "epoch": 2.005540166204986, "grad_norm": 2.045290231704712, "learning_rate": 1.3359936893879635e-05, "loss": 0.1766, "step": 14480 }, { "epoch": 2.0069252077562325, "grad_norm": 1.2657092809677124, "learning_rate": 1.334137627024268e-05, "loss": 0.1511, "step": 14490 }, { "epoch": 2.0083102493074794, "grad_norm": 1.6155598163604736, "learning_rate": 1.3322815646605728e-05, "loss": 0.1591, "step": 14500 }, { "epoch": 2.009695290858726, "grad_norm": 3.8692007064819336, "learning_rate": 1.3304255022968773e-05, "loss": 0.208, "step": 14510 }, { "epoch": 2.011080332409972, "grad_norm": 2.4857311248779297, "learning_rate": 1.3285694399331819e-05, "loss": 0.1712, "step": 14520 }, { "epoch": 2.012465373961219, "grad_norm": 2.3056626319885254, "learning_rate": 1.3267133775694864e-05, "loss": 0.1958, "step": 14530 }, { "epoch": 2.0138504155124655, "grad_norm": 2.1693966388702393, "learning_rate": 1.3248573152057911e-05, "loss": 0.1853, "step": 14540 }, { "epoch": 2.015235457063712, "grad_norm": 4.402191162109375, "learning_rate": 1.3230012528420957e-05, "loss": 0.2022, "step": 14550 }, { "epoch": 2.0166204986149583, "grad_norm": 1.5139687061309814, "learning_rate": 1.3211451904784002e-05, "loss": 0.2085, "step": 14560 }, { "epoch": 2.018005540166205, "grad_norm": 2.144901990890503, "learning_rate": 1.3192891281147046e-05, "loss": 0.2012, "step": 14570 }, { "epoch": 2.0193905817174516, "grad_norm": 3.4196605682373047, "learning_rate": 1.3174330657510093e-05, "loss": 0.215, "step": 14580 }, { "epoch": 2.020775623268698, "grad_norm": 1.5005124807357788, "learning_rate": 1.3155770033873139e-05, "loss": 0.1661, "step": 14590 }, { "epoch": 2.0221606648199444, "grad_norm": 1.7353737354278564, "learning_rate": 1.3137209410236184e-05, "loss": 0.1908, "step": 14600 }, { "epoch": 2.0235457063711912, "grad_norm": 2.1631791591644287, "learning_rate": 1.311864878659923e-05, "loss": 0.1913, "step": 14610 }, { "epoch": 2.0249307479224377, "grad_norm": 2.5623526573181152, "learning_rate": 1.3100088162962277e-05, "loss": 0.1889, "step": 14620 }, { "epoch": 2.026315789473684, "grad_norm": 1.8216127157211304, "learning_rate": 1.3081527539325323e-05, "loss": 0.2089, "step": 14630 }, { "epoch": 2.027700831024931, "grad_norm": 2.1311192512512207, "learning_rate": 1.3062966915688368e-05, "loss": 0.1938, "step": 14640 }, { "epoch": 2.0290858725761773, "grad_norm": 2.107114315032959, "learning_rate": 1.3044406292051415e-05, "loss": 0.1792, "step": 14650 }, { "epoch": 2.0304709141274238, "grad_norm": 2.94160532951355, "learning_rate": 1.302584566841446e-05, "loss": 0.1977, "step": 14660 }, { "epoch": 2.03185595567867, "grad_norm": 2.283118724822998, "learning_rate": 1.3007285044777505e-05, "loss": 0.1676, "step": 14670 }, { "epoch": 2.033240997229917, "grad_norm": 3.027130365371704, "learning_rate": 1.298872442114055e-05, "loss": 0.1662, "step": 14680 }, { "epoch": 2.0346260387811634, "grad_norm": 1.537743091583252, "learning_rate": 1.2970163797503597e-05, "loss": 0.1946, "step": 14690 }, { "epoch": 2.03601108033241, "grad_norm": 1.9911495447158813, "learning_rate": 1.2951603173866643e-05, "loss": 0.1961, "step": 14700 }, { "epoch": 2.0373961218836567, "grad_norm": 2.0069620609283447, "learning_rate": 1.2933042550229688e-05, "loss": 0.217, "step": 14710 }, { "epoch": 2.038781163434903, "grad_norm": 1.7512022256851196, "learning_rate": 1.2914481926592734e-05, "loss": 0.1887, "step": 14720 }, { "epoch": 2.0401662049861495, "grad_norm": 2.5431530475616455, "learning_rate": 1.2895921302955781e-05, "loss": 0.18, "step": 14730 }, { "epoch": 2.041551246537396, "grad_norm": 3.3946635723114014, "learning_rate": 1.2877360679318827e-05, "loss": 0.2296, "step": 14740 }, { "epoch": 2.042936288088643, "grad_norm": 3.5196239948272705, "learning_rate": 1.2858800055681872e-05, "loss": 0.1829, "step": 14750 }, { "epoch": 2.044321329639889, "grad_norm": 2.5390875339508057, "learning_rate": 1.284023943204492e-05, "loss": 0.1804, "step": 14760 }, { "epoch": 2.0457063711911356, "grad_norm": 2.016263723373413, "learning_rate": 1.2821678808407965e-05, "loss": 0.1856, "step": 14770 }, { "epoch": 2.0470914127423825, "grad_norm": 2.1532645225524902, "learning_rate": 1.2803118184771009e-05, "loss": 0.1719, "step": 14780 }, { "epoch": 2.048476454293629, "grad_norm": 3.4412078857421875, "learning_rate": 1.2784557561134054e-05, "loss": 0.1574, "step": 14790 }, { "epoch": 2.0498614958448753, "grad_norm": 1.2067993879318237, "learning_rate": 1.2765996937497101e-05, "loss": 0.1661, "step": 14800 }, { "epoch": 2.0512465373961217, "grad_norm": 1.7038114070892334, "learning_rate": 1.2747436313860147e-05, "loss": 0.175, "step": 14810 }, { "epoch": 2.0526315789473686, "grad_norm": 1.9622423648834229, "learning_rate": 1.2728875690223192e-05, "loss": 0.192, "step": 14820 }, { "epoch": 2.054016620498615, "grad_norm": 1.806934118270874, "learning_rate": 1.2710315066586238e-05, "loss": 0.1814, "step": 14830 }, { "epoch": 2.0554016620498614, "grad_norm": 2.2102761268615723, "learning_rate": 1.2691754442949285e-05, "loss": 0.2402, "step": 14840 }, { "epoch": 2.056786703601108, "grad_norm": 1.8251749277114868, "learning_rate": 1.267319381931233e-05, "loss": 0.1764, "step": 14850 }, { "epoch": 2.0581717451523547, "grad_norm": 1.8050317764282227, "learning_rate": 1.2654633195675376e-05, "loss": 0.1671, "step": 14860 }, { "epoch": 2.059556786703601, "grad_norm": 1.6401031017303467, "learning_rate": 1.2636072572038423e-05, "loss": 0.1983, "step": 14870 }, { "epoch": 2.0609418282548475, "grad_norm": 2.121783494949341, "learning_rate": 1.2617511948401467e-05, "loss": 0.1669, "step": 14880 }, { "epoch": 2.0623268698060944, "grad_norm": 1.7833797931671143, "learning_rate": 1.2598951324764512e-05, "loss": 0.1812, "step": 14890 }, { "epoch": 2.063711911357341, "grad_norm": 1.8813719749450684, "learning_rate": 1.2580390701127558e-05, "loss": 0.1865, "step": 14900 }, { "epoch": 2.065096952908587, "grad_norm": 1.707542061805725, "learning_rate": 1.2561830077490605e-05, "loss": 0.2263, "step": 14910 }, { "epoch": 2.0664819944598336, "grad_norm": 2.113223075866699, "learning_rate": 1.254326945385365e-05, "loss": 0.2098, "step": 14920 }, { "epoch": 2.0678670360110805, "grad_norm": 1.8287922143936157, "learning_rate": 1.2524708830216696e-05, "loss": 0.1732, "step": 14930 }, { "epoch": 2.069252077562327, "grad_norm": 1.389681339263916, "learning_rate": 1.2506148206579742e-05, "loss": 0.1981, "step": 14940 }, { "epoch": 2.0706371191135733, "grad_norm": 2.0864617824554443, "learning_rate": 1.2487587582942789e-05, "loss": 0.1812, "step": 14950 }, { "epoch": 2.07202216066482, "grad_norm": 1.4142706394195557, "learning_rate": 1.2469026959305834e-05, "loss": 0.1618, "step": 14960 }, { "epoch": 2.0734072022160666, "grad_norm": 2.4035658836364746, "learning_rate": 1.245046633566888e-05, "loss": 0.2163, "step": 14970 }, { "epoch": 2.074792243767313, "grad_norm": 3.36538028717041, "learning_rate": 1.2431905712031924e-05, "loss": 0.1659, "step": 14980 }, { "epoch": 2.0761772853185594, "grad_norm": 4.023022651672363, "learning_rate": 1.2413345088394971e-05, "loss": 0.1942, "step": 14990 }, { "epoch": 2.0775623268698062, "grad_norm": 3.7977612018585205, "learning_rate": 1.2394784464758016e-05, "loss": 0.1708, "step": 15000 }, { "epoch": 2.0782548476454292, "eval_loss": 0.2703794538974762, "eval_runtime": 1428.8894, "eval_samples_per_second": 6.383, "eval_steps_per_second": 0.798, "step": 15005 }, { "epoch": 2.0789473684210527, "grad_norm": 1.9419952630996704, "learning_rate": 1.2376223841121062e-05, "loss": 0.1724, "step": 15010 }, { "epoch": 2.080332409972299, "grad_norm": 1.5353316068649292, "learning_rate": 1.2357663217484107e-05, "loss": 0.1838, "step": 15020 }, { "epoch": 2.081717451523546, "grad_norm": 1.7038683891296387, "learning_rate": 1.2339102593847155e-05, "loss": 0.1659, "step": 15030 }, { "epoch": 2.0831024930747923, "grad_norm": 1.4729688167572021, "learning_rate": 1.23205419702102e-05, "loss": 0.1753, "step": 15040 }, { "epoch": 2.0844875346260388, "grad_norm": 1.7278324365615845, "learning_rate": 1.2301981346573246e-05, "loss": 0.1754, "step": 15050 }, { "epoch": 2.085872576177285, "grad_norm": 1.8487584590911865, "learning_rate": 1.2283420722936293e-05, "loss": 0.1642, "step": 15060 }, { "epoch": 2.087257617728532, "grad_norm": 5.234111785888672, "learning_rate": 1.2264860099299338e-05, "loss": 0.2079, "step": 15070 }, { "epoch": 2.0886426592797784, "grad_norm": 2.1265268325805664, "learning_rate": 1.2246299475662384e-05, "loss": 0.1834, "step": 15080 }, { "epoch": 2.090027700831025, "grad_norm": 2.6006407737731934, "learning_rate": 1.2227738852025428e-05, "loss": 0.1703, "step": 15090 }, { "epoch": 2.0914127423822713, "grad_norm": 1.6003869771957397, "learning_rate": 1.2209178228388475e-05, "loss": 0.1881, "step": 15100 }, { "epoch": 2.092797783933518, "grad_norm": 1.5022588968276978, "learning_rate": 1.219061760475152e-05, "loss": 0.1654, "step": 15110 }, { "epoch": 2.0941828254847645, "grad_norm": 1.9021573066711426, "learning_rate": 1.2172056981114566e-05, "loss": 0.1449, "step": 15120 }, { "epoch": 2.095567867036011, "grad_norm": 2.192850112915039, "learning_rate": 1.2153496357477611e-05, "loss": 0.148, "step": 15130 }, { "epoch": 2.096952908587258, "grad_norm": 2.480792284011841, "learning_rate": 1.2134935733840659e-05, "loss": 0.2227, "step": 15140 }, { "epoch": 2.098337950138504, "grad_norm": 1.867102026939392, "learning_rate": 1.2116375110203704e-05, "loss": 0.2002, "step": 15150 }, { "epoch": 2.0997229916897506, "grad_norm": 2.851119041442871, "learning_rate": 1.209781448656675e-05, "loss": 0.1931, "step": 15160 }, { "epoch": 2.101108033240997, "grad_norm": 2.097811698913574, "learning_rate": 1.2079253862929797e-05, "loss": 0.1952, "step": 15170 }, { "epoch": 2.102493074792244, "grad_norm": 1.8236339092254639, "learning_rate": 1.2060693239292842e-05, "loss": 0.1726, "step": 15180 }, { "epoch": 2.1038781163434903, "grad_norm": 2.647979259490967, "learning_rate": 1.2042132615655886e-05, "loss": 0.18, "step": 15190 }, { "epoch": 2.1052631578947367, "grad_norm": 2.0955395698547363, "learning_rate": 1.2023571992018932e-05, "loss": 0.2037, "step": 15200 }, { "epoch": 2.1066481994459836, "grad_norm": 2.227377414703369, "learning_rate": 1.2005011368381979e-05, "loss": 0.2027, "step": 15210 }, { "epoch": 2.10803324099723, "grad_norm": 1.8922470808029175, "learning_rate": 1.1986450744745024e-05, "loss": 0.2151, "step": 15220 }, { "epoch": 2.1094182825484764, "grad_norm": 1.543190360069275, "learning_rate": 1.196789012110807e-05, "loss": 0.1816, "step": 15230 }, { "epoch": 2.110803324099723, "grad_norm": 1.440812110900879, "learning_rate": 1.1949329497471115e-05, "loss": 0.1457, "step": 15240 }, { "epoch": 2.1121883656509697, "grad_norm": 1.3764451742172241, "learning_rate": 1.1930768873834163e-05, "loss": 0.2024, "step": 15250 }, { "epoch": 2.113573407202216, "grad_norm": 1.9469165802001953, "learning_rate": 1.1912208250197208e-05, "loss": 0.1938, "step": 15260 }, { "epoch": 2.1149584487534625, "grad_norm": 2.06449818611145, "learning_rate": 1.1893647626560254e-05, "loss": 0.1746, "step": 15270 }, { "epoch": 2.1163434903047094, "grad_norm": 2.1765999794006348, "learning_rate": 1.18750870029233e-05, "loss": 0.1746, "step": 15280 }, { "epoch": 2.1177285318559558, "grad_norm": 1.6806100606918335, "learning_rate": 1.1856526379286346e-05, "loss": 0.146, "step": 15290 }, { "epoch": 2.119113573407202, "grad_norm": 1.8645600080490112, "learning_rate": 1.183796575564939e-05, "loss": 0.1805, "step": 15300 }, { "epoch": 2.1204986149584486, "grad_norm": 1.9853228330612183, "learning_rate": 1.1819405132012436e-05, "loss": 0.1881, "step": 15310 }, { "epoch": 2.1218836565096955, "grad_norm": 2.438157796859741, "learning_rate": 1.1800844508375481e-05, "loss": 0.1605, "step": 15320 }, { "epoch": 2.123268698060942, "grad_norm": 1.974244475364685, "learning_rate": 1.1782283884738528e-05, "loss": 0.2067, "step": 15330 }, { "epoch": 2.1246537396121883, "grad_norm": 2.455505847930908, "learning_rate": 1.1763723261101574e-05, "loss": 0.2004, "step": 15340 }, { "epoch": 2.1260387811634347, "grad_norm": 1.9168258905410767, "learning_rate": 1.174516263746462e-05, "loss": 0.2129, "step": 15350 }, { "epoch": 2.1274238227146816, "grad_norm": 1.6309889554977417, "learning_rate": 1.1726602013827666e-05, "loss": 0.1975, "step": 15360 }, { "epoch": 2.128808864265928, "grad_norm": 3.4665346145629883, "learning_rate": 1.1708041390190712e-05, "loss": 0.2004, "step": 15370 }, { "epoch": 2.1301939058171744, "grad_norm": 1.8809685707092285, "learning_rate": 1.1689480766553757e-05, "loss": 0.1998, "step": 15380 }, { "epoch": 2.1315789473684212, "grad_norm": 2.7498867511749268, "learning_rate": 1.1670920142916805e-05, "loss": 0.1567, "step": 15390 }, { "epoch": 2.1329639889196677, "grad_norm": 2.1605582237243652, "learning_rate": 1.1652359519279848e-05, "loss": 0.2108, "step": 15400 }, { "epoch": 2.134349030470914, "grad_norm": 1.3878306150436401, "learning_rate": 1.1633798895642894e-05, "loss": 0.1653, "step": 15410 }, { "epoch": 2.1357340720221605, "grad_norm": 1.9377297163009644, "learning_rate": 1.161523827200594e-05, "loss": 0.1893, "step": 15420 }, { "epoch": 2.1371191135734073, "grad_norm": 1.818156361579895, "learning_rate": 1.1596677648368985e-05, "loss": 0.1557, "step": 15430 }, { "epoch": 2.1385041551246537, "grad_norm": 2.047635555267334, "learning_rate": 1.1578117024732032e-05, "loss": 0.2056, "step": 15440 }, { "epoch": 2.1398891966759, "grad_norm": 1.8148167133331299, "learning_rate": 1.1559556401095078e-05, "loss": 0.204, "step": 15450 }, { "epoch": 2.141274238227147, "grad_norm": 1.9170321226119995, "learning_rate": 1.1540995777458123e-05, "loss": 0.1645, "step": 15460 }, { "epoch": 2.1426592797783934, "grad_norm": 2.07987117767334, "learning_rate": 1.152243515382117e-05, "loss": 0.1828, "step": 15470 }, { "epoch": 2.14404432132964, "grad_norm": 1.5561580657958984, "learning_rate": 1.1503874530184216e-05, "loss": 0.1782, "step": 15480 }, { "epoch": 2.1454293628808863, "grad_norm": 1.8059701919555664, "learning_rate": 1.1485313906547261e-05, "loss": 0.209, "step": 15490 }, { "epoch": 2.146814404432133, "grad_norm": 1.7873286008834839, "learning_rate": 1.1466753282910305e-05, "loss": 0.1892, "step": 15500 }, { "epoch": 2.1481994459833795, "grad_norm": 2.170375347137451, "learning_rate": 1.1448192659273352e-05, "loss": 0.1714, "step": 15510 }, { "epoch": 2.149584487534626, "grad_norm": 1.824212908744812, "learning_rate": 1.1429632035636398e-05, "loss": 0.1742, "step": 15520 }, { "epoch": 2.150969529085873, "grad_norm": 3.1203479766845703, "learning_rate": 1.1411071411999443e-05, "loss": 0.2095, "step": 15530 }, { "epoch": 2.152354570637119, "grad_norm": 2.1490399837493896, "learning_rate": 1.1392510788362489e-05, "loss": 0.1543, "step": 15540 }, { "epoch": 2.1537396121883656, "grad_norm": 2.8778560161590576, "learning_rate": 1.1373950164725536e-05, "loss": 0.19, "step": 15550 }, { "epoch": 2.155124653739612, "grad_norm": 2.4876325130462646, "learning_rate": 1.1355389541088582e-05, "loss": 0.2228, "step": 15560 }, { "epoch": 2.156509695290859, "grad_norm": 1.8188594579696655, "learning_rate": 1.1336828917451627e-05, "loss": 0.1879, "step": 15570 }, { "epoch": 2.1578947368421053, "grad_norm": 2.6546733379364014, "learning_rate": 1.1318268293814674e-05, "loss": 0.1667, "step": 15580 }, { "epoch": 2.1592797783933517, "grad_norm": 1.4731812477111816, "learning_rate": 1.129970767017772e-05, "loss": 0.1701, "step": 15590 }, { "epoch": 2.160664819944598, "grad_norm": 1.7102782726287842, "learning_rate": 1.1281147046540765e-05, "loss": 0.1939, "step": 15600 }, { "epoch": 2.162049861495845, "grad_norm": 1.9763939380645752, "learning_rate": 1.126258642290381e-05, "loss": 0.1957, "step": 15610 }, { "epoch": 2.1634349030470914, "grad_norm": 2.430368661880493, "learning_rate": 1.1244025799266856e-05, "loss": 0.2212, "step": 15620 }, { "epoch": 2.164819944598338, "grad_norm": 2.098733425140381, "learning_rate": 1.1225465175629902e-05, "loss": 0.1502, "step": 15630 }, { "epoch": 2.1662049861495847, "grad_norm": 1.9996839761734009, "learning_rate": 1.1206904551992947e-05, "loss": 0.2054, "step": 15640 }, { "epoch": 2.167590027700831, "grad_norm": 2.2129204273223877, "learning_rate": 1.1188343928355993e-05, "loss": 0.1923, "step": 15650 }, { "epoch": 2.1689750692520775, "grad_norm": 2.9811301231384277, "learning_rate": 1.116978330471904e-05, "loss": 0.1995, "step": 15660 }, { "epoch": 2.170360110803324, "grad_norm": 1.7352439165115356, "learning_rate": 1.1151222681082086e-05, "loss": 0.1815, "step": 15670 }, { "epoch": 2.1717451523545708, "grad_norm": 2.3152153491973877, "learning_rate": 1.1132662057445131e-05, "loss": 0.2229, "step": 15680 }, { "epoch": 2.173130193905817, "grad_norm": 2.018587350845337, "learning_rate": 1.1114101433808178e-05, "loss": 0.1789, "step": 15690 }, { "epoch": 2.1745152354570636, "grad_norm": 2.000908613204956, "learning_rate": 1.1095540810171224e-05, "loss": 0.1999, "step": 15700 }, { "epoch": 2.1759002770083105, "grad_norm": 2.4677839279174805, "learning_rate": 1.1076980186534268e-05, "loss": 0.1765, "step": 15710 }, { "epoch": 2.177285318559557, "grad_norm": 1.403136134147644, "learning_rate": 1.1058419562897313e-05, "loss": 0.1749, "step": 15720 }, { "epoch": 2.1786703601108033, "grad_norm": 1.935598611831665, "learning_rate": 1.1039858939260359e-05, "loss": 0.1506, "step": 15730 }, { "epoch": 2.1800554016620497, "grad_norm": 2.197678565979004, "learning_rate": 1.1021298315623406e-05, "loss": 0.1765, "step": 15740 }, { "epoch": 2.1814404432132966, "grad_norm": 1.7003947496414185, "learning_rate": 1.1002737691986451e-05, "loss": 0.1747, "step": 15750 }, { "epoch": 2.182825484764543, "grad_norm": 1.7991876602172852, "learning_rate": 1.0984177068349497e-05, "loss": 0.178, "step": 15760 }, { "epoch": 2.1842105263157894, "grad_norm": 3.0078437328338623, "learning_rate": 1.0965616444712544e-05, "loss": 0.164, "step": 15770 }, { "epoch": 2.1855955678670362, "grad_norm": 1.8674951791763306, "learning_rate": 1.094705582107559e-05, "loss": 0.2086, "step": 15780 }, { "epoch": 2.1869806094182827, "grad_norm": 1.6032930612564087, "learning_rate": 1.0928495197438635e-05, "loss": 0.151, "step": 15790 }, { "epoch": 2.188365650969529, "grad_norm": 2.2988972663879395, "learning_rate": 1.0909934573801682e-05, "loss": 0.1991, "step": 15800 }, { "epoch": 2.1897506925207755, "grad_norm": 2.0465915203094482, "learning_rate": 1.0891373950164726e-05, "loss": 0.1855, "step": 15810 }, { "epoch": 2.1911357340720223, "grad_norm": 1.6645904779434204, "learning_rate": 1.0874669388891467e-05, "loss": 0.1907, "step": 15820 }, { "epoch": 2.1925207756232687, "grad_norm": 1.4574722051620483, "learning_rate": 1.0856108765254514e-05, "loss": 0.1808, "step": 15830 }, { "epoch": 2.193905817174515, "grad_norm": 3.1846442222595215, "learning_rate": 1.083754814161756e-05, "loss": 0.2199, "step": 15840 }, { "epoch": 2.1952908587257616, "grad_norm": 2.510326385498047, "learning_rate": 1.0818987517980605e-05, "loss": 0.1804, "step": 15850 }, { "epoch": 2.1966759002770084, "grad_norm": 1.9207181930541992, "learning_rate": 1.0800426894343653e-05, "loss": 0.1802, "step": 15860 }, { "epoch": 2.198060941828255, "grad_norm": 1.5893478393554688, "learning_rate": 1.0781866270706696e-05, "loss": 0.1877, "step": 15870 }, { "epoch": 2.1994459833795013, "grad_norm": 2.8297579288482666, "learning_rate": 1.0763305647069742e-05, "loss": 0.1832, "step": 15880 }, { "epoch": 2.200831024930748, "grad_norm": 2.0689809322357178, "learning_rate": 1.0744745023432787e-05, "loss": 0.1696, "step": 15890 }, { "epoch": 2.2022160664819945, "grad_norm": 2.195650577545166, "learning_rate": 1.0726184399795833e-05, "loss": 0.1962, "step": 15900 }, { "epoch": 2.203601108033241, "grad_norm": 2.4047739505767822, "learning_rate": 1.070762377615888e-05, "loss": 0.1616, "step": 15910 }, { "epoch": 2.2049861495844874, "grad_norm": 2.298598289489746, "learning_rate": 1.0689063152521926e-05, "loss": 0.1956, "step": 15920 }, { "epoch": 2.206371191135734, "grad_norm": 2.514930009841919, "learning_rate": 1.0670502528884971e-05, "loss": 0.192, "step": 15930 }, { "epoch": 2.2077562326869806, "grad_norm": 2.1210293769836426, "learning_rate": 1.0651941905248018e-05, "loss": 0.1501, "step": 15940 }, { "epoch": 2.209141274238227, "grad_norm": 1.428801417350769, "learning_rate": 1.0633381281611064e-05, "loss": 0.1822, "step": 15950 }, { "epoch": 2.2105263157894735, "grad_norm": 1.8047534227371216, "learning_rate": 1.061482065797411e-05, "loss": 0.1616, "step": 15960 }, { "epoch": 2.2119113573407203, "grad_norm": 1.586500644683838, "learning_rate": 1.0596260034337153e-05, "loss": 0.1567, "step": 15970 }, { "epoch": 2.2132963988919667, "grad_norm": 1.429770588874817, "learning_rate": 1.05776994107002e-05, "loss": 0.1873, "step": 15980 }, { "epoch": 2.214681440443213, "grad_norm": 3.8475210666656494, "learning_rate": 1.0559138787063246e-05, "loss": 0.1759, "step": 15990 }, { "epoch": 2.21606648199446, "grad_norm": 2.3511464595794678, "learning_rate": 1.0540578163426291e-05, "loss": 0.1683, "step": 16000 }, { "epoch": 2.2174515235457064, "grad_norm": 2.5054216384887695, "learning_rate": 1.0522017539789337e-05, "loss": 0.1761, "step": 16010 }, { "epoch": 2.218836565096953, "grad_norm": 1.5130455493927002, "learning_rate": 1.0503456916152384e-05, "loss": 0.1769, "step": 16020 }, { "epoch": 2.2202216066481997, "grad_norm": 2.0365564823150635, "learning_rate": 1.048489629251543e-05, "loss": 0.1929, "step": 16030 }, { "epoch": 2.221606648199446, "grad_norm": 2.6106247901916504, "learning_rate": 1.0466335668878475e-05, "loss": 0.1926, "step": 16040 }, { "epoch": 2.2229916897506925, "grad_norm": 1.5493723154067993, "learning_rate": 1.0447775045241522e-05, "loss": 0.177, "step": 16050 }, { "epoch": 2.224376731301939, "grad_norm": 2.3234448432922363, "learning_rate": 1.0429214421604568e-05, "loss": 0.2127, "step": 16060 }, { "epoch": 2.2257617728531858, "grad_norm": 2.54538893699646, "learning_rate": 1.0410653797967613e-05, "loss": 0.1878, "step": 16070 }, { "epoch": 2.227146814404432, "grad_norm": 1.641268014907837, "learning_rate": 1.0392093174330657e-05, "loss": 0.1522, "step": 16080 }, { "epoch": 2.2285318559556786, "grad_norm": 2.013471841812134, "learning_rate": 1.0373532550693704e-05, "loss": 0.1901, "step": 16090 }, { "epoch": 2.229916897506925, "grad_norm": 2.284034490585327, "learning_rate": 1.035497192705675e-05, "loss": 0.1998, "step": 16100 }, { "epoch": 2.231301939058172, "grad_norm": 1.8952127695083618, "learning_rate": 1.0336411303419795e-05, "loss": 0.1582, "step": 16110 }, { "epoch": 2.2326869806094183, "grad_norm": 2.218730926513672, "learning_rate": 1.031785067978284e-05, "loss": 0.1871, "step": 16120 }, { "epoch": 2.2340720221606647, "grad_norm": 1.4592667818069458, "learning_rate": 1.0299290056145888e-05, "loss": 0.1613, "step": 16130 }, { "epoch": 2.2354570637119116, "grad_norm": 2.5558269023895264, "learning_rate": 1.0280729432508934e-05, "loss": 0.1907, "step": 16140 }, { "epoch": 2.236842105263158, "grad_norm": 2.4344468116760254, "learning_rate": 1.0262168808871979e-05, "loss": 0.1599, "step": 16150 }, { "epoch": 2.2382271468144044, "grad_norm": 1.4910821914672852, "learning_rate": 1.0243608185235026e-05, "loss": 0.1576, "step": 16160 }, { "epoch": 2.239612188365651, "grad_norm": 2.0156784057617188, "learning_rate": 1.0225047561598072e-05, "loss": 0.191, "step": 16170 }, { "epoch": 2.2409972299168976, "grad_norm": 3.575056314468384, "learning_rate": 1.0206486937961116e-05, "loss": 0.2032, "step": 16180 }, { "epoch": 2.242382271468144, "grad_norm": 2.0502638816833496, "learning_rate": 1.0187926314324161e-05, "loss": 0.1915, "step": 16190 }, { "epoch": 2.2437673130193905, "grad_norm": 1.5162267684936523, "learning_rate": 1.0169365690687207e-05, "loss": 0.1873, "step": 16200 }, { "epoch": 2.245152354570637, "grad_norm": 2.2193379402160645, "learning_rate": 1.0150805067050254e-05, "loss": 0.1924, "step": 16210 }, { "epoch": 2.2465373961218837, "grad_norm": 2.0222744941711426, "learning_rate": 1.01322444434133e-05, "loss": 0.1615, "step": 16220 }, { "epoch": 2.24792243767313, "grad_norm": 1.7618517875671387, "learning_rate": 1.0113683819776345e-05, "loss": 0.1622, "step": 16230 }, { "epoch": 2.2493074792243766, "grad_norm": 1.4167286157608032, "learning_rate": 1.0095123196139392e-05, "loss": 0.1759, "step": 16240 }, { "epoch": 2.2506925207756234, "grad_norm": 1.6987353563308716, "learning_rate": 1.0076562572502437e-05, "loss": 0.1722, "step": 16250 }, { "epoch": 2.25207756232687, "grad_norm": 1.8066134452819824, "learning_rate": 1.0058001948865483e-05, "loss": 0.1547, "step": 16260 }, { "epoch": 2.2534626038781163, "grad_norm": 1.7205597162246704, "learning_rate": 1.003944132522853e-05, "loss": 0.1733, "step": 16270 }, { "epoch": 2.254847645429363, "grad_norm": 1.9057214260101318, "learning_rate": 1.0020880701591574e-05, "loss": 0.1933, "step": 16280 }, { "epoch": 2.2562326869806095, "grad_norm": 1.8358714580535889, "learning_rate": 1.000232007795462e-05, "loss": 0.1775, "step": 16290 }, { "epoch": 2.257617728531856, "grad_norm": 3.4662563800811768, "learning_rate": 9.983759454317667e-06, "loss": 0.1976, "step": 16300 }, { "epoch": 2.2590027700831024, "grad_norm": 2.5907483100891113, "learning_rate": 9.96519883068071e-06, "loss": 0.1674, "step": 16310 }, { "epoch": 2.260387811634349, "grad_norm": 2.281738042831421, "learning_rate": 9.946638207043758e-06, "loss": 0.1941, "step": 16320 }, { "epoch": 2.2617728531855956, "grad_norm": 2.349745035171509, "learning_rate": 9.928077583406803e-06, "loss": 0.1961, "step": 16330 }, { "epoch": 2.263157894736842, "grad_norm": 1.396642804145813, "learning_rate": 9.909516959769849e-06, "loss": 0.1475, "step": 16340 }, { "epoch": 2.2645429362880884, "grad_norm": 1.8703711032867432, "learning_rate": 9.892812398496592e-06, "loss": 0.1619, "step": 16350 }, { "epoch": 2.2659279778393353, "grad_norm": 2.279014825820923, "learning_rate": 9.874251774859635e-06, "loss": 0.1788, "step": 16360 }, { "epoch": 2.2673130193905817, "grad_norm": 1.6533187627792358, "learning_rate": 9.855691151222681e-06, "loss": 0.1947, "step": 16370 }, { "epoch": 2.268698060941828, "grad_norm": 1.9308675527572632, "learning_rate": 9.837130527585728e-06, "loss": 0.1818, "step": 16380 }, { "epoch": 2.270083102493075, "grad_norm": 2.1606502532958984, "learning_rate": 9.818569903948774e-06, "loss": 0.1866, "step": 16390 }, { "epoch": 2.2714681440443214, "grad_norm": 2.4330480098724365, "learning_rate": 9.800009280311819e-06, "loss": 0.1811, "step": 16400 }, { "epoch": 2.272853185595568, "grad_norm": 2.237210512161255, "learning_rate": 9.781448656674865e-06, "loss": 0.16, "step": 16410 }, { "epoch": 2.2742382271468142, "grad_norm": 5.955811977386475, "learning_rate": 9.76288803303791e-06, "loss": 0.1702, "step": 16420 }, { "epoch": 2.275623268698061, "grad_norm": 1.7061985731124878, "learning_rate": 9.744327409400957e-06, "loss": 0.1758, "step": 16430 }, { "epoch": 2.2770083102493075, "grad_norm": 3.15761399269104, "learning_rate": 9.725766785764003e-06, "loss": 0.1877, "step": 16440 }, { "epoch": 2.278393351800554, "grad_norm": 1.7660126686096191, "learning_rate": 9.707206162127048e-06, "loss": 0.1702, "step": 16450 }, { "epoch": 2.2797783933518003, "grad_norm": 2.0085220336914062, "learning_rate": 9.688645538490094e-06, "loss": 0.176, "step": 16460 }, { "epoch": 2.281163434903047, "grad_norm": 2.159771203994751, "learning_rate": 9.67008491485314e-06, "loss": 0.1999, "step": 16470 }, { "epoch": 2.2825484764542936, "grad_norm": 2.3023860454559326, "learning_rate": 9.651524291216185e-06, "loss": 0.1575, "step": 16480 }, { "epoch": 2.28393351800554, "grad_norm": 1.5170584917068481, "learning_rate": 9.632963667579232e-06, "loss": 0.1908, "step": 16490 }, { "epoch": 2.285318559556787, "grad_norm": 2.998106002807617, "learning_rate": 9.614403043942278e-06, "loss": 0.1824, "step": 16500 }, { "epoch": 2.2867036011080333, "grad_norm": 2.6079273223876953, "learning_rate": 9.595842420305323e-06, "loss": 0.1978, "step": 16510 }, { "epoch": 2.2880886426592797, "grad_norm": 2.0469090938568115, "learning_rate": 9.577281796668369e-06, "loss": 0.181, "step": 16520 }, { "epoch": 2.2894736842105265, "grad_norm": 1.8900119066238403, "learning_rate": 9.558721173031414e-06, "loss": 0.1852, "step": 16530 }, { "epoch": 2.290858725761773, "grad_norm": 1.3825252056121826, "learning_rate": 9.540160549394461e-06, "loss": 0.2009, "step": 16540 }, { "epoch": 2.2922437673130194, "grad_norm": 2.3342790603637695, "learning_rate": 9.521599925757507e-06, "loss": 0.208, "step": 16550 }, { "epoch": 2.293628808864266, "grad_norm": 3.8766579627990723, "learning_rate": 9.503039302120552e-06, "loss": 0.1838, "step": 16560 }, { "epoch": 2.2950138504155126, "grad_norm": 2.5892879962921143, "learning_rate": 9.484478678483598e-06, "loss": 0.1826, "step": 16570 }, { "epoch": 2.296398891966759, "grad_norm": 1.9301683902740479, "learning_rate": 9.465918054846643e-06, "loss": 0.1768, "step": 16580 }, { "epoch": 2.2977839335180055, "grad_norm": 1.7350757122039795, "learning_rate": 9.447357431209689e-06, "loss": 0.1769, "step": 16590 }, { "epoch": 2.299168975069252, "grad_norm": 2.5931735038757324, "learning_rate": 9.428796807572736e-06, "loss": 0.1934, "step": 16600 }, { "epoch": 2.3005540166204987, "grad_norm": 1.9637055397033691, "learning_rate": 9.410236183935781e-06, "loss": 0.1603, "step": 16610 }, { "epoch": 2.301939058171745, "grad_norm": 2.185250997543335, "learning_rate": 9.391675560298827e-06, "loss": 0.1741, "step": 16620 }, { "epoch": 2.3033240997229916, "grad_norm": 2.1665894985198975, "learning_rate": 9.373114936661872e-06, "loss": 0.1598, "step": 16630 }, { "epoch": 2.3047091412742384, "grad_norm": 2.147671937942505, "learning_rate": 9.354554313024918e-06, "loss": 0.1916, "step": 16640 }, { "epoch": 2.306094182825485, "grad_norm": 2.141869068145752, "learning_rate": 9.335993689387965e-06, "loss": 0.16, "step": 16650 }, { "epoch": 2.3074792243767313, "grad_norm": 2.271667957305908, "learning_rate": 9.31743306575101e-06, "loss": 0.1961, "step": 16660 }, { "epoch": 2.3088642659279777, "grad_norm": 2.0384247303009033, "learning_rate": 9.298872442114054e-06, "loss": 0.1948, "step": 16670 }, { "epoch": 2.3102493074792245, "grad_norm": 1.5816787481307983, "learning_rate": 9.280311818477102e-06, "loss": 0.1827, "step": 16680 }, { "epoch": 2.311634349030471, "grad_norm": 1.7221791744232178, "learning_rate": 9.261751194840147e-06, "loss": 0.1798, "step": 16690 }, { "epoch": 2.3130193905817173, "grad_norm": 1.7499399185180664, "learning_rate": 9.243190571203193e-06, "loss": 0.1946, "step": 16700 }, { "epoch": 2.3144044321329638, "grad_norm": 2.3477981090545654, "learning_rate": 9.22462994756624e-06, "loss": 0.1628, "step": 16710 }, { "epoch": 2.3157894736842106, "grad_norm": 2.5354487895965576, "learning_rate": 9.206069323929285e-06, "loss": 0.1748, "step": 16720 }, { "epoch": 2.317174515235457, "grad_norm": 2.410682201385498, "learning_rate": 9.187508700292331e-06, "loss": 0.1858, "step": 16730 }, { "epoch": 2.3185595567867034, "grad_norm": 2.190929651260376, "learning_rate": 9.168948076655376e-06, "loss": 0.1744, "step": 16740 }, { "epoch": 2.3199445983379503, "grad_norm": 2.5295703411102295, "learning_rate": 9.150387453018422e-06, "loss": 0.1855, "step": 16750 }, { "epoch": 2.3213296398891967, "grad_norm": 1.313277244567871, "learning_rate": 9.133682891745163e-06, "loss": 0.1862, "step": 16760 }, { "epoch": 2.322714681440443, "grad_norm": 2.6606435775756836, "learning_rate": 9.11512226810821e-06, "loss": 0.1474, "step": 16770 }, { "epoch": 2.32409972299169, "grad_norm": 2.0530002117156982, "learning_rate": 9.096561644471254e-06, "loss": 0.1808, "step": 16780 }, { "epoch": 2.3254847645429364, "grad_norm": 1.8997056484222412, "learning_rate": 9.078001020834301e-06, "loss": 0.1928, "step": 16790 }, { "epoch": 2.326869806094183, "grad_norm": 2.367507219314575, "learning_rate": 9.059440397197347e-06, "loss": 0.1783, "step": 16800 }, { "epoch": 2.3282548476454292, "grad_norm": 1.7851054668426514, "learning_rate": 9.040879773560392e-06, "loss": 0.1978, "step": 16810 }, { "epoch": 2.329639889196676, "grad_norm": 1.6374269723892212, "learning_rate": 9.022319149923438e-06, "loss": 0.1696, "step": 16820 }, { "epoch": 2.3310249307479225, "grad_norm": 1.65655517578125, "learning_rate": 9.003758526286483e-06, "loss": 0.1911, "step": 16830 }, { "epoch": 2.332409972299169, "grad_norm": 1.8260128498077393, "learning_rate": 8.985197902649529e-06, "loss": 0.1993, "step": 16840 }, { "epoch": 2.3337950138504153, "grad_norm": 1.7871785163879395, "learning_rate": 8.966637279012576e-06, "loss": 0.1881, "step": 16850 }, { "epoch": 2.335180055401662, "grad_norm": 2.481699228286743, "learning_rate": 8.948076655375621e-06, "loss": 0.1825, "step": 16860 }, { "epoch": 2.3365650969529086, "grad_norm": 1.8377317190170288, "learning_rate": 8.929516031738667e-06, "loss": 0.1687, "step": 16870 }, { "epoch": 2.337950138504155, "grad_norm": 1.7560187578201294, "learning_rate": 8.910955408101712e-06, "loss": 0.1762, "step": 16880 }, { "epoch": 2.339335180055402, "grad_norm": 2.3741097450256348, "learning_rate": 8.892394784464758e-06, "loss": 0.1885, "step": 16890 }, { "epoch": 2.3407202216066483, "grad_norm": 2.3812923431396484, "learning_rate": 8.873834160827805e-06, "loss": 0.203, "step": 16900 }, { "epoch": 2.3421052631578947, "grad_norm": 1.9939693212509155, "learning_rate": 8.85527353719085e-06, "loss": 0.2002, "step": 16910 }, { "epoch": 2.343490304709141, "grad_norm": 2.0856332778930664, "learning_rate": 8.836712913553896e-06, "loss": 0.2113, "step": 16920 }, { "epoch": 2.344875346260388, "grad_norm": 1.7179794311523438, "learning_rate": 8.818152289916942e-06, "loss": 0.1582, "step": 16930 }, { "epoch": 2.3462603878116344, "grad_norm": 2.0003726482391357, "learning_rate": 8.799591666279987e-06, "loss": 0.2333, "step": 16940 }, { "epoch": 2.347645429362881, "grad_norm": 1.780743956565857, "learning_rate": 8.781031042643033e-06, "loss": 0.1748, "step": 16950 }, { "epoch": 2.349030470914127, "grad_norm": 3.4279558658599854, "learning_rate": 8.76247041900608e-06, "loss": 0.1969, "step": 16960 }, { "epoch": 2.350415512465374, "grad_norm": 1.674522042274475, "learning_rate": 8.743909795369125e-06, "loss": 0.1952, "step": 16970 }, { "epoch": 2.3518005540166205, "grad_norm": 1.9975336790084839, "learning_rate": 8.725349171732171e-06, "loss": 0.1838, "step": 16980 }, { "epoch": 2.353185595567867, "grad_norm": 2.15136456489563, "learning_rate": 8.706788548095216e-06, "loss": 0.1835, "step": 16990 }, { "epoch": 2.3545706371191137, "grad_norm": 1.4658695459365845, "learning_rate": 8.688227924458262e-06, "loss": 0.1541, "step": 17000 }, { "epoch": 2.35595567867036, "grad_norm": 2.032360792160034, "learning_rate": 8.669667300821309e-06, "loss": 0.1694, "step": 17010 }, { "epoch": 2.3573407202216066, "grad_norm": 2.0457422733306885, "learning_rate": 8.651106677184355e-06, "loss": 0.1971, "step": 17020 }, { "epoch": 2.3587257617728534, "grad_norm": 1.985001564025879, "learning_rate": 8.6325460535474e-06, "loss": 0.1649, "step": 17030 }, { "epoch": 2.3601108033241, "grad_norm": 1.8881748914718628, "learning_rate": 8.613985429910446e-06, "loss": 0.1711, "step": 17040 }, { "epoch": 2.3614958448753463, "grad_norm": 2.045001745223999, "learning_rate": 8.595424806273491e-06, "loss": 0.1908, "step": 17050 }, { "epoch": 2.3628808864265927, "grad_norm": 2.6451058387756348, "learning_rate": 8.576864182636537e-06, "loss": 0.1899, "step": 17060 }, { "epoch": 2.3642659279778395, "grad_norm": 2.57863187789917, "learning_rate": 8.558303558999584e-06, "loss": 0.1604, "step": 17070 }, { "epoch": 2.365650969529086, "grad_norm": 4.005101203918457, "learning_rate": 8.53974293536263e-06, "loss": 0.1464, "step": 17080 }, { "epoch": 2.3670360110803323, "grad_norm": 2.0819427967071533, "learning_rate": 8.521182311725675e-06, "loss": 0.1723, "step": 17090 }, { "epoch": 2.3684210526315788, "grad_norm": 2.7164533138275146, "learning_rate": 8.50262168808872e-06, "loss": 0.2051, "step": 17100 }, { "epoch": 2.3698060941828256, "grad_norm": 1.8587703704833984, "learning_rate": 8.484061064451766e-06, "loss": 0.1622, "step": 17110 }, { "epoch": 2.371191135734072, "grad_norm": 1.5107576847076416, "learning_rate": 8.465500440814813e-06, "loss": 0.176, "step": 17120 }, { "epoch": 2.3725761772853184, "grad_norm": 3.7486839294433594, "learning_rate": 8.446939817177859e-06, "loss": 0.1531, "step": 17130 }, { "epoch": 2.3739612188365653, "grad_norm": 2.4600539207458496, "learning_rate": 8.428379193540902e-06, "loss": 0.2029, "step": 17140 }, { "epoch": 2.3753462603878117, "grad_norm": 1.6657919883728027, "learning_rate": 8.40981856990395e-06, "loss": 0.163, "step": 17150 }, { "epoch": 2.376731301939058, "grad_norm": 1.488139271736145, "learning_rate": 8.391257946266995e-06, "loss": 0.1732, "step": 17160 }, { "epoch": 2.3781163434903045, "grad_norm": 1.7949665784835815, "learning_rate": 8.37269732263004e-06, "loss": 0.2044, "step": 17170 }, { "epoch": 2.3795013850415514, "grad_norm": 4.909599781036377, "learning_rate": 8.354136698993088e-06, "loss": 0.1786, "step": 17180 }, { "epoch": 2.380886426592798, "grad_norm": 1.3685661554336548, "learning_rate": 8.335576075356132e-06, "loss": 0.1948, "step": 17190 }, { "epoch": 2.3822714681440442, "grad_norm": 1.8317877054214478, "learning_rate": 8.317015451719179e-06, "loss": 0.1766, "step": 17200 }, { "epoch": 2.3836565096952906, "grad_norm": 2.7056987285614014, "learning_rate": 8.298454828082224e-06, "loss": 0.166, "step": 17210 }, { "epoch": 2.3850415512465375, "grad_norm": 2.0532264709472656, "learning_rate": 8.27989420444527e-06, "loss": 0.1553, "step": 17220 }, { "epoch": 2.386426592797784, "grad_norm": 1.6754969358444214, "learning_rate": 8.261333580808315e-06, "loss": 0.1811, "step": 17230 }, { "epoch": 2.3878116343490303, "grad_norm": 1.6388112306594849, "learning_rate": 8.242772957171363e-06, "loss": 0.201, "step": 17240 }, { "epoch": 2.389196675900277, "grad_norm": 1.9280608892440796, "learning_rate": 8.224212333534406e-06, "loss": 0.1714, "step": 17250 }, { "epoch": 2.3905817174515236, "grad_norm": 1.5879631042480469, "learning_rate": 8.205651709897454e-06, "loss": 0.1941, "step": 17260 }, { "epoch": 2.39196675900277, "grad_norm": 1.5063045024871826, "learning_rate": 8.187091086260499e-06, "loss": 0.1816, "step": 17270 }, { "epoch": 2.393351800554017, "grad_norm": 1.6192227602005005, "learning_rate": 8.168530462623545e-06, "loss": 0.1703, "step": 17280 }, { "epoch": 2.3947368421052633, "grad_norm": 1.5527336597442627, "learning_rate": 8.149969838986592e-06, "loss": 0.1936, "step": 17290 }, { "epoch": 2.3961218836565097, "grad_norm": 3.69242000579834, "learning_rate": 8.131409215349636e-06, "loss": 0.1867, "step": 17300 }, { "epoch": 2.397506925207756, "grad_norm": 2.34364914894104, "learning_rate": 8.112848591712683e-06, "loss": 0.1905, "step": 17310 }, { "epoch": 2.398891966759003, "grad_norm": 1.7522242069244385, "learning_rate": 8.094287968075728e-06, "loss": 0.1719, "step": 17320 }, { "epoch": 2.4002770083102494, "grad_norm": 1.8389091491699219, "learning_rate": 8.075727344438774e-06, "loss": 0.157, "step": 17330 }, { "epoch": 2.401662049861496, "grad_norm": 1.5576094388961792, "learning_rate": 8.05716672080182e-06, "loss": 0.1705, "step": 17340 }, { "epoch": 2.403047091412742, "grad_norm": 3.3434033393859863, "learning_rate": 8.038606097164865e-06, "loss": 0.2162, "step": 17350 }, { "epoch": 2.404432132963989, "grad_norm": 1.8596067428588867, "learning_rate": 8.02004547352791e-06, "loss": 0.1546, "step": 17360 }, { "epoch": 2.4058171745152355, "grad_norm": 1.6991015672683716, "learning_rate": 8.001484849890957e-06, "loss": 0.1709, "step": 17370 }, { "epoch": 2.407202216066482, "grad_norm": 1.4487606287002563, "learning_rate": 7.982924226254003e-06, "loss": 0.1858, "step": 17380 }, { "epoch": 2.4085872576177287, "grad_norm": 1.7766512632369995, "learning_rate": 7.964363602617048e-06, "loss": 0.1825, "step": 17390 }, { "epoch": 2.409972299168975, "grad_norm": 2.881904363632202, "learning_rate": 7.945802978980094e-06, "loss": 0.2057, "step": 17400 }, { "epoch": 2.4113573407202216, "grad_norm": 1.7567830085754395, "learning_rate": 7.92724235534314e-06, "loss": 0.1728, "step": 17410 }, { "epoch": 2.412742382271468, "grad_norm": 2.2497482299804688, "learning_rate": 7.908681731706187e-06, "loss": 0.1695, "step": 17420 }, { "epoch": 2.414127423822715, "grad_norm": 1.412612795829773, "learning_rate": 7.890121108069232e-06, "loss": 0.1562, "step": 17430 }, { "epoch": 2.4155124653739612, "grad_norm": 2.2560765743255615, "learning_rate": 7.871560484432278e-06, "loss": 0.1865, "step": 17440 }, { "epoch": 2.4168975069252077, "grad_norm": 4.349898338317871, "learning_rate": 7.852999860795323e-06, "loss": 0.1842, "step": 17450 }, { "epoch": 2.418282548476454, "grad_norm": 1.4243489503860474, "learning_rate": 7.834439237158369e-06, "loss": 0.1711, "step": 17460 }, { "epoch": 2.419667590027701, "grad_norm": 1.888654351234436, "learning_rate": 7.815878613521414e-06, "loss": 0.2135, "step": 17470 }, { "epoch": 2.4210526315789473, "grad_norm": 2.302077293395996, "learning_rate": 7.797317989884461e-06, "loss": 0.1969, "step": 17480 }, { "epoch": 2.4224376731301938, "grad_norm": 1.739283800125122, "learning_rate": 7.778757366247507e-06, "loss": 0.1409, "step": 17490 }, { "epoch": 2.4238227146814406, "grad_norm": 2.2265708446502686, "learning_rate": 7.760196742610552e-06, "loss": 0.1555, "step": 17500 }, { "epoch": 2.425207756232687, "grad_norm": 3.4545814990997314, "learning_rate": 7.741636118973598e-06, "loss": 0.1922, "step": 17510 }, { "epoch": 2.4265927977839334, "grad_norm": 1.799225091934204, "learning_rate": 7.723075495336643e-06, "loss": 0.1541, "step": 17520 }, { "epoch": 2.4279778393351803, "grad_norm": 3.538924217224121, "learning_rate": 7.70451487169969e-06, "loss": 0.1595, "step": 17530 }, { "epoch": 2.4293628808864267, "grad_norm": 3.1783182621002197, "learning_rate": 7.685954248062736e-06, "loss": 0.1915, "step": 17540 }, { "epoch": 2.430747922437673, "grad_norm": 1.569459080696106, "learning_rate": 7.667393624425782e-06, "loss": 0.1822, "step": 17550 }, { "epoch": 2.4321329639889195, "grad_norm": 1.8626432418823242, "learning_rate": 7.648833000788827e-06, "loss": 0.1566, "step": 17560 }, { "epoch": 2.4335180055401664, "grad_norm": 1.8595503568649292, "learning_rate": 7.630272377151873e-06, "loss": 0.1844, "step": 17570 }, { "epoch": 2.434903047091413, "grad_norm": 1.743048906326294, "learning_rate": 7.611711753514919e-06, "loss": 0.1704, "step": 17580 }, { "epoch": 2.436288088642659, "grad_norm": 2.213383674621582, "learning_rate": 7.5931511298779645e-06, "loss": 0.1702, "step": 17590 }, { "epoch": 2.4376731301939056, "grad_norm": 1.8055698871612549, "learning_rate": 7.574590506241011e-06, "loss": 0.1555, "step": 17600 }, { "epoch": 2.4390581717451525, "grad_norm": 2.1330082416534424, "learning_rate": 7.5560298826040555e-06, "loss": 0.2097, "step": 17610 }, { "epoch": 2.440443213296399, "grad_norm": 2.3374805450439453, "learning_rate": 7.537469258967102e-06, "loss": 0.1959, "step": 17620 }, { "epoch": 2.4418282548476453, "grad_norm": 1.8412171602249146, "learning_rate": 7.518908635330147e-06, "loss": 0.1711, "step": 17630 }, { "epoch": 2.443213296398892, "grad_norm": 1.8535511493682861, "learning_rate": 7.500348011693194e-06, "loss": 0.1739, "step": 17640 }, { "epoch": 2.4445983379501386, "grad_norm": 1.7968270778656006, "learning_rate": 7.48178738805624e-06, "loss": 0.1841, "step": 17650 }, { "epoch": 2.445983379501385, "grad_norm": 2.1843976974487305, "learning_rate": 7.463226764419285e-06, "loss": 0.142, "step": 17660 }, { "epoch": 2.4473684210526314, "grad_norm": 1.7341108322143555, "learning_rate": 7.44466614078233e-06, "loss": 0.1807, "step": 17670 }, { "epoch": 2.4487534626038783, "grad_norm": 1.8200232982635498, "learning_rate": 7.426105517145377e-06, "loss": 0.1564, "step": 17680 }, { "epoch": 2.4501385041551247, "grad_norm": 3.129906415939331, "learning_rate": 7.407544893508423e-06, "loss": 0.1846, "step": 17690 }, { "epoch": 2.451523545706371, "grad_norm": 2.155862331390381, "learning_rate": 7.3889842698714685e-06, "loss": 0.1618, "step": 17700 }, { "epoch": 2.4529085872576175, "grad_norm": 1.7917015552520752, "learning_rate": 7.370423646234514e-06, "loss": 0.229, "step": 17710 }, { "epoch": 2.4542936288088644, "grad_norm": 1.904911756515503, "learning_rate": 7.3518630225975595e-06, "loss": 0.1847, "step": 17720 }, { "epoch": 2.455678670360111, "grad_norm": 2.046907901763916, "learning_rate": 7.333302398960606e-06, "loss": 0.1729, "step": 17730 }, { "epoch": 2.457063711911357, "grad_norm": 2.4984397888183594, "learning_rate": 7.314741775323651e-06, "loss": 0.1569, "step": 17740 }, { "epoch": 2.458448753462604, "grad_norm": 1.768541693687439, "learning_rate": 7.296181151686698e-06, "loss": 0.1665, "step": 17750 }, { "epoch": 2.4598337950138505, "grad_norm": 2.0088300704956055, "learning_rate": 7.277620528049743e-06, "loss": 0.1772, "step": 17760 }, { "epoch": 2.461218836565097, "grad_norm": 2.9492692947387695, "learning_rate": 7.259059904412789e-06, "loss": 0.1923, "step": 17770 }, { "epoch": 2.4626038781163437, "grad_norm": 1.7164802551269531, "learning_rate": 7.240499280775834e-06, "loss": 0.1693, "step": 17780 }, { "epoch": 2.46398891966759, "grad_norm": 1.7039817571640015, "learning_rate": 7.2219386571388806e-06, "loss": 0.1357, "step": 17790 }, { "epoch": 2.4653739612188366, "grad_norm": 1.6272010803222656, "learning_rate": 7.203378033501927e-06, "loss": 0.1675, "step": 17800 }, { "epoch": 2.466759002770083, "grad_norm": 2.1668031215667725, "learning_rate": 7.1848174098649724e-06, "loss": 0.2196, "step": 17810 }, { "epoch": 2.46814404432133, "grad_norm": 2.1896896362304688, "learning_rate": 7.166256786228017e-06, "loss": 0.1705, "step": 17820 }, { "epoch": 2.4695290858725762, "grad_norm": 1.9577006101608276, "learning_rate": 7.1476961625910634e-06, "loss": 0.1603, "step": 17830 }, { "epoch": 2.4709141274238227, "grad_norm": 4.705322742462158, "learning_rate": 7.12913553895411e-06, "loss": 0.2003, "step": 17840 }, { "epoch": 2.472299168975069, "grad_norm": 1.6234703063964844, "learning_rate": 7.110574915317155e-06, "loss": 0.2046, "step": 17850 }, { "epoch": 2.473684210526316, "grad_norm": 2.3687870502471924, "learning_rate": 7.092014291680202e-06, "loss": 0.18, "step": 17860 }, { "epoch": 2.4750692520775623, "grad_norm": 1.580241322517395, "learning_rate": 7.073453668043246e-06, "loss": 0.1803, "step": 17870 }, { "epoch": 2.4764542936288088, "grad_norm": 2.288207530975342, "learning_rate": 7.054893044406293e-06, "loss": 0.1567, "step": 17880 }, { "epoch": 2.4778393351800556, "grad_norm": 2.1847081184387207, "learning_rate": 7.036332420769338e-06, "loss": 0.1676, "step": 17890 }, { "epoch": 2.479224376731302, "grad_norm": 2.0806612968444824, "learning_rate": 7.0177717971323845e-06, "loss": 0.1605, "step": 17900 }, { "epoch": 2.4806094182825484, "grad_norm": 1.6858736276626587, "learning_rate": 6.999211173495431e-06, "loss": 0.1498, "step": 17910 }, { "epoch": 2.481994459833795, "grad_norm": 1.8091322183609009, "learning_rate": 6.9806505498584755e-06, "loss": 0.1508, "step": 17920 }, { "epoch": 2.4833795013850417, "grad_norm": 2.947080135345459, "learning_rate": 6.962089926221521e-06, "loss": 0.1519, "step": 17930 }, { "epoch": 2.484764542936288, "grad_norm": 1.562985897064209, "learning_rate": 6.943529302584567e-06, "loss": 0.1828, "step": 17940 }, { "epoch": 2.4861495844875345, "grad_norm": 2.3566386699676514, "learning_rate": 6.924968678947614e-06, "loss": 0.1745, "step": 17950 }, { "epoch": 2.487534626038781, "grad_norm": 2.319905996322632, "learning_rate": 6.906408055310659e-06, "loss": 0.1805, "step": 17960 }, { "epoch": 2.488919667590028, "grad_norm": 4.170083999633789, "learning_rate": 6.887847431673704e-06, "loss": 0.1932, "step": 17970 }, { "epoch": 2.490304709141274, "grad_norm": 3.8577587604522705, "learning_rate": 6.86928680803675e-06, "loss": 0.1939, "step": 17980 }, { "epoch": 2.4916897506925206, "grad_norm": 1.916488528251648, "learning_rate": 6.850726184399797e-06, "loss": 0.1808, "step": 17990 }, { "epoch": 2.4930747922437675, "grad_norm": 2.2478854656219482, "learning_rate": 6.832165560762842e-06, "loss": 0.1912, "step": 18000 } ], "logging_steps": 10, "max_steps": 21660, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.194381944570511e+18, "train_batch_size": 12, "trial_name": null, "trial_params": null }