{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 3665, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013642564802182811, "grad_norm": 5.18676233291626, "learning_rate": 7.999880967748602e-05, "loss": 2.6446, "step": 10 }, { "epoch": 0.027285129604365622, "grad_norm": 5.224745273590088, "learning_rate": 7.999469507332807e-05, "loss": 2.5833, "step": 20 }, { "epoch": 0.040927694406548434, "grad_norm": 5.343475341796875, "learning_rate": 7.998764179444167e-05, "loss": 2.3323, "step": 30 }, { "epoch": 0.054570259208731244, "grad_norm": 3.960702657699585, "learning_rate": 7.997765035907784e-05, "loss": 2.2764, "step": 40 }, { "epoch": 0.06821282401091405, "grad_norm": 5.80894136428833, "learning_rate": 7.99661465457664e-05, "loss": 2.3653, "step": 50 }, { "epoch": 0.08185538881309687, "grad_norm": 3.9454762935638428, "learning_rate": 7.995057481402883e-05, "loss": 2.3655, "step": 60 }, { "epoch": 0.09549795361527967, "grad_norm": 4.658463478088379, "learning_rate": 7.993206764937005e-05, "loss": 2.2008, "step": 70 }, { "epoch": 0.10914051841746249, "grad_norm": 7.062836647033691, "learning_rate": 7.99106264116338e-05, "loss": 2.4619, "step": 80 }, { "epoch": 0.12278308321964529, "grad_norm": 5.339938640594482, "learning_rate": 7.988625267624962e-05, "loss": 2.4662, "step": 90 }, { "epoch": 0.1364256480218281, "grad_norm": 6.298858165740967, "learning_rate": 7.9858948234117e-05, "loss": 2.3909, "step": 100 }, { "epoch": 0.15006821282401092, "grad_norm": 5.622934818267822, "learning_rate": 7.98287150914739e-05, "loss": 2.4736, "step": 110 }, { "epoch": 0.16371077762619374, "grad_norm": 3.229802370071411, "learning_rate": 7.979555546974922e-05, "loss": 2.3796, "step": 120 }, { "epoch": 0.17735334242837653, "grad_norm": 5.506136894226074, "learning_rate": 7.975947180539966e-05, "loss": 2.2357, "step": 130 }, { "epoch": 0.19099590723055934, "grad_norm": 5.04240608215332, "learning_rate": 7.972046674973065e-05, "loss": 2.3435, "step": 140 }, { "epoch": 0.20463847203274216, "grad_norm": 6.470039367675781, "learning_rate": 7.967854316870156e-05, "loss": 2.4652, "step": 150 }, { "epoch": 0.21828103683492497, "grad_norm": 3.5564420223236084, "learning_rate": 7.963370414271514e-05, "loss": 2.2549, "step": 160 }, { "epoch": 0.23192360163710776, "grad_norm": 3.998383045196533, "learning_rate": 7.958595296639115e-05, "loss": 2.3102, "step": 170 }, { "epoch": 0.24556616643929058, "grad_norm": 4.615527629852295, "learning_rate": 7.953529314832426e-05, "loss": 2.2444, "step": 180 }, { "epoch": 0.2592087312414734, "grad_norm": 6.488475322723389, "learning_rate": 7.948172841082635e-05, "loss": 2.2928, "step": 190 }, { "epoch": 0.2728512960436562, "grad_norm": 4.134075164794922, "learning_rate": 7.942526268965287e-05, "loss": 2.2113, "step": 200 }, { "epoch": 0.286493860845839, "grad_norm": 5.373613357543945, "learning_rate": 7.936590013371378e-05, "loss": 2.0795, "step": 210 }, { "epoch": 0.30013642564802184, "grad_norm": 5.965483665466309, "learning_rate": 7.93036451047686e-05, "loss": 2.3991, "step": 220 }, { "epoch": 0.31377899045020463, "grad_norm": 4.945356369018555, "learning_rate": 7.923850217710604e-05, "loss": 2.3399, "step": 230 }, { "epoch": 0.3274215552523875, "grad_norm": 5.210105895996094, "learning_rate": 7.917047613720773e-05, "loss": 2.3305, "step": 240 }, { "epoch": 0.34106412005457026, "grad_norm": 3.3301730155944824, "learning_rate": 7.909957198339669e-05, "loss": 2.2404, "step": 250 }, { "epoch": 0.35470668485675305, "grad_norm": 3.685528039932251, "learning_rate": 7.902579492546998e-05, "loss": 2.1853, "step": 260 }, { "epoch": 0.3683492496589359, "grad_norm": 5.30728006362915, "learning_rate": 7.894915038431595e-05, "loss": 2.3177, "step": 270 }, { "epoch": 0.3819918144611187, "grad_norm": 2.644773006439209, "learning_rate": 7.886964399151586e-05, "loss": 2.3621, "step": 280 }, { "epoch": 0.3956343792633015, "grad_norm": 4.921834468841553, "learning_rate": 7.878728158893015e-05, "loss": 2.3687, "step": 290 }, { "epoch": 0.4092769440654843, "grad_norm": 3.9288289546966553, "learning_rate": 7.870206922826921e-05, "loss": 2.2021, "step": 300 }, { "epoch": 0.4229195088676671, "grad_norm": 6.143318176269531, "learning_rate": 7.862294656001264e-05, "loss": 2.0236, "step": 310 }, { "epoch": 0.43656207366984995, "grad_norm": 4.3533806800842285, "learning_rate": 7.853233670108533e-05, "loss": 2.4181, "step": 320 }, { "epoch": 0.45020463847203274, "grad_norm": 3.7804017066955566, "learning_rate": 7.843889561656962e-05, "loss": 2.2359, "step": 330 }, { "epoch": 0.4638472032742155, "grad_norm": 2.7896721363067627, "learning_rate": 7.83426301721999e-05, "loss": 2.2156, "step": 340 }, { "epoch": 0.47748976807639837, "grad_norm": 4.168877601623535, "learning_rate": 7.824354744123498e-05, "loss": 2.256, "step": 350 }, { "epoch": 0.49113233287858116, "grad_norm": 7.003695011138916, "learning_rate": 7.814165470393832e-05, "loss": 2.2627, "step": 360 }, { "epoch": 0.504774897680764, "grad_norm": 9.064947128295898, "learning_rate": 7.803695944704313e-05, "loss": 2.1303, "step": 370 }, { "epoch": 0.5184174624829468, "grad_norm": 6.936222553253174, "learning_rate": 7.79294693632023e-05, "loss": 2.307, "step": 380 }, { "epoch": 0.5320600272851296, "grad_norm": 4.441155433654785, "learning_rate": 7.781919235042309e-05, "loss": 2.2182, "step": 390 }, { "epoch": 0.5457025920873124, "grad_norm": 7.236617088317871, "learning_rate": 7.770613651148688e-05, "loss": 2.2309, "step": 400 }, { "epoch": 0.5593451568894953, "grad_norm": 4.184124946594238, "learning_rate": 7.75903101533538e-05, "loss": 2.0949, "step": 410 }, { "epoch": 0.572987721691678, "grad_norm": 2.9721992015838623, "learning_rate": 7.747172178655232e-05, "loss": 2.0541, "step": 420 }, { "epoch": 0.5866302864938608, "grad_norm": 2.36011004447937, "learning_rate": 7.735038012455398e-05, "loss": 2.0429, "step": 430 }, { "epoch": 0.6002728512960437, "grad_norm": 5.892980575561523, "learning_rate": 7.722629408313309e-05, "loss": 2.1867, "step": 440 }, { "epoch": 0.6139154160982264, "grad_norm": 5.935312747955322, "learning_rate": 7.709947277971168e-05, "loss": 2.2556, "step": 450 }, { "epoch": 0.6275579809004093, "grad_norm": 5.781426906585693, "learning_rate": 7.69699255326896e-05, "loss": 2.3728, "step": 460 }, { "epoch": 0.6412005457025921, "grad_norm": 6.5159149169921875, "learning_rate": 7.683766186075973e-05, "loss": 2.4319, "step": 470 }, { "epoch": 0.654843110504775, "grad_norm": 3.946726083755493, "learning_rate": 7.670269148220874e-05, "loss": 2.2417, "step": 480 }, { "epoch": 0.6684856753069577, "grad_norm": 5.711483001708984, "learning_rate": 7.656502431420286e-05, "loss": 2.1867, "step": 490 }, { "epoch": 0.6821282401091405, "grad_norm": 1.9001113176345825, "learning_rate": 7.64246704720593e-05, "loss": 2.2426, "step": 500 }, { "epoch": 0.6957708049113234, "grad_norm": 6.293882846832275, "learning_rate": 7.628164026850302e-05, "loss": 2.1057, "step": 510 }, { "epoch": 0.7094133697135061, "grad_norm": 2.980394124984741, "learning_rate": 7.613594421290888e-05, "loss": 2.0724, "step": 520 }, { "epoch": 0.723055934515689, "grad_norm": 7.354122161865234, "learning_rate": 7.59875930105296e-05, "loss": 2.1513, "step": 530 }, { "epoch": 0.7366984993178718, "grad_norm": 3.451127290725708, "learning_rate": 7.583659756170904e-05, "loss": 1.996, "step": 540 }, { "epoch": 0.7503410641200545, "grad_norm": 4.8892951011657715, "learning_rate": 7.568296896108135e-05, "loss": 2.2927, "step": 550 }, { "epoch": 0.7639836289222374, "grad_norm": 5.78281831741333, "learning_rate": 7.552671849675571e-05, "loss": 2.2445, "step": 560 }, { "epoch": 0.7776261937244202, "grad_norm": 3.0643134117126465, "learning_rate": 7.536785764948705e-05, "loss": 2.1723, "step": 570 }, { "epoch": 0.791268758526603, "grad_norm": 5.0908732414245605, "learning_rate": 7.520639809183234e-05, "loss": 2.1945, "step": 580 }, { "epoch": 0.8049113233287858, "grad_norm": 4.066890716552734, "learning_rate": 7.5042351687293e-05, "loss": 2.385, "step": 590 }, { "epoch": 0.8185538881309686, "grad_norm": 6.034420967102051, "learning_rate": 7.487573048944318e-05, "loss": 1.8776, "step": 600 }, { "epoch": 0.8321964529331515, "grad_norm": 2.5823731422424316, "learning_rate": 7.470654674104415e-05, "loss": 2.1586, "step": 610 }, { "epoch": 0.8458390177353342, "grad_norm": 4.781266212463379, "learning_rate": 7.453481287314469e-05, "loss": 2.1229, "step": 620 }, { "epoch": 0.859481582537517, "grad_norm": 4.008852481842041, "learning_rate": 7.436054150416777e-05, "loss": 2.1562, "step": 630 }, { "epoch": 0.8731241473396999, "grad_norm": 7.812346458435059, "learning_rate": 7.418374543898328e-05, "loss": 2.2563, "step": 640 }, { "epoch": 0.8867667121418826, "grad_norm": 5.588919639587402, "learning_rate": 7.400443766796728e-05, "loss": 2.0738, "step": 650 }, { "epoch": 0.9004092769440655, "grad_norm": 3.1923656463623047, "learning_rate": 7.382263136604744e-05, "loss": 1.9363, "step": 660 }, { "epoch": 0.9140518417462483, "grad_norm": 6.451751708984375, "learning_rate": 7.363833989173504e-05, "loss": 2.3014, "step": 670 }, { "epoch": 0.927694406548431, "grad_norm": 4.7484517097473145, "learning_rate": 7.34515767861434e-05, "loss": 2.1957, "step": 680 }, { "epoch": 0.9413369713506139, "grad_norm": 4.468317031860352, "learning_rate": 7.326235577199292e-05, "loss": 2.4058, "step": 690 }, { "epoch": 0.9549795361527967, "grad_norm": 4.4900078773498535, "learning_rate": 7.307069075260283e-05, "loss": 2.0714, "step": 700 }, { "epoch": 0.9686221009549796, "grad_norm": 4.51573371887207, "learning_rate": 7.287659581086957e-05, "loss": 2.0244, "step": 710 }, { "epoch": 0.9822646657571623, "grad_norm": 2.892160654067993, "learning_rate": 7.268008520823203e-05, "loss": 2.1891, "step": 720 }, { "epoch": 0.9959072305593452, "grad_norm": 5.765126705169678, "learning_rate": 7.248117338362371e-05, "loss": 2.0008, "step": 730 }, { "epoch": 1.009549795361528, "grad_norm": 4.076498031616211, "learning_rate": 7.227987495241174e-05, "loss": 1.7823, "step": 740 }, { "epoch": 1.0231923601637107, "grad_norm": 5.407172203063965, "learning_rate": 7.207620470532302e-05, "loss": 1.7272, "step": 750 }, { "epoch": 1.0368349249658937, "grad_norm": 2.3942642211914062, "learning_rate": 7.187017760735748e-05, "loss": 1.7809, "step": 760 }, { "epoch": 1.0504774897680764, "grad_norm": 6.462409019470215, "learning_rate": 7.166180879668843e-05, "loss": 1.5226, "step": 770 }, { "epoch": 1.0641200545702592, "grad_norm": 10.609193801879883, "learning_rate": 7.145111358355031e-05, "loss": 1.6743, "step": 780 }, { "epoch": 1.077762619372442, "grad_norm": 5.816223621368408, "learning_rate": 7.123810744911376e-05, "loss": 1.3974, "step": 790 }, { "epoch": 1.0914051841746248, "grad_norm": 8.911855697631836, "learning_rate": 7.102280604434805e-05, "loss": 1.4882, "step": 800 }, { "epoch": 1.1050477489768076, "grad_norm": 8.37972640991211, "learning_rate": 7.080522518887116e-05, "loss": 1.8217, "step": 810 }, { "epoch": 1.1186903137789905, "grad_norm": 5.856902599334717, "learning_rate": 7.058538086978738e-05, "loss": 1.5443, "step": 820 }, { "epoch": 1.1323328785811733, "grad_norm": 8.661952018737793, "learning_rate": 7.036328924051266e-05, "loss": 1.426, "step": 830 }, { "epoch": 1.145975443383356, "grad_norm": 7.178607940673828, "learning_rate": 7.013896661958766e-05, "loss": 1.6086, "step": 840 }, { "epoch": 1.159618008185539, "grad_norm": 6.399686813354492, "learning_rate": 6.991242948947879e-05, "loss": 1.64, "step": 850 }, { "epoch": 1.1732605729877217, "grad_norm": 5.770147800445557, "learning_rate": 6.968369449536705e-05, "loss": 1.6026, "step": 860 }, { "epoch": 1.1869031377899044, "grad_norm": 7.86605167388916, "learning_rate": 6.945277844392504e-05, "loss": 1.5026, "step": 870 }, { "epoch": 1.2005457025920874, "grad_norm": 3.953367233276367, "learning_rate": 6.921969830208212e-05, "loss": 1.6705, "step": 880 }, { "epoch": 1.21418826739427, "grad_norm": 10.93819808959961, "learning_rate": 6.898447119577764e-05, "loss": 1.6982, "step": 890 }, { "epoch": 1.2278308321964528, "grad_norm": 4.714999675750732, "learning_rate": 6.874711440870264e-05, "loss": 1.2631, "step": 900 }, { "epoch": 1.2414733969986358, "grad_norm": 4.0259480476379395, "learning_rate": 6.85076453810299e-05, "loss": 1.5005, "step": 910 }, { "epoch": 1.2551159618008185, "grad_norm": 7.3578643798828125, "learning_rate": 6.826608170813243e-05, "loss": 1.6076, "step": 920 }, { "epoch": 1.2687585266030013, "grad_norm": 5.368468284606934, "learning_rate": 6.802244113929075e-05, "loss": 1.7727, "step": 930 }, { "epoch": 1.2824010914051842, "grad_norm": 7.0559916496276855, "learning_rate": 6.777674157638862e-05, "loss": 1.6125, "step": 940 }, { "epoch": 1.296043656207367, "grad_norm": 6.349229335784912, "learning_rate": 6.75290010725977e-05, "loss": 1.4289, "step": 950 }, { "epoch": 1.30968622100955, "grad_norm": 4.133677959442139, "learning_rate": 6.727923783105111e-05, "loss": 1.6043, "step": 960 }, { "epoch": 1.3233287858117326, "grad_norm": 5.688299655914307, "learning_rate": 6.702747020350586e-05, "loss": 1.6637, "step": 970 }, { "epoch": 1.3369713506139154, "grad_norm": 8.215998649597168, "learning_rate": 6.677371668899448e-05, "loss": 1.717, "step": 980 }, { "epoch": 1.350613915416098, "grad_norm": 6.267200946807861, "learning_rate": 6.651799593246569e-05, "loss": 1.572, "step": 990 }, { "epoch": 1.364256480218281, "grad_norm": 3.5424532890319824, "learning_rate": 6.626032672341458e-05, "loss": 1.6264, "step": 1000 }, { "epoch": 1.3778990450204638, "grad_norm": 6.2543206214904785, "learning_rate": 6.600072799450186e-05, "loss": 1.5057, "step": 1010 }, { "epoch": 1.3915416098226467, "grad_norm": 5.060370445251465, "learning_rate": 6.573921882016284e-05, "loss": 1.6963, "step": 1020 }, { "epoch": 1.4051841746248295, "grad_norm": 7.101504325866699, "learning_rate": 6.547581841520589e-05, "loss": 1.3956, "step": 1030 }, { "epoch": 1.4188267394270122, "grad_norm": 7.740041732788086, "learning_rate": 6.521054613340064e-05, "loss": 1.469, "step": 1040 }, { "epoch": 1.4324693042291952, "grad_norm": 7.723188877105713, "learning_rate": 6.494342146605581e-05, "loss": 1.5518, "step": 1050 }, { "epoch": 1.446111869031378, "grad_norm": 5.523282527923584, "learning_rate": 6.467446404058722e-05, "loss": 1.5123, "step": 1060 }, { "epoch": 1.4597544338335606, "grad_norm": 10.065855026245117, "learning_rate": 6.44036936190755e-05, "loss": 1.5065, "step": 1070 }, { "epoch": 1.4733969986357436, "grad_norm": 8.757994651794434, "learning_rate": 6.413113009681411e-05, "loss": 1.5362, "step": 1080 }, { "epoch": 1.4870395634379263, "grad_norm": 10.307364463806152, "learning_rate": 6.385679350084743e-05, "loss": 1.7138, "step": 1090 }, { "epoch": 1.500682128240109, "grad_norm": 7.119322299957275, "learning_rate": 6.358070398849938e-05, "loss": 1.6659, "step": 1100 }, { "epoch": 1.514324693042292, "grad_norm": 12.579729080200195, "learning_rate": 6.330288184589216e-05, "loss": 1.6374, "step": 1110 }, { "epoch": 1.5279672578444747, "grad_norm": 8.232572555541992, "learning_rate": 6.30233474864558e-05, "loss": 1.6296, "step": 1120 }, { "epoch": 1.5416098226466577, "grad_norm": 10.136465072631836, "learning_rate": 6.274212144942824e-05, "loss": 1.6491, "step": 1130 }, { "epoch": 1.5552523874488404, "grad_norm": 5.127601623535156, "learning_rate": 6.245922439834612e-05, "loss": 1.6309, "step": 1140 }, { "epoch": 1.5688949522510232, "grad_norm": 6.231171607971191, "learning_rate": 6.217467711952658e-05, "loss": 1.5365, "step": 1150 }, { "epoch": 1.5825375170532059, "grad_norm": 7.796588897705078, "learning_rate": 6.188850052053985e-05, "loss": 1.493, "step": 1160 }, { "epoch": 1.5961800818553888, "grad_norm": 8.250170707702637, "learning_rate": 6.160071562867316e-05, "loss": 1.6771, "step": 1170 }, { "epoch": 1.6098226466575716, "grad_norm": 4.563484191894531, "learning_rate": 6.131134358938559e-05, "loss": 1.5381, "step": 1180 }, { "epoch": 1.6234652114597545, "grad_norm": 7.776663303375244, "learning_rate": 6.1020405664754455e-05, "loss": 1.4791, "step": 1190 }, { "epoch": 1.6371077762619373, "grad_norm": 6.122801780700684, "learning_rate": 6.0727923231913035e-05, "loss": 1.5296, "step": 1200 }, { "epoch": 1.65075034106412, "grad_norm": 5.5693278312683105, "learning_rate": 6.04339177814798e-05, "loss": 1.4165, "step": 1210 }, { "epoch": 1.6643929058663027, "grad_norm": 8.107104301452637, "learning_rate": 6.013841091597947e-05, "loss": 1.5694, "step": 1220 }, { "epoch": 1.6780354706684857, "grad_norm": 12.389050483703613, "learning_rate": 5.9841424348255596e-05, "loss": 1.6024, "step": 1230 }, { "epoch": 1.6916780354706686, "grad_norm": 6.175624370574951, "learning_rate": 5.954297989987526e-05, "loss": 1.499, "step": 1240 }, { "epoch": 1.7053206002728514, "grad_norm": 4.447775840759277, "learning_rate": 5.924309949952565e-05, "loss": 1.5177, "step": 1250 }, { "epoch": 1.718963165075034, "grad_norm": 6.788479804992676, "learning_rate": 5.8941805181402886e-05, "loss": 1.4564, "step": 1260 }, { "epoch": 1.7326057298772168, "grad_norm": 6.172934532165527, "learning_rate": 5.8639119083592954e-05, "loss": 1.3263, "step": 1270 }, { "epoch": 1.7462482946793996, "grad_norm": 7.947272777557373, "learning_rate": 5.833506344644507e-05, "loss": 1.6518, "step": 1280 }, { "epoch": 1.7598908594815825, "grad_norm": 5.520730018615723, "learning_rate": 5.802966061093762e-05, "loss": 1.4169, "step": 1290 }, { "epoch": 1.7735334242837655, "grad_norm": 6.108306407928467, "learning_rate": 5.7722933017036515e-05, "loss": 1.4631, "step": 1300 }, { "epoch": 1.7871759890859482, "grad_norm": 3.4584460258483887, "learning_rate": 5.741490320204644e-05, "loss": 1.5939, "step": 1310 }, { "epoch": 1.800818553888131, "grad_norm": 7.870245456695557, "learning_rate": 5.7105593798954895e-05, "loss": 1.4775, "step": 1320 }, { "epoch": 1.8144611186903137, "grad_norm": 6.256468772888184, "learning_rate": 5.679502753476913e-05, "loss": 1.6513, "step": 1330 }, { "epoch": 1.8281036834924966, "grad_norm": 8.314029693603516, "learning_rate": 5.648322722884635e-05, "loss": 1.6092, "step": 1340 }, { "epoch": 1.8417462482946794, "grad_norm": 7.979041576385498, "learning_rate": 5.6170215791216926e-05, "loss": 1.5116, "step": 1350 }, { "epoch": 1.8553888130968623, "grad_norm": 7.900625228881836, "learning_rate": 5.58560162209011e-05, "loss": 1.3667, "step": 1360 }, { "epoch": 1.869031377899045, "grad_norm": 7.304281234741211, "learning_rate": 5.554065160421907e-05, "loss": 1.4584, "step": 1370 }, { "epoch": 1.8826739427012278, "grad_norm": 5.6456618309021, "learning_rate": 5.522414511309472e-05, "loss": 1.7889, "step": 1380 }, { "epoch": 1.8963165075034105, "grad_norm": 5.90806770324707, "learning_rate": 5.490652000335297e-05, "loss": 1.5623, "step": 1390 }, { "epoch": 1.9099590723055935, "grad_norm": 6.941085338592529, "learning_rate": 5.461972027286809e-05, "loss": 1.5508, "step": 1400 }, { "epoch": 1.9236016371077762, "grad_norm": 7.18194580078125, "learning_rate": 5.430003415054097e-05, "loss": 1.4951, "step": 1410 }, { "epoch": 1.9372442019099592, "grad_norm": 9.118647575378418, "learning_rate": 5.397929731013993e-05, "loss": 1.601, "step": 1420 }, { "epoch": 1.950886766712142, "grad_norm": 7.178833961486816, "learning_rate": 5.365753331832165e-05, "loss": 1.5427, "step": 1430 }, { "epoch": 1.9645293315143246, "grad_norm": 5.578968524932861, "learning_rate": 5.3334765817214407e-05, "loss": 1.4017, "step": 1440 }, { "epoch": 1.9781718963165074, "grad_norm": 10.137267112731934, "learning_rate": 5.301101852268093e-05, "loss": 1.6255, "step": 1450 }, { "epoch": 1.9918144611186903, "grad_norm": 7.634779930114746, "learning_rate": 5.268631522257586e-05, "loss": 1.5628, "step": 1460 }, { "epoch": 2.0054570259208733, "grad_norm": 4.639617919921875, "learning_rate": 5.23606797749979e-05, "loss": 1.2903, "step": 1470 }, { "epoch": 2.019099590723056, "grad_norm": 9.048364639282227, "learning_rate": 5.2034136106536784e-05, "loss": 0.8694, "step": 1480 }, { "epoch": 2.0327421555252387, "grad_norm": 12.236319541931152, "learning_rate": 5.1706708210515225e-05, "loss": 1.0926, "step": 1490 }, { "epoch": 2.0463847203274215, "grad_norm": 8.067400932312012, "learning_rate": 5.1378420145226e-05, "loss": 0.9019, "step": 1500 }, { "epoch": 2.060027285129604, "grad_norm": 5.9596028327941895, "learning_rate": 5.104929603216422e-05, "loss": 0.9077, "step": 1510 }, { "epoch": 2.0736698499317874, "grad_norm": 16.68425941467285, "learning_rate": 5.0719360054254925e-05, "loss": 0.8833, "step": 1520 }, { "epoch": 2.08731241473397, "grad_norm": 10.05809497833252, "learning_rate": 5.0388636454076256e-05, "loss": 0.8182, "step": 1530 }, { "epoch": 2.100954979536153, "grad_norm": 7.299218654632568, "learning_rate": 5.0057149532078165e-05, "loss": 0.7097, "step": 1540 }, { "epoch": 2.1145975443383356, "grad_norm": 7.193046569824219, "learning_rate": 4.9724923644796904e-05, "loss": 0.8243, "step": 1550 }, { "epoch": 2.1282401091405183, "grad_norm": 6.852187633514404, "learning_rate": 4.939198320306537e-05, "loss": 0.8468, "step": 1560 }, { "epoch": 2.141882673942701, "grad_norm": 8.87316608428955, "learning_rate": 4.9058352670219576e-05, "loss": 0.8348, "step": 1570 }, { "epoch": 2.155525238744884, "grad_norm": 7.688050270080566, "learning_rate": 4.872405656030099e-05, "loss": 0.9292, "step": 1580 }, { "epoch": 2.169167803547067, "grad_norm": 9.83198356628418, "learning_rate": 4.83891194362555e-05, "loss": 0.7706, "step": 1590 }, { "epoch": 2.1828103683492497, "grad_norm": 10.323474884033203, "learning_rate": 4.805356590812852e-05, "loss": 0.8487, "step": 1600 }, { "epoch": 2.1964529331514324, "grad_norm": 8.7017183303833, "learning_rate": 4.771742063125674e-05, "loss": 0.8509, "step": 1610 }, { "epoch": 2.210095497953615, "grad_norm": 6.59673547744751, "learning_rate": 4.7380708304456554e-05, "loss": 0.8833, "step": 1620 }, { "epoch": 2.223738062755798, "grad_norm": 15.534720420837402, "learning_rate": 4.704345366820927e-05, "loss": 0.8968, "step": 1630 }, { "epoch": 2.237380627557981, "grad_norm": 11.261545181274414, "learning_rate": 4.670568150284323e-05, "loss": 0.9044, "step": 1640 }, { "epoch": 2.251023192360164, "grad_norm": 8.76096248626709, "learning_rate": 4.636741662671308e-05, "loss": 0.974, "step": 1650 }, { "epoch": 2.2646657571623465, "grad_norm": 5.140685081481934, "learning_rate": 4.602868389437622e-05, "loss": 0.8121, "step": 1660 }, { "epoch": 2.2783083219645293, "grad_norm": 9.906394004821777, "learning_rate": 4.568950819476648e-05, "loss": 1.1324, "step": 1670 }, { "epoch": 2.291950886766712, "grad_norm": 13.181589126586914, "learning_rate": 4.5349914449365435e-05, "loss": 0.8856, "step": 1680 }, { "epoch": 2.305593451568895, "grad_norm": 12.309771537780762, "learning_rate": 4.500992761037129e-05, "loss": 0.8276, "step": 1690 }, { "epoch": 2.319236016371078, "grad_norm": 10.110664367675781, "learning_rate": 4.4669572658865405e-05, "loss": 0.7492, "step": 1700 }, { "epoch": 2.3328785811732606, "grad_norm": 8.248074531555176, "learning_rate": 4.4328874602976786e-05, "loss": 0.9167, "step": 1710 }, { "epoch": 2.3465211459754434, "grad_norm": 8.356489181518555, "learning_rate": 4.3987858476044617e-05, "loss": 0.9453, "step": 1720 }, { "epoch": 2.360163710777626, "grad_norm": 5.316605091094971, "learning_rate": 4.364654933477886e-05, "loss": 0.8714, "step": 1730 }, { "epoch": 2.373806275579809, "grad_norm": 9.405285835266113, "learning_rate": 4.330497225741917e-05, "loss": 0.9251, "step": 1740 }, { "epoch": 2.3874488403819916, "grad_norm": 9.250997543334961, "learning_rate": 4.296315234189223e-05, "loss": 0.9479, "step": 1750 }, { "epoch": 2.4010914051841747, "grad_norm": 7.570913791656494, "learning_rate": 4.262111470396766e-05, "loss": 0.8346, "step": 1760 }, { "epoch": 2.4147339699863575, "grad_norm": 7.284623146057129, "learning_rate": 4.2278884475412585e-05, "loss": 0.754, "step": 1770 }, { "epoch": 2.42837653478854, "grad_norm": 8.737554550170898, "learning_rate": 4.193648680214505e-05, "loss": 0.7743, "step": 1780 }, { "epoch": 2.442019099590723, "grad_norm": 9.590819358825684, "learning_rate": 4.159394684238635e-05, "loss": 0.9273, "step": 1790 }, { "epoch": 2.4556616643929057, "grad_norm": 9.176666259765625, "learning_rate": 4.1251289764812495e-05, "loss": 1.0037, "step": 1800 }, { "epoch": 2.469304229195089, "grad_norm": 9.587228775024414, "learning_rate": 4.090854074670495e-05, "loss": 0.7728, "step": 1810 }, { "epoch": 2.4829467939972716, "grad_norm": 3.603977918624878, "learning_rate": 4.056572497210066e-05, "loss": 0.8098, "step": 1820 }, { "epoch": 2.4965893587994543, "grad_norm": 10.71971321105957, "learning_rate": 4.0222867629941554e-05, "loss": 0.8404, "step": 1830 }, { "epoch": 2.510231923601637, "grad_norm": 8.777016639709473, "learning_rate": 3.987999391222389e-05, "loss": 0.7021, "step": 1840 }, { "epoch": 2.52387448840382, "grad_norm": 9.952046394348145, "learning_rate": 3.953712901214707e-05, "loss": 0.9656, "step": 1850 }, { "epoch": 2.5375170532060025, "grad_norm": 10.772456169128418, "learning_rate": 3.9194298122262666e-05, "loss": 0.7487, "step": 1860 }, { "epoch": 2.5511596180081857, "grad_norm": 7.986364364624023, "learning_rate": 3.8851526432623254e-05, "loss": 0.8693, "step": 1870 }, { "epoch": 2.5648021828103684, "grad_norm": 8.993766784667969, "learning_rate": 3.850883912893158e-05, "loss": 0.8933, "step": 1880 }, { "epoch": 2.578444747612551, "grad_norm": 8.045419692993164, "learning_rate": 3.816626139069004e-05, "loss": 0.7098, "step": 1890 }, { "epoch": 2.592087312414734, "grad_norm": 7.299800872802734, "learning_rate": 3.782381838935047e-05, "loss": 0.8742, "step": 1900 }, { "epoch": 2.6057298772169166, "grad_norm": 8.76268482208252, "learning_rate": 3.748153528646472e-05, "loss": 0.7846, "step": 1910 }, { "epoch": 2.6193724420191, "grad_norm": 13.77757453918457, "learning_rate": 3.713943723183587e-05, "loss": 0.9032, "step": 1920 }, { "epoch": 2.6330150068212825, "grad_norm": 9.216800689697266, "learning_rate": 3.6797549361670257e-05, "loss": 0.8044, "step": 1930 }, { "epoch": 2.6466575716234653, "grad_norm": 12.811419486999512, "learning_rate": 3.6455896796730554e-05, "loss": 0.8528, "step": 1940 }, { "epoch": 2.660300136425648, "grad_norm": 8.257354736328125, "learning_rate": 3.611450464049005e-05, "loss": 0.9294, "step": 1950 }, { "epoch": 2.6739427012278307, "grad_norm": 6.599920272827148, "learning_rate": 3.577339797728805e-05, "loss": 0.9441, "step": 1960 }, { "epoch": 2.6875852660300135, "grad_norm": 10.485870361328125, "learning_rate": 3.5432601870486795e-05, "loss": 0.8572, "step": 1970 }, { "epoch": 2.701227830832196, "grad_norm": 5.167399883270264, "learning_rate": 3.509214136062993e-05, "loss": 0.9912, "step": 1980 }, { "epoch": 2.7148703956343794, "grad_norm": 9.000060081481934, "learning_rate": 3.475204146360254e-05, "loss": 0.9876, "step": 1990 }, { "epoch": 2.728512960436562, "grad_norm": 7.356120586395264, "learning_rate": 3.44123271687931e-05, "loss": 0.8796, "step": 2000 }, { "epoch": 2.742155525238745, "grad_norm": 10.706610679626465, "learning_rate": 3.407302343725737e-05, "loss": 0.8394, "step": 2010 }, { "epoch": 2.7557980900409276, "grad_norm": 6.853879928588867, "learning_rate": 3.3734155199884275e-05, "loss": 0.7603, "step": 2020 }, { "epoch": 2.7694406548431107, "grad_norm": 8.325788497924805, "learning_rate": 3.339574735556412e-05, "loss": 0.8944, "step": 2030 }, { "epoch": 2.7830832196452935, "grad_norm": 12.427726745605469, "learning_rate": 3.3057824769359104e-05, "loss": 0.9737, "step": 2040 }, { "epoch": 2.796725784447476, "grad_norm": 11.044093132019043, "learning_rate": 3.2720412270676275e-05, "loss": 0.9483, "step": 2050 }, { "epoch": 2.810368349249659, "grad_norm": 5.907009601593018, "learning_rate": 3.2383534651443206e-05, "loss": 0.9254, "step": 2060 }, { "epoch": 2.8240109140518417, "grad_norm": 5.986649036407471, "learning_rate": 3.204721666428631e-05, "loss": 0.7472, "step": 2070 }, { "epoch": 2.8376534788540244, "grad_norm": 9.0693359375, "learning_rate": 3.171148302071215e-05, "loss": 0.8689, "step": 2080 }, { "epoch": 2.851296043656207, "grad_norm": 7.178049087524414, "learning_rate": 3.137635838929169e-05, "loss": 0.7493, "step": 2090 }, { "epoch": 2.8649386084583903, "grad_norm": 8.808353424072266, "learning_rate": 3.1041867393847764e-05, "loss": 1.0867, "step": 2100 }, { "epoch": 2.878581173260573, "grad_norm": 11.312020301818848, "learning_rate": 3.070803461164575e-05, "loss": 0.9846, "step": 2110 }, { "epoch": 2.892223738062756, "grad_norm": 2.6739227771759033, "learning_rate": 3.0374884571587776e-05, "loss": 0.7862, "step": 2120 }, { "epoch": 2.9058663028649385, "grad_norm": 9.54980182647705, "learning_rate": 3.004244175241038e-05, "loss": 1.0586, "step": 2130 }, { "epoch": 2.9195088676671213, "grad_norm": 10.564549446105957, "learning_rate": 2.971073058088587e-05, "loss": 0.8757, "step": 2140 }, { "epoch": 2.9331514324693044, "grad_norm": 9.608818054199219, "learning_rate": 2.937977543002764e-05, "loss": 0.9026, "step": 2150 }, { "epoch": 2.946793997271487, "grad_norm": 4.548944473266602, "learning_rate": 2.9049600617299188e-05, "loss": 0.8864, "step": 2160 }, { "epoch": 2.96043656207367, "grad_norm": 7.556874752044678, "learning_rate": 2.872023040282739e-05, "loss": 0.8224, "step": 2170 }, { "epoch": 2.9740791268758526, "grad_norm": 9.024819374084473, "learning_rate": 2.8391688987620045e-05, "loss": 0.794, "step": 2180 }, { "epoch": 2.9877216916780354, "grad_norm": 5.675693035125732, "learning_rate": 2.8064000511787523e-05, "loss": 0.9792, "step": 2190 }, { "epoch": 3.001364256480218, "grad_norm": 6.426331520080566, "learning_rate": 2.77371890527691e-05, "loss": 0.7812, "step": 2200 }, { "epoch": 3.0150068212824013, "grad_norm": 7.24752140045166, "learning_rate": 2.741127862356389e-05, "loss": 0.496, "step": 2210 }, { "epoch": 3.028649386084584, "grad_norm": 8.4566650390625, "learning_rate": 2.7086293170966312e-05, "loss": 0.3416, "step": 2220 }, { "epoch": 3.0422919508867667, "grad_norm": 5.965997219085693, "learning_rate": 2.6762256573806664e-05, "loss": 0.3462, "step": 2230 }, { "epoch": 3.0559345156889495, "grad_norm": 10.749195098876953, "learning_rate": 2.6439192641196583e-05, "loss": 0.4756, "step": 2240 }, { "epoch": 3.069577080491132, "grad_norm": 7.666036605834961, "learning_rate": 2.611712511077959e-05, "loss": 0.357, "step": 2250 }, { "epoch": 3.083219645293315, "grad_norm": 7.038111209869385, "learning_rate": 2.5796077646986922e-05, "loss": 0.3361, "step": 2260 }, { "epoch": 3.096862210095498, "grad_norm": 10.389034271240234, "learning_rate": 2.5476073839298857e-05, "loss": 0.419, "step": 2270 }, { "epoch": 3.110504774897681, "grad_norm": 6.829967021942139, "learning_rate": 2.5157137200511253e-05, "loss": 0.4211, "step": 2280 }, { "epoch": 3.1241473396998636, "grad_norm": 4.85243558883667, "learning_rate": 2.4839291165008073e-05, "loss": 0.5248, "step": 2290 }, { "epoch": 3.1377899045020463, "grad_norm": 7.922482490539551, "learning_rate": 2.452255908703945e-05, "loss": 0.3983, "step": 2300 }, { "epoch": 3.151432469304229, "grad_norm": 4.674210071563721, "learning_rate": 2.420696423900567e-05, "loss": 0.3728, "step": 2310 }, { "epoch": 3.1650750341064118, "grad_norm": 16.18085289001465, "learning_rate": 2.3892529809747195e-05, "loss": 0.4099, "step": 2320 }, { "epoch": 3.178717598908595, "grad_norm": 10.127204895019531, "learning_rate": 2.35792789028409e-05, "loss": 0.5466, "step": 2330 }, { "epoch": 3.1923601637107777, "grad_norm": 6.371789932250977, "learning_rate": 2.32672345349024e-05, "loss": 0.3956, "step": 2340 }, { "epoch": 3.2060027285129604, "grad_norm": 6.170881748199463, "learning_rate": 2.2956419633894922e-05, "loss": 0.2686, "step": 2350 }, { "epoch": 3.219645293315143, "grad_norm": 8.651374816894531, "learning_rate": 2.264685703744466e-05, "loss": 0.3545, "step": 2360 }, { "epoch": 3.233287858117326, "grad_norm": 9.146707534790039, "learning_rate": 2.2338569491162688e-05, "loss": 0.4361, "step": 2370 }, { "epoch": 3.246930422919509, "grad_norm": 10.473475456237793, "learning_rate": 2.2031579646973662e-05, "loss": 0.4616, "step": 2380 }, { "epoch": 3.260572987721692, "grad_norm": 7.665262222290039, "learning_rate": 2.1725910061451582e-05, "loss": 0.4095, "step": 2390 }, { "epoch": 3.2742155525238745, "grad_norm": 14.176889419555664, "learning_rate": 2.1421583194162237e-05, "loss": 0.4428, "step": 2400 }, { "epoch": 3.2878581173260573, "grad_norm": 10.604450225830078, "learning_rate": 2.1118621406013045e-05, "loss": 0.4946, "step": 2410 }, { "epoch": 3.30150068212824, "grad_norm": 9.188895225524902, "learning_rate": 2.0817046957610073e-05, "loss": 0.3676, "step": 2420 }, { "epoch": 3.3151432469304227, "grad_norm": 5.148584842681885, "learning_rate": 2.0516882007622318e-05, "loss": 0.4319, "step": 2430 }, { "epoch": 3.328785811732606, "grad_norm": 10.153374671936035, "learning_rate": 2.0218148611153614e-05, "loss": 0.3973, "step": 2440 }, { "epoch": 3.3424283765347886, "grad_norm": 2.798651695251465, "learning_rate": 1.99208687181221e-05, "loss": 0.3728, "step": 2450 }, { "epoch": 3.3560709413369714, "grad_norm": 7.391672611236572, "learning_rate": 1.9625064171647403e-05, "loss": 0.4029, "step": 2460 }, { "epoch": 3.369713506139154, "grad_norm": 8.293444633483887, "learning_rate": 1.933075670644566e-05, "loss": 0.5182, "step": 2470 }, { "epoch": 3.383356070941337, "grad_norm": 12.008163452148438, "learning_rate": 1.903796794723261e-05, "loss": 0.3211, "step": 2480 }, { "epoch": 3.39699863574352, "grad_norm": 4.839990139007568, "learning_rate": 1.8746719407134558e-05, "loss": 0.3218, "step": 2490 }, { "epoch": 3.4106412005457027, "grad_norm": 5.815826892852783, "learning_rate": 1.8457032486107733e-05, "loss": 0.3561, "step": 2500 }, { "epoch": 3.4242837653478855, "grad_norm": 5.609132766723633, "learning_rate": 1.816892846936592e-05, "loss": 0.4223, "step": 2510 }, { "epoch": 3.437926330150068, "grad_norm": 6.10213041305542, "learning_rate": 1.7882428525816434e-05, "loss": 0.2911, "step": 2520 }, { "epoch": 3.451568894952251, "grad_norm": 6.413115501403809, "learning_rate": 1.759755370650472e-05, "loss": 0.3991, "step": 2530 }, { "epoch": 3.4652114597544337, "grad_norm": 10.340503692626953, "learning_rate": 1.7314324943067598e-05, "loss": 0.3823, "step": 2540 }, { "epoch": 3.4788540245566164, "grad_norm": 10.008712768554688, "learning_rate": 1.70327630461953e-05, "loss": 0.3564, "step": 2550 }, { "epoch": 3.4924965893587996, "grad_norm": 7.237100601196289, "learning_rate": 1.6752888704102304e-05, "loss": 0.4012, "step": 2560 }, { "epoch": 3.5061391541609823, "grad_norm": 7.369434833526611, "learning_rate": 1.6474722481007344e-05, "loss": 0.3622, "step": 2570 }, { "epoch": 3.519781718963165, "grad_norm": 10.581721305847168, "learning_rate": 1.619828481562229e-05, "loss": 0.3502, "step": 2580 }, { "epoch": 3.533424283765348, "grad_norm": 7.968098163604736, "learning_rate": 1.5923596019650517e-05, "loss": 0.4129, "step": 2590 }, { "epoch": 3.547066848567531, "grad_norm": 9.581936836242676, "learning_rate": 1.565067627629432e-05, "loss": 0.455, "step": 2600 }, { "epoch": 3.5607094133697137, "grad_norm": 7.407210350036621, "learning_rate": 1.5379545638772032e-05, "loss": 0.3905, "step": 2610 }, { "epoch": 3.5743519781718964, "grad_norm": 9.555901527404785, "learning_rate": 1.511022402884459e-05, "loss": 0.4603, "step": 2620 }, { "epoch": 3.587994542974079, "grad_norm": 5.677056312561035, "learning_rate": 1.4842731235351653e-05, "loss": 0.3654, "step": 2630 }, { "epoch": 3.601637107776262, "grad_norm": 8.808677673339844, "learning_rate": 1.4577086912757659e-05, "loss": 0.4481, "step": 2640 }, { "epoch": 3.6152796725784446, "grad_norm": 11.250635147094727, "learning_rate": 1.4313310579707697e-05, "loss": 0.4829, "step": 2650 }, { "epoch": 3.6289222373806274, "grad_norm": 8.87308406829834, "learning_rate": 1.405142161759327e-05, "loss": 0.3937, "step": 2660 }, { "epoch": 3.64256480218281, "grad_norm": 7.986633777618408, "learning_rate": 1.3791439269128274e-05, "loss": 0.3902, "step": 2670 }, { "epoch": 3.6562073669849933, "grad_norm": 6.5852155685424805, "learning_rate": 1.3533382636935092e-05, "loss": 0.4495, "step": 2680 }, { "epoch": 3.669849931787176, "grad_norm": 22.476903915405273, "learning_rate": 1.3277270682140996e-05, "loss": 0.4707, "step": 2690 }, { "epoch": 3.6834924965893587, "grad_norm": 6.103644371032715, "learning_rate": 1.3023122222984941e-05, "loss": 0.4189, "step": 2700 }, { "epoch": 3.6971350613915415, "grad_norm": 8.584049224853516, "learning_rate": 1.2770955933434906e-05, "loss": 0.3741, "step": 2710 }, { "epoch": 3.7107776261937246, "grad_norm": 8.539772033691406, "learning_rate": 1.2520790341815726e-05, "loss": 0.3724, "step": 2720 }, { "epoch": 3.7244201909959074, "grad_norm": 8.408806800842285, "learning_rate": 1.2272643829447723e-05, "loss": 0.4002, "step": 2730 }, { "epoch": 3.73806275579809, "grad_norm": 13.712347030639648, "learning_rate": 1.2026534629296168e-05, "loss": 0.4129, "step": 2740 }, { "epoch": 3.751705320600273, "grad_norm": 7.925304412841797, "learning_rate": 1.1782480824631478e-05, "loss": 0.3354, "step": 2750 }, { "epoch": 3.7653478854024556, "grad_norm": 6.753428936004639, "learning_rate": 1.154050034770057e-05, "loss": 0.3, "step": 2760 }, { "epoch": 3.7789904502046383, "grad_norm": 11.436506271362305, "learning_rate": 1.1300610978409301e-05, "loss": 0.4613, "step": 2770 }, { "epoch": 3.792633015006821, "grad_norm": 7.002927780151367, "learning_rate": 1.1062830343015998e-05, "loss": 0.3782, "step": 2780 }, { "epoch": 3.806275579809004, "grad_norm": 8.752483367919922, "learning_rate": 1.0827175912836352e-05, "loss": 0.3154, "step": 2790 }, { "epoch": 3.819918144611187, "grad_norm": 8.298131942749023, "learning_rate": 1.059366500295973e-05, "loss": 0.3941, "step": 2800 }, { "epoch": 3.8335607094133697, "grad_norm": 9.577731132507324, "learning_rate": 1.0362314770976858e-05, "loss": 0.4402, "step": 2810 }, { "epoch": 3.8472032742155524, "grad_norm": 8.360587120056152, "learning_rate": 1.0133142215719176e-05, "loss": 0.3588, "step": 2820 }, { "epoch": 3.8608458390177356, "grad_norm": 7.106631755828857, "learning_rate": 9.906164176009825e-06, "loss": 0.4171, "step": 2830 }, { "epoch": 3.8744884038199183, "grad_norm": 7.905004501342773, "learning_rate": 9.681397329426363e-06, "loss": 0.4008, "step": 2840 }, { "epoch": 3.888130968622101, "grad_norm": 7.430079460144043, "learning_rate": 9.458858191075358e-06, "loss": 0.2851, "step": 2850 }, { "epoch": 3.901773533424284, "grad_norm": 6.089357852935791, "learning_rate": 9.238563112378967e-06, "loss": 0.486, "step": 2860 }, { "epoch": 3.9154160982264665, "grad_norm": 10.433391571044922, "learning_rate": 9.02052827987339e-06, "loss": 0.3321, "step": 2870 }, { "epoch": 3.9290586630286493, "grad_norm": 11.072164535522461, "learning_rate": 8.804769714019619e-06, "loss": 0.2981, "step": 2880 }, { "epoch": 3.942701227830832, "grad_norm": 6.538973331451416, "learning_rate": 8.591303268026293e-06, "loss": 0.3412, "step": 2890 }, { "epoch": 3.956343792633015, "grad_norm": 6.748257637023926, "learning_rate": 8.380144626684829e-06, "loss": 0.3648, "step": 2900 }, { "epoch": 3.969986357435198, "grad_norm": 8.447965621948242, "learning_rate": 8.171309305216973e-06, "loss": 0.4028, "step": 2910 }, { "epoch": 3.9836289222373806, "grad_norm": 11.228553771972656, "learning_rate": 7.96481264813481e-06, "loss": 0.5365, "step": 2920 }, { "epoch": 3.9972714870395634, "grad_norm": 16.4576416015625, "learning_rate": 7.760669828113276e-06, "loss": 0.3312, "step": 2930 }, { "epoch": 4.0109140518417465, "grad_norm": 9.181668281555176, "learning_rate": 7.558895844875325e-06, "loss": 0.2586, "step": 2940 }, { "epoch": 4.024556616643929, "grad_norm": 6.005313873291016, "learning_rate": 7.359505524089843e-06, "loss": 0.174, "step": 2950 }, { "epoch": 4.038199181446112, "grad_norm": 9.592679977416992, "learning_rate": 7.162513516282236e-06, "loss": 0.1319, "step": 2960 }, { "epoch": 4.051841746248295, "grad_norm": 4.006359100341797, "learning_rate": 6.967934295758003e-06, "loss": 0.1098, "step": 2970 }, { "epoch": 4.0654843110504775, "grad_norm": 8.887368202209473, "learning_rate": 6.775782159539237e-06, "loss": 0.1351, "step": 2980 }, { "epoch": 4.07912687585266, "grad_norm": 4.626531600952148, "learning_rate": 6.586071226314046e-06, "loss": 0.2408, "step": 2990 }, { "epoch": 4.092769440654843, "grad_norm": 3.412660598754883, "learning_rate": 6.3988154353992285e-06, "loss": 0.1759, "step": 3000 }, { "epoch": 4.106412005457026, "grad_norm": 3.9307303428649902, "learning_rate": 6.214028545716071e-06, "loss": 0.2398, "step": 3010 }, { "epoch": 4.120054570259208, "grad_norm": 4.589839935302734, "learning_rate": 6.031724134779331e-06, "loss": 0.1491, "step": 3020 }, { "epoch": 4.133697135061391, "grad_norm": 5.797046184539795, "learning_rate": 5.851915597699638e-06, "loss": 0.1711, "step": 3030 }, { "epoch": 4.147339699863575, "grad_norm": 6.0500030517578125, "learning_rate": 5.674616146199277e-06, "loss": 0.1815, "step": 3040 }, { "epoch": 4.1609822646657575, "grad_norm": 5.424590587615967, "learning_rate": 5.499838807641413e-06, "loss": 0.2796, "step": 3050 }, { "epoch": 4.17462482946794, "grad_norm": 10.55384349822998, "learning_rate": 5.327596424072896e-06, "loss": 0.2307, "step": 3060 }, { "epoch": 4.188267394270123, "grad_norm": 11.20399284362793, "learning_rate": 5.157901651280672e-06, "loss": 0.1955, "step": 3070 }, { "epoch": 4.201909959072306, "grad_norm": 5.694499492645264, "learning_rate": 4.990766957861875e-06, "loss": 0.2332, "step": 3080 }, { "epoch": 4.215552523874488, "grad_norm": 3.623840093612671, "learning_rate": 4.826204624307665e-06, "loss": 0.1317, "step": 3090 }, { "epoch": 4.229195088676671, "grad_norm": 6.306783199310303, "learning_rate": 4.664226742100946e-06, "loss": 0.1772, "step": 3100 }, { "epoch": 4.242837653478854, "grad_norm": 5.530797004699707, "learning_rate": 4.504845212827848e-06, "loss": 0.1893, "step": 3110 }, { "epoch": 4.256480218281037, "grad_norm": 6.482370853424072, "learning_rate": 4.348071747303322e-06, "loss": 0.2459, "step": 3120 }, { "epoch": 4.270122783083219, "grad_norm": 6.513676643371582, "learning_rate": 4.193917864710599e-06, "loss": 0.1615, "step": 3130 }, { "epoch": 4.283765347885402, "grad_norm": 7.2312703132629395, "learning_rate": 4.042394891754846e-06, "loss": 0.1488, "step": 3140 }, { "epoch": 4.297407912687586, "grad_norm": 8.509649276733398, "learning_rate": 3.893513961830886e-06, "loss": 0.1507, "step": 3150 }, { "epoch": 4.311050477489768, "grad_norm": 8.615159034729004, "learning_rate": 3.74728601420518e-06, "loss": 0.1768, "step": 3160 }, { "epoch": 4.324693042291951, "grad_norm": 9.077027320861816, "learning_rate": 3.6037217932120272e-06, "loss": 0.3008, "step": 3170 }, { "epoch": 4.338335607094134, "grad_norm": 4.60724401473999, "learning_rate": 3.4628318474641344e-06, "loss": 0.1839, "step": 3180 }, { "epoch": 4.351978171896317, "grad_norm": 3.5988879203796387, "learning_rate": 3.3246265290775013e-06, "loss": 0.1681, "step": 3190 }, { "epoch": 4.365620736698499, "grad_norm": 3.6327638626098633, "learning_rate": 3.1891159929108074e-06, "loss": 0.2091, "step": 3200 }, { "epoch": 4.379263301500682, "grad_norm": 7.489198207855225, "learning_rate": 3.0563101958192677e-06, "loss": 0.175, "step": 3210 }, { "epoch": 4.392905866302865, "grad_norm": 7.076563835144043, "learning_rate": 2.9262188959230297e-06, "loss": 0.2715, "step": 3220 }, { "epoch": 4.406548431105048, "grad_norm": 4.808744430541992, "learning_rate": 2.7988516518901643e-06, "loss": 0.205, "step": 3230 }, { "epoch": 4.42019099590723, "grad_norm": 12.732096672058105, "learning_rate": 2.674217822234382e-06, "loss": 0.1804, "step": 3240 }, { "epoch": 4.433833560709413, "grad_norm": 4.898436546325684, "learning_rate": 2.5523265646273252e-06, "loss": 0.2301, "step": 3250 }, { "epoch": 4.447476125511596, "grad_norm": 3.7656116485595703, "learning_rate": 2.433186835225745e-06, "loss": 0.2665, "step": 3260 }, { "epoch": 4.461118690313779, "grad_norm": 4.772073745727539, "learning_rate": 2.316807388013431e-06, "loss": 0.1443, "step": 3270 }, { "epoch": 4.474761255115962, "grad_norm": 7.5682830810546875, "learning_rate": 2.203196774157972e-06, "loss": 0.2292, "step": 3280 }, { "epoch": 4.488403819918145, "grad_norm": 4.153563976287842, "learning_rate": 2.0923633413824663e-06, "loss": 0.1765, "step": 3290 }, { "epoch": 4.502046384720328, "grad_norm": 3.0278851985931396, "learning_rate": 1.98431523335215e-06, "loss": 0.1966, "step": 3300 }, { "epoch": 4.51568894952251, "grad_norm": 3.25349497795105, "learning_rate": 1.8790603890760328e-06, "loss": 0.2525, "step": 3310 }, { "epoch": 4.529331514324693, "grad_norm": 6.384403228759766, "learning_rate": 1.7766065423235624e-06, "loss": 0.1502, "step": 3320 }, { "epoch": 4.542974079126876, "grad_norm": 8.483981132507324, "learning_rate": 1.6769612210563834e-06, "loss": 0.2251, "step": 3330 }, { "epoch": 4.5566166439290585, "grad_norm": 5.477599620819092, "learning_rate": 1.5801317468751954e-06, "loss": 0.1724, "step": 3340 }, { "epoch": 4.570259208731241, "grad_norm": 6.450118541717529, "learning_rate": 1.4861252344817812e-06, "loss": 0.1416, "step": 3350 }, { "epoch": 4.583901773533424, "grad_norm": 8.463088035583496, "learning_rate": 1.3949485911562799e-06, "loss": 0.1507, "step": 3360 }, { "epoch": 4.597544338335607, "grad_norm": 5.667326927185059, "learning_rate": 1.3066085162496057e-06, "loss": 0.1486, "step": 3370 }, { "epoch": 4.61118690313779, "grad_norm": 8.173833847045898, "learning_rate": 1.2211115006912499e-06, "loss": 0.1823, "step": 3380 }, { "epoch": 4.624829467939973, "grad_norm": 7.300376892089844, "learning_rate": 1.1384638265123305e-06, "loss": 0.1146, "step": 3390 }, { "epoch": 4.638472032742156, "grad_norm": 5.938165664672852, "learning_rate": 1.0586715663840175e-06, "loss": 0.2206, "step": 3400 }, { "epoch": 4.6521145975443385, "grad_norm": 5.717035293579102, "learning_rate": 9.817405831713135e-07, "loss": 0.137, "step": 3410 }, { "epoch": 4.665757162346521, "grad_norm": 9.58558464050293, "learning_rate": 9.076765295022949e-07, "loss": 0.1969, "step": 3420 }, { "epoch": 4.679399727148704, "grad_norm": 5.858943462371826, "learning_rate": 8.364848473527698e-07, "loss": 0.1712, "step": 3430 }, { "epoch": 4.693042291950887, "grad_norm": 5.517107009887695, "learning_rate": 7.748725387928791e-07, "loss": 0.1606, "step": 3440 }, { "epoch": 4.7066848567530695, "grad_norm": 2.8044917583465576, "learning_rate": 7.09152598980536e-07, "loss": 0.1722, "step": 3450 }, { "epoch": 4.720327421555252, "grad_norm": 5.587821960449219, "learning_rate": 6.463196175536768e-07, "loss": 0.2592, "step": 3460 }, { "epoch": 4.733969986357435, "grad_norm": 8.284170150756836, "learning_rate": 5.863782112669647e-07, "loss": 0.2729, "step": 3470 }, { "epoch": 4.747612551159618, "grad_norm": 7.003023147583008, "learning_rate": 5.293327844118956e-07, "loss": 0.1888, "step": 3480 }, { "epoch": 4.7612551159618, "grad_norm": 1.9856537580490112, "learning_rate": 4.751875284932217e-07, "loss": 0.1367, "step": 3490 }, { "epoch": 4.774897680763983, "grad_norm": 6.205258846282959, "learning_rate": 4.2394642192095327e-07, "loss": 0.2003, "step": 3500 }, { "epoch": 4.788540245566167, "grad_norm": 5.503448009490967, "learning_rate": 3.7561322971803706e-07, "loss": 0.1503, "step": 3510 }, { "epoch": 4.8021828103683495, "grad_norm": 5.814334392547607, "learning_rate": 3.301915032437375e-07, "loss": 0.1425, "step": 3520 }, { "epoch": 4.815825375170532, "grad_norm": 7.47700309753418, "learning_rate": 2.8768457993266775e-07, "loss": 0.1265, "step": 3530 }, { "epoch": 4.829467939972715, "grad_norm": 6.920642375946045, "learning_rate": 2.480955830495679e-07, "loss": 0.2103, "step": 3540 }, { "epoch": 4.843110504774898, "grad_norm": 8.557290077209473, "learning_rate": 2.1142742145984442e-07, "loss": 0.2395, "step": 3550 }, { "epoch": 4.85675306957708, "grad_norm": 7.730421543121338, "learning_rate": 1.7768278941581617e-07, "loss": 0.1317, "step": 3560 }, { "epoch": 4.870395634379263, "grad_norm": 4.612334728240967, "learning_rate": 1.4686416635874445e-07, "loss": 0.1802, "step": 3570 }, { "epoch": 4.884038199181446, "grad_norm": 6.5652899742126465, "learning_rate": 1.1897381673666719e-07, "loss": 0.2484, "step": 3580 }, { "epoch": 4.897680763983629, "grad_norm": 8.53544807434082, "learning_rate": 9.40137898380078e-08, "loss": 0.1346, "step": 3590 }, { "epoch": 4.911323328785811, "grad_norm": 6.831415176391602, "learning_rate": 7.198591964099777e-08, "loss": 0.2097, "step": 3600 }, { "epoch": 4.924965893587995, "grad_norm": 11.197553634643555, "learning_rate": 5.289182467893561e-08, "loss": 0.1865, "step": 3610 }, { "epoch": 4.938608458390178, "grad_norm": 9.906291007995605, "learning_rate": 3.6732907921241956e-08, "loss": 0.2137, "step": 3620 }, { "epoch": 4.95225102319236, "grad_norm": 6.347635269165039, "learning_rate": 2.351035667038648e-08, "loss": 0.1699, "step": 3630 }, { "epoch": 4.965893587994543, "grad_norm": 2.5308990478515625, "learning_rate": 1.3225142474651009e-08, "loss": 0.247, "step": 3640 }, { "epoch": 4.979536152796726, "grad_norm": 8.235413551330566, "learning_rate": 5.878021056742178e-09, "loss": 0.1884, "step": 3650 }, { "epoch": 4.993178717598909, "grad_norm": 8.303121566772461, "learning_rate": 1.4695322582491956e-09, "loss": 0.186, "step": 3660 }, { "epoch": 5.0, "step": 3665, "total_flos": 2.2102292765343744e+17, "train_loss": 1.054027732430181, "train_runtime": 2482.8245, "train_samples_per_second": 5.905, "train_steps_per_second": 1.476 } ], "logging_steps": 10, "max_steps": 3665, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2102292765343744e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }