{ "best_metric": 0.04224640876054764, "best_model_checkpoint": "./testVal_default_model/checkpoint-15844", "epoch": 5.0, "eval_steps": 500, "global_step": 19805, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025246149962130774, "grad_norm": 2.1267971992492676, "learning_rate": 2.9984852310022722e-05, "loss": 0.3068, "step": 10 }, { "epoch": 0.005049229992426155, "grad_norm": 0.8344613909721375, "learning_rate": 2.9969704620045444e-05, "loss": 0.1692, "step": 20 }, { "epoch": 0.007573844988639233, "grad_norm": 1.6312780380249023, "learning_rate": 2.9954556930068166e-05, "loss": 0.1614, "step": 30 }, { "epoch": 0.01009845998485231, "grad_norm": 2.278165102005005, "learning_rate": 2.9939409240090887e-05, "loss": 0.1488, "step": 40 }, { "epoch": 0.012623074981065387, "grad_norm": 0.7996057868003845, "learning_rate": 2.992426155011361e-05, "loss": 0.1342, "step": 50 }, { "epoch": 0.015147689977278465, "grad_norm": 0.9300896525382996, "learning_rate": 2.990911386013633e-05, "loss": 0.1341, "step": 60 }, { "epoch": 0.017672304973491544, "grad_norm": 1.3064889907836914, "learning_rate": 2.9893966170159052e-05, "loss": 0.1469, "step": 70 }, { "epoch": 0.02019691996970462, "grad_norm": 0.842812180519104, "learning_rate": 2.9878818480181774e-05, "loss": 0.1306, "step": 80 }, { "epoch": 0.022721534965917698, "grad_norm": 1.0661952495574951, "learning_rate": 2.9863670790204495e-05, "loss": 0.1243, "step": 90 }, { "epoch": 0.025246149962130773, "grad_norm": 0.8375621438026428, "learning_rate": 2.9848523100227217e-05, "loss": 0.1137, "step": 100 }, { "epoch": 0.027770764958343852, "grad_norm": 0.6965815424919128, "learning_rate": 2.9833375410249938e-05, "loss": 0.097, "step": 110 }, { "epoch": 0.03029537995455693, "grad_norm": 1.5321495532989502, "learning_rate": 2.981822772027266e-05, "loss": 0.1218, "step": 120 }, { "epoch": 0.03281999495077001, "grad_norm": 0.7771267890930176, "learning_rate": 2.980308003029538e-05, "loss": 0.1044, "step": 130 }, { "epoch": 0.03534460994698309, "grad_norm": 1.137905478477478, "learning_rate": 2.9787932340318103e-05, "loss": 0.1219, "step": 140 }, { "epoch": 0.03786922494319616, "grad_norm": 0.6131516098976135, "learning_rate": 2.9772784650340825e-05, "loss": 0.1163, "step": 150 }, { "epoch": 0.04039383993940924, "grad_norm": 0.6713071465492249, "learning_rate": 2.9757636960363543e-05, "loss": 0.1046, "step": 160 }, { "epoch": 0.04291845493562232, "grad_norm": 0.7513856887817383, "learning_rate": 2.9742489270386268e-05, "loss": 0.1186, "step": 170 }, { "epoch": 0.045443069931835396, "grad_norm": 0.9550772905349731, "learning_rate": 2.972734158040899e-05, "loss": 0.1053, "step": 180 }, { "epoch": 0.047967684928048475, "grad_norm": 1.595443606376648, "learning_rate": 2.9712193890431708e-05, "loss": 0.1229, "step": 190 }, { "epoch": 0.05049229992426155, "grad_norm": 0.4370063543319702, "learning_rate": 2.9697046200454433e-05, "loss": 0.1324, "step": 200 }, { "epoch": 0.053016914920474625, "grad_norm": 1.201798439025879, "learning_rate": 2.9681898510477154e-05, "loss": 0.1222, "step": 210 }, { "epoch": 0.055541529916687704, "grad_norm": 0.6518549919128418, "learning_rate": 2.9666750820499872e-05, "loss": 0.1039, "step": 220 }, { "epoch": 0.05806614491290078, "grad_norm": 0.9148812294006348, "learning_rate": 2.9651603130522597e-05, "loss": 0.1213, "step": 230 }, { "epoch": 0.06059075990911386, "grad_norm": 0.9625220894813538, "learning_rate": 2.963645544054532e-05, "loss": 0.0982, "step": 240 }, { "epoch": 0.06311537490532694, "grad_norm": 1.0822298526763916, "learning_rate": 2.9621307750568037e-05, "loss": 0.1058, "step": 250 }, { "epoch": 0.06563998990154002, "grad_norm": 0.5486002564430237, "learning_rate": 2.9606160060590762e-05, "loss": 0.1103, "step": 260 }, { "epoch": 0.0681646048977531, "grad_norm": 0.6268118619918823, "learning_rate": 2.9591012370613484e-05, "loss": 0.1082, "step": 270 }, { "epoch": 0.07068921989396618, "grad_norm": 0.950860321521759, "learning_rate": 2.9575864680636202e-05, "loss": 0.1153, "step": 280 }, { "epoch": 0.07321383489017924, "grad_norm": 0.5327135324478149, "learning_rate": 2.9560716990658924e-05, "loss": 0.1022, "step": 290 }, { "epoch": 0.07573844988639232, "grad_norm": 0.5201209187507629, "learning_rate": 2.954556930068165e-05, "loss": 0.1222, "step": 300 }, { "epoch": 0.0782630648826054, "grad_norm": 0.5655984282493591, "learning_rate": 2.9530421610704367e-05, "loss": 0.0962, "step": 310 }, { "epoch": 0.08078767987881848, "grad_norm": 0.48904240131378174, "learning_rate": 2.951527392072709e-05, "loss": 0.0881, "step": 320 }, { "epoch": 0.08331229487503156, "grad_norm": 0.8683993220329285, "learning_rate": 2.9500126230749813e-05, "loss": 0.1149, "step": 330 }, { "epoch": 0.08583690987124463, "grad_norm": 0.8078688383102417, "learning_rate": 2.948497854077253e-05, "loss": 0.0921, "step": 340 }, { "epoch": 0.08836152486745771, "grad_norm": 2.072779893875122, "learning_rate": 2.9469830850795253e-05, "loss": 0.0912, "step": 350 }, { "epoch": 0.09088613986367079, "grad_norm": 0.41440829634666443, "learning_rate": 2.9454683160817978e-05, "loss": 0.094, "step": 360 }, { "epoch": 0.09341075485988387, "grad_norm": 0.5622707605361938, "learning_rate": 2.9439535470840696e-05, "loss": 0.098, "step": 370 }, { "epoch": 0.09593536985609695, "grad_norm": 0.9337944388389587, "learning_rate": 2.9424387780863418e-05, "loss": 0.0998, "step": 380 }, { "epoch": 0.09845998485231003, "grad_norm": 0.6288129091262817, "learning_rate": 2.9409240090886143e-05, "loss": 0.1113, "step": 390 }, { "epoch": 0.1009845998485231, "grad_norm": 0.5012751817703247, "learning_rate": 2.939409240090886e-05, "loss": 0.1035, "step": 400 }, { "epoch": 0.10350921484473617, "grad_norm": 0.5585261583328247, "learning_rate": 2.9378944710931583e-05, "loss": 0.1073, "step": 410 }, { "epoch": 0.10603382984094925, "grad_norm": 0.8108246922492981, "learning_rate": 2.9363797020954308e-05, "loss": 0.0965, "step": 420 }, { "epoch": 0.10855844483716233, "grad_norm": 0.9611131548881531, "learning_rate": 2.9348649330977026e-05, "loss": 0.1109, "step": 430 }, { "epoch": 0.11108305983337541, "grad_norm": 0.6837782859802246, "learning_rate": 2.9333501640999748e-05, "loss": 0.1084, "step": 440 }, { "epoch": 0.11360767482958849, "grad_norm": 1.0511689186096191, "learning_rate": 2.931835395102247e-05, "loss": 0.0979, "step": 450 }, { "epoch": 0.11613228982580157, "grad_norm": 1.4842995405197144, "learning_rate": 2.930320626104519e-05, "loss": 0.0915, "step": 460 }, { "epoch": 0.11865690482201464, "grad_norm": 0.948907732963562, "learning_rate": 2.9288058571067912e-05, "loss": 0.0774, "step": 470 }, { "epoch": 0.12118151981822772, "grad_norm": 0.8867286443710327, "learning_rate": 2.9272910881090634e-05, "loss": 0.0858, "step": 480 }, { "epoch": 0.1237061348144408, "grad_norm": 0.5692402720451355, "learning_rate": 2.9257763191113356e-05, "loss": 0.0939, "step": 490 }, { "epoch": 0.12623074981065388, "grad_norm": 0.4716489613056183, "learning_rate": 2.9242615501136077e-05, "loss": 0.108, "step": 500 }, { "epoch": 0.12875536480686695, "grad_norm": 0.4867834448814392, "learning_rate": 2.92274678111588e-05, "loss": 0.0858, "step": 510 }, { "epoch": 0.13127997980308004, "grad_norm": 0.4366174638271332, "learning_rate": 2.921232012118152e-05, "loss": 0.0695, "step": 520 }, { "epoch": 0.1338045947992931, "grad_norm": 1.3339869976043701, "learning_rate": 2.9197172431204242e-05, "loss": 0.0955, "step": 530 }, { "epoch": 0.1363292097955062, "grad_norm": 0.9120462536811829, "learning_rate": 2.9182024741226963e-05, "loss": 0.103, "step": 540 }, { "epoch": 0.13885382479171926, "grad_norm": 0.3417504131793976, "learning_rate": 2.9166877051249685e-05, "loss": 0.0878, "step": 550 }, { "epoch": 0.14137843978793235, "grad_norm": 0.5997135043144226, "learning_rate": 2.9151729361272407e-05, "loss": 0.0944, "step": 560 }, { "epoch": 0.14390305478414542, "grad_norm": 0.7205860018730164, "learning_rate": 2.9136581671295128e-05, "loss": 0.0877, "step": 570 }, { "epoch": 0.14642766978035848, "grad_norm": 0.5023823976516724, "learning_rate": 2.912143398131785e-05, "loss": 0.091, "step": 580 }, { "epoch": 0.14895228477657158, "grad_norm": 0.6862772703170776, "learning_rate": 2.910628629134057e-05, "loss": 0.0964, "step": 590 }, { "epoch": 0.15147689977278464, "grad_norm": 0.7713685035705566, "learning_rate": 2.9091138601363293e-05, "loss": 0.0962, "step": 600 }, { "epoch": 0.15400151476899773, "grad_norm": 0.45930472016334534, "learning_rate": 2.9075990911386015e-05, "loss": 0.1, "step": 610 }, { "epoch": 0.1565261297652108, "grad_norm": 0.6255579590797424, "learning_rate": 2.9060843221408736e-05, "loss": 0.0957, "step": 620 }, { "epoch": 0.1590507447614239, "grad_norm": 0.7254714369773865, "learning_rate": 2.9045695531431458e-05, "loss": 0.1, "step": 630 }, { "epoch": 0.16157535975763695, "grad_norm": 0.9384628534317017, "learning_rate": 2.903054784145418e-05, "loss": 0.0923, "step": 640 }, { "epoch": 0.16409997475385005, "grad_norm": 0.5457489490509033, "learning_rate": 2.90154001514769e-05, "loss": 0.1059, "step": 650 }, { "epoch": 0.1666245897500631, "grad_norm": 0.5641859769821167, "learning_rate": 2.9000252461499623e-05, "loss": 0.0952, "step": 660 }, { "epoch": 0.1691492047462762, "grad_norm": 0.5558760166168213, "learning_rate": 2.8985104771522344e-05, "loss": 0.1043, "step": 670 }, { "epoch": 0.17167381974248927, "grad_norm": 0.6601101756095886, "learning_rate": 2.8969957081545066e-05, "loss": 0.0875, "step": 680 }, { "epoch": 0.17419843473870233, "grad_norm": 0.6963163614273071, "learning_rate": 2.8954809391567787e-05, "loss": 0.0937, "step": 690 }, { "epoch": 0.17672304973491543, "grad_norm": 1.265081524848938, "learning_rate": 2.893966170159051e-05, "loss": 0.0816, "step": 700 }, { "epoch": 0.1792476647311285, "grad_norm": 0.5968114137649536, "learning_rate": 2.892451401161323e-05, "loss": 0.0842, "step": 710 }, { "epoch": 0.18177227972734158, "grad_norm": 0.9470769762992859, "learning_rate": 2.8909366321635952e-05, "loss": 0.0863, "step": 720 }, { "epoch": 0.18429689472355465, "grad_norm": 0.4141634702682495, "learning_rate": 2.8894218631658674e-05, "loss": 0.0825, "step": 730 }, { "epoch": 0.18682150971976774, "grad_norm": 0.5291838645935059, "learning_rate": 2.8879070941681392e-05, "loss": 0.1073, "step": 740 }, { "epoch": 0.1893461247159808, "grad_norm": 0.6476059556007385, "learning_rate": 2.8863923251704117e-05, "loss": 0.0852, "step": 750 }, { "epoch": 0.1918707397121939, "grad_norm": 1.3100022077560425, "learning_rate": 2.884877556172684e-05, "loss": 0.0987, "step": 760 }, { "epoch": 0.19439535470840696, "grad_norm": 0.4951756000518799, "learning_rate": 2.8833627871749557e-05, "loss": 0.0842, "step": 770 }, { "epoch": 0.19691996970462006, "grad_norm": 0.9130496978759766, "learning_rate": 2.8818480181772282e-05, "loss": 0.0877, "step": 780 }, { "epoch": 0.19944458470083312, "grad_norm": 1.0746809244155884, "learning_rate": 2.8803332491795003e-05, "loss": 0.0784, "step": 790 }, { "epoch": 0.2019691996970462, "grad_norm": 0.5604913830757141, "learning_rate": 2.878818480181772e-05, "loss": 0.0991, "step": 800 }, { "epoch": 0.20449381469325928, "grad_norm": 0.512355625629425, "learning_rate": 2.8773037111840447e-05, "loss": 0.0974, "step": 810 }, { "epoch": 0.20701842968947234, "grad_norm": 0.6891873478889465, "learning_rate": 2.8757889421863168e-05, "loss": 0.0803, "step": 820 }, { "epoch": 0.20954304468568544, "grad_norm": 0.4630875289440155, "learning_rate": 2.8742741731885886e-05, "loss": 0.0803, "step": 830 }, { "epoch": 0.2120676596818985, "grad_norm": 0.9669208526611328, "learning_rate": 2.8727594041908608e-05, "loss": 0.1017, "step": 840 }, { "epoch": 0.2145922746781116, "grad_norm": 0.36673077940940857, "learning_rate": 2.8712446351931333e-05, "loss": 0.0864, "step": 850 }, { "epoch": 0.21711688967432466, "grad_norm": 0.587645947933197, "learning_rate": 2.869729866195405e-05, "loss": 0.0832, "step": 860 }, { "epoch": 0.21964150467053775, "grad_norm": 0.7566137909889221, "learning_rate": 2.8682150971976773e-05, "loss": 0.0991, "step": 870 }, { "epoch": 0.22216611966675082, "grad_norm": 0.9127172827720642, "learning_rate": 2.8667003281999498e-05, "loss": 0.0725, "step": 880 }, { "epoch": 0.2246907346629639, "grad_norm": 0.44545140862464905, "learning_rate": 2.8651855592022216e-05, "loss": 0.1114, "step": 890 }, { "epoch": 0.22721534965917697, "grad_norm": 0.6126995086669922, "learning_rate": 2.8636707902044937e-05, "loss": 0.0939, "step": 900 }, { "epoch": 0.22973996465539007, "grad_norm": 0.5383595824241638, "learning_rate": 2.8621560212067662e-05, "loss": 0.0956, "step": 910 }, { "epoch": 0.23226457965160313, "grad_norm": 0.6282172203063965, "learning_rate": 2.860641252209038e-05, "loss": 0.087, "step": 920 }, { "epoch": 0.2347891946478162, "grad_norm": 0.8885302543640137, "learning_rate": 2.8591264832113102e-05, "loss": 0.0951, "step": 930 }, { "epoch": 0.2373138096440293, "grad_norm": 0.7967470288276672, "learning_rate": 2.8576117142135827e-05, "loss": 0.1023, "step": 940 }, { "epoch": 0.23983842464024235, "grad_norm": 0.7743595838546753, "learning_rate": 2.8560969452158545e-05, "loss": 0.0797, "step": 950 }, { "epoch": 0.24236303963645545, "grad_norm": 0.5401091575622559, "learning_rate": 2.8545821762181267e-05, "loss": 0.1063, "step": 960 }, { "epoch": 0.2448876546326685, "grad_norm": 0.8554522395133972, "learning_rate": 2.8530674072203992e-05, "loss": 0.0833, "step": 970 }, { "epoch": 0.2474122696288816, "grad_norm": 0.7112722396850586, "learning_rate": 2.851552638222671e-05, "loss": 0.0891, "step": 980 }, { "epoch": 0.24993688462509467, "grad_norm": 0.5287074446678162, "learning_rate": 2.8500378692249432e-05, "loss": 0.0877, "step": 990 }, { "epoch": 0.25246149962130776, "grad_norm": 0.6009781956672668, "learning_rate": 2.8485231002272157e-05, "loss": 0.0882, "step": 1000 }, { "epoch": 0.25498611461752085, "grad_norm": 0.3697584271430969, "learning_rate": 2.8470083312294875e-05, "loss": 0.0659, "step": 1010 }, { "epoch": 0.2575107296137339, "grad_norm": 0.6940627694129944, "learning_rate": 2.8454935622317597e-05, "loss": 0.0897, "step": 1020 }, { "epoch": 0.260035344609947, "grad_norm": 0.43336808681488037, "learning_rate": 2.8439787932340318e-05, "loss": 0.0721, "step": 1030 }, { "epoch": 0.2625599596061601, "grad_norm": 0.7710297703742981, "learning_rate": 2.842464024236304e-05, "loss": 0.0886, "step": 1040 }, { "epoch": 0.2650845746023731, "grad_norm": 1.1904844045639038, "learning_rate": 2.840949255238576e-05, "loss": 0.0903, "step": 1050 }, { "epoch": 0.2676091895985862, "grad_norm": 0.6289048194885254, "learning_rate": 2.8394344862408483e-05, "loss": 0.0782, "step": 1060 }, { "epoch": 0.2701338045947993, "grad_norm": 0.5819368958473206, "learning_rate": 2.8379197172431205e-05, "loss": 0.0841, "step": 1070 }, { "epoch": 0.2726584195910124, "grad_norm": 0.9544229507446289, "learning_rate": 2.8364049482453926e-05, "loss": 0.0813, "step": 1080 }, { "epoch": 0.27518303458722543, "grad_norm": 0.5838118195533752, "learning_rate": 2.8348901792476648e-05, "loss": 0.1027, "step": 1090 }, { "epoch": 0.2777076495834385, "grad_norm": 0.5665274858474731, "learning_rate": 2.833375410249937e-05, "loss": 0.0932, "step": 1100 }, { "epoch": 0.2802322645796516, "grad_norm": 0.7049827575683594, "learning_rate": 2.831860641252209e-05, "loss": 0.0895, "step": 1110 }, { "epoch": 0.2827568795758647, "grad_norm": 0.7130704522132874, "learning_rate": 2.8303458722544813e-05, "loss": 0.0922, "step": 1120 }, { "epoch": 0.28528149457207774, "grad_norm": 0.606171727180481, "learning_rate": 2.8288311032567534e-05, "loss": 0.0763, "step": 1130 }, { "epoch": 0.28780610956829084, "grad_norm": 0.6686602830886841, "learning_rate": 2.8273163342590256e-05, "loss": 0.0853, "step": 1140 }, { "epoch": 0.29033072456450393, "grad_norm": 0.555953860282898, "learning_rate": 2.8258015652612977e-05, "loss": 0.0844, "step": 1150 }, { "epoch": 0.29285533956071697, "grad_norm": 0.763724684715271, "learning_rate": 2.82428679626357e-05, "loss": 0.0927, "step": 1160 }, { "epoch": 0.29537995455693006, "grad_norm": 0.6845389008522034, "learning_rate": 2.822772027265842e-05, "loss": 0.082, "step": 1170 }, { "epoch": 0.29790456955314315, "grad_norm": 0.5240347385406494, "learning_rate": 2.8212572582681142e-05, "loss": 0.0791, "step": 1180 }, { "epoch": 0.30042918454935624, "grad_norm": 0.7150965332984924, "learning_rate": 2.8197424892703864e-05, "loss": 0.0898, "step": 1190 }, { "epoch": 0.3029537995455693, "grad_norm": 0.45540598034858704, "learning_rate": 2.8182277202726585e-05, "loss": 0.0802, "step": 1200 }, { "epoch": 0.3054784145417824, "grad_norm": 0.8016244173049927, "learning_rate": 2.8167129512749307e-05, "loss": 0.078, "step": 1210 }, { "epoch": 0.30800302953799547, "grad_norm": 0.5537816286087036, "learning_rate": 2.815198182277203e-05, "loss": 0.0809, "step": 1220 }, { "epoch": 0.31052764453420856, "grad_norm": 0.6857221722602844, "learning_rate": 2.813683413279475e-05, "loss": 0.0905, "step": 1230 }, { "epoch": 0.3130522595304216, "grad_norm": 0.5408879518508911, "learning_rate": 2.8121686442817472e-05, "loss": 0.0829, "step": 1240 }, { "epoch": 0.3155768745266347, "grad_norm": 0.3268249034881592, "learning_rate": 2.8106538752840193e-05, "loss": 0.0928, "step": 1250 }, { "epoch": 0.3181014895228478, "grad_norm": 0.5064918398857117, "learning_rate": 2.8091391062862915e-05, "loss": 0.0805, "step": 1260 }, { "epoch": 0.3206261045190608, "grad_norm": 1.134167194366455, "learning_rate": 2.8076243372885636e-05, "loss": 0.091, "step": 1270 }, { "epoch": 0.3231507195152739, "grad_norm": 0.6449709534645081, "learning_rate": 2.8061095682908358e-05, "loss": 0.0926, "step": 1280 }, { "epoch": 0.325675334511487, "grad_norm": 0.4470975399017334, "learning_rate": 2.804594799293108e-05, "loss": 0.0888, "step": 1290 }, { "epoch": 0.3281999495077001, "grad_norm": 0.6847784519195557, "learning_rate": 2.80308003029538e-05, "loss": 0.0872, "step": 1300 }, { "epoch": 0.33072456450391313, "grad_norm": 0.5377852320671082, "learning_rate": 2.8015652612976523e-05, "loss": 0.0802, "step": 1310 }, { "epoch": 0.3332491795001262, "grad_norm": 0.7764335870742798, "learning_rate": 2.800050492299924e-05, "loss": 0.083, "step": 1320 }, { "epoch": 0.3357737944963393, "grad_norm": 0.6579951047897339, "learning_rate": 2.7985357233021966e-05, "loss": 0.0828, "step": 1330 }, { "epoch": 0.3382984094925524, "grad_norm": 0.7836153507232666, "learning_rate": 2.7970209543044688e-05, "loss": 0.0716, "step": 1340 }, { "epoch": 0.34082302448876545, "grad_norm": 1.0645248889923096, "learning_rate": 2.7955061853067406e-05, "loss": 0.0828, "step": 1350 }, { "epoch": 0.34334763948497854, "grad_norm": 0.4852674603462219, "learning_rate": 2.7939914163090127e-05, "loss": 0.0797, "step": 1360 }, { "epoch": 0.34587225448119163, "grad_norm": 0.7467886805534363, "learning_rate": 2.7924766473112852e-05, "loss": 0.0865, "step": 1370 }, { "epoch": 0.34839686947740467, "grad_norm": 0.8338529467582703, "learning_rate": 2.790961878313557e-05, "loss": 0.075, "step": 1380 }, { "epoch": 0.35092148447361776, "grad_norm": 0.9489770531654358, "learning_rate": 2.7894471093158292e-05, "loss": 0.0799, "step": 1390 }, { "epoch": 0.35344609946983085, "grad_norm": 0.4182811677455902, "learning_rate": 2.7879323403181017e-05, "loss": 0.0953, "step": 1400 }, { "epoch": 0.35597071446604395, "grad_norm": 0.472494900226593, "learning_rate": 2.7864175713203735e-05, "loss": 0.0939, "step": 1410 }, { "epoch": 0.358495329462257, "grad_norm": 0.6279084086418152, "learning_rate": 2.7849028023226457e-05, "loss": 0.0977, "step": 1420 }, { "epoch": 0.3610199444584701, "grad_norm": 0.5314123630523682, "learning_rate": 2.7833880333249182e-05, "loss": 0.0905, "step": 1430 }, { "epoch": 0.36354455945468317, "grad_norm": 0.3234538435935974, "learning_rate": 2.78187326432719e-05, "loss": 0.0792, "step": 1440 }, { "epoch": 0.36606917445089626, "grad_norm": 0.7543266415596008, "learning_rate": 2.7803584953294622e-05, "loss": 0.0828, "step": 1450 }, { "epoch": 0.3685937894471093, "grad_norm": 0.38049548864364624, "learning_rate": 2.7788437263317347e-05, "loss": 0.0783, "step": 1460 }, { "epoch": 0.3711184044433224, "grad_norm": 0.3925035893917084, "learning_rate": 2.7773289573340065e-05, "loss": 0.0738, "step": 1470 }, { "epoch": 0.3736430194395355, "grad_norm": 0.6012548208236694, "learning_rate": 2.7758141883362787e-05, "loss": 0.0767, "step": 1480 }, { "epoch": 0.3761676344357485, "grad_norm": 0.6800134181976318, "learning_rate": 2.774299419338551e-05, "loss": 0.0778, "step": 1490 }, { "epoch": 0.3786922494319616, "grad_norm": 0.561687707901001, "learning_rate": 2.772784650340823e-05, "loss": 0.079, "step": 1500 }, { "epoch": 0.3812168644281747, "grad_norm": 0.48705849051475525, "learning_rate": 2.771269881343095e-05, "loss": 0.0813, "step": 1510 }, { "epoch": 0.3837414794243878, "grad_norm": 0.5197842121124268, "learning_rate": 2.7697551123453676e-05, "loss": 0.0872, "step": 1520 }, { "epoch": 0.38626609442060084, "grad_norm": 0.476550430059433, "learning_rate": 2.7682403433476395e-05, "loss": 0.0684, "step": 1530 }, { "epoch": 0.38879070941681393, "grad_norm": 0.7136000394821167, "learning_rate": 2.7667255743499116e-05, "loss": 0.0787, "step": 1540 }, { "epoch": 0.391315324413027, "grad_norm": 0.8119826912879944, "learning_rate": 2.765210805352184e-05, "loss": 0.0834, "step": 1550 }, { "epoch": 0.3938399394092401, "grad_norm": 0.7646836638450623, "learning_rate": 2.763696036354456e-05, "loss": 0.0867, "step": 1560 }, { "epoch": 0.39636455440545315, "grad_norm": 0.5930790901184082, "learning_rate": 2.762181267356728e-05, "loss": 0.0731, "step": 1570 }, { "epoch": 0.39888916940166624, "grad_norm": 0.3663583993911743, "learning_rate": 2.7606664983590006e-05, "loss": 0.0727, "step": 1580 }, { "epoch": 0.40141378439787934, "grad_norm": 0.3528522551059723, "learning_rate": 2.7591517293612724e-05, "loss": 0.0779, "step": 1590 }, { "epoch": 0.4039383993940924, "grad_norm": 0.3986479938030243, "learning_rate": 2.7576369603635446e-05, "loss": 0.0876, "step": 1600 }, { "epoch": 0.40646301439030547, "grad_norm": 0.5565474033355713, "learning_rate": 2.7561221913658167e-05, "loss": 0.0884, "step": 1610 }, { "epoch": 0.40898762938651856, "grad_norm": 0.49433985352516174, "learning_rate": 2.754607422368089e-05, "loss": 0.0772, "step": 1620 }, { "epoch": 0.41151224438273165, "grad_norm": 0.6990385055541992, "learning_rate": 2.753092653370361e-05, "loss": 0.0718, "step": 1630 }, { "epoch": 0.4140368593789447, "grad_norm": 1.0656036138534546, "learning_rate": 2.7515778843726332e-05, "loss": 0.071, "step": 1640 }, { "epoch": 0.4165614743751578, "grad_norm": 0.4659973978996277, "learning_rate": 2.7500631153749054e-05, "loss": 0.0885, "step": 1650 }, { "epoch": 0.4190860893713709, "grad_norm": 0.6749240159988403, "learning_rate": 2.7485483463771775e-05, "loss": 0.0854, "step": 1660 }, { "epoch": 0.42161070436758397, "grad_norm": 0.4509848952293396, "learning_rate": 2.7470335773794497e-05, "loss": 0.082, "step": 1670 }, { "epoch": 0.424135319363797, "grad_norm": 0.6541846394538879, "learning_rate": 2.745518808381722e-05, "loss": 0.0989, "step": 1680 }, { "epoch": 0.4266599343600101, "grad_norm": 0.603756308555603, "learning_rate": 2.744004039383994e-05, "loss": 0.0834, "step": 1690 }, { "epoch": 0.4291845493562232, "grad_norm": 0.4919886589050293, "learning_rate": 2.742489270386266e-05, "loss": 0.0847, "step": 1700 }, { "epoch": 0.4317091643524363, "grad_norm": 0.8659531474113464, "learning_rate": 2.7409745013885383e-05, "loss": 0.0878, "step": 1710 }, { "epoch": 0.4342337793486493, "grad_norm": 0.6441717743873596, "learning_rate": 2.7394597323908105e-05, "loss": 0.0704, "step": 1720 }, { "epoch": 0.4367583943448624, "grad_norm": 0.5323107838630676, "learning_rate": 2.7379449633930826e-05, "loss": 0.0676, "step": 1730 }, { "epoch": 0.4392830093410755, "grad_norm": 0.5645779967308044, "learning_rate": 2.7364301943953548e-05, "loss": 0.0745, "step": 1740 }, { "epoch": 0.44180762433728854, "grad_norm": 0.6338810920715332, "learning_rate": 2.734915425397627e-05, "loss": 0.083, "step": 1750 }, { "epoch": 0.44433223933350163, "grad_norm": 0.8070118427276611, "learning_rate": 2.733400656399899e-05, "loss": 0.0679, "step": 1760 }, { "epoch": 0.4468568543297147, "grad_norm": 0.5226871967315674, "learning_rate": 2.7318858874021713e-05, "loss": 0.0966, "step": 1770 }, { "epoch": 0.4493814693259278, "grad_norm": 0.4776107668876648, "learning_rate": 2.7303711184044434e-05, "loss": 0.0861, "step": 1780 }, { "epoch": 0.45190608432214086, "grad_norm": 0.5352398753166199, "learning_rate": 2.7288563494067156e-05, "loss": 0.0743, "step": 1790 }, { "epoch": 0.45443069931835395, "grad_norm": 0.6204835772514343, "learning_rate": 2.7273415804089878e-05, "loss": 0.0852, "step": 1800 }, { "epoch": 0.45695531431456704, "grad_norm": 0.7186635732650757, "learning_rate": 2.72582681141126e-05, "loss": 0.0832, "step": 1810 }, { "epoch": 0.45947992931078013, "grad_norm": 0.3791876435279846, "learning_rate": 2.724312042413532e-05, "loss": 0.083, "step": 1820 }, { "epoch": 0.46200454430699317, "grad_norm": 0.6428934931755066, "learning_rate": 2.7227972734158042e-05, "loss": 0.0872, "step": 1830 }, { "epoch": 0.46452915930320626, "grad_norm": 0.3516116440296173, "learning_rate": 2.7212825044180764e-05, "loss": 0.0659, "step": 1840 }, { "epoch": 0.46705377429941936, "grad_norm": 0.44267144799232483, "learning_rate": 2.7197677354203486e-05, "loss": 0.0873, "step": 1850 }, { "epoch": 0.4695783892956324, "grad_norm": 0.5157018899917603, "learning_rate": 2.7182529664226207e-05, "loss": 0.0841, "step": 1860 }, { "epoch": 0.4721030042918455, "grad_norm": 0.39743056893348694, "learning_rate": 2.716738197424893e-05, "loss": 0.0751, "step": 1870 }, { "epoch": 0.4746276192880586, "grad_norm": 0.9152094721794128, "learning_rate": 2.715223428427165e-05, "loss": 0.0809, "step": 1880 }, { "epoch": 0.47715223428427167, "grad_norm": 0.5350621342658997, "learning_rate": 2.7137086594294372e-05, "loss": 0.0792, "step": 1890 }, { "epoch": 0.4796768492804847, "grad_norm": 0.6785259246826172, "learning_rate": 2.712193890431709e-05, "loss": 0.0932, "step": 1900 }, { "epoch": 0.4822014642766978, "grad_norm": 0.5591861605644226, "learning_rate": 2.7106791214339812e-05, "loss": 0.0875, "step": 1910 }, { "epoch": 0.4847260792729109, "grad_norm": 0.4783095419406891, "learning_rate": 2.7091643524362537e-05, "loss": 0.0953, "step": 1920 }, { "epoch": 0.487250694269124, "grad_norm": 0.3992745578289032, "learning_rate": 2.7076495834385255e-05, "loss": 0.0897, "step": 1930 }, { "epoch": 0.489775309265337, "grad_norm": 0.5237228870391846, "learning_rate": 2.7061348144407977e-05, "loss": 0.0735, "step": 1940 }, { "epoch": 0.4922999242615501, "grad_norm": 0.43609362840652466, "learning_rate": 2.70462004544307e-05, "loss": 0.0885, "step": 1950 }, { "epoch": 0.4948245392577632, "grad_norm": 0.9206698536872864, "learning_rate": 2.703105276445342e-05, "loss": 0.0861, "step": 1960 }, { "epoch": 0.49734915425397624, "grad_norm": 0.9243408441543579, "learning_rate": 2.701590507447614e-05, "loss": 0.0923, "step": 1970 }, { "epoch": 0.49987376925018934, "grad_norm": 0.7312402725219727, "learning_rate": 2.7000757384498866e-05, "loss": 0.0856, "step": 1980 }, { "epoch": 0.5023983842464024, "grad_norm": 0.40028661489486694, "learning_rate": 2.6985609694521585e-05, "loss": 0.0676, "step": 1990 }, { "epoch": 0.5049229992426155, "grad_norm": 0.4305866062641144, "learning_rate": 2.6970462004544306e-05, "loss": 0.0744, "step": 2000 }, { "epoch": 0.5074476142388286, "grad_norm": 0.30411994457244873, "learning_rate": 2.695531431456703e-05, "loss": 0.064, "step": 2010 }, { "epoch": 0.5099722292350417, "grad_norm": 0.44871994853019714, "learning_rate": 2.694016662458975e-05, "loss": 0.0883, "step": 2020 }, { "epoch": 0.5124968442312547, "grad_norm": 0.8206811547279358, "learning_rate": 2.692501893461247e-05, "loss": 0.0803, "step": 2030 }, { "epoch": 0.5150214592274678, "grad_norm": 0.6455752849578857, "learning_rate": 2.6909871244635196e-05, "loss": 0.0951, "step": 2040 }, { "epoch": 0.5175460742236809, "grad_norm": 0.5514742732048035, "learning_rate": 2.6894723554657914e-05, "loss": 0.0855, "step": 2050 }, { "epoch": 0.520070689219894, "grad_norm": 0.3301815688610077, "learning_rate": 2.6879575864680636e-05, "loss": 0.0916, "step": 2060 }, { "epoch": 0.522595304216107, "grad_norm": 0.4964681565761566, "learning_rate": 2.686442817470336e-05, "loss": 0.0794, "step": 2070 }, { "epoch": 0.5251199192123202, "grad_norm": 0.6347307562828064, "learning_rate": 2.684928048472608e-05, "loss": 0.0756, "step": 2080 }, { "epoch": 0.5276445342085332, "grad_norm": 0.7548183798789978, "learning_rate": 2.68341327947488e-05, "loss": 0.0781, "step": 2090 }, { "epoch": 0.5301691492047462, "grad_norm": 0.43172019720077515, "learning_rate": 2.6818985104771525e-05, "loss": 0.0674, "step": 2100 }, { "epoch": 0.5326937642009594, "grad_norm": 0.8760651350021362, "learning_rate": 2.6803837414794244e-05, "loss": 0.0815, "step": 2110 }, { "epoch": 0.5352183791971724, "grad_norm": 0.795153021812439, "learning_rate": 2.6788689724816965e-05, "loss": 0.0828, "step": 2120 }, { "epoch": 0.5377429941933856, "grad_norm": 0.7313557267189026, "learning_rate": 2.677354203483969e-05, "loss": 0.0983, "step": 2130 }, { "epoch": 0.5402676091895986, "grad_norm": 0.4594115912914276, "learning_rate": 2.675839434486241e-05, "loss": 0.0705, "step": 2140 }, { "epoch": 0.5427922241858116, "grad_norm": 0.5128085613250732, "learning_rate": 2.674324665488513e-05, "loss": 0.0823, "step": 2150 }, { "epoch": 0.5453168391820248, "grad_norm": 0.7316603064537048, "learning_rate": 2.6728098964907855e-05, "loss": 0.0736, "step": 2160 }, { "epoch": 0.5478414541782378, "grad_norm": 0.47318318486213684, "learning_rate": 2.6712951274930573e-05, "loss": 0.0714, "step": 2170 }, { "epoch": 0.5503660691744509, "grad_norm": 0.669712245464325, "learning_rate": 2.6697803584953295e-05, "loss": 0.0787, "step": 2180 }, { "epoch": 0.552890684170664, "grad_norm": 0.5920892357826233, "learning_rate": 2.6682655894976016e-05, "loss": 0.0787, "step": 2190 }, { "epoch": 0.555415299166877, "grad_norm": 0.3415123522281647, "learning_rate": 2.6667508204998738e-05, "loss": 0.0824, "step": 2200 }, { "epoch": 0.5579399141630901, "grad_norm": 0.37938541173934937, "learning_rate": 2.665236051502146e-05, "loss": 0.0749, "step": 2210 }, { "epoch": 0.5604645291593032, "grad_norm": 0.6105532646179199, "learning_rate": 2.663721282504418e-05, "loss": 0.0832, "step": 2220 }, { "epoch": 0.5629891441555163, "grad_norm": 0.6222105026245117, "learning_rate": 2.6622065135066903e-05, "loss": 0.0662, "step": 2230 }, { "epoch": 0.5655137591517294, "grad_norm": 0.7301434874534607, "learning_rate": 2.6606917445089624e-05, "loss": 0.082, "step": 2240 }, { "epoch": 0.5680383741479424, "grad_norm": 0.36286047101020813, "learning_rate": 2.6591769755112346e-05, "loss": 0.0823, "step": 2250 }, { "epoch": 0.5705629891441555, "grad_norm": 0.483547180891037, "learning_rate": 2.6576622065135068e-05, "loss": 0.0882, "step": 2260 }, { "epoch": 0.5730876041403686, "grad_norm": 0.3633301854133606, "learning_rate": 2.656147437515779e-05, "loss": 0.0847, "step": 2270 }, { "epoch": 0.5756122191365817, "grad_norm": 0.44909363985061646, "learning_rate": 2.654632668518051e-05, "loss": 0.0886, "step": 2280 }, { "epoch": 0.5781368341327947, "grad_norm": 0.5327022671699524, "learning_rate": 2.6531178995203232e-05, "loss": 0.0714, "step": 2290 }, { "epoch": 0.5806614491290079, "grad_norm": 0.7483319044113159, "learning_rate": 2.6516031305225954e-05, "loss": 0.0806, "step": 2300 }, { "epoch": 0.5831860641252209, "grad_norm": 0.5000852346420288, "learning_rate": 2.6500883615248676e-05, "loss": 0.0725, "step": 2310 }, { "epoch": 0.5857106791214339, "grad_norm": 0.3488561809062958, "learning_rate": 2.6485735925271397e-05, "loss": 0.0653, "step": 2320 }, { "epoch": 0.5882352941176471, "grad_norm": 0.32613256573677063, "learning_rate": 2.647058823529412e-05, "loss": 0.0682, "step": 2330 }, { "epoch": 0.5907599091138601, "grad_norm": 0.8107202649116516, "learning_rate": 2.645544054531684e-05, "loss": 0.0848, "step": 2340 }, { "epoch": 0.5932845241100733, "grad_norm": 0.6575340628623962, "learning_rate": 2.6440292855339562e-05, "loss": 0.1013, "step": 2350 }, { "epoch": 0.5958091391062863, "grad_norm": 0.43987488746643066, "learning_rate": 2.6425145165362283e-05, "loss": 0.0749, "step": 2360 }, { "epoch": 0.5983337541024993, "grad_norm": 0.9867390394210815, "learning_rate": 2.6409997475385005e-05, "loss": 0.0859, "step": 2370 }, { "epoch": 0.6008583690987125, "grad_norm": 0.5467984080314636, "learning_rate": 2.6394849785407727e-05, "loss": 0.0743, "step": 2380 }, { "epoch": 0.6033829840949255, "grad_norm": 0.42555686831474304, "learning_rate": 2.6379702095430448e-05, "loss": 0.0906, "step": 2390 }, { "epoch": 0.6059075990911386, "grad_norm": 0.33940818905830383, "learning_rate": 2.636455440545317e-05, "loss": 0.0734, "step": 2400 }, { "epoch": 0.6084322140873517, "grad_norm": 0.4564431309700012, "learning_rate": 2.634940671547589e-05, "loss": 0.0687, "step": 2410 }, { "epoch": 0.6109568290835647, "grad_norm": 0.591241180896759, "learning_rate": 2.6334259025498613e-05, "loss": 0.0701, "step": 2420 }, { "epoch": 0.6134814440797778, "grad_norm": 0.49482131004333496, "learning_rate": 2.6319111335521335e-05, "loss": 0.0741, "step": 2430 }, { "epoch": 0.6160060590759909, "grad_norm": 0.32807767391204834, "learning_rate": 2.6303963645544056e-05, "loss": 0.0836, "step": 2440 }, { "epoch": 0.618530674072204, "grad_norm": 0.3978806734085083, "learning_rate": 2.6288815955566778e-05, "loss": 0.0796, "step": 2450 }, { "epoch": 0.6210552890684171, "grad_norm": 0.3510526418685913, "learning_rate": 2.6273668265589496e-05, "loss": 0.0735, "step": 2460 }, { "epoch": 0.6235799040646302, "grad_norm": 0.42759060859680176, "learning_rate": 2.625852057561222e-05, "loss": 0.0703, "step": 2470 }, { "epoch": 0.6261045190608432, "grad_norm": 0.7114939093589783, "learning_rate": 2.624337288563494e-05, "loss": 0.0843, "step": 2480 }, { "epoch": 0.6286291340570563, "grad_norm": 0.7506195306777954, "learning_rate": 2.622822519565766e-05, "loss": 0.0835, "step": 2490 }, { "epoch": 0.6311537490532694, "grad_norm": 0.41576099395751953, "learning_rate": 2.6213077505680386e-05, "loss": 0.0788, "step": 2500 }, { "epoch": 0.6336783640494824, "grad_norm": 0.5157233476638794, "learning_rate": 2.6197929815703104e-05, "loss": 0.0765, "step": 2510 }, { "epoch": 0.6362029790456956, "grad_norm": 0.532408595085144, "learning_rate": 2.6182782125725826e-05, "loss": 0.0861, "step": 2520 }, { "epoch": 0.6387275940419086, "grad_norm": 0.4434366822242737, "learning_rate": 2.616763443574855e-05, "loss": 0.0619, "step": 2530 }, { "epoch": 0.6412522090381216, "grad_norm": 0.5986734628677368, "learning_rate": 2.615248674577127e-05, "loss": 0.0902, "step": 2540 }, { "epoch": 0.6437768240343348, "grad_norm": 0.4010680615901947, "learning_rate": 2.613733905579399e-05, "loss": 0.0894, "step": 2550 }, { "epoch": 0.6463014390305478, "grad_norm": 0.9059281349182129, "learning_rate": 2.6122191365816715e-05, "loss": 0.0713, "step": 2560 }, { "epoch": 0.648826054026761, "grad_norm": 0.5254572033882141, "learning_rate": 2.6107043675839434e-05, "loss": 0.0818, "step": 2570 }, { "epoch": 0.651350669022974, "grad_norm": 0.3511335253715515, "learning_rate": 2.6091895985862155e-05, "loss": 0.0775, "step": 2580 }, { "epoch": 0.653875284019187, "grad_norm": 0.39499038457870483, "learning_rate": 2.607674829588488e-05, "loss": 0.0738, "step": 2590 }, { "epoch": 0.6563998990154002, "grad_norm": 0.4215822219848633, "learning_rate": 2.60616006059076e-05, "loss": 0.0704, "step": 2600 }, { "epoch": 0.6589245140116132, "grad_norm": 0.5732513070106506, "learning_rate": 2.604645291593032e-05, "loss": 0.066, "step": 2610 }, { "epoch": 0.6614491290078263, "grad_norm": 0.6704108715057373, "learning_rate": 2.6031305225953045e-05, "loss": 0.0688, "step": 2620 }, { "epoch": 0.6639737440040394, "grad_norm": 0.47199252247810364, "learning_rate": 2.6016157535975763e-05, "loss": 0.0735, "step": 2630 }, { "epoch": 0.6664983590002524, "grad_norm": 0.7372543215751648, "learning_rate": 2.6001009845998485e-05, "loss": 0.0732, "step": 2640 }, { "epoch": 0.6690229739964655, "grad_norm": 0.6057668924331665, "learning_rate": 2.598586215602121e-05, "loss": 0.0812, "step": 2650 }, { "epoch": 0.6715475889926786, "grad_norm": 0.5473082065582275, "learning_rate": 2.5970714466043928e-05, "loss": 0.0792, "step": 2660 }, { "epoch": 0.6740722039888917, "grad_norm": 0.5566405057907104, "learning_rate": 2.595556677606665e-05, "loss": 0.0784, "step": 2670 }, { "epoch": 0.6765968189851048, "grad_norm": 0.41001176834106445, "learning_rate": 2.5940419086089375e-05, "loss": 0.0836, "step": 2680 }, { "epoch": 0.6791214339813179, "grad_norm": 0.7479286789894104, "learning_rate": 2.5925271396112093e-05, "loss": 0.0766, "step": 2690 }, { "epoch": 0.6816460489775309, "grad_norm": 0.5025691390037537, "learning_rate": 2.5910123706134814e-05, "loss": 0.0751, "step": 2700 }, { "epoch": 0.684170663973744, "grad_norm": 0.5500156283378601, "learning_rate": 2.589497601615754e-05, "loss": 0.0838, "step": 2710 }, { "epoch": 0.6866952789699571, "grad_norm": 0.5829827785491943, "learning_rate": 2.5879828326180258e-05, "loss": 0.0836, "step": 2720 }, { "epoch": 0.6892198939661701, "grad_norm": 0.4228859543800354, "learning_rate": 2.586468063620298e-05, "loss": 0.0784, "step": 2730 }, { "epoch": 0.6917445089623833, "grad_norm": 0.46277573704719543, "learning_rate": 2.5849532946225704e-05, "loss": 0.0859, "step": 2740 }, { "epoch": 0.6942691239585963, "grad_norm": 0.446417897939682, "learning_rate": 2.5834385256248422e-05, "loss": 0.07, "step": 2750 }, { "epoch": 0.6967937389548093, "grad_norm": 0.5821120738983154, "learning_rate": 2.5819237566271144e-05, "loss": 0.0847, "step": 2760 }, { "epoch": 0.6993183539510225, "grad_norm": 0.4102267622947693, "learning_rate": 2.5804089876293865e-05, "loss": 0.0724, "step": 2770 }, { "epoch": 0.7018429689472355, "grad_norm": 0.6494462490081787, "learning_rate": 2.5788942186316587e-05, "loss": 0.0787, "step": 2780 }, { "epoch": 0.7043675839434487, "grad_norm": 0.45610910654067993, "learning_rate": 2.577379449633931e-05, "loss": 0.0789, "step": 2790 }, { "epoch": 0.7068921989396617, "grad_norm": 0.763862133026123, "learning_rate": 2.575864680636203e-05, "loss": 0.0691, "step": 2800 }, { "epoch": 0.7094168139358747, "grad_norm": 0.6916351318359375, "learning_rate": 2.5743499116384752e-05, "loss": 0.0714, "step": 2810 }, { "epoch": 0.7119414289320879, "grad_norm": 0.48409590125083923, "learning_rate": 2.5728351426407473e-05, "loss": 0.0705, "step": 2820 }, { "epoch": 0.7144660439283009, "grad_norm": 0.5418862700462341, "learning_rate": 2.5713203736430195e-05, "loss": 0.0785, "step": 2830 }, { "epoch": 0.716990658924514, "grad_norm": 0.384924978017807, "learning_rate": 2.5698056046452917e-05, "loss": 0.0758, "step": 2840 }, { "epoch": 0.7195152739207271, "grad_norm": 0.313711553812027, "learning_rate": 2.5682908356475638e-05, "loss": 0.0784, "step": 2850 }, { "epoch": 0.7220398889169402, "grad_norm": 0.33260729908943176, "learning_rate": 2.566776066649836e-05, "loss": 0.0546, "step": 2860 }, { "epoch": 0.7245645039131532, "grad_norm": 0.8136101365089417, "learning_rate": 2.565261297652108e-05, "loss": 0.0762, "step": 2870 }, { "epoch": 0.7270891189093663, "grad_norm": 0.363004595041275, "learning_rate": 2.5637465286543803e-05, "loss": 0.0853, "step": 2880 }, { "epoch": 0.7296137339055794, "grad_norm": 0.533036470413208, "learning_rate": 2.5622317596566525e-05, "loss": 0.0831, "step": 2890 }, { "epoch": 0.7321383489017925, "grad_norm": 0.5486681461334229, "learning_rate": 2.5607169906589246e-05, "loss": 0.0661, "step": 2900 }, { "epoch": 0.7346629638980056, "grad_norm": 0.4462204873561859, "learning_rate": 2.5592022216611968e-05, "loss": 0.0792, "step": 2910 }, { "epoch": 0.7371875788942186, "grad_norm": 0.39562398195266724, "learning_rate": 2.557687452663469e-05, "loss": 0.0905, "step": 2920 }, { "epoch": 0.7397121938904317, "grad_norm": 0.3897605836391449, "learning_rate": 2.556172683665741e-05, "loss": 0.0817, "step": 2930 }, { "epoch": 0.7422368088866448, "grad_norm": 0.44640547037124634, "learning_rate": 2.5546579146680133e-05, "loss": 0.0666, "step": 2940 }, { "epoch": 0.7447614238828578, "grad_norm": 0.3504391312599182, "learning_rate": 2.5531431456702854e-05, "loss": 0.0765, "step": 2950 }, { "epoch": 0.747286038879071, "grad_norm": 0.7800899147987366, "learning_rate": 2.5516283766725576e-05, "loss": 0.0763, "step": 2960 }, { "epoch": 0.749810653875284, "grad_norm": 0.6443584561347961, "learning_rate": 2.5501136076748297e-05, "loss": 0.0909, "step": 2970 }, { "epoch": 0.752335268871497, "grad_norm": 0.3258838951587677, "learning_rate": 2.548598838677102e-05, "loss": 0.0717, "step": 2980 }, { "epoch": 0.7548598838677102, "grad_norm": 0.545174241065979, "learning_rate": 2.547084069679374e-05, "loss": 0.0675, "step": 2990 }, { "epoch": 0.7573844988639232, "grad_norm": 0.751238226890564, "learning_rate": 2.5455693006816462e-05, "loss": 0.0767, "step": 3000 }, { "epoch": 0.7599091138601364, "grad_norm": 0.5401891469955444, "learning_rate": 2.544054531683918e-05, "loss": 0.0733, "step": 3010 }, { "epoch": 0.7624337288563494, "grad_norm": 0.7195361256599426, "learning_rate": 2.5425397626861905e-05, "loss": 0.0724, "step": 3020 }, { "epoch": 0.7649583438525625, "grad_norm": 0.5175593495368958, "learning_rate": 2.5410249936884627e-05, "loss": 0.0819, "step": 3030 }, { "epoch": 0.7674829588487756, "grad_norm": 0.5216336250305176, "learning_rate": 2.5395102246907345e-05, "loss": 0.0685, "step": 3040 }, { "epoch": 0.7700075738449886, "grad_norm": 0.618516743183136, "learning_rate": 2.537995455693007e-05, "loss": 0.0766, "step": 3050 }, { "epoch": 0.7725321888412017, "grad_norm": 0.6617169380187988, "learning_rate": 2.536480686695279e-05, "loss": 0.0909, "step": 3060 }, { "epoch": 0.7750568038374148, "grad_norm": 0.43062853813171387, "learning_rate": 2.534965917697551e-05, "loss": 0.0671, "step": 3070 }, { "epoch": 0.7775814188336279, "grad_norm": 0.3703617453575134, "learning_rate": 2.5334511486998235e-05, "loss": 0.0685, "step": 3080 }, { "epoch": 0.7801060338298409, "grad_norm": 0.397579550743103, "learning_rate": 2.5319363797020953e-05, "loss": 0.0764, "step": 3090 }, { "epoch": 0.782630648826054, "grad_norm": 0.42598873376846313, "learning_rate": 2.5304216107043675e-05, "loss": 0.0708, "step": 3100 }, { "epoch": 0.7851552638222671, "grad_norm": 0.49477070569992065, "learning_rate": 2.52890684170664e-05, "loss": 0.0755, "step": 3110 }, { "epoch": 0.7876798788184802, "grad_norm": 0.3231316804885864, "learning_rate": 2.5273920727089118e-05, "loss": 0.0702, "step": 3120 }, { "epoch": 0.7902044938146933, "grad_norm": 0.4189174175262451, "learning_rate": 2.525877303711184e-05, "loss": 0.0853, "step": 3130 }, { "epoch": 0.7927291088109063, "grad_norm": 0.48693427443504333, "learning_rate": 2.5243625347134564e-05, "loss": 0.0669, "step": 3140 }, { "epoch": 0.7952537238071195, "grad_norm": 0.48928236961364746, "learning_rate": 2.5228477657157283e-05, "loss": 0.0855, "step": 3150 }, { "epoch": 0.7977783388033325, "grad_norm": 0.4300285279750824, "learning_rate": 2.5213329967180004e-05, "loss": 0.0947, "step": 3160 }, { "epoch": 0.8003029537995455, "grad_norm": 0.5028986930847168, "learning_rate": 2.519818227720273e-05, "loss": 0.0901, "step": 3170 }, { "epoch": 0.8028275687957587, "grad_norm": 0.6309892535209656, "learning_rate": 2.5183034587225447e-05, "loss": 0.072, "step": 3180 }, { "epoch": 0.8053521837919717, "grad_norm": 0.34405893087387085, "learning_rate": 2.516788689724817e-05, "loss": 0.0805, "step": 3190 }, { "epoch": 0.8078767987881847, "grad_norm": 0.5418539643287659, "learning_rate": 2.5152739207270894e-05, "loss": 0.0638, "step": 3200 }, { "epoch": 0.8104014137843979, "grad_norm": 0.47962188720703125, "learning_rate": 2.5137591517293612e-05, "loss": 0.0646, "step": 3210 }, { "epoch": 0.8129260287806109, "grad_norm": 0.2669812738895416, "learning_rate": 2.5122443827316334e-05, "loss": 0.0737, "step": 3220 }, { "epoch": 0.8154506437768241, "grad_norm": 0.5387107133865356, "learning_rate": 2.510729613733906e-05, "loss": 0.0855, "step": 3230 }, { "epoch": 0.8179752587730371, "grad_norm": 1.0121177434921265, "learning_rate": 2.5092148447361777e-05, "loss": 0.0778, "step": 3240 }, { "epoch": 0.8204998737692502, "grad_norm": 0.5904279351234436, "learning_rate": 2.50770007573845e-05, "loss": 0.0713, "step": 3250 }, { "epoch": 0.8230244887654633, "grad_norm": 0.49105462431907654, "learning_rate": 2.5061853067407224e-05, "loss": 0.0758, "step": 3260 }, { "epoch": 0.8255491037616763, "grad_norm": 0.8744997382164001, "learning_rate": 2.5046705377429942e-05, "loss": 0.0769, "step": 3270 }, { "epoch": 0.8280737187578894, "grad_norm": 0.7501155734062195, "learning_rate": 2.5031557687452663e-05, "loss": 0.0735, "step": 3280 }, { "epoch": 0.8305983337541025, "grad_norm": 0.42036619782447815, "learning_rate": 2.501640999747539e-05, "loss": 0.0822, "step": 3290 }, { "epoch": 0.8331229487503156, "grad_norm": 0.44184601306915283, "learning_rate": 2.5001262307498107e-05, "loss": 0.072, "step": 3300 }, { "epoch": 0.8356475637465287, "grad_norm": 0.3335505425930023, "learning_rate": 2.4986114617520828e-05, "loss": 0.0723, "step": 3310 }, { "epoch": 0.8381721787427417, "grad_norm": 0.4426600933074951, "learning_rate": 2.497096692754355e-05, "loss": 0.0789, "step": 3320 }, { "epoch": 0.8406967937389548, "grad_norm": 0.45263999700546265, "learning_rate": 2.495581923756627e-05, "loss": 0.0744, "step": 3330 }, { "epoch": 0.8432214087351679, "grad_norm": 0.2730228900909424, "learning_rate": 2.4940671547588993e-05, "loss": 0.0712, "step": 3340 }, { "epoch": 0.845746023731381, "grad_norm": 0.34930211305618286, "learning_rate": 2.4925523857611715e-05, "loss": 0.0699, "step": 3350 }, { "epoch": 0.848270638727594, "grad_norm": 0.3079373240470886, "learning_rate": 2.4910376167634436e-05, "loss": 0.0693, "step": 3360 }, { "epoch": 0.8507952537238072, "grad_norm": 0.5083094239234924, "learning_rate": 2.4895228477657158e-05, "loss": 0.08, "step": 3370 }, { "epoch": 0.8533198687200202, "grad_norm": 0.40638256072998047, "learning_rate": 2.488008078767988e-05, "loss": 0.0752, "step": 3380 }, { "epoch": 0.8558444837162332, "grad_norm": 0.5014817714691162, "learning_rate": 2.48649330977026e-05, "loss": 0.0719, "step": 3390 }, { "epoch": 0.8583690987124464, "grad_norm": 0.5369516015052795, "learning_rate": 2.4849785407725323e-05, "loss": 0.0846, "step": 3400 }, { "epoch": 0.8608937137086594, "grad_norm": 0.5256063342094421, "learning_rate": 2.4834637717748044e-05, "loss": 0.0902, "step": 3410 }, { "epoch": 0.8634183287048726, "grad_norm": 0.4781491756439209, "learning_rate": 2.4819490027770766e-05, "loss": 0.0902, "step": 3420 }, { "epoch": 0.8659429437010856, "grad_norm": 0.48606160283088684, "learning_rate": 2.4804342337793487e-05, "loss": 0.0868, "step": 3430 }, { "epoch": 0.8684675586972986, "grad_norm": 0.6008639931678772, "learning_rate": 2.478919464781621e-05, "loss": 0.0726, "step": 3440 }, { "epoch": 0.8709921736935118, "grad_norm": 0.35723280906677246, "learning_rate": 2.477404695783893e-05, "loss": 0.0614, "step": 3450 }, { "epoch": 0.8735167886897248, "grad_norm": 0.8890093564987183, "learning_rate": 2.4758899267861652e-05, "loss": 0.0674, "step": 3460 }, { "epoch": 0.8760414036859379, "grad_norm": 0.6494946479797363, "learning_rate": 2.4743751577884374e-05, "loss": 0.0679, "step": 3470 }, { "epoch": 0.878566018682151, "grad_norm": 0.5928673148155212, "learning_rate": 2.4728603887907095e-05, "loss": 0.0638, "step": 3480 }, { "epoch": 0.881090633678364, "grad_norm": 0.3160926103591919, "learning_rate": 2.4713456197929817e-05, "loss": 0.0826, "step": 3490 }, { "epoch": 0.8836152486745771, "grad_norm": 0.585054874420166, "learning_rate": 2.469830850795254e-05, "loss": 0.0732, "step": 3500 }, { "epoch": 0.8861398636707902, "grad_norm": 0.4782266318798065, "learning_rate": 2.468316081797526e-05, "loss": 0.0752, "step": 3510 }, { "epoch": 0.8886644786670033, "grad_norm": 0.43204379081726074, "learning_rate": 2.466801312799798e-05, "loss": 0.089, "step": 3520 }, { "epoch": 0.8911890936632164, "grad_norm": 0.5396738052368164, "learning_rate": 2.46528654380207e-05, "loss": 0.0818, "step": 3530 }, { "epoch": 0.8937137086594295, "grad_norm": 0.39785364270210266, "learning_rate": 2.4637717748043425e-05, "loss": 0.0775, "step": 3540 }, { "epoch": 0.8962383236556425, "grad_norm": 0.41307616233825684, "learning_rate": 2.4622570058066146e-05, "loss": 0.0754, "step": 3550 }, { "epoch": 0.8987629386518556, "grad_norm": 0.5757405161857605, "learning_rate": 2.4607422368088865e-05, "loss": 0.0808, "step": 3560 }, { "epoch": 0.9012875536480687, "grad_norm": 0.4765954911708832, "learning_rate": 2.459227467811159e-05, "loss": 0.0752, "step": 3570 }, { "epoch": 0.9038121686442817, "grad_norm": 0.553316593170166, "learning_rate": 2.457712698813431e-05, "loss": 0.0646, "step": 3580 }, { "epoch": 0.9063367836404949, "grad_norm": 0.6468276977539062, "learning_rate": 2.456197929815703e-05, "loss": 0.0804, "step": 3590 }, { "epoch": 0.9088613986367079, "grad_norm": 0.38345563411712646, "learning_rate": 2.4546831608179754e-05, "loss": 0.0761, "step": 3600 }, { "epoch": 0.9113860136329209, "grad_norm": 0.38474252820014954, "learning_rate": 2.4531683918202476e-05, "loss": 0.0543, "step": 3610 }, { "epoch": 0.9139106286291341, "grad_norm": 0.44275468587875366, "learning_rate": 2.4516536228225194e-05, "loss": 0.0909, "step": 3620 }, { "epoch": 0.9164352436253471, "grad_norm": 0.40391191840171814, "learning_rate": 2.450138853824792e-05, "loss": 0.0655, "step": 3630 }, { "epoch": 0.9189598586215603, "grad_norm": 0.7056028246879578, "learning_rate": 2.4486240848270637e-05, "loss": 0.0774, "step": 3640 }, { "epoch": 0.9214844736177733, "grad_norm": 0.6490464806556702, "learning_rate": 2.447109315829336e-05, "loss": 0.0827, "step": 3650 }, { "epoch": 0.9240090886139863, "grad_norm": 0.6265650391578674, "learning_rate": 2.4455945468316084e-05, "loss": 0.0795, "step": 3660 }, { "epoch": 0.9265337036101995, "grad_norm": 0.38877788186073303, "learning_rate": 2.4440797778338802e-05, "loss": 0.0741, "step": 3670 }, { "epoch": 0.9290583186064125, "grad_norm": 0.5186750888824463, "learning_rate": 2.4425650088361524e-05, "loss": 0.0725, "step": 3680 }, { "epoch": 0.9315829336026256, "grad_norm": 0.6181837916374207, "learning_rate": 2.441050239838425e-05, "loss": 0.0807, "step": 3690 }, { "epoch": 0.9341075485988387, "grad_norm": 0.4812741279602051, "learning_rate": 2.4395354708406967e-05, "loss": 0.0806, "step": 3700 }, { "epoch": 0.9366321635950517, "grad_norm": 0.45176827907562256, "learning_rate": 2.438020701842969e-05, "loss": 0.0712, "step": 3710 }, { "epoch": 0.9391567785912648, "grad_norm": 0.5601783990859985, "learning_rate": 2.4365059328452414e-05, "loss": 0.0747, "step": 3720 }, { "epoch": 0.9416813935874779, "grad_norm": 0.5894845128059387, "learning_rate": 2.4349911638475132e-05, "loss": 0.0795, "step": 3730 }, { "epoch": 0.944206008583691, "grad_norm": 0.42322757840156555, "learning_rate": 2.4334763948497853e-05, "loss": 0.072, "step": 3740 }, { "epoch": 0.9467306235799041, "grad_norm": 0.5656612515449524, "learning_rate": 2.431961625852058e-05, "loss": 0.0689, "step": 3750 }, { "epoch": 0.9492552385761172, "grad_norm": 0.4395703971385956, "learning_rate": 2.4304468568543297e-05, "loss": 0.0916, "step": 3760 }, { "epoch": 0.9517798535723302, "grad_norm": 0.3628901243209839, "learning_rate": 2.4289320878566018e-05, "loss": 0.0727, "step": 3770 }, { "epoch": 0.9543044685685433, "grad_norm": 0.3859265148639679, "learning_rate": 2.4274173188588743e-05, "loss": 0.0835, "step": 3780 }, { "epoch": 0.9568290835647564, "grad_norm": 0.2310730367898941, "learning_rate": 2.425902549861146e-05, "loss": 0.0687, "step": 3790 }, { "epoch": 0.9593536985609694, "grad_norm": 0.30712732672691345, "learning_rate": 2.4243877808634183e-05, "loss": 0.0695, "step": 3800 }, { "epoch": 0.9618783135571826, "grad_norm": 0.7812157273292542, "learning_rate": 2.4228730118656908e-05, "loss": 0.0709, "step": 3810 }, { "epoch": 0.9644029285533956, "grad_norm": 0.3865745961666107, "learning_rate": 2.4213582428679626e-05, "loss": 0.0837, "step": 3820 }, { "epoch": 0.9669275435496086, "grad_norm": 0.5264056921005249, "learning_rate": 2.4198434738702348e-05, "loss": 0.0771, "step": 3830 }, { "epoch": 0.9694521585458218, "grad_norm": 0.4684106111526489, "learning_rate": 2.4183287048725073e-05, "loss": 0.0876, "step": 3840 }, { "epoch": 0.9719767735420348, "grad_norm": 0.37889453768730164, "learning_rate": 2.416813935874779e-05, "loss": 0.078, "step": 3850 }, { "epoch": 0.974501388538248, "grad_norm": 0.4028097987174988, "learning_rate": 2.4152991668770513e-05, "loss": 0.0825, "step": 3860 }, { "epoch": 0.977026003534461, "grad_norm": 0.4436962902545929, "learning_rate": 2.4137843978793237e-05, "loss": 0.0627, "step": 3870 }, { "epoch": 0.979550618530674, "grad_norm": 0.35176676511764526, "learning_rate": 2.4122696288815956e-05, "loss": 0.0706, "step": 3880 }, { "epoch": 0.9820752335268872, "grad_norm": 0.5100188255310059, "learning_rate": 2.4107548598838677e-05, "loss": 0.066, "step": 3890 }, { "epoch": 0.9845998485231002, "grad_norm": 0.3400685489177704, "learning_rate": 2.40924009088614e-05, "loss": 0.0611, "step": 3900 }, { "epoch": 0.9871244635193133, "grad_norm": 0.4831116497516632, "learning_rate": 2.407725321888412e-05, "loss": 0.0907, "step": 3910 }, { "epoch": 0.9896490785155264, "grad_norm": 0.7926459312438965, "learning_rate": 2.4062105528906842e-05, "loss": 0.0956, "step": 3920 }, { "epoch": 0.9921736935117395, "grad_norm": 0.5712438225746155, "learning_rate": 2.4046957838929564e-05, "loss": 0.0807, "step": 3930 }, { "epoch": 0.9946983085079525, "grad_norm": 0.4294516146183014, "learning_rate": 2.4031810148952285e-05, "loss": 0.075, "step": 3940 }, { "epoch": 0.9972229235041656, "grad_norm": 0.37027591466903687, "learning_rate": 2.4016662458975007e-05, "loss": 0.071, "step": 3950 }, { "epoch": 0.9997475385003787, "grad_norm": 0.5219939947128296, "learning_rate": 2.400151476899773e-05, "loss": 0.0659, "step": 3960 }, { "epoch": 1.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.05425933748483658, "eval_runtime": 969.6738, "eval_samples_per_second": 212.713, "eval_steps_per_second": 3.324, "step": 3961 }, { "epoch": 1.0022721534965917, "grad_norm": 0.4360177218914032, "learning_rate": 2.398636707902045e-05, "loss": 0.0797, "step": 3970 }, { "epoch": 1.0047967684928047, "grad_norm": 0.6967335343360901, "learning_rate": 2.397121938904317e-05, "loss": 0.0688, "step": 3980 }, { "epoch": 1.007321383489018, "grad_norm": 0.6186540722846985, "learning_rate": 2.3956071699065893e-05, "loss": 0.0703, "step": 3990 }, { "epoch": 1.009845998485231, "grad_norm": 0.6125244498252869, "learning_rate": 2.3940924009088615e-05, "loss": 0.0759, "step": 4000 }, { "epoch": 1.012370613481444, "grad_norm": 0.48123863339424133, "learning_rate": 2.3925776319111336e-05, "loss": 0.0646, "step": 4010 }, { "epoch": 1.0148952284776571, "grad_norm": 0.32201361656188965, "learning_rate": 2.3910628629134058e-05, "loss": 0.0741, "step": 4020 }, { "epoch": 1.0174198434738702, "grad_norm": 0.7003934979438782, "learning_rate": 2.389548093915678e-05, "loss": 0.0704, "step": 4030 }, { "epoch": 1.0199444584700834, "grad_norm": 0.5227739810943604, "learning_rate": 2.38803332491795e-05, "loss": 0.0705, "step": 4040 }, { "epoch": 1.0224690734662965, "grad_norm": 0.59422367811203, "learning_rate": 2.3865185559202223e-05, "loss": 0.0751, "step": 4050 }, { "epoch": 1.0249936884625095, "grad_norm": 0.5077184438705444, "learning_rate": 2.3850037869224944e-05, "loss": 0.0818, "step": 4060 }, { "epoch": 1.0275183034587225, "grad_norm": 0.3867091238498688, "learning_rate": 2.3834890179247666e-05, "loss": 0.0768, "step": 4070 }, { "epoch": 1.0300429184549356, "grad_norm": 0.43073731660842896, "learning_rate": 2.3819742489270384e-05, "loss": 0.0794, "step": 4080 }, { "epoch": 1.0325675334511486, "grad_norm": 0.5008924007415771, "learning_rate": 2.380459479929311e-05, "loss": 0.0676, "step": 4090 }, { "epoch": 1.0350921484473619, "grad_norm": 0.3357384204864502, "learning_rate": 2.378944710931583e-05, "loss": 0.0751, "step": 4100 }, { "epoch": 1.037616763443575, "grad_norm": 0.2947876453399658, "learning_rate": 2.377429941933855e-05, "loss": 0.0705, "step": 4110 }, { "epoch": 1.040141378439788, "grad_norm": 0.4923837184906006, "learning_rate": 2.3759151729361274e-05, "loss": 0.0854, "step": 4120 }, { "epoch": 1.042665993436001, "grad_norm": 0.6966848373413086, "learning_rate": 2.3744004039383996e-05, "loss": 0.0707, "step": 4130 }, { "epoch": 1.045190608432214, "grad_norm": 0.5456628799438477, "learning_rate": 2.3728856349406714e-05, "loss": 0.0732, "step": 4140 }, { "epoch": 1.0477152234284273, "grad_norm": 0.5405559539794922, "learning_rate": 2.371370865942944e-05, "loss": 0.075, "step": 4150 }, { "epoch": 1.0502398384246403, "grad_norm": 0.3989148736000061, "learning_rate": 2.369856096945216e-05, "loss": 0.0717, "step": 4160 }, { "epoch": 1.0527644534208533, "grad_norm": 0.4704185724258423, "learning_rate": 2.368341327947488e-05, "loss": 0.0652, "step": 4170 }, { "epoch": 1.0552890684170664, "grad_norm": 0.42349693179130554, "learning_rate": 2.3668265589497604e-05, "loss": 0.0745, "step": 4180 }, { "epoch": 1.0578136834132794, "grad_norm": 0.4445798993110657, "learning_rate": 2.3653117899520325e-05, "loss": 0.0807, "step": 4190 }, { "epoch": 1.0603382984094925, "grad_norm": 0.8739972710609436, "learning_rate": 2.3637970209543043e-05, "loss": 0.0841, "step": 4200 }, { "epoch": 1.0628629134057057, "grad_norm": 0.45838242769241333, "learning_rate": 2.362282251956577e-05, "loss": 0.0774, "step": 4210 }, { "epoch": 1.0653875284019187, "grad_norm": 0.43052664399147034, "learning_rate": 2.3607674829588487e-05, "loss": 0.0797, "step": 4220 }, { "epoch": 1.0679121433981318, "grad_norm": 0.5692147016525269, "learning_rate": 2.3592527139611208e-05, "loss": 0.0787, "step": 4230 }, { "epoch": 1.0704367583943448, "grad_norm": 0.48966166377067566, "learning_rate": 2.3577379449633933e-05, "loss": 0.0791, "step": 4240 }, { "epoch": 1.0729613733905579, "grad_norm": 0.510415256023407, "learning_rate": 2.356223175965665e-05, "loss": 0.0802, "step": 4250 }, { "epoch": 1.0754859883867711, "grad_norm": 0.4185769855976105, "learning_rate": 2.3547084069679373e-05, "loss": 0.0648, "step": 4260 }, { "epoch": 1.0780106033829842, "grad_norm": 0.5046850442886353, "learning_rate": 2.3531936379702098e-05, "loss": 0.0812, "step": 4270 }, { "epoch": 1.0805352183791972, "grad_norm": 0.41549035906791687, "learning_rate": 2.3516788689724816e-05, "loss": 0.0786, "step": 4280 }, { "epoch": 1.0830598333754102, "grad_norm": 0.4319119155406952, "learning_rate": 2.3501640999747538e-05, "loss": 0.0783, "step": 4290 }, { "epoch": 1.0855844483716233, "grad_norm": 0.8472285270690918, "learning_rate": 2.3486493309770263e-05, "loss": 0.0663, "step": 4300 }, { "epoch": 1.0881090633678363, "grad_norm": 0.2649112641811371, "learning_rate": 2.347134561979298e-05, "loss": 0.069, "step": 4310 }, { "epoch": 1.0906336783640496, "grad_norm": 0.6875, "learning_rate": 2.3456197929815702e-05, "loss": 0.072, "step": 4320 }, { "epoch": 1.0931582933602626, "grad_norm": 0.6910480260848999, "learning_rate": 2.3441050239838427e-05, "loss": 0.0663, "step": 4330 }, { "epoch": 1.0956829083564756, "grad_norm": 0.413908988237381, "learning_rate": 2.3425902549861146e-05, "loss": 0.075, "step": 4340 }, { "epoch": 1.0982075233526887, "grad_norm": 0.5234224200248718, "learning_rate": 2.3410754859883867e-05, "loss": 0.0653, "step": 4350 }, { "epoch": 1.1007321383489017, "grad_norm": 0.5041384100914001, "learning_rate": 2.3395607169906592e-05, "loss": 0.0616, "step": 4360 }, { "epoch": 1.103256753345115, "grad_norm": 0.34713175892829895, "learning_rate": 2.338045947992931e-05, "loss": 0.0835, "step": 4370 }, { "epoch": 1.105781368341328, "grad_norm": 0.5200866460800171, "learning_rate": 2.3365311789952032e-05, "loss": 0.0796, "step": 4380 }, { "epoch": 1.108305983337541, "grad_norm": 0.5935444831848145, "learning_rate": 2.3350164099974757e-05, "loss": 0.0824, "step": 4390 }, { "epoch": 1.110830598333754, "grad_norm": 0.45419350266456604, "learning_rate": 2.3335016409997475e-05, "loss": 0.0782, "step": 4400 }, { "epoch": 1.1133552133299671, "grad_norm": 0.6200574040412903, "learning_rate": 2.3319868720020197e-05, "loss": 0.0721, "step": 4410 }, { "epoch": 1.1158798283261802, "grad_norm": 0.3939630687236786, "learning_rate": 2.3304721030042922e-05, "loss": 0.0672, "step": 4420 }, { "epoch": 1.1184044433223934, "grad_norm": 0.44664672017097473, "learning_rate": 2.328957334006564e-05, "loss": 0.0639, "step": 4430 }, { "epoch": 1.1209290583186065, "grad_norm": 0.3891284167766571, "learning_rate": 2.327442565008836e-05, "loss": 0.0637, "step": 4440 }, { "epoch": 1.1234536733148195, "grad_norm": 0.5123685002326965, "learning_rate": 2.3259277960111087e-05, "loss": 0.0577, "step": 4450 }, { "epoch": 1.1259782883110325, "grad_norm": 0.5410445332527161, "learning_rate": 2.3244130270133805e-05, "loss": 0.0777, "step": 4460 }, { "epoch": 1.1285029033072456, "grad_norm": 0.39902034401893616, "learning_rate": 2.3228982580156526e-05, "loss": 0.0684, "step": 4470 }, { "epoch": 1.1310275183034588, "grad_norm": 0.40197306871414185, "learning_rate": 2.3213834890179248e-05, "loss": 0.0607, "step": 4480 }, { "epoch": 1.1335521332996719, "grad_norm": 0.44786280393600464, "learning_rate": 2.319868720020197e-05, "loss": 0.0665, "step": 4490 }, { "epoch": 1.136076748295885, "grad_norm": 0.5214644074440002, "learning_rate": 2.318353951022469e-05, "loss": 0.0601, "step": 4500 }, { "epoch": 1.138601363292098, "grad_norm": 0.4208206832408905, "learning_rate": 2.3168391820247413e-05, "loss": 0.075, "step": 4510 }, { "epoch": 1.141125978288311, "grad_norm": 0.46941113471984863, "learning_rate": 2.3153244130270134e-05, "loss": 0.0644, "step": 4520 }, { "epoch": 1.1436505932845242, "grad_norm": 0.4159145951271057, "learning_rate": 2.3138096440292856e-05, "loss": 0.0689, "step": 4530 }, { "epoch": 1.1461752082807373, "grad_norm": 0.7756341099739075, "learning_rate": 2.3122948750315578e-05, "loss": 0.075, "step": 4540 }, { "epoch": 1.1486998232769503, "grad_norm": 0.49775972962379456, "learning_rate": 2.31078010603383e-05, "loss": 0.0781, "step": 4550 }, { "epoch": 1.1512244382731633, "grad_norm": 0.3479367792606354, "learning_rate": 2.309265337036102e-05, "loss": 0.0725, "step": 4560 }, { "epoch": 1.1537490532693764, "grad_norm": 0.4266480803489685, "learning_rate": 2.3077505680383742e-05, "loss": 0.0791, "step": 4570 }, { "epoch": 1.1562736682655894, "grad_norm": 0.46766582131385803, "learning_rate": 2.3062357990406464e-05, "loss": 0.0754, "step": 4580 }, { "epoch": 1.1587982832618025, "grad_norm": 0.6098841428756714, "learning_rate": 2.3047210300429186e-05, "loss": 0.0701, "step": 4590 }, { "epoch": 1.1613228982580157, "grad_norm": 0.496367484331131, "learning_rate": 2.3032062610451907e-05, "loss": 0.0822, "step": 4600 }, { "epoch": 1.1638475132542287, "grad_norm": 0.4577995538711548, "learning_rate": 2.301691492047463e-05, "loss": 0.079, "step": 4610 }, { "epoch": 1.1663721282504418, "grad_norm": 0.33575066924095154, "learning_rate": 2.300176723049735e-05, "loss": 0.0712, "step": 4620 }, { "epoch": 1.1688967432466548, "grad_norm": 0.35166892409324646, "learning_rate": 2.298661954052007e-05, "loss": 0.0736, "step": 4630 }, { "epoch": 1.1714213582428679, "grad_norm": 0.46313348412513733, "learning_rate": 2.2971471850542793e-05, "loss": 0.0676, "step": 4640 }, { "epoch": 1.1739459732390811, "grad_norm": 0.6426845788955688, "learning_rate": 2.2956324160565515e-05, "loss": 0.0621, "step": 4650 }, { "epoch": 1.1764705882352942, "grad_norm": 0.403133749961853, "learning_rate": 2.2941176470588233e-05, "loss": 0.0774, "step": 4660 }, { "epoch": 1.1789952032315072, "grad_norm": 0.4596996307373047, "learning_rate": 2.2926028780610958e-05, "loss": 0.0702, "step": 4670 }, { "epoch": 1.1815198182277202, "grad_norm": 0.4984603226184845, "learning_rate": 2.291088109063368e-05, "loss": 0.0761, "step": 4680 }, { "epoch": 1.1840444332239333, "grad_norm": 0.6605084538459778, "learning_rate": 2.2895733400656398e-05, "loss": 0.0718, "step": 4690 }, { "epoch": 1.1865690482201465, "grad_norm": 0.6148932576179504, "learning_rate": 2.2880585710679123e-05, "loss": 0.0647, "step": 4700 }, { "epoch": 1.1890936632163596, "grad_norm": 0.7315832376480103, "learning_rate": 2.2865438020701845e-05, "loss": 0.0812, "step": 4710 }, { "epoch": 1.1916182782125726, "grad_norm": 0.45521411299705505, "learning_rate": 2.2850290330724563e-05, "loss": 0.0745, "step": 4720 }, { "epoch": 1.1941428932087856, "grad_norm": 0.3817095160484314, "learning_rate": 2.2835142640747288e-05, "loss": 0.0652, "step": 4730 }, { "epoch": 1.1966675082049987, "grad_norm": 0.6603373289108276, "learning_rate": 2.281999495077001e-05, "loss": 0.0837, "step": 4740 }, { "epoch": 1.199192123201212, "grad_norm": 0.44251519441604614, "learning_rate": 2.2804847260792728e-05, "loss": 0.0652, "step": 4750 }, { "epoch": 1.201716738197425, "grad_norm": 0.41316279768943787, "learning_rate": 2.2789699570815453e-05, "loss": 0.058, "step": 4760 }, { "epoch": 1.204241353193638, "grad_norm": 0.3292355239391327, "learning_rate": 2.2774551880838174e-05, "loss": 0.0701, "step": 4770 }, { "epoch": 1.206765968189851, "grad_norm": 0.47908949851989746, "learning_rate": 2.2759404190860892e-05, "loss": 0.0719, "step": 4780 }, { "epoch": 1.209290583186064, "grad_norm": 0.5355591773986816, "learning_rate": 2.2744256500883617e-05, "loss": 0.0772, "step": 4790 }, { "epoch": 1.2118151981822771, "grad_norm": 0.3404475450515747, "learning_rate": 2.2729108810906336e-05, "loss": 0.0659, "step": 4800 }, { "epoch": 1.2143398131784902, "grad_norm": 0.5191195607185364, "learning_rate": 2.2713961120929057e-05, "loss": 0.0697, "step": 4810 }, { "epoch": 1.2168644281747034, "grad_norm": 0.29288583993911743, "learning_rate": 2.2698813430951782e-05, "loss": 0.0628, "step": 4820 }, { "epoch": 1.2193890431709165, "grad_norm": 0.3372870981693268, "learning_rate": 2.26836657409745e-05, "loss": 0.0724, "step": 4830 }, { "epoch": 1.2219136581671295, "grad_norm": 0.4657537639141083, "learning_rate": 2.2668518050997222e-05, "loss": 0.0582, "step": 4840 }, { "epoch": 1.2244382731633425, "grad_norm": 0.4123993217945099, "learning_rate": 2.2653370361019947e-05, "loss": 0.0669, "step": 4850 }, { "epoch": 1.2269628881595556, "grad_norm": 0.560075044631958, "learning_rate": 2.2638222671042665e-05, "loss": 0.0824, "step": 4860 }, { "epoch": 1.2294875031557688, "grad_norm": 0.4414028525352478, "learning_rate": 2.2623074981065387e-05, "loss": 0.0727, "step": 4870 }, { "epoch": 1.2320121181519819, "grad_norm": 0.450916588306427, "learning_rate": 2.2607927291088112e-05, "loss": 0.0718, "step": 4880 }, { "epoch": 1.234536733148195, "grad_norm": 0.6700304746627808, "learning_rate": 2.259277960111083e-05, "loss": 0.0705, "step": 4890 }, { "epoch": 1.237061348144408, "grad_norm": 0.5820491909980774, "learning_rate": 2.257763191113355e-05, "loss": 0.0691, "step": 4900 }, { "epoch": 1.239585963140621, "grad_norm": 0.7849363088607788, "learning_rate": 2.2562484221156277e-05, "loss": 0.0746, "step": 4910 }, { "epoch": 1.2421105781368342, "grad_norm": 0.3631155490875244, "learning_rate": 2.2547336531178995e-05, "loss": 0.072, "step": 4920 }, { "epoch": 1.2446351931330473, "grad_norm": 0.35678336024284363, "learning_rate": 2.2532188841201716e-05, "loss": 0.0675, "step": 4930 }, { "epoch": 1.2471598081292603, "grad_norm": 0.4498080015182495, "learning_rate": 2.251704115122444e-05, "loss": 0.0692, "step": 4940 }, { "epoch": 1.2496844231254733, "grad_norm": 0.5682255029678345, "learning_rate": 2.250189346124716e-05, "loss": 0.0832, "step": 4950 }, { "epoch": 1.2522090381216864, "grad_norm": 0.5812047719955444, "learning_rate": 2.248674577126988e-05, "loss": 0.0848, "step": 4960 }, { "epoch": 1.2547336531178996, "grad_norm": 0.4898669123649597, "learning_rate": 2.2471598081292606e-05, "loss": 0.0699, "step": 4970 }, { "epoch": 1.2572582681141127, "grad_norm": 0.45778876543045044, "learning_rate": 2.2456450391315324e-05, "loss": 0.078, "step": 4980 }, { "epoch": 1.2597828831103257, "grad_norm": 0.32259121537208557, "learning_rate": 2.2441302701338046e-05, "loss": 0.0813, "step": 4990 }, { "epoch": 1.2623074981065388, "grad_norm": 0.34969955682754517, "learning_rate": 2.242615501136077e-05, "loss": 0.064, "step": 5000 }, { "epoch": 1.2648321131027518, "grad_norm": 0.4658315181732178, "learning_rate": 2.241100732138349e-05, "loss": 0.0641, "step": 5010 }, { "epoch": 1.267356728098965, "grad_norm": 0.7253788113594055, "learning_rate": 2.239585963140621e-05, "loss": 0.0669, "step": 5020 }, { "epoch": 1.2698813430951779, "grad_norm": 0.4734630882740021, "learning_rate": 2.2380711941428936e-05, "loss": 0.0692, "step": 5030 }, { "epoch": 1.2724059580913911, "grad_norm": 0.496377170085907, "learning_rate": 2.2365564251451654e-05, "loss": 0.0701, "step": 5040 }, { "epoch": 1.2749305730876042, "grad_norm": 0.427693247795105, "learning_rate": 2.2350416561474375e-05, "loss": 0.0724, "step": 5050 }, { "epoch": 1.2774551880838172, "grad_norm": 0.3826102316379547, "learning_rate": 2.2335268871497097e-05, "loss": 0.0559, "step": 5060 }, { "epoch": 1.2799798030800302, "grad_norm": 0.4349898099899292, "learning_rate": 2.232012118151982e-05, "loss": 0.0804, "step": 5070 }, { "epoch": 1.2825044180762433, "grad_norm": 0.4229235351085663, "learning_rate": 2.230497349154254e-05, "loss": 0.0583, "step": 5080 }, { "epoch": 1.2850290330724565, "grad_norm": 0.4786253869533539, "learning_rate": 2.2289825801565262e-05, "loss": 0.0802, "step": 5090 }, { "epoch": 1.2875536480686696, "grad_norm": 0.5271996259689331, "learning_rate": 2.2274678111587983e-05, "loss": 0.0824, "step": 5100 }, { "epoch": 1.2900782630648826, "grad_norm": 0.4490508735179901, "learning_rate": 2.2259530421610705e-05, "loss": 0.0748, "step": 5110 }, { "epoch": 1.2926028780610956, "grad_norm": 0.49381911754608154, "learning_rate": 2.2244382731633427e-05, "loss": 0.0701, "step": 5120 }, { "epoch": 1.2951274930573087, "grad_norm": 0.4700550436973572, "learning_rate": 2.2229235041656148e-05, "loss": 0.061, "step": 5130 }, { "epoch": 1.297652108053522, "grad_norm": 0.2748670279979706, "learning_rate": 2.221408735167887e-05, "loss": 0.0638, "step": 5140 }, { "epoch": 1.300176723049735, "grad_norm": 0.32839298248291016, "learning_rate": 2.219893966170159e-05, "loss": 0.0627, "step": 5150 }, { "epoch": 1.302701338045948, "grad_norm": 0.40593937039375305, "learning_rate": 2.2183791971724313e-05, "loss": 0.0664, "step": 5160 }, { "epoch": 1.305225953042161, "grad_norm": 0.43036961555480957, "learning_rate": 2.2168644281747035e-05, "loss": 0.0861, "step": 5170 }, { "epoch": 1.307750568038374, "grad_norm": 0.7976852655410767, "learning_rate": 2.2153496591769753e-05, "loss": 0.0869, "step": 5180 }, { "epoch": 1.3102751830345873, "grad_norm": 1.0304032564163208, "learning_rate": 2.2138348901792478e-05, "loss": 0.0774, "step": 5190 }, { "epoch": 1.3127997980308004, "grad_norm": 0.38359397649765015, "learning_rate": 2.21232012118152e-05, "loss": 0.0821, "step": 5200 }, { "epoch": 1.3153244130270134, "grad_norm": 0.3385170102119446, "learning_rate": 2.2108053521837918e-05, "loss": 0.0747, "step": 5210 }, { "epoch": 1.3178490280232265, "grad_norm": 0.8735803365707397, "learning_rate": 2.2092905831860643e-05, "loss": 0.0691, "step": 5220 }, { "epoch": 1.3203736430194395, "grad_norm": 0.5266577005386353, "learning_rate": 2.2077758141883364e-05, "loss": 0.071, "step": 5230 }, { "epoch": 1.3228982580156528, "grad_norm": 0.46573153138160706, "learning_rate": 2.2062610451906082e-05, "loss": 0.0876, "step": 5240 }, { "epoch": 1.3254228730118656, "grad_norm": 0.6112514138221741, "learning_rate": 2.2047462761928807e-05, "loss": 0.0691, "step": 5250 }, { "epoch": 1.3279474880080788, "grad_norm": 0.4857766032218933, "learning_rate": 2.203231507195153e-05, "loss": 0.0743, "step": 5260 }, { "epoch": 1.3304721030042919, "grad_norm": 0.3289374113082886, "learning_rate": 2.2017167381974247e-05, "loss": 0.0679, "step": 5270 }, { "epoch": 1.332996718000505, "grad_norm": 0.6034137606620789, "learning_rate": 2.2002019691996972e-05, "loss": 0.0664, "step": 5280 }, { "epoch": 1.335521332996718, "grad_norm": 0.6763460636138916, "learning_rate": 2.1986872002019694e-05, "loss": 0.0758, "step": 5290 }, { "epoch": 1.338045947992931, "grad_norm": 0.4285227060317993, "learning_rate": 2.1971724312042412e-05, "loss": 0.0709, "step": 5300 }, { "epoch": 1.3405705629891442, "grad_norm": 0.3583575189113617, "learning_rate": 2.1956576622065137e-05, "loss": 0.0752, "step": 5310 }, { "epoch": 1.3430951779853573, "grad_norm": 0.5029326677322388, "learning_rate": 2.194142893208786e-05, "loss": 0.0748, "step": 5320 }, { "epoch": 1.3456197929815703, "grad_norm": 0.6658989191055298, "learning_rate": 2.1926281242110577e-05, "loss": 0.0655, "step": 5330 }, { "epoch": 1.3481444079777833, "grad_norm": 0.520709216594696, "learning_rate": 2.1911133552133302e-05, "loss": 0.0781, "step": 5340 }, { "epoch": 1.3506690229739964, "grad_norm": 0.534546971321106, "learning_rate": 2.189598586215602e-05, "loss": 0.0678, "step": 5350 }, { "epoch": 1.3531936379702096, "grad_norm": 0.3448280096054077, "learning_rate": 2.188083817217874e-05, "loss": 0.046, "step": 5360 }, { "epoch": 1.3557182529664227, "grad_norm": 0.47474193572998047, "learning_rate": 2.1865690482201466e-05, "loss": 0.0619, "step": 5370 }, { "epoch": 1.3582428679626357, "grad_norm": 0.3935701847076416, "learning_rate": 2.1850542792224185e-05, "loss": 0.0655, "step": 5380 }, { "epoch": 1.3607674829588488, "grad_norm": 0.5216870903968811, "learning_rate": 2.1835395102246906e-05, "loss": 0.0727, "step": 5390 }, { "epoch": 1.3632920979550618, "grad_norm": 0.40178000926971436, "learning_rate": 2.182024741226963e-05, "loss": 0.0686, "step": 5400 }, { "epoch": 1.365816712951275, "grad_norm": 0.8555682301521301, "learning_rate": 2.180509972229235e-05, "loss": 0.0744, "step": 5410 }, { "epoch": 1.368341327947488, "grad_norm": 0.5784962773323059, "learning_rate": 2.178995203231507e-05, "loss": 0.0762, "step": 5420 }, { "epoch": 1.3708659429437011, "grad_norm": 0.476243793964386, "learning_rate": 2.1774804342337796e-05, "loss": 0.0712, "step": 5430 }, { "epoch": 1.3733905579399142, "grad_norm": 0.4363716244697571, "learning_rate": 2.1759656652360514e-05, "loss": 0.0664, "step": 5440 }, { "epoch": 1.3759151729361272, "grad_norm": 0.533042848110199, "learning_rate": 2.1744508962383236e-05, "loss": 0.0881, "step": 5450 }, { "epoch": 1.3784397879323405, "grad_norm": 0.34616196155548096, "learning_rate": 2.172936127240596e-05, "loss": 0.0749, "step": 5460 }, { "epoch": 1.3809644029285533, "grad_norm": 0.31354981660842896, "learning_rate": 2.171421358242868e-05, "loss": 0.0634, "step": 5470 }, { "epoch": 1.3834890179247665, "grad_norm": 0.5656862854957581, "learning_rate": 2.16990658924514e-05, "loss": 0.0716, "step": 5480 }, { "epoch": 1.3860136329209796, "grad_norm": 0.516463041305542, "learning_rate": 2.1683918202474126e-05, "loss": 0.0614, "step": 5490 }, { "epoch": 1.3885382479171926, "grad_norm": 0.3807201385498047, "learning_rate": 2.1668770512496844e-05, "loss": 0.0608, "step": 5500 }, { "epoch": 1.3910628629134056, "grad_norm": 0.3986429274082184, "learning_rate": 2.1653622822519565e-05, "loss": 0.0701, "step": 5510 }, { "epoch": 1.3935874779096187, "grad_norm": 0.58119797706604, "learning_rate": 2.163847513254229e-05, "loss": 0.0692, "step": 5520 }, { "epoch": 1.396112092905832, "grad_norm": 0.5791721940040588, "learning_rate": 2.162332744256501e-05, "loss": 0.0713, "step": 5530 }, { "epoch": 1.398636707902045, "grad_norm": 0.39115121960639954, "learning_rate": 2.160817975258773e-05, "loss": 0.0678, "step": 5540 }, { "epoch": 1.401161322898258, "grad_norm": 0.37049493193626404, "learning_rate": 2.1593032062610455e-05, "loss": 0.0579, "step": 5550 }, { "epoch": 1.403685937894471, "grad_norm": 0.7497106194496155, "learning_rate": 2.1577884372633173e-05, "loss": 0.0738, "step": 5560 }, { "epoch": 1.406210552890684, "grad_norm": 0.4488617777824402, "learning_rate": 2.1562736682655895e-05, "loss": 0.0826, "step": 5570 }, { "epoch": 1.4087351678868973, "grad_norm": 0.42779994010925293, "learning_rate": 2.154758899267862e-05, "loss": 0.0782, "step": 5580 }, { "epoch": 1.4112597828831104, "grad_norm": 0.6836367249488831, "learning_rate": 2.1532441302701338e-05, "loss": 0.0673, "step": 5590 }, { "epoch": 1.4137843978793234, "grad_norm": 0.38072410225868225, "learning_rate": 2.151729361272406e-05, "loss": 0.0695, "step": 5600 }, { "epoch": 1.4163090128755365, "grad_norm": 0.41650840640068054, "learning_rate": 2.1502145922746785e-05, "loss": 0.0757, "step": 5610 }, { "epoch": 1.4188336278717495, "grad_norm": 0.6885185241699219, "learning_rate": 2.1486998232769503e-05, "loss": 0.0662, "step": 5620 }, { "epoch": 1.4213582428679628, "grad_norm": 0.5995170474052429, "learning_rate": 2.1471850542792225e-05, "loss": 0.0723, "step": 5630 }, { "epoch": 1.4238828578641758, "grad_norm": 0.6088976263999939, "learning_rate": 2.1456702852814946e-05, "loss": 0.0684, "step": 5640 }, { "epoch": 1.4264074728603888, "grad_norm": 0.5369215607643127, "learning_rate": 2.1441555162837668e-05, "loss": 0.0682, "step": 5650 }, { "epoch": 1.4289320878566019, "grad_norm": 0.7823798060417175, "learning_rate": 2.142640747286039e-05, "loss": 0.0675, "step": 5660 }, { "epoch": 1.431456702852815, "grad_norm": 0.39496108889579773, "learning_rate": 2.141125978288311e-05, "loss": 0.0727, "step": 5670 }, { "epoch": 1.4339813178490282, "grad_norm": 0.509624183177948, "learning_rate": 2.1396112092905833e-05, "loss": 0.0736, "step": 5680 }, { "epoch": 1.436505932845241, "grad_norm": 0.4820767641067505, "learning_rate": 2.1380964402928554e-05, "loss": 0.0683, "step": 5690 }, { "epoch": 1.4390305478414542, "grad_norm": 0.5313420295715332, "learning_rate": 2.1365816712951272e-05, "loss": 0.0678, "step": 5700 }, { "epoch": 1.4415551628376673, "grad_norm": 0.4537731409072876, "learning_rate": 2.1350669022973997e-05, "loss": 0.0765, "step": 5710 }, { "epoch": 1.4440797778338803, "grad_norm": 0.6362118721008301, "learning_rate": 2.133552133299672e-05, "loss": 0.0552, "step": 5720 }, { "epoch": 1.4466043928300933, "grad_norm": 0.3806234300136566, "learning_rate": 2.1320373643019437e-05, "loss": 0.0748, "step": 5730 }, { "epoch": 1.4491290078263064, "grad_norm": 0.4131557047367096, "learning_rate": 2.1305225953042162e-05, "loss": 0.0589, "step": 5740 }, { "epoch": 1.4516536228225196, "grad_norm": 0.5624988675117493, "learning_rate": 2.1290078263064884e-05, "loss": 0.0642, "step": 5750 }, { "epoch": 1.4541782378187327, "grad_norm": 0.29660847783088684, "learning_rate": 2.1274930573087602e-05, "loss": 0.0629, "step": 5760 }, { "epoch": 1.4567028528149457, "grad_norm": 0.5936652421951294, "learning_rate": 2.1259782883110327e-05, "loss": 0.0698, "step": 5770 }, { "epoch": 1.4592274678111588, "grad_norm": 0.46520429849624634, "learning_rate": 2.124463519313305e-05, "loss": 0.0637, "step": 5780 }, { "epoch": 1.4617520828073718, "grad_norm": 0.5674166679382324, "learning_rate": 2.1229487503155767e-05, "loss": 0.0744, "step": 5790 }, { "epoch": 1.464276697803585, "grad_norm": 0.894939661026001, "learning_rate": 2.121433981317849e-05, "loss": 0.0704, "step": 5800 }, { "epoch": 1.466801312799798, "grad_norm": 0.482416570186615, "learning_rate": 2.1199192123201213e-05, "loss": 0.0628, "step": 5810 }, { "epoch": 1.4693259277960111, "grad_norm": 0.48222440481185913, "learning_rate": 2.118404443322393e-05, "loss": 0.0788, "step": 5820 }, { "epoch": 1.4718505427922242, "grad_norm": 0.527004063129425, "learning_rate": 2.1168896743246656e-05, "loss": 0.0658, "step": 5830 }, { "epoch": 1.4743751577884372, "grad_norm": 0.5293876528739929, "learning_rate": 2.1153749053269378e-05, "loss": 0.063, "step": 5840 }, { "epoch": 1.4768997727846505, "grad_norm": 0.33477652072906494, "learning_rate": 2.1138601363292096e-05, "loss": 0.072, "step": 5850 }, { "epoch": 1.4794243877808635, "grad_norm": 0.5224368572235107, "learning_rate": 2.112345367331482e-05, "loss": 0.0644, "step": 5860 }, { "epoch": 1.4819490027770765, "grad_norm": 0.31001779437065125, "learning_rate": 2.1108305983337543e-05, "loss": 0.0698, "step": 5870 }, { "epoch": 1.4844736177732896, "grad_norm": 0.8478286862373352, "learning_rate": 2.109315829336026e-05, "loss": 0.0787, "step": 5880 }, { "epoch": 1.4869982327695026, "grad_norm": 0.5729703903198242, "learning_rate": 2.1078010603382986e-05, "loss": 0.0689, "step": 5890 }, { "epoch": 1.4895228477657159, "grad_norm": 0.5856155753135681, "learning_rate": 2.1062862913405708e-05, "loss": 0.0642, "step": 5900 }, { "epoch": 1.4920474627619287, "grad_norm": 0.27725616097450256, "learning_rate": 2.1047715223428426e-05, "loss": 0.0663, "step": 5910 }, { "epoch": 1.494572077758142, "grad_norm": 0.6175593733787537, "learning_rate": 2.103256753345115e-05, "loss": 0.0742, "step": 5920 }, { "epoch": 1.497096692754355, "grad_norm": 0.6941758990287781, "learning_rate": 2.101741984347387e-05, "loss": 0.068, "step": 5930 }, { "epoch": 1.499621307750568, "grad_norm": 0.48814696073532104, "learning_rate": 2.100227215349659e-05, "loss": 0.0775, "step": 5940 }, { "epoch": 1.5021459227467813, "grad_norm": 0.5020901560783386, "learning_rate": 2.0987124463519316e-05, "loss": 0.0679, "step": 5950 }, { "epoch": 1.504670537742994, "grad_norm": 0.4444803297519684, "learning_rate": 2.0971976773542034e-05, "loss": 0.0706, "step": 5960 }, { "epoch": 1.5071951527392073, "grad_norm": 0.34238091111183167, "learning_rate": 2.0956829083564755e-05, "loss": 0.0577, "step": 5970 }, { "epoch": 1.5097197677354204, "grad_norm": 0.8788526654243469, "learning_rate": 2.094168139358748e-05, "loss": 0.066, "step": 5980 }, { "epoch": 1.5122443827316334, "grad_norm": 1.2382627725601196, "learning_rate": 2.09265337036102e-05, "loss": 0.0768, "step": 5990 }, { "epoch": 1.5147689977278465, "grad_norm": 0.47514763474464417, "learning_rate": 2.091138601363292e-05, "loss": 0.0797, "step": 6000 }, { "epoch": 1.5172936127240595, "grad_norm": 0.8225613236427307, "learning_rate": 2.0896238323655645e-05, "loss": 0.0755, "step": 6010 }, { "epoch": 1.5198182277202728, "grad_norm": 0.31106212735176086, "learning_rate": 2.0881090633678363e-05, "loss": 0.0682, "step": 6020 }, { "epoch": 1.5223428427164858, "grad_norm": 0.45431169867515564, "learning_rate": 2.0865942943701085e-05, "loss": 0.0703, "step": 6030 }, { "epoch": 1.5248674577126988, "grad_norm": 0.3643419146537781, "learning_rate": 2.085079525372381e-05, "loss": 0.0719, "step": 6040 }, { "epoch": 1.5273920727089119, "grad_norm": 0.42831987142562866, "learning_rate": 2.0835647563746528e-05, "loss": 0.0858, "step": 6050 }, { "epoch": 1.529916687705125, "grad_norm": 0.5199233889579773, "learning_rate": 2.082049987376925e-05, "loss": 0.0674, "step": 6060 }, { "epoch": 1.5324413027013382, "grad_norm": 0.3392798900604248, "learning_rate": 2.0805352183791975e-05, "loss": 0.0812, "step": 6070 }, { "epoch": 1.534965917697551, "grad_norm": 0.32933366298675537, "learning_rate": 2.0790204493814693e-05, "loss": 0.0639, "step": 6080 }, { "epoch": 1.5374905326937642, "grad_norm": 0.5892539024353027, "learning_rate": 2.0775056803837415e-05, "loss": 0.0734, "step": 6090 }, { "epoch": 1.5400151476899773, "grad_norm": 0.6368609070777893, "learning_rate": 2.075990911386014e-05, "loss": 0.0668, "step": 6100 }, { "epoch": 1.5425397626861903, "grad_norm": 0.6262642741203308, "learning_rate": 2.0744761423882858e-05, "loss": 0.069, "step": 6110 }, { "epoch": 1.5450643776824036, "grad_norm": 0.6016247272491455, "learning_rate": 2.072961373390558e-05, "loss": 0.0602, "step": 6120 }, { "epoch": 1.5475889926786164, "grad_norm": 0.45948028564453125, "learning_rate": 2.0714466043928304e-05, "loss": 0.0799, "step": 6130 }, { "epoch": 1.5501136076748296, "grad_norm": 0.30423131585121155, "learning_rate": 2.0699318353951022e-05, "loss": 0.0658, "step": 6140 }, { "epoch": 1.5526382226710427, "grad_norm": 0.6848326921463013, "learning_rate": 2.0684170663973744e-05, "loss": 0.0737, "step": 6150 }, { "epoch": 1.5551628376672557, "grad_norm": 0.6539986729621887, "learning_rate": 2.066902297399647e-05, "loss": 0.0709, "step": 6160 }, { "epoch": 1.557687452663469, "grad_norm": 0.49498459696769714, "learning_rate": 2.0653875284019187e-05, "loss": 0.0653, "step": 6170 }, { "epoch": 1.5602120676596818, "grad_norm": 0.43628498911857605, "learning_rate": 2.063872759404191e-05, "loss": 0.0655, "step": 6180 }, { "epoch": 1.562736682655895, "grad_norm": 0.350460022687912, "learning_rate": 2.0623579904064634e-05, "loss": 0.0563, "step": 6190 }, { "epoch": 1.565261297652108, "grad_norm": 0.4810716509819031, "learning_rate": 2.0608432214087352e-05, "loss": 0.0709, "step": 6200 }, { "epoch": 1.5677859126483211, "grad_norm": 0.421172171831131, "learning_rate": 2.0593284524110074e-05, "loss": 0.0732, "step": 6210 }, { "epoch": 1.5703105276445342, "grad_norm": 0.5165485143661499, "learning_rate": 2.0578136834132795e-05, "loss": 0.0628, "step": 6220 }, { "epoch": 1.5728351426407472, "grad_norm": 0.41548728942871094, "learning_rate": 2.0562989144155517e-05, "loss": 0.0634, "step": 6230 }, { "epoch": 1.5753597576369605, "grad_norm": 0.4678884744644165, "learning_rate": 2.054784145417824e-05, "loss": 0.0735, "step": 6240 }, { "epoch": 1.5778843726331735, "grad_norm": 0.6086229085922241, "learning_rate": 2.0532693764200957e-05, "loss": 0.0705, "step": 6250 }, { "epoch": 1.5804089876293865, "grad_norm": 0.5168741941452026, "learning_rate": 2.051754607422368e-05, "loss": 0.0763, "step": 6260 }, { "epoch": 1.5829336026255996, "grad_norm": 0.3280368745326996, "learning_rate": 2.0502398384246403e-05, "loss": 0.07, "step": 6270 }, { "epoch": 1.5854582176218126, "grad_norm": 0.545002818107605, "learning_rate": 2.048725069426912e-05, "loss": 0.0777, "step": 6280 }, { "epoch": 1.5879828326180259, "grad_norm": 0.2854851186275482, "learning_rate": 2.0472103004291846e-05, "loss": 0.0566, "step": 6290 }, { "epoch": 1.5905074476142387, "grad_norm": 0.44336074590682983, "learning_rate": 2.0456955314314568e-05, "loss": 0.063, "step": 6300 }, { "epoch": 1.593032062610452, "grad_norm": 0.4831758141517639, "learning_rate": 2.0441807624337286e-05, "loss": 0.0621, "step": 6310 }, { "epoch": 1.595556677606665, "grad_norm": 0.7535089254379272, "learning_rate": 2.042665993436001e-05, "loss": 0.0761, "step": 6320 }, { "epoch": 1.598081292602878, "grad_norm": 0.417810320854187, "learning_rate": 2.0411512244382733e-05, "loss": 0.0812, "step": 6330 }, { "epoch": 1.6006059075990913, "grad_norm": 0.5797942876815796, "learning_rate": 2.039636455440545e-05, "loss": 0.0715, "step": 6340 }, { "epoch": 1.603130522595304, "grad_norm": 0.4397270083427429, "learning_rate": 2.0381216864428176e-05, "loss": 0.0801, "step": 6350 }, { "epoch": 1.6056551375915173, "grad_norm": 0.39924344420433044, "learning_rate": 2.0366069174450898e-05, "loss": 0.0582, "step": 6360 }, { "epoch": 1.6081797525877304, "grad_norm": 0.3915840685367584, "learning_rate": 2.0350921484473616e-05, "loss": 0.0647, "step": 6370 }, { "epoch": 1.6107043675839434, "grad_norm": 0.4041799008846283, "learning_rate": 2.033577379449634e-05, "loss": 0.0604, "step": 6380 }, { "epoch": 1.6132289825801567, "grad_norm": 0.45373886823654175, "learning_rate": 2.0320626104519062e-05, "loss": 0.0729, "step": 6390 }, { "epoch": 1.6157535975763695, "grad_norm": 0.4872368276119232, "learning_rate": 2.030547841454178e-05, "loss": 0.0735, "step": 6400 }, { "epoch": 1.6182782125725828, "grad_norm": 0.6927065253257751, "learning_rate": 2.0290330724564506e-05, "loss": 0.0598, "step": 6410 }, { "epoch": 1.6208028275687958, "grad_norm": 0.6960461139678955, "learning_rate": 2.0275183034587227e-05, "loss": 0.069, "step": 6420 }, { "epoch": 1.6233274425650088, "grad_norm": 0.8550765514373779, "learning_rate": 2.0260035344609945e-05, "loss": 0.0718, "step": 6430 }, { "epoch": 1.6258520575612219, "grad_norm": 0.4117189049720764, "learning_rate": 2.024488765463267e-05, "loss": 0.0673, "step": 6440 }, { "epoch": 1.628376672557435, "grad_norm": 0.3952585458755493, "learning_rate": 2.0229739964655392e-05, "loss": 0.0743, "step": 6450 }, { "epoch": 1.6309012875536482, "grad_norm": 0.30420053005218506, "learning_rate": 2.021459227467811e-05, "loss": 0.0682, "step": 6460 }, { "epoch": 1.6334259025498612, "grad_norm": 0.8514861464500427, "learning_rate": 2.0199444584700835e-05, "loss": 0.068, "step": 6470 }, { "epoch": 1.6359505175460742, "grad_norm": 0.2670237421989441, "learning_rate": 2.0184296894723557e-05, "loss": 0.0704, "step": 6480 }, { "epoch": 1.6384751325422873, "grad_norm": 0.5565549731254578, "learning_rate": 2.0169149204746275e-05, "loss": 0.0726, "step": 6490 }, { "epoch": 1.6409997475385003, "grad_norm": 0.36750999093055725, "learning_rate": 2.0154001514769e-05, "loss": 0.0626, "step": 6500 }, { "epoch": 1.6435243625347136, "grad_norm": 0.35153648257255554, "learning_rate": 2.0138853824791718e-05, "loss": 0.0712, "step": 6510 }, { "epoch": 1.6460489775309264, "grad_norm": 0.6402739882469177, "learning_rate": 2.012370613481444e-05, "loss": 0.0639, "step": 6520 }, { "epoch": 1.6485735925271396, "grad_norm": 0.37415197491645813, "learning_rate": 2.0108558444837165e-05, "loss": 0.0691, "step": 6530 }, { "epoch": 1.6510982075233527, "grad_norm": 0.2915560305118561, "learning_rate": 2.0093410754859883e-05, "loss": 0.071, "step": 6540 }, { "epoch": 1.6536228225195657, "grad_norm": 0.47494032979011536, "learning_rate": 2.0078263064882604e-05, "loss": 0.0677, "step": 6550 }, { "epoch": 1.656147437515779, "grad_norm": 0.4690226912498474, "learning_rate": 2.006311537490533e-05, "loss": 0.0656, "step": 6560 }, { "epoch": 1.6586720525119918, "grad_norm": 0.38596364855766296, "learning_rate": 2.0047967684928048e-05, "loss": 0.0546, "step": 6570 }, { "epoch": 1.661196667508205, "grad_norm": 0.5573495626449585, "learning_rate": 2.003281999495077e-05, "loss": 0.0693, "step": 6580 }, { "epoch": 1.663721282504418, "grad_norm": 0.3469643294811249, "learning_rate": 2.0017672304973494e-05, "loss": 0.0696, "step": 6590 }, { "epoch": 1.6662458975006311, "grad_norm": 0.6143271923065186, "learning_rate": 2.0002524614996212e-05, "loss": 0.0685, "step": 6600 }, { "epoch": 1.6687705124968444, "grad_norm": 0.44451045989990234, "learning_rate": 1.9987376925018934e-05, "loss": 0.0621, "step": 6610 }, { "epoch": 1.6712951274930572, "grad_norm": 0.39059174060821533, "learning_rate": 1.997222923504166e-05, "loss": 0.0689, "step": 6620 }, { "epoch": 1.6738197424892705, "grad_norm": 0.5144139528274536, "learning_rate": 1.9957081545064377e-05, "loss": 0.0734, "step": 6630 }, { "epoch": 1.6763443574854835, "grad_norm": 0.3920697867870331, "learning_rate": 1.99419338550871e-05, "loss": 0.0734, "step": 6640 }, { "epoch": 1.6788689724816965, "grad_norm": 0.5351212024688721, "learning_rate": 1.9926786165109824e-05, "loss": 0.0702, "step": 6650 }, { "epoch": 1.6813935874779096, "grad_norm": 0.6645495295524597, "learning_rate": 1.9911638475132542e-05, "loss": 0.0828, "step": 6660 }, { "epoch": 1.6839182024741226, "grad_norm": 0.3733726739883423, "learning_rate": 1.9896490785155264e-05, "loss": 0.0651, "step": 6670 }, { "epoch": 1.6864428174703359, "grad_norm": 0.3866395950317383, "learning_rate": 1.988134309517799e-05, "loss": 0.0677, "step": 6680 }, { "epoch": 1.688967432466549, "grad_norm": 0.5303260684013367, "learning_rate": 1.9866195405200707e-05, "loss": 0.0717, "step": 6690 }, { "epoch": 1.691492047462762, "grad_norm": 0.24624003469944, "learning_rate": 1.985104771522343e-05, "loss": 0.0655, "step": 6700 }, { "epoch": 1.694016662458975, "grad_norm": 0.5795212984085083, "learning_rate": 1.9835900025246153e-05, "loss": 0.0757, "step": 6710 }, { "epoch": 1.696541277455188, "grad_norm": 0.39747050404548645, "learning_rate": 1.982075233526887e-05, "loss": 0.0753, "step": 6720 }, { "epoch": 1.6990658924514013, "grad_norm": 0.611092209815979, "learning_rate": 1.9805604645291593e-05, "loss": 0.065, "step": 6730 }, { "epoch": 1.701590507447614, "grad_norm": 0.4604199230670929, "learning_rate": 1.9790456955314318e-05, "loss": 0.0661, "step": 6740 }, { "epoch": 1.7041151224438273, "grad_norm": 1.0068440437316895, "learning_rate": 1.9775309265337036e-05, "loss": 0.0748, "step": 6750 }, { "epoch": 1.7066397374400404, "grad_norm": 0.2989295721054077, "learning_rate": 1.9760161575359758e-05, "loss": 0.0593, "step": 6760 }, { "epoch": 1.7091643524362534, "grad_norm": 0.6437642574310303, "learning_rate": 1.9745013885382483e-05, "loss": 0.0646, "step": 6770 }, { "epoch": 1.7116889674324667, "grad_norm": 0.40779128670692444, "learning_rate": 1.97298661954052e-05, "loss": 0.0727, "step": 6780 }, { "epoch": 1.7142135824286795, "grad_norm": 0.5573729872703552, "learning_rate": 1.9714718505427923e-05, "loss": 0.0684, "step": 6790 }, { "epoch": 1.7167381974248928, "grad_norm": 0.41327282786369324, "learning_rate": 1.9699570815450644e-05, "loss": 0.0738, "step": 6800 }, { "epoch": 1.7192628124211058, "grad_norm": 0.6371347904205322, "learning_rate": 1.9684423125473366e-05, "loss": 0.0729, "step": 6810 }, { "epoch": 1.7217874274173188, "grad_norm": 0.6604834198951721, "learning_rate": 1.9669275435496088e-05, "loss": 0.0561, "step": 6820 }, { "epoch": 1.724312042413532, "grad_norm": 0.4899774193763733, "learning_rate": 1.9654127745518806e-05, "loss": 0.0701, "step": 6830 }, { "epoch": 1.726836657409745, "grad_norm": 0.6170947551727295, "learning_rate": 1.963898005554153e-05, "loss": 0.0899, "step": 6840 }, { "epoch": 1.7293612724059582, "grad_norm": 0.5308715105056763, "learning_rate": 1.9623832365564252e-05, "loss": 0.0713, "step": 6850 }, { "epoch": 1.7318858874021712, "grad_norm": 0.4299701750278473, "learning_rate": 1.960868467558697e-05, "loss": 0.0642, "step": 6860 }, { "epoch": 1.7344105023983842, "grad_norm": 0.4577876329421997, "learning_rate": 1.9593536985609695e-05, "loss": 0.0776, "step": 6870 }, { "epoch": 1.7369351173945973, "grad_norm": 0.220742866396904, "learning_rate": 1.9578389295632417e-05, "loss": 0.0643, "step": 6880 }, { "epoch": 1.7394597323908103, "grad_norm": 0.6673031449317932, "learning_rate": 1.9563241605655135e-05, "loss": 0.0546, "step": 6890 }, { "epoch": 1.7419843473870236, "grad_norm": 0.37436506152153015, "learning_rate": 1.954809391567786e-05, "loss": 0.0931, "step": 6900 }, { "epoch": 1.7445089623832366, "grad_norm": 0.4209068715572357, "learning_rate": 1.9532946225700582e-05, "loss": 0.0625, "step": 6910 }, { "epoch": 1.7470335773794496, "grad_norm": 0.5889585018157959, "learning_rate": 1.95177985357233e-05, "loss": 0.0562, "step": 6920 }, { "epoch": 1.7495581923756627, "grad_norm": 0.6516574621200562, "learning_rate": 1.9502650845746025e-05, "loss": 0.0692, "step": 6930 }, { "epoch": 1.7520828073718757, "grad_norm": 0.5210825800895691, "learning_rate": 1.9487503155768747e-05, "loss": 0.0573, "step": 6940 }, { "epoch": 1.754607422368089, "grad_norm": 0.5769010186195374, "learning_rate": 1.9472355465791465e-05, "loss": 0.0726, "step": 6950 }, { "epoch": 1.7571320373643018, "grad_norm": 0.24784168601036072, "learning_rate": 1.945720777581419e-05, "loss": 0.0607, "step": 6960 }, { "epoch": 1.759656652360515, "grad_norm": 0.7145076394081116, "learning_rate": 1.944206008583691e-05, "loss": 0.0723, "step": 6970 }, { "epoch": 1.762181267356728, "grad_norm": 0.5195138454437256, "learning_rate": 1.942691239585963e-05, "loss": 0.072, "step": 6980 }, { "epoch": 1.7647058823529411, "grad_norm": 0.7395075559616089, "learning_rate": 1.9411764705882355e-05, "loss": 0.059, "step": 6990 }, { "epoch": 1.7672304973491544, "grad_norm": 0.4518623948097229, "learning_rate": 1.9396617015905076e-05, "loss": 0.064, "step": 7000 }, { "epoch": 1.7697551123453672, "grad_norm": 0.5325851440429688, "learning_rate": 1.9381469325927794e-05, "loss": 0.0735, "step": 7010 }, { "epoch": 1.7722797273415805, "grad_norm": 0.342540442943573, "learning_rate": 1.936632163595052e-05, "loss": 0.0572, "step": 7020 }, { "epoch": 1.7748043423377935, "grad_norm": 0.513810932636261, "learning_rate": 1.935117394597324e-05, "loss": 0.0651, "step": 7030 }, { "epoch": 1.7773289573340065, "grad_norm": 0.4068731665611267, "learning_rate": 1.933602625599596e-05, "loss": 0.0642, "step": 7040 }, { "epoch": 1.7798535723302198, "grad_norm": 0.4329892098903656, "learning_rate": 1.9320878566018684e-05, "loss": 0.0639, "step": 7050 }, { "epoch": 1.7823781873264326, "grad_norm": 0.5116180777549744, "learning_rate": 1.9305730876041406e-05, "loss": 0.0647, "step": 7060 }, { "epoch": 1.7849028023226459, "grad_norm": 0.5484455823898315, "learning_rate": 1.9290583186064124e-05, "loss": 0.0597, "step": 7070 }, { "epoch": 1.787427417318859, "grad_norm": 0.3904184103012085, "learning_rate": 1.927543549608685e-05, "loss": 0.0594, "step": 7080 }, { "epoch": 1.789952032315072, "grad_norm": 0.7509499192237854, "learning_rate": 1.9260287806109567e-05, "loss": 0.0644, "step": 7090 }, { "epoch": 1.792476647311285, "grad_norm": 0.3884585499763489, "learning_rate": 1.924514011613229e-05, "loss": 0.0784, "step": 7100 }, { "epoch": 1.795001262307498, "grad_norm": 0.39316463470458984, "learning_rate": 1.9229992426155014e-05, "loss": 0.0629, "step": 7110 }, { "epoch": 1.7975258773037113, "grad_norm": 0.3391354978084564, "learning_rate": 1.9214844736177732e-05, "loss": 0.0541, "step": 7120 }, { "epoch": 1.8000504922999243, "grad_norm": 0.5223536491394043, "learning_rate": 1.9199697046200454e-05, "loss": 0.0687, "step": 7130 }, { "epoch": 1.8025751072961373, "grad_norm": 0.40146803855895996, "learning_rate": 1.918454935622318e-05, "loss": 0.0738, "step": 7140 }, { "epoch": 1.8050997222923504, "grad_norm": 0.7420483231544495, "learning_rate": 1.9169401666245897e-05, "loss": 0.0642, "step": 7150 }, { "epoch": 1.8076243372885634, "grad_norm": 0.5140613913536072, "learning_rate": 1.915425397626862e-05, "loss": 0.0648, "step": 7160 }, { "epoch": 1.8101489522847767, "grad_norm": 0.3334696292877197, "learning_rate": 1.9139106286291343e-05, "loss": 0.0693, "step": 7170 }, { "epoch": 1.8126735672809895, "grad_norm": 0.5419024229049683, "learning_rate": 1.912395859631406e-05, "loss": 0.0746, "step": 7180 }, { "epoch": 1.8151981822772028, "grad_norm": 0.4140032231807709, "learning_rate": 1.9108810906336783e-05, "loss": 0.067, "step": 7190 }, { "epoch": 1.8177227972734158, "grad_norm": 0.7290335297584534, "learning_rate": 1.9093663216359508e-05, "loss": 0.0688, "step": 7200 }, { "epoch": 1.8202474122696288, "grad_norm": 0.47243237495422363, "learning_rate": 1.9078515526382226e-05, "loss": 0.0566, "step": 7210 }, { "epoch": 1.822772027265842, "grad_norm": 0.4763794541358948, "learning_rate": 1.9063367836404948e-05, "loss": 0.0736, "step": 7220 }, { "epoch": 1.825296642262055, "grad_norm": 0.5379777550697327, "learning_rate": 1.9048220146427673e-05, "loss": 0.0673, "step": 7230 }, { "epoch": 1.8278212572582682, "grad_norm": 0.6144044399261475, "learning_rate": 1.903307245645039e-05, "loss": 0.077, "step": 7240 }, { "epoch": 1.8303458722544812, "grad_norm": 0.2949241101741791, "learning_rate": 1.9017924766473113e-05, "loss": 0.0617, "step": 7250 }, { "epoch": 1.8328704872506942, "grad_norm": 0.4626986086368561, "learning_rate": 1.9002777076495838e-05, "loss": 0.0774, "step": 7260 }, { "epoch": 1.8353951022469075, "grad_norm": 0.3882731795310974, "learning_rate": 1.8987629386518556e-05, "loss": 0.0695, "step": 7270 }, { "epoch": 1.8379197172431203, "grad_norm": 0.49298906326293945, "learning_rate": 1.8972481696541277e-05, "loss": 0.0836, "step": 7280 }, { "epoch": 1.8404443322393336, "grad_norm": 0.5465462803840637, "learning_rate": 1.8957334006564002e-05, "loss": 0.0673, "step": 7290 }, { "epoch": 1.8429689472355466, "grad_norm": 0.6325553059577942, "learning_rate": 1.894218631658672e-05, "loss": 0.0761, "step": 7300 }, { "epoch": 1.8454935622317596, "grad_norm": 0.3699894845485687, "learning_rate": 1.8927038626609442e-05, "loss": 0.0604, "step": 7310 }, { "epoch": 1.8480181772279727, "grad_norm": 0.3315562903881073, "learning_rate": 1.8911890936632167e-05, "loss": 0.0728, "step": 7320 }, { "epoch": 1.8505427922241857, "grad_norm": 0.3164065480232239, "learning_rate": 1.8896743246654885e-05, "loss": 0.0667, "step": 7330 }, { "epoch": 1.853067407220399, "grad_norm": 0.3007540702819824, "learning_rate": 1.8881595556677607e-05, "loss": 0.0654, "step": 7340 }, { "epoch": 1.855592022216612, "grad_norm": 0.30912989377975464, "learning_rate": 1.886644786670033e-05, "loss": 0.0647, "step": 7350 }, { "epoch": 1.858116637212825, "grad_norm": 0.51572185754776, "learning_rate": 1.885130017672305e-05, "loss": 0.0687, "step": 7360 }, { "epoch": 1.860641252209038, "grad_norm": 0.3540509343147278, "learning_rate": 1.8836152486745772e-05, "loss": 0.0678, "step": 7370 }, { "epoch": 1.8631658672052511, "grad_norm": 0.46048682928085327, "learning_rate": 1.882100479676849e-05, "loss": 0.0595, "step": 7380 }, { "epoch": 1.8656904822014644, "grad_norm": 0.3078758418560028, "learning_rate": 1.8805857106791215e-05, "loss": 0.053, "step": 7390 }, { "epoch": 1.8682150971976772, "grad_norm": 0.500586211681366, "learning_rate": 1.8790709416813937e-05, "loss": 0.0732, "step": 7400 }, { "epoch": 1.8707397121938905, "grad_norm": 0.5178824663162231, "learning_rate": 1.8775561726836655e-05, "loss": 0.0592, "step": 7410 }, { "epoch": 1.8732643271901035, "grad_norm": 0.6682724952697754, "learning_rate": 1.876041403685938e-05, "loss": 0.073, "step": 7420 }, { "epoch": 1.8757889421863165, "grad_norm": 0.5322150588035583, "learning_rate": 1.87452663468821e-05, "loss": 0.0699, "step": 7430 }, { "epoch": 1.8783135571825298, "grad_norm": 0.6210611462593079, "learning_rate": 1.873011865690482e-05, "loss": 0.0613, "step": 7440 }, { "epoch": 1.8808381721787426, "grad_norm": 0.7251406311988831, "learning_rate": 1.8714970966927545e-05, "loss": 0.0616, "step": 7450 }, { "epoch": 1.8833627871749559, "grad_norm": 0.5758464932441711, "learning_rate": 1.8699823276950266e-05, "loss": 0.0643, "step": 7460 }, { "epoch": 1.885887402171169, "grad_norm": 0.36390137672424316, "learning_rate": 1.8684675586972984e-05, "loss": 0.0612, "step": 7470 }, { "epoch": 1.888412017167382, "grad_norm": 0.41576454043388367, "learning_rate": 1.866952789699571e-05, "loss": 0.0682, "step": 7480 }, { "epoch": 1.8909366321635952, "grad_norm": 0.3395916819572449, "learning_rate": 1.865438020701843e-05, "loss": 0.0628, "step": 7490 }, { "epoch": 1.893461247159808, "grad_norm": 0.49120160937309265, "learning_rate": 1.863923251704115e-05, "loss": 0.068, "step": 7500 }, { "epoch": 1.8959858621560213, "grad_norm": 0.2704300284385681, "learning_rate": 1.8624084827063874e-05, "loss": 0.0494, "step": 7510 }, { "epoch": 1.8985104771522343, "grad_norm": 0.4225012958049774, "learning_rate": 1.8608937137086596e-05, "loss": 0.0594, "step": 7520 }, { "epoch": 1.9010350921484473, "grad_norm": 0.4334900975227356, "learning_rate": 1.8593789447109314e-05, "loss": 0.0765, "step": 7530 }, { "epoch": 1.9035597071446604, "grad_norm": 0.6376916766166687, "learning_rate": 1.857864175713204e-05, "loss": 0.0779, "step": 7540 }, { "epoch": 1.9060843221408734, "grad_norm": 0.426049143075943, "learning_rate": 1.856349406715476e-05, "loss": 0.0688, "step": 7550 }, { "epoch": 1.9086089371370867, "grad_norm": 0.840045690536499, "learning_rate": 1.854834637717748e-05, "loss": 0.0718, "step": 7560 }, { "epoch": 1.9111335521332997, "grad_norm": 0.8169825673103333, "learning_rate": 1.8533198687200204e-05, "loss": 0.0772, "step": 7570 }, { "epoch": 1.9136581671295128, "grad_norm": 0.4841653108596802, "learning_rate": 1.8518050997222925e-05, "loss": 0.0697, "step": 7580 }, { "epoch": 1.9161827821257258, "grad_norm": 0.5061050057411194, "learning_rate": 1.8502903307245644e-05, "loss": 0.0771, "step": 7590 }, { "epoch": 1.9187073971219388, "grad_norm": 0.5344434380531311, "learning_rate": 1.848775561726837e-05, "loss": 0.0708, "step": 7600 }, { "epoch": 1.921232012118152, "grad_norm": 0.3740493655204773, "learning_rate": 1.847260792729109e-05, "loss": 0.0541, "step": 7610 }, { "epoch": 1.923756627114365, "grad_norm": 0.33391043543815613, "learning_rate": 1.8457460237313808e-05, "loss": 0.067, "step": 7620 }, { "epoch": 1.9262812421105782, "grad_norm": 0.6397750377655029, "learning_rate": 1.8442312547336533e-05, "loss": 0.0568, "step": 7630 }, { "epoch": 1.9288058571067912, "grad_norm": 0.32809019088745117, "learning_rate": 1.8427164857359255e-05, "loss": 0.0688, "step": 7640 }, { "epoch": 1.9313304721030042, "grad_norm": 0.45072248578071594, "learning_rate": 1.8412017167381973e-05, "loss": 0.0729, "step": 7650 }, { "epoch": 1.9338550870992175, "grad_norm": 0.5833106637001038, "learning_rate": 1.8396869477404698e-05, "loss": 0.0714, "step": 7660 }, { "epoch": 1.9363797020954303, "grad_norm": 0.6556414365768433, "learning_rate": 1.8381721787427416e-05, "loss": 0.0835, "step": 7670 }, { "epoch": 1.9389043170916436, "grad_norm": 0.6509954333305359, "learning_rate": 1.8366574097450138e-05, "loss": 0.0648, "step": 7680 }, { "epoch": 1.9414289320878566, "grad_norm": 0.35850790143013, "learning_rate": 1.8351426407472863e-05, "loss": 0.0742, "step": 7690 }, { "epoch": 1.9439535470840696, "grad_norm": 0.6519030332565308, "learning_rate": 1.833627871749558e-05, "loss": 0.0599, "step": 7700 }, { "epoch": 1.946478162080283, "grad_norm": 0.36720070242881775, "learning_rate": 1.8321131027518303e-05, "loss": 0.0694, "step": 7710 }, { "epoch": 1.9490027770764957, "grad_norm": 0.5095828771591187, "learning_rate": 1.8305983337541028e-05, "loss": 0.0717, "step": 7720 }, { "epoch": 1.951527392072709, "grad_norm": 0.5592676997184753, "learning_rate": 1.8290835647563746e-05, "loss": 0.0728, "step": 7730 }, { "epoch": 1.954052007068922, "grad_norm": 0.5880316495895386, "learning_rate": 1.8275687957586467e-05, "loss": 0.0705, "step": 7740 }, { "epoch": 1.956576622065135, "grad_norm": 0.26635172963142395, "learning_rate": 1.8260540267609192e-05, "loss": 0.0736, "step": 7750 }, { "epoch": 1.9591012370613483, "grad_norm": 0.4297441840171814, "learning_rate": 1.824539257763191e-05, "loss": 0.063, "step": 7760 }, { "epoch": 1.9616258520575611, "grad_norm": 0.4150530993938446, "learning_rate": 1.8230244887654632e-05, "loss": 0.0733, "step": 7770 }, { "epoch": 1.9641504670537744, "grad_norm": 0.36856597661972046, "learning_rate": 1.8215097197677357e-05, "loss": 0.0586, "step": 7780 }, { "epoch": 1.9666750820499874, "grad_norm": 0.36772435903549194, "learning_rate": 1.8199949507700075e-05, "loss": 0.0646, "step": 7790 }, { "epoch": 1.9691996970462005, "grad_norm": 0.34384581446647644, "learning_rate": 1.8184801817722797e-05, "loss": 0.0623, "step": 7800 }, { "epoch": 1.9717243120424135, "grad_norm": 0.5583040118217468, "learning_rate": 1.8169654127745522e-05, "loss": 0.0574, "step": 7810 }, { "epoch": 1.9742489270386265, "grad_norm": 0.8212075233459473, "learning_rate": 1.815450643776824e-05, "loss": 0.0764, "step": 7820 }, { "epoch": 1.9767735420348398, "grad_norm": 0.529600203037262, "learning_rate": 1.8139358747790962e-05, "loss": 0.0738, "step": 7830 }, { "epoch": 1.9792981570310526, "grad_norm": 0.3406949043273926, "learning_rate": 1.8124211057813687e-05, "loss": 0.0584, "step": 7840 }, { "epoch": 1.9818227720272659, "grad_norm": 0.538336455821991, "learning_rate": 1.8109063367836405e-05, "loss": 0.0735, "step": 7850 }, { "epoch": 1.984347387023479, "grad_norm": 0.47139012813568115, "learning_rate": 1.8093915677859127e-05, "loss": 0.0671, "step": 7860 }, { "epoch": 1.986872002019692, "grad_norm": 0.5175936222076416, "learning_rate": 1.8078767987881848e-05, "loss": 0.0744, "step": 7870 }, { "epoch": 1.9893966170159052, "grad_norm": 0.44927799701690674, "learning_rate": 1.806362029790457e-05, "loss": 0.0758, "step": 7880 }, { "epoch": 1.991921232012118, "grad_norm": 0.6851469278335571, "learning_rate": 1.804847260792729e-05, "loss": 0.0634, "step": 7890 }, { "epoch": 1.9944458470083313, "grad_norm": 0.5457276701927185, "learning_rate": 1.8033324917950013e-05, "loss": 0.063, "step": 7900 }, { "epoch": 1.9969704620045443, "grad_norm": 0.3204804062843323, "learning_rate": 1.8018177227972735e-05, "loss": 0.073, "step": 7910 }, { "epoch": 1.9994950770007573, "grad_norm": 0.27762261033058167, "learning_rate": 1.8003029537995456e-05, "loss": 0.08, "step": 7920 }, { "epoch": 2.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.051420293748378754, "eval_runtime": 913.9012, "eval_samples_per_second": 225.694, "eval_steps_per_second": 3.527, "step": 7922 }, { "epoch": 2.0020196919969706, "grad_norm": 0.3685190975666046, "learning_rate": 1.7987881848018178e-05, "loss": 0.0506, "step": 7930 }, { "epoch": 2.0045443069931834, "grad_norm": 0.420768141746521, "learning_rate": 1.79727341580409e-05, "loss": 0.0702, "step": 7940 }, { "epoch": 2.0070689219893967, "grad_norm": 0.24624113738536835, "learning_rate": 1.795758646806362e-05, "loss": 0.0616, "step": 7950 }, { "epoch": 2.0095935369856095, "grad_norm": 0.37558090686798096, "learning_rate": 1.794243877808634e-05, "loss": 0.0597, "step": 7960 }, { "epoch": 2.0121181519818228, "grad_norm": 0.27011531591415405, "learning_rate": 1.7927291088109064e-05, "loss": 0.0686, "step": 7970 }, { "epoch": 2.014642766978036, "grad_norm": 0.4313770830631256, "learning_rate": 1.7912143398131786e-05, "loss": 0.0632, "step": 7980 }, { "epoch": 2.017167381974249, "grad_norm": 0.28532424569129944, "learning_rate": 1.7896995708154504e-05, "loss": 0.0509, "step": 7990 }, { "epoch": 2.019691996970462, "grad_norm": 0.4658902883529663, "learning_rate": 1.788184801817723e-05, "loss": 0.0736, "step": 8000 }, { "epoch": 2.022216611966675, "grad_norm": 0.3360334634780884, "learning_rate": 1.786670032819995e-05, "loss": 0.0625, "step": 8010 }, { "epoch": 2.024741226962888, "grad_norm": 0.41845816373825073, "learning_rate": 1.785155263822267e-05, "loss": 0.0694, "step": 8020 }, { "epoch": 2.0272658419591014, "grad_norm": 0.39248284697532654, "learning_rate": 1.7836404948245394e-05, "loss": 0.0717, "step": 8030 }, { "epoch": 2.0297904569553142, "grad_norm": 0.36341196298599243, "learning_rate": 1.7821257258268115e-05, "loss": 0.0678, "step": 8040 }, { "epoch": 2.0323150719515275, "grad_norm": 0.6026912927627563, "learning_rate": 1.7806109568290833e-05, "loss": 0.0735, "step": 8050 }, { "epoch": 2.0348396869477403, "grad_norm": 0.44328150153160095, "learning_rate": 1.779096187831356e-05, "loss": 0.0735, "step": 8060 }, { "epoch": 2.0373643019439536, "grad_norm": 0.43636953830718994, "learning_rate": 1.777581418833628e-05, "loss": 0.068, "step": 8070 }, { "epoch": 2.039888916940167, "grad_norm": 0.5925287008285522, "learning_rate": 1.7760666498358998e-05, "loss": 0.0694, "step": 8080 }, { "epoch": 2.0424135319363796, "grad_norm": 0.5771965384483337, "learning_rate": 1.7745518808381723e-05, "loss": 0.0677, "step": 8090 }, { "epoch": 2.044938146932593, "grad_norm": 0.5764312744140625, "learning_rate": 1.7730371118404445e-05, "loss": 0.0716, "step": 8100 }, { "epoch": 2.0474627619288057, "grad_norm": 0.5081468820571899, "learning_rate": 1.7715223428427163e-05, "loss": 0.0604, "step": 8110 }, { "epoch": 2.049987376925019, "grad_norm": 0.3322690427303314, "learning_rate": 1.7700075738449888e-05, "loss": 0.0612, "step": 8120 }, { "epoch": 2.0525119919212322, "grad_norm": 0.4772360622882843, "learning_rate": 1.768492804847261e-05, "loss": 0.0672, "step": 8130 }, { "epoch": 2.055036606917445, "grad_norm": 0.4442533850669861, "learning_rate": 1.7669780358495328e-05, "loss": 0.0537, "step": 8140 }, { "epoch": 2.0575612219136583, "grad_norm": 0.4396134614944458, "learning_rate": 1.7654632668518053e-05, "loss": 0.0592, "step": 8150 }, { "epoch": 2.060085836909871, "grad_norm": 1.0153625011444092, "learning_rate": 1.7639484978540774e-05, "loss": 0.068, "step": 8160 }, { "epoch": 2.0626104519060844, "grad_norm": 0.4387151896953583, "learning_rate": 1.7624337288563493e-05, "loss": 0.0737, "step": 8170 }, { "epoch": 2.065135066902297, "grad_norm": 0.6584615111351013, "learning_rate": 1.7609189598586218e-05, "loss": 0.0623, "step": 8180 }, { "epoch": 2.0676596818985105, "grad_norm": 0.5479759573936462, "learning_rate": 1.759404190860894e-05, "loss": 0.0728, "step": 8190 }, { "epoch": 2.0701842968947237, "grad_norm": 0.7336176633834839, "learning_rate": 1.7578894218631657e-05, "loss": 0.0735, "step": 8200 }, { "epoch": 2.0727089118909365, "grad_norm": 0.41931140422821045, "learning_rate": 1.7563746528654382e-05, "loss": 0.0687, "step": 8210 }, { "epoch": 2.07523352688715, "grad_norm": 0.48326900601387024, "learning_rate": 1.7548598838677104e-05, "loss": 0.071, "step": 8220 }, { "epoch": 2.0777581418833626, "grad_norm": 0.5461397767066956, "learning_rate": 1.7533451148699822e-05, "loss": 0.0588, "step": 8230 }, { "epoch": 2.080282756879576, "grad_norm": 0.5842398405075073, "learning_rate": 1.7518303458722547e-05, "loss": 0.0687, "step": 8240 }, { "epoch": 2.082807371875789, "grad_norm": 0.4675542116165161, "learning_rate": 1.7503155768745265e-05, "loss": 0.0789, "step": 8250 }, { "epoch": 2.085331986872002, "grad_norm": 0.38348913192749023, "learning_rate": 1.7488008078767987e-05, "loss": 0.0605, "step": 8260 }, { "epoch": 2.087856601868215, "grad_norm": 0.608074963092804, "learning_rate": 1.7472860388790712e-05, "loss": 0.068, "step": 8270 }, { "epoch": 2.090381216864428, "grad_norm": 0.27725252509117126, "learning_rate": 1.745771269881343e-05, "loss": 0.0684, "step": 8280 }, { "epoch": 2.0929058318606413, "grad_norm": 0.5501505732536316, "learning_rate": 1.7442565008836152e-05, "loss": 0.081, "step": 8290 }, { "epoch": 2.0954304468568545, "grad_norm": 0.28557008504867554, "learning_rate": 1.7427417318858877e-05, "loss": 0.06, "step": 8300 }, { "epoch": 2.0979550618530673, "grad_norm": 0.6145514249801636, "learning_rate": 1.7412269628881595e-05, "loss": 0.0684, "step": 8310 }, { "epoch": 2.1004796768492806, "grad_norm": 0.2920602262020111, "learning_rate": 1.7397121938904317e-05, "loss": 0.0693, "step": 8320 }, { "epoch": 2.1030042918454934, "grad_norm": 0.48144832253456116, "learning_rate": 1.738197424892704e-05, "loss": 0.0705, "step": 8330 }, { "epoch": 2.1055289068417067, "grad_norm": 0.30737602710723877, "learning_rate": 1.736682655894976e-05, "loss": 0.068, "step": 8340 }, { "epoch": 2.1080535218379195, "grad_norm": 0.5024394989013672, "learning_rate": 1.735167886897248e-05, "loss": 0.0662, "step": 8350 }, { "epoch": 2.1105781368341328, "grad_norm": 0.3547489643096924, "learning_rate": 1.7336531178995206e-05, "loss": 0.0586, "step": 8360 }, { "epoch": 2.113102751830346, "grad_norm": 0.41034796833992004, "learning_rate": 1.7321383489017924e-05, "loss": 0.074, "step": 8370 }, { "epoch": 2.115627366826559, "grad_norm": 0.6292237639427185, "learning_rate": 1.7306235799040646e-05, "loss": 0.0741, "step": 8380 }, { "epoch": 2.118151981822772, "grad_norm": 0.6585634350776672, "learning_rate": 1.729108810906337e-05, "loss": 0.0647, "step": 8390 }, { "epoch": 2.120676596818985, "grad_norm": 0.44634106755256653, "learning_rate": 1.727594041908609e-05, "loss": 0.0599, "step": 8400 }, { "epoch": 2.123201211815198, "grad_norm": 0.570796549320221, "learning_rate": 1.726079272910881e-05, "loss": 0.0692, "step": 8410 }, { "epoch": 2.1257258268114114, "grad_norm": 0.8458355069160461, "learning_rate": 1.7245645039131532e-05, "loss": 0.0707, "step": 8420 }, { "epoch": 2.1282504418076242, "grad_norm": 0.41282087564468384, "learning_rate": 1.7230497349154254e-05, "loss": 0.0647, "step": 8430 }, { "epoch": 2.1307750568038375, "grad_norm": 0.39141160249710083, "learning_rate": 1.7215349659176976e-05, "loss": 0.0547, "step": 8440 }, { "epoch": 2.1332996718000503, "grad_norm": 0.564751923084259, "learning_rate": 1.7200201969199697e-05, "loss": 0.0632, "step": 8450 }, { "epoch": 2.1358242867962636, "grad_norm": 0.7247843146324158, "learning_rate": 1.718505427922242e-05, "loss": 0.0639, "step": 8460 }, { "epoch": 2.138348901792477, "grad_norm": 0.5174043774604797, "learning_rate": 1.716990658924514e-05, "loss": 0.062, "step": 8470 }, { "epoch": 2.1408735167886896, "grad_norm": 0.4329341650009155, "learning_rate": 1.7154758899267862e-05, "loss": 0.0651, "step": 8480 }, { "epoch": 2.143398131784903, "grad_norm": 0.41230252385139465, "learning_rate": 1.7139611209290584e-05, "loss": 0.0633, "step": 8490 }, { "epoch": 2.1459227467811157, "grad_norm": 0.5256580114364624, "learning_rate": 1.7124463519313305e-05, "loss": 0.0631, "step": 8500 }, { "epoch": 2.148447361777329, "grad_norm": 0.4602107107639313, "learning_rate": 1.7109315829336027e-05, "loss": 0.0732, "step": 8510 }, { "epoch": 2.1509719767735422, "grad_norm": 0.6555882096290588, "learning_rate": 1.709416813935875e-05, "loss": 0.0704, "step": 8520 }, { "epoch": 2.153496591769755, "grad_norm": 0.4370688199996948, "learning_rate": 1.707902044938147e-05, "loss": 0.0681, "step": 8530 }, { "epoch": 2.1560212067659683, "grad_norm": 0.7623379230499268, "learning_rate": 1.7063872759404188e-05, "loss": 0.0683, "step": 8540 }, { "epoch": 2.158545821762181, "grad_norm": 0.437193363904953, "learning_rate": 1.7048725069426913e-05, "loss": 0.063, "step": 8550 }, { "epoch": 2.1610704367583944, "grad_norm": 0.32816392183303833, "learning_rate": 1.7033577379449635e-05, "loss": 0.0637, "step": 8560 }, { "epoch": 2.1635950517546076, "grad_norm": 0.2817254960536957, "learning_rate": 1.7018429689472353e-05, "loss": 0.0563, "step": 8570 }, { "epoch": 2.1661196667508205, "grad_norm": 0.5059931874275208, "learning_rate": 1.7003281999495078e-05, "loss": 0.0582, "step": 8580 }, { "epoch": 2.1686442817470337, "grad_norm": 0.4076451063156128, "learning_rate": 1.69881343095178e-05, "loss": 0.0521, "step": 8590 }, { "epoch": 2.1711688967432465, "grad_norm": 0.5454453229904175, "learning_rate": 1.6972986619540518e-05, "loss": 0.0615, "step": 8600 }, { "epoch": 2.17369351173946, "grad_norm": 0.39881065487861633, "learning_rate": 1.6957838929563243e-05, "loss": 0.0607, "step": 8610 }, { "epoch": 2.1762181267356726, "grad_norm": 0.38210931420326233, "learning_rate": 1.6942691239585964e-05, "loss": 0.073, "step": 8620 }, { "epoch": 2.178742741731886, "grad_norm": 0.4913787245750427, "learning_rate": 1.6927543549608683e-05, "loss": 0.069, "step": 8630 }, { "epoch": 2.181267356728099, "grad_norm": 0.4121997356414795, "learning_rate": 1.6912395859631408e-05, "loss": 0.0643, "step": 8640 }, { "epoch": 2.183791971724312, "grad_norm": 0.6225160360336304, "learning_rate": 1.689724816965413e-05, "loss": 0.0742, "step": 8650 }, { "epoch": 2.186316586720525, "grad_norm": 0.40772151947021484, "learning_rate": 1.6882100479676847e-05, "loss": 0.0767, "step": 8660 }, { "epoch": 2.188841201716738, "grad_norm": 0.38705331087112427, "learning_rate": 1.6866952789699572e-05, "loss": 0.0692, "step": 8670 }, { "epoch": 2.1913658167129513, "grad_norm": 0.4707449674606323, "learning_rate": 1.6851805099722294e-05, "loss": 0.0754, "step": 8680 }, { "epoch": 2.1938904317091645, "grad_norm": 0.35473600029945374, "learning_rate": 1.6836657409745012e-05, "loss": 0.0524, "step": 8690 }, { "epoch": 2.1964150467053773, "grad_norm": 0.2633616626262665, "learning_rate": 1.6821509719767737e-05, "loss": 0.0625, "step": 8700 }, { "epoch": 2.1989396617015906, "grad_norm": 0.6730284690856934, "learning_rate": 1.680636202979046e-05, "loss": 0.0647, "step": 8710 }, { "epoch": 2.2014642766978034, "grad_norm": 0.4892052412033081, "learning_rate": 1.6791214339813177e-05, "loss": 0.0567, "step": 8720 }, { "epoch": 2.2039888916940167, "grad_norm": 0.6040000319480896, "learning_rate": 1.6776066649835902e-05, "loss": 0.0647, "step": 8730 }, { "epoch": 2.20651350669023, "grad_norm": 0.43167611956596375, "learning_rate": 1.6760918959858623e-05, "loss": 0.0664, "step": 8740 }, { "epoch": 2.2090381216864428, "grad_norm": 0.651911735534668, "learning_rate": 1.6745771269881342e-05, "loss": 0.0746, "step": 8750 }, { "epoch": 2.211562736682656, "grad_norm": 0.6243721842765808, "learning_rate": 1.6730623579904067e-05, "loss": 0.0572, "step": 8760 }, { "epoch": 2.214087351678869, "grad_norm": 0.6300668120384216, "learning_rate": 1.6715475889926788e-05, "loss": 0.0592, "step": 8770 }, { "epoch": 2.216611966675082, "grad_norm": 0.529558002948761, "learning_rate": 1.6700328199949506e-05, "loss": 0.0665, "step": 8780 }, { "epoch": 2.219136581671295, "grad_norm": 0.3494657278060913, "learning_rate": 1.668518050997223e-05, "loss": 0.0748, "step": 8790 }, { "epoch": 2.221661196667508, "grad_norm": 0.4483802914619446, "learning_rate": 1.6670032819994953e-05, "loss": 0.0617, "step": 8800 }, { "epoch": 2.2241858116637214, "grad_norm": 0.5772615075111389, "learning_rate": 1.665488513001767e-05, "loss": 0.0618, "step": 8810 }, { "epoch": 2.2267104266599342, "grad_norm": 0.5170813202857971, "learning_rate": 1.6639737440040396e-05, "loss": 0.0626, "step": 8820 }, { "epoch": 2.2292350416561475, "grad_norm": 0.4822421669960022, "learning_rate": 1.6624589750063114e-05, "loss": 0.0715, "step": 8830 }, { "epoch": 2.2317596566523603, "grad_norm": 0.5152460336685181, "learning_rate": 1.6609442060085836e-05, "loss": 0.0628, "step": 8840 }, { "epoch": 2.2342842716485736, "grad_norm": 0.6732852458953857, "learning_rate": 1.659429437010856e-05, "loss": 0.0772, "step": 8850 }, { "epoch": 2.236808886644787, "grad_norm": 0.6237836480140686, "learning_rate": 1.657914668013128e-05, "loss": 0.0759, "step": 8860 }, { "epoch": 2.2393335016409996, "grad_norm": 0.3084138035774231, "learning_rate": 1.6563998990154e-05, "loss": 0.0588, "step": 8870 }, { "epoch": 2.241858116637213, "grad_norm": 0.4294467568397522, "learning_rate": 1.6548851300176726e-05, "loss": 0.0659, "step": 8880 }, { "epoch": 2.2443827316334257, "grad_norm": 0.42405208945274353, "learning_rate": 1.6533703610199444e-05, "loss": 0.0611, "step": 8890 }, { "epoch": 2.246907346629639, "grad_norm": 0.30669859051704407, "learning_rate": 1.6518555920222166e-05, "loss": 0.0729, "step": 8900 }, { "epoch": 2.2494319616258522, "grad_norm": 0.6405044794082642, "learning_rate": 1.650340823024489e-05, "loss": 0.0644, "step": 8910 }, { "epoch": 2.251956576622065, "grad_norm": 0.5801793336868286, "learning_rate": 1.648826054026761e-05, "loss": 0.0726, "step": 8920 }, { "epoch": 2.2544811916182783, "grad_norm": 0.5352018475532532, "learning_rate": 1.647311285029033e-05, "loss": 0.0505, "step": 8930 }, { "epoch": 2.257005806614491, "grad_norm": 0.4289718568325043, "learning_rate": 1.6457965160313055e-05, "loss": 0.0557, "step": 8940 }, { "epoch": 2.2595304216107044, "grad_norm": 0.6818227767944336, "learning_rate": 1.6442817470335774e-05, "loss": 0.061, "step": 8950 }, { "epoch": 2.2620550366069176, "grad_norm": 0.5462810397148132, "learning_rate": 1.6427669780358495e-05, "loss": 0.0623, "step": 8960 }, { "epoch": 2.2645796516031305, "grad_norm": 0.48088541626930237, "learning_rate": 1.6412522090381217e-05, "loss": 0.0768, "step": 8970 }, { "epoch": 2.2671042665993437, "grad_norm": 0.43914252519607544, "learning_rate": 1.639737440040394e-05, "loss": 0.0611, "step": 8980 }, { "epoch": 2.2696288815955565, "grad_norm": 0.3516700267791748, "learning_rate": 1.638222671042666e-05, "loss": 0.0766, "step": 8990 }, { "epoch": 2.27215349659177, "grad_norm": 0.38341203331947327, "learning_rate": 1.636707902044938e-05, "loss": 0.0631, "step": 9000 }, { "epoch": 2.274678111587983, "grad_norm": 0.5238111615180969, "learning_rate": 1.6351931330472103e-05, "loss": 0.0568, "step": 9010 }, { "epoch": 2.277202726584196, "grad_norm": 0.31196850538253784, "learning_rate": 1.6336783640494825e-05, "loss": 0.0671, "step": 9020 }, { "epoch": 2.279727341580409, "grad_norm": 0.4655681550502777, "learning_rate": 1.6321635950517546e-05, "loss": 0.0671, "step": 9030 }, { "epoch": 2.282251956576622, "grad_norm": 0.4430312514305115, "learning_rate": 1.6306488260540268e-05, "loss": 0.0603, "step": 9040 }, { "epoch": 2.284776571572835, "grad_norm": 0.5572938323020935, "learning_rate": 1.629134057056299e-05, "loss": 0.0516, "step": 9050 }, { "epoch": 2.2873011865690485, "grad_norm": 0.48111096024513245, "learning_rate": 1.627619288058571e-05, "loss": 0.0658, "step": 9060 }, { "epoch": 2.2898258015652613, "grad_norm": 0.40908750891685486, "learning_rate": 1.6261045190608433e-05, "loss": 0.0599, "step": 9070 }, { "epoch": 2.2923504165614745, "grad_norm": 0.45796236395835876, "learning_rate": 1.6245897500631154e-05, "loss": 0.0617, "step": 9080 }, { "epoch": 2.2948750315576874, "grad_norm": 0.2645781934261322, "learning_rate": 1.6230749810653876e-05, "loss": 0.0573, "step": 9090 }, { "epoch": 2.2973996465539006, "grad_norm": 0.45168253779411316, "learning_rate": 1.6215602120676598e-05, "loss": 0.0613, "step": 9100 }, { "epoch": 2.2999242615501134, "grad_norm": 0.3695938289165497, "learning_rate": 1.620045443069932e-05, "loss": 0.0652, "step": 9110 }, { "epoch": 2.3024488765463267, "grad_norm": 0.36523544788360596, "learning_rate": 1.6185306740722037e-05, "loss": 0.0634, "step": 9120 }, { "epoch": 2.30497349154254, "grad_norm": 0.5313533544540405, "learning_rate": 1.6170159050744762e-05, "loss": 0.0646, "step": 9130 }, { "epoch": 2.3074981065387528, "grad_norm": 0.47745630145072937, "learning_rate": 1.6155011360767484e-05, "loss": 0.0643, "step": 9140 }, { "epoch": 2.310022721534966, "grad_norm": 0.4751470685005188, "learning_rate": 1.6139863670790202e-05, "loss": 0.0549, "step": 9150 }, { "epoch": 2.312547336531179, "grad_norm": 0.5816293358802795, "learning_rate": 1.6124715980812927e-05, "loss": 0.0616, "step": 9160 }, { "epoch": 2.315071951527392, "grad_norm": 0.44545724987983704, "learning_rate": 1.610956829083565e-05, "loss": 0.0659, "step": 9170 }, { "epoch": 2.317596566523605, "grad_norm": 0.3668888509273529, "learning_rate": 1.6094420600858367e-05, "loss": 0.0632, "step": 9180 }, { "epoch": 2.320121181519818, "grad_norm": 0.40825653076171875, "learning_rate": 1.6079272910881092e-05, "loss": 0.0794, "step": 9190 }, { "epoch": 2.3226457965160314, "grad_norm": 0.4783599376678467, "learning_rate": 1.6064125220903813e-05, "loss": 0.0709, "step": 9200 }, { "epoch": 2.3251704115122442, "grad_norm": 0.5379131436347961, "learning_rate": 1.604897753092653e-05, "loss": 0.0587, "step": 9210 }, { "epoch": 2.3276950265084575, "grad_norm": 0.5364894866943359, "learning_rate": 1.6033829840949257e-05, "loss": 0.0846, "step": 9220 }, { "epoch": 2.3302196415046703, "grad_norm": 0.4032694399356842, "learning_rate": 1.6018682150971978e-05, "loss": 0.0683, "step": 9230 }, { "epoch": 2.3327442565008836, "grad_norm": 0.44544950127601624, "learning_rate": 1.6003534460994696e-05, "loss": 0.0609, "step": 9240 }, { "epoch": 2.335268871497097, "grad_norm": 0.39711642265319824, "learning_rate": 1.598838677101742e-05, "loss": 0.0531, "step": 9250 }, { "epoch": 2.3377934864933096, "grad_norm": 0.7067275643348694, "learning_rate": 1.5973239081040143e-05, "loss": 0.0645, "step": 9260 }, { "epoch": 2.340318101489523, "grad_norm": 0.404411256313324, "learning_rate": 1.595809139106286e-05, "loss": 0.0843, "step": 9270 }, { "epoch": 2.3428427164857357, "grad_norm": 0.3514604866504669, "learning_rate": 1.5942943701085586e-05, "loss": 0.0607, "step": 9280 }, { "epoch": 2.345367331481949, "grad_norm": 0.5159276723861694, "learning_rate": 1.5927796011108308e-05, "loss": 0.081, "step": 9290 }, { "epoch": 2.3478919464781622, "grad_norm": 0.3323509395122528, "learning_rate": 1.5912648321131026e-05, "loss": 0.0694, "step": 9300 }, { "epoch": 2.350416561474375, "grad_norm": 0.32046014070510864, "learning_rate": 1.589750063115375e-05, "loss": 0.0596, "step": 9310 }, { "epoch": 2.3529411764705883, "grad_norm": 0.7215676307678223, "learning_rate": 1.5882352941176473e-05, "loss": 0.0699, "step": 9320 }, { "epoch": 2.355465791466801, "grad_norm": 0.2936984896659851, "learning_rate": 1.586720525119919e-05, "loss": 0.0669, "step": 9330 }, { "epoch": 2.3579904064630144, "grad_norm": 0.4268344044685364, "learning_rate": 1.5852057561221916e-05, "loss": 0.0814, "step": 9340 }, { "epoch": 2.3605150214592276, "grad_norm": 0.61527419090271, "learning_rate": 1.5836909871244637e-05, "loss": 0.0766, "step": 9350 }, { "epoch": 2.3630396364554405, "grad_norm": 0.6297392249107361, "learning_rate": 1.5821762181267356e-05, "loss": 0.0697, "step": 9360 }, { "epoch": 2.3655642514516537, "grad_norm": 0.5995168685913086, "learning_rate": 1.580661449129008e-05, "loss": 0.0556, "step": 9370 }, { "epoch": 2.3680888664478665, "grad_norm": 0.246127650141716, "learning_rate": 1.5791466801312802e-05, "loss": 0.0651, "step": 9380 }, { "epoch": 2.37061348144408, "grad_norm": 0.5713209509849548, "learning_rate": 1.577631911133552e-05, "loss": 0.0544, "step": 9390 }, { "epoch": 2.373138096440293, "grad_norm": 0.6119291186332703, "learning_rate": 1.5761171421358245e-05, "loss": 0.0598, "step": 9400 }, { "epoch": 2.375662711436506, "grad_norm": 0.612404465675354, "learning_rate": 1.5746023731380964e-05, "loss": 0.0605, "step": 9410 }, { "epoch": 2.378187326432719, "grad_norm": 0.5621523857116699, "learning_rate": 1.5730876041403685e-05, "loss": 0.0637, "step": 9420 }, { "epoch": 2.380711941428932, "grad_norm": 0.24175520241260529, "learning_rate": 1.571572835142641e-05, "loss": 0.0576, "step": 9430 }, { "epoch": 2.383236556425145, "grad_norm": 0.5484057664871216, "learning_rate": 1.570058066144913e-05, "loss": 0.0633, "step": 9440 }, { "epoch": 2.3857611714213585, "grad_norm": 0.4739590287208557, "learning_rate": 1.568543297147185e-05, "loss": 0.0768, "step": 9450 }, { "epoch": 2.3882857864175713, "grad_norm": 0.3890620172023773, "learning_rate": 1.5670285281494575e-05, "loss": 0.07, "step": 9460 }, { "epoch": 2.3908104014137845, "grad_norm": 0.47377142310142517, "learning_rate": 1.5655137591517293e-05, "loss": 0.0611, "step": 9470 }, { "epoch": 2.3933350164099974, "grad_norm": 0.6585646867752075, "learning_rate": 1.5639989901540015e-05, "loss": 0.0543, "step": 9480 }, { "epoch": 2.3958596314062106, "grad_norm": 0.28793618083000183, "learning_rate": 1.562484221156274e-05, "loss": 0.0696, "step": 9490 }, { "epoch": 2.398384246402424, "grad_norm": 0.3433607816696167, "learning_rate": 1.5609694521585458e-05, "loss": 0.0618, "step": 9500 }, { "epoch": 2.4009088613986367, "grad_norm": 0.3919399678707123, "learning_rate": 1.559454683160818e-05, "loss": 0.0628, "step": 9510 }, { "epoch": 2.40343347639485, "grad_norm": 0.6731687784194946, "learning_rate": 1.55793991416309e-05, "loss": 0.0657, "step": 9520 }, { "epoch": 2.4059580913910628, "grad_norm": 0.25971853733062744, "learning_rate": 1.5564251451653623e-05, "loss": 0.0754, "step": 9530 }, { "epoch": 2.408482706387276, "grad_norm": 0.4266602396965027, "learning_rate": 1.5549103761676344e-05, "loss": 0.0527, "step": 9540 }, { "epoch": 2.411007321383489, "grad_norm": 0.29480141401290894, "learning_rate": 1.5533956071699066e-05, "loss": 0.0815, "step": 9550 }, { "epoch": 2.413531936379702, "grad_norm": 0.6023832559585571, "learning_rate": 1.5518808381721787e-05, "loss": 0.0796, "step": 9560 }, { "epoch": 2.4160565513759154, "grad_norm": 0.4575349688529968, "learning_rate": 1.550366069174451e-05, "loss": 0.0694, "step": 9570 }, { "epoch": 2.418581166372128, "grad_norm": 0.7292725443840027, "learning_rate": 1.548851300176723e-05, "loss": 0.0545, "step": 9580 }, { "epoch": 2.4211057813683414, "grad_norm": 1.021568775177002, "learning_rate": 1.5473365311789952e-05, "loss": 0.0528, "step": 9590 }, { "epoch": 2.4236303963645542, "grad_norm": 0.434799462556839, "learning_rate": 1.5458217621812674e-05, "loss": 0.0642, "step": 9600 }, { "epoch": 2.4261550113607675, "grad_norm": 0.5191155672073364, "learning_rate": 1.5443069931835395e-05, "loss": 0.0694, "step": 9610 }, { "epoch": 2.4286796263569803, "grad_norm": 0.5991169214248657, "learning_rate": 1.5427922241858117e-05, "loss": 0.062, "step": 9620 }, { "epoch": 2.4312042413531936, "grad_norm": 0.3980846107006073, "learning_rate": 1.541277455188084e-05, "loss": 0.0576, "step": 9630 }, { "epoch": 2.433728856349407, "grad_norm": 0.7166936993598938, "learning_rate": 1.539762686190356e-05, "loss": 0.0624, "step": 9640 }, { "epoch": 2.4362534713456196, "grad_norm": 0.5147587656974792, "learning_rate": 1.5382479171926282e-05, "loss": 0.0656, "step": 9650 }, { "epoch": 2.438778086341833, "grad_norm": 0.6524437665939331, "learning_rate": 1.5367331481949003e-05, "loss": 0.0603, "step": 9660 }, { "epoch": 2.4413027013380457, "grad_norm": 0.2982568144798279, "learning_rate": 1.5352183791971725e-05, "loss": 0.0597, "step": 9670 }, { "epoch": 2.443827316334259, "grad_norm": 0.3818541169166565, "learning_rate": 1.5337036101994447e-05, "loss": 0.0646, "step": 9680 }, { "epoch": 2.4463519313304722, "grad_norm": 0.5956404209136963, "learning_rate": 1.5321888412017168e-05, "loss": 0.0632, "step": 9690 }, { "epoch": 2.448876546326685, "grad_norm": 0.6520951986312866, "learning_rate": 1.5306740722039886e-05, "loss": 0.0646, "step": 9700 }, { "epoch": 2.4514011613228983, "grad_norm": 0.4105677008628845, "learning_rate": 1.529159303206261e-05, "loss": 0.0562, "step": 9710 }, { "epoch": 2.453925776319111, "grad_norm": 0.4286990165710449, "learning_rate": 1.5276445342085333e-05, "loss": 0.0624, "step": 9720 }, { "epoch": 2.4564503913153244, "grad_norm": 0.4525969922542572, "learning_rate": 1.526129765210805e-05, "loss": 0.0661, "step": 9730 }, { "epoch": 2.4589750063115376, "grad_norm": 0.39278095960617065, "learning_rate": 1.5246149962130776e-05, "loss": 0.0597, "step": 9740 }, { "epoch": 2.4614996213077505, "grad_norm": 0.3804191052913666, "learning_rate": 1.5231002272153498e-05, "loss": 0.0581, "step": 9750 }, { "epoch": 2.4640242363039637, "grad_norm": 0.6127219796180725, "learning_rate": 1.5215854582176218e-05, "loss": 0.0525, "step": 9760 }, { "epoch": 2.4665488513001765, "grad_norm": 0.6907774209976196, "learning_rate": 1.5200706892198941e-05, "loss": 0.065, "step": 9770 }, { "epoch": 2.46907346629639, "grad_norm": 0.5706852674484253, "learning_rate": 1.5185559202221661e-05, "loss": 0.0556, "step": 9780 }, { "epoch": 2.471598081292603, "grad_norm": 0.5961136221885681, "learning_rate": 1.5170411512244382e-05, "loss": 0.0598, "step": 9790 }, { "epoch": 2.474122696288816, "grad_norm": 0.4143712818622589, "learning_rate": 1.5155263822267106e-05, "loss": 0.0605, "step": 9800 }, { "epoch": 2.476647311285029, "grad_norm": 0.4803559184074402, "learning_rate": 1.5140116132289826e-05, "loss": 0.0568, "step": 9810 }, { "epoch": 2.479171926281242, "grad_norm": 0.45310842990875244, "learning_rate": 1.5124968442312547e-05, "loss": 0.0728, "step": 9820 }, { "epoch": 2.481696541277455, "grad_norm": 0.45321986079216003, "learning_rate": 1.510982075233527e-05, "loss": 0.0597, "step": 9830 }, { "epoch": 2.4842211562736685, "grad_norm": 0.5330350995063782, "learning_rate": 1.509467306235799e-05, "loss": 0.0598, "step": 9840 }, { "epoch": 2.4867457712698813, "grad_norm": 0.5140849947929382, "learning_rate": 1.5079525372380712e-05, "loss": 0.0658, "step": 9850 }, { "epoch": 2.4892703862660945, "grad_norm": 0.4241473972797394, "learning_rate": 1.5064377682403435e-05, "loss": 0.0586, "step": 9860 }, { "epoch": 2.4917950012623074, "grad_norm": 0.28790709376335144, "learning_rate": 1.5049229992426155e-05, "loss": 0.0634, "step": 9870 }, { "epoch": 2.4943196162585206, "grad_norm": 0.5911887288093567, "learning_rate": 1.5034082302448877e-05, "loss": 0.069, "step": 9880 }, { "epoch": 2.496844231254734, "grad_norm": 0.5213605761528015, "learning_rate": 1.50189346124716e-05, "loss": 0.0699, "step": 9890 }, { "epoch": 2.4993688462509467, "grad_norm": 0.5791930556297302, "learning_rate": 1.500378692249432e-05, "loss": 0.0546, "step": 9900 }, { "epoch": 2.50189346124716, "grad_norm": 0.5724365711212158, "learning_rate": 1.498863923251704e-05, "loss": 0.0554, "step": 9910 }, { "epoch": 2.5044180762433728, "grad_norm": 0.5653268098831177, "learning_rate": 1.4973491542539763e-05, "loss": 0.0745, "step": 9920 }, { "epoch": 2.506942691239586, "grad_norm": 0.6643403768539429, "learning_rate": 1.4958343852562485e-05, "loss": 0.0595, "step": 9930 }, { "epoch": 2.5094673062357993, "grad_norm": 0.5853692293167114, "learning_rate": 1.4943196162585205e-05, "loss": 0.066, "step": 9940 }, { "epoch": 2.511991921232012, "grad_norm": 0.3577601909637451, "learning_rate": 1.4928048472607928e-05, "loss": 0.0661, "step": 9950 }, { "epoch": 2.5145165362282254, "grad_norm": 0.44483545422554016, "learning_rate": 1.491290078263065e-05, "loss": 0.0679, "step": 9960 }, { "epoch": 2.517041151224438, "grad_norm": 0.7027893662452698, "learning_rate": 1.489775309265337e-05, "loss": 0.065, "step": 9970 }, { "epoch": 2.5195657662206514, "grad_norm": 0.8064749240875244, "learning_rate": 1.4882605402676093e-05, "loss": 0.0532, "step": 9980 }, { "epoch": 2.5220903812168647, "grad_norm": 0.3491119146347046, "learning_rate": 1.4867457712698814e-05, "loss": 0.0586, "step": 9990 }, { "epoch": 2.5246149962130775, "grad_norm": 0.3388938307762146, "learning_rate": 1.4852310022721534e-05, "loss": 0.0663, "step": 10000 }, { "epoch": 2.5271396112092903, "grad_norm": 0.4186761975288391, "learning_rate": 1.4837162332744258e-05, "loss": 0.0611, "step": 10010 }, { "epoch": 2.5296642262055036, "grad_norm": 0.4181320071220398, "learning_rate": 1.4822014642766979e-05, "loss": 0.0554, "step": 10020 }, { "epoch": 2.532188841201717, "grad_norm": 0.555503785610199, "learning_rate": 1.4806866952789699e-05, "loss": 0.0749, "step": 10030 }, { "epoch": 2.53471345619793, "grad_norm": 0.4351447522640228, "learning_rate": 1.4791719262812422e-05, "loss": 0.0635, "step": 10040 }, { "epoch": 2.537238071194143, "grad_norm": 0.3531211018562317, "learning_rate": 1.4776571572835144e-05, "loss": 0.065, "step": 10050 }, { "epoch": 2.5397626861903557, "grad_norm": 0.3265394866466522, "learning_rate": 1.4761423882857864e-05, "loss": 0.0569, "step": 10060 }, { "epoch": 2.542287301186569, "grad_norm": 0.30541831254959106, "learning_rate": 1.4746276192880585e-05, "loss": 0.0704, "step": 10070 }, { "epoch": 2.5448119161827822, "grad_norm": 0.5428284406661987, "learning_rate": 1.4731128502903309e-05, "loss": 0.0591, "step": 10080 }, { "epoch": 2.547336531178995, "grad_norm": 0.6441836357116699, "learning_rate": 1.4715980812926029e-05, "loss": 0.0717, "step": 10090 }, { "epoch": 2.5498611461752083, "grad_norm": 0.35294800996780396, "learning_rate": 1.470083312294875e-05, "loss": 0.0502, "step": 10100 }, { "epoch": 2.552385761171421, "grad_norm": 0.36238688230514526, "learning_rate": 1.4685685432971473e-05, "loss": 0.073, "step": 10110 }, { "epoch": 2.5549103761676344, "grad_norm": 0.42970699071884155, "learning_rate": 1.4670537742994193e-05, "loss": 0.0505, "step": 10120 }, { "epoch": 2.5574349911638476, "grad_norm": 0.6428592205047607, "learning_rate": 1.4655390053016915e-05, "loss": 0.0779, "step": 10130 }, { "epoch": 2.5599596061600605, "grad_norm": 0.6469938158988953, "learning_rate": 1.4640242363039638e-05, "loss": 0.0654, "step": 10140 }, { "epoch": 2.5624842211562737, "grad_norm": 0.5174428224563599, "learning_rate": 1.4625094673062358e-05, "loss": 0.0723, "step": 10150 }, { "epoch": 2.5650088361524865, "grad_norm": 0.4240271747112274, "learning_rate": 1.460994698308508e-05, "loss": 0.0593, "step": 10160 }, { "epoch": 2.5675334511487, "grad_norm": 0.763982892036438, "learning_rate": 1.4594799293107803e-05, "loss": 0.0576, "step": 10170 }, { "epoch": 2.570058066144913, "grad_norm": 0.6305162906646729, "learning_rate": 1.4579651603130523e-05, "loss": 0.0742, "step": 10180 }, { "epoch": 2.572582681141126, "grad_norm": 0.4666813910007477, "learning_rate": 1.4564503913153245e-05, "loss": 0.0645, "step": 10190 }, { "epoch": 2.575107296137339, "grad_norm": 0.5371018052101135, "learning_rate": 1.4549356223175964e-05, "loss": 0.0706, "step": 10200 }, { "epoch": 2.577631911133552, "grad_norm": 0.38728779554367065, "learning_rate": 1.4534208533198688e-05, "loss": 0.0695, "step": 10210 }, { "epoch": 2.580156526129765, "grad_norm": 0.537994921207428, "learning_rate": 1.451906084322141e-05, "loss": 0.0714, "step": 10220 }, { "epoch": 2.5826811411259785, "grad_norm": 0.41289886832237244, "learning_rate": 1.450391315324413e-05, "loss": 0.0703, "step": 10230 }, { "epoch": 2.5852057561221913, "grad_norm": 0.5759279727935791, "learning_rate": 1.4488765463266852e-05, "loss": 0.066, "step": 10240 }, { "epoch": 2.5877303711184045, "grad_norm": 0.5359590649604797, "learning_rate": 1.4473617773289574e-05, "loss": 0.0704, "step": 10250 }, { "epoch": 2.5902549861146174, "grad_norm": 0.3472817540168762, "learning_rate": 1.4458470083312294e-05, "loss": 0.0545, "step": 10260 }, { "epoch": 2.5927796011108306, "grad_norm": 0.5667416453361511, "learning_rate": 1.4443322393335017e-05, "loss": 0.0577, "step": 10270 }, { "epoch": 2.595304216107044, "grad_norm": 0.5809875726699829, "learning_rate": 1.4428174703357739e-05, "loss": 0.0551, "step": 10280 }, { "epoch": 2.5978288311032567, "grad_norm": 0.4938434362411499, "learning_rate": 1.4413027013380459e-05, "loss": 0.056, "step": 10290 }, { "epoch": 2.60035344609947, "grad_norm": 0.33972424268722534, "learning_rate": 1.4397879323403182e-05, "loss": 0.0595, "step": 10300 }, { "epoch": 2.6028780610956828, "grad_norm": 0.42544251680374146, "learning_rate": 1.4382731633425904e-05, "loss": 0.0608, "step": 10310 }, { "epoch": 2.605402676091896, "grad_norm": 0.3733600378036499, "learning_rate": 1.4367583943448624e-05, "loss": 0.0496, "step": 10320 }, { "epoch": 2.6079272910881093, "grad_norm": 0.313872754573822, "learning_rate": 1.4352436253471347e-05, "loss": 0.0641, "step": 10330 }, { "epoch": 2.610451906084322, "grad_norm": 0.43706610798835754, "learning_rate": 1.4337288563494068e-05, "loss": 0.0546, "step": 10340 }, { "epoch": 2.6129765210805354, "grad_norm": 0.5997895002365112, "learning_rate": 1.4322140873516788e-05, "loss": 0.0645, "step": 10350 }, { "epoch": 2.615501136076748, "grad_norm": 0.4480789601802826, "learning_rate": 1.430699318353951e-05, "loss": 0.0563, "step": 10360 }, { "epoch": 2.6180257510729614, "grad_norm": 0.41993817687034607, "learning_rate": 1.4291845493562233e-05, "loss": 0.0578, "step": 10370 }, { "epoch": 2.6205503660691747, "grad_norm": 4.695851802825928, "learning_rate": 1.4276697803584953e-05, "loss": 0.0712, "step": 10380 }, { "epoch": 2.6230749810653875, "grad_norm": 0.5636560320854187, "learning_rate": 1.4261550113607675e-05, "loss": 0.0627, "step": 10390 }, { "epoch": 2.6255995960616008, "grad_norm": 0.2798742949962616, "learning_rate": 1.4246402423630398e-05, "loss": 0.0707, "step": 10400 }, { "epoch": 2.6281242110578136, "grad_norm": 0.5203710794448853, "learning_rate": 1.4231254733653118e-05, "loss": 0.0637, "step": 10410 }, { "epoch": 2.630648826054027, "grad_norm": 0.6120650768280029, "learning_rate": 1.421610704367584e-05, "loss": 0.0615, "step": 10420 }, { "epoch": 2.63317344105024, "grad_norm": 0.3410748541355133, "learning_rate": 1.4200959353698563e-05, "loss": 0.068, "step": 10430 }, { "epoch": 2.635698056046453, "grad_norm": 0.4275479018688202, "learning_rate": 1.4185811663721283e-05, "loss": 0.0638, "step": 10440 }, { "epoch": 2.6382226710426657, "grad_norm": 0.5067179799079895, "learning_rate": 1.4170663973744004e-05, "loss": 0.062, "step": 10450 }, { "epoch": 2.640747286038879, "grad_norm": 0.2900165617465973, "learning_rate": 1.4155516283766726e-05, "loss": 0.0603, "step": 10460 }, { "epoch": 2.6432719010350922, "grad_norm": 0.4279478192329407, "learning_rate": 1.4140368593789447e-05, "loss": 0.085, "step": 10470 }, { "epoch": 2.6457965160313055, "grad_norm": 0.5949720144271851, "learning_rate": 1.4125220903812169e-05, "loss": 0.0642, "step": 10480 }, { "epoch": 2.6483211310275183, "grad_norm": 0.4991612732410431, "learning_rate": 1.4110073213834889e-05, "loss": 0.0563, "step": 10490 }, { "epoch": 2.650845746023731, "grad_norm": 1.0148282051086426, "learning_rate": 1.4094925523857612e-05, "loss": 0.0675, "step": 10500 }, { "epoch": 2.6533703610199444, "grad_norm": 0.5165835022926331, "learning_rate": 1.4079777833880334e-05, "loss": 0.0713, "step": 10510 }, { "epoch": 2.6558949760161576, "grad_norm": 0.7783892154693604, "learning_rate": 1.4064630143903054e-05, "loss": 0.0625, "step": 10520 }, { "epoch": 2.6584195910123705, "grad_norm": 0.24568358063697815, "learning_rate": 1.4049482453925777e-05, "loss": 0.0578, "step": 10530 }, { "epoch": 2.6609442060085837, "grad_norm": 0.46469801664352417, "learning_rate": 1.4034334763948499e-05, "loss": 0.0734, "step": 10540 }, { "epoch": 2.6634688210047965, "grad_norm": 0.5760740637779236, "learning_rate": 1.4019187073971219e-05, "loss": 0.0677, "step": 10550 }, { "epoch": 2.66599343600101, "grad_norm": 0.6759510040283203, "learning_rate": 1.4004039383993942e-05, "loss": 0.064, "step": 10560 }, { "epoch": 2.668518050997223, "grad_norm": 0.6074597239494324, "learning_rate": 1.3988891694016663e-05, "loss": 0.0609, "step": 10570 }, { "epoch": 2.671042665993436, "grad_norm": 0.3964291214942932, "learning_rate": 1.3973744004039383e-05, "loss": 0.0589, "step": 10580 }, { "epoch": 2.673567280989649, "grad_norm": 0.4285549521446228, "learning_rate": 1.3958596314062107e-05, "loss": 0.0607, "step": 10590 }, { "epoch": 2.676091895985862, "grad_norm": 0.4889032244682312, "learning_rate": 1.3943448624084828e-05, "loss": 0.0664, "step": 10600 }, { "epoch": 2.678616510982075, "grad_norm": 0.5509054660797119, "learning_rate": 1.3928300934107548e-05, "loss": 0.0657, "step": 10610 }, { "epoch": 2.6811411259782885, "grad_norm": 0.5647663474082947, "learning_rate": 1.3913153244130271e-05, "loss": 0.0712, "step": 10620 }, { "epoch": 2.6836657409745013, "grad_norm": 0.558576762676239, "learning_rate": 1.3898005554152993e-05, "loss": 0.0713, "step": 10630 }, { "epoch": 2.6861903559707145, "grad_norm": 0.4516654908657074, "learning_rate": 1.3882857864175713e-05, "loss": 0.0631, "step": 10640 }, { "epoch": 2.6887149709669274, "grad_norm": 0.4486042559146881, "learning_rate": 1.3867710174198434e-05, "loss": 0.0653, "step": 10650 }, { "epoch": 2.6912395859631406, "grad_norm": 0.6781389117240906, "learning_rate": 1.3852562484221158e-05, "loss": 0.0632, "step": 10660 }, { "epoch": 2.693764200959354, "grad_norm": 0.49867871403694153, "learning_rate": 1.3837414794243878e-05, "loss": 0.069, "step": 10670 }, { "epoch": 2.6962888159555667, "grad_norm": 0.35217198729515076, "learning_rate": 1.38222671042666e-05, "loss": 0.0658, "step": 10680 }, { "epoch": 2.69881343095178, "grad_norm": 0.6483206152915955, "learning_rate": 1.3807119414289323e-05, "loss": 0.0632, "step": 10690 }, { "epoch": 2.7013380459479928, "grad_norm": 0.3511044979095459, "learning_rate": 1.3791971724312042e-05, "loss": 0.0785, "step": 10700 }, { "epoch": 2.703862660944206, "grad_norm": 0.38118109107017517, "learning_rate": 1.3776824034334764e-05, "loss": 0.0622, "step": 10710 }, { "epoch": 2.7063872759404193, "grad_norm": 0.6023518443107605, "learning_rate": 1.3761676344357486e-05, "loss": 0.0665, "step": 10720 }, { "epoch": 2.708911890936632, "grad_norm": 0.87665855884552, "learning_rate": 1.3746528654380207e-05, "loss": 0.0527, "step": 10730 }, { "epoch": 2.7114365059328454, "grad_norm": 0.24186670780181885, "learning_rate": 1.3731380964402929e-05, "loss": 0.0605, "step": 10740 }, { "epoch": 2.713961120929058, "grad_norm": 0.40543287992477417, "learning_rate": 1.371623327442565e-05, "loss": 0.0714, "step": 10750 }, { "epoch": 2.7164857359252714, "grad_norm": 0.5654604434967041, "learning_rate": 1.3701085584448372e-05, "loss": 0.0649, "step": 10760 }, { "epoch": 2.7190103509214847, "grad_norm": 0.4096258580684662, "learning_rate": 1.3685937894471094e-05, "loss": 0.0575, "step": 10770 }, { "epoch": 2.7215349659176975, "grad_norm": 0.28115513920783997, "learning_rate": 1.3670790204493814e-05, "loss": 0.0506, "step": 10780 }, { "epoch": 2.7240595809139108, "grad_norm": 0.4821475148200989, "learning_rate": 1.3655642514516537e-05, "loss": 0.0572, "step": 10790 }, { "epoch": 2.7265841959101236, "grad_norm": 0.6362419128417969, "learning_rate": 1.3640494824539258e-05, "loss": 0.07, "step": 10800 }, { "epoch": 2.729108810906337, "grad_norm": 0.33636218309402466, "learning_rate": 1.3625347134561978e-05, "loss": 0.0565, "step": 10810 }, { "epoch": 2.73163342590255, "grad_norm": 0.5278832912445068, "learning_rate": 1.3610199444584702e-05, "loss": 0.0742, "step": 10820 }, { "epoch": 2.734158040898763, "grad_norm": 0.6074432134628296, "learning_rate": 1.3595051754607423e-05, "loss": 0.0684, "step": 10830 }, { "epoch": 2.736682655894976, "grad_norm": 0.37134847044944763, "learning_rate": 1.3579904064630143e-05, "loss": 0.0757, "step": 10840 }, { "epoch": 2.739207270891189, "grad_norm": 0.3969337046146393, "learning_rate": 1.3564756374652866e-05, "loss": 0.0566, "step": 10850 }, { "epoch": 2.7417318858874022, "grad_norm": 0.8758741617202759, "learning_rate": 1.3549608684675588e-05, "loss": 0.0537, "step": 10860 }, { "epoch": 2.7442565008836155, "grad_norm": 0.619987428188324, "learning_rate": 1.3534460994698308e-05, "loss": 0.0597, "step": 10870 }, { "epoch": 2.7467811158798283, "grad_norm": 0.4209834635257721, "learning_rate": 1.3519313304721031e-05, "loss": 0.0666, "step": 10880 }, { "epoch": 2.749305730876041, "grad_norm": 0.44348636269569397, "learning_rate": 1.3504165614743753e-05, "loss": 0.0577, "step": 10890 }, { "epoch": 2.7518303458722544, "grad_norm": 0.3939560651779175, "learning_rate": 1.3489017924766473e-05, "loss": 0.0748, "step": 10900 }, { "epoch": 2.7543549608684676, "grad_norm": 0.44665518403053284, "learning_rate": 1.3473870234789196e-05, "loss": 0.05, "step": 10910 }, { "epoch": 2.756879575864681, "grad_norm": 0.42193907499313354, "learning_rate": 1.3458722544811918e-05, "loss": 0.0683, "step": 10920 }, { "epoch": 2.7594041908608937, "grad_norm": 0.17752446234226227, "learning_rate": 1.3443574854834637e-05, "loss": 0.0561, "step": 10930 }, { "epoch": 2.7619288058571065, "grad_norm": 0.3520565927028656, "learning_rate": 1.3428427164857359e-05, "loss": 0.0618, "step": 10940 }, { "epoch": 2.76445342085332, "grad_norm": 0.4644409418106079, "learning_rate": 1.3413279474880082e-05, "loss": 0.0565, "step": 10950 }, { "epoch": 2.766978035849533, "grad_norm": 0.5722434520721436, "learning_rate": 1.3398131784902802e-05, "loss": 0.0669, "step": 10960 }, { "epoch": 2.769502650845746, "grad_norm": 0.42250752449035645, "learning_rate": 1.3382984094925524e-05, "loss": 0.0536, "step": 10970 }, { "epoch": 2.772027265841959, "grad_norm": 0.35409343242645264, "learning_rate": 1.3367836404948247e-05, "loss": 0.0632, "step": 10980 }, { "epoch": 2.774551880838172, "grad_norm": 0.32060950994491577, "learning_rate": 1.3352688714970967e-05, "loss": 0.0533, "step": 10990 }, { "epoch": 2.777076495834385, "grad_norm": 0.5148407220840454, "learning_rate": 1.3337541024993689e-05, "loss": 0.063, "step": 11000 }, { "epoch": 2.7796011108305985, "grad_norm": 0.33185675740242004, "learning_rate": 1.332239333501641e-05, "loss": 0.0622, "step": 11010 }, { "epoch": 2.7821257258268113, "grad_norm": 0.6653043031692505, "learning_rate": 1.3307245645039132e-05, "loss": 0.0626, "step": 11020 }, { "epoch": 2.7846503408230245, "grad_norm": 0.8676443099975586, "learning_rate": 1.3292097955061853e-05, "loss": 0.0692, "step": 11030 }, { "epoch": 2.7871749558192374, "grad_norm": 0.6054368615150452, "learning_rate": 1.3276950265084575e-05, "loss": 0.0617, "step": 11040 }, { "epoch": 2.7896995708154506, "grad_norm": 0.416385680437088, "learning_rate": 1.3261802575107297e-05, "loss": 0.0576, "step": 11050 }, { "epoch": 2.792224185811664, "grad_norm": 0.37582552433013916, "learning_rate": 1.3246654885130018e-05, "loss": 0.0624, "step": 11060 }, { "epoch": 2.7947488008078767, "grad_norm": 0.44736284017562866, "learning_rate": 1.3231507195152738e-05, "loss": 0.0737, "step": 11070 }, { "epoch": 2.79727341580409, "grad_norm": 0.5286217927932739, "learning_rate": 1.3216359505175461e-05, "loss": 0.0644, "step": 11080 }, { "epoch": 2.7997980308003028, "grad_norm": 0.3506781756877899, "learning_rate": 1.3201211815198183e-05, "loss": 0.0661, "step": 11090 }, { "epoch": 2.802322645796516, "grad_norm": 0.3819282054901123, "learning_rate": 1.3186064125220903e-05, "loss": 0.0506, "step": 11100 }, { "epoch": 2.8048472607927293, "grad_norm": 0.4702123999595642, "learning_rate": 1.3170916435243626e-05, "loss": 0.0613, "step": 11110 }, { "epoch": 2.807371875788942, "grad_norm": 0.5164020657539368, "learning_rate": 1.3155768745266348e-05, "loss": 0.0569, "step": 11120 }, { "epoch": 2.8098964907851554, "grad_norm": 0.6037926077842712, "learning_rate": 1.3140621055289068e-05, "loss": 0.0697, "step": 11130 }, { "epoch": 2.812421105781368, "grad_norm": 0.5212513208389282, "learning_rate": 1.3125473365311791e-05, "loss": 0.0482, "step": 11140 }, { "epoch": 2.8149457207775814, "grad_norm": 0.46721410751342773, "learning_rate": 1.3110325675334513e-05, "loss": 0.0607, "step": 11150 }, { "epoch": 2.8174703357737947, "grad_norm": 0.48438936471939087, "learning_rate": 1.3095177985357232e-05, "loss": 0.0685, "step": 11160 }, { "epoch": 2.8199949507700075, "grad_norm": 0.40378254652023315, "learning_rate": 1.3080030295379956e-05, "loss": 0.0684, "step": 11170 }, { "epoch": 2.8225195657662208, "grad_norm": 0.4804742634296417, "learning_rate": 1.3064882605402677e-05, "loss": 0.074, "step": 11180 }, { "epoch": 2.8250441807624336, "grad_norm": 0.4946554899215698, "learning_rate": 1.3049734915425397e-05, "loss": 0.0694, "step": 11190 }, { "epoch": 2.827568795758647, "grad_norm": 0.3673482835292816, "learning_rate": 1.303458722544812e-05, "loss": 0.0694, "step": 11200 }, { "epoch": 2.83009341075486, "grad_norm": 0.4023125171661377, "learning_rate": 1.3019439535470842e-05, "loss": 0.0622, "step": 11210 }, { "epoch": 2.832618025751073, "grad_norm": 0.7260475158691406, "learning_rate": 1.3004291845493562e-05, "loss": 0.0761, "step": 11220 }, { "epoch": 2.835142640747286, "grad_norm": 0.5414501428604126, "learning_rate": 1.2989144155516284e-05, "loss": 0.0548, "step": 11230 }, { "epoch": 2.837667255743499, "grad_norm": 0.48935428261756897, "learning_rate": 1.2973996465539007e-05, "loss": 0.0551, "step": 11240 }, { "epoch": 2.8401918707397122, "grad_norm": 0.3654029071331024, "learning_rate": 1.2958848775561727e-05, "loss": 0.0526, "step": 11250 }, { "epoch": 2.8427164857359255, "grad_norm": 0.42648911476135254, "learning_rate": 1.2943701085584448e-05, "loss": 0.0666, "step": 11260 }, { "epoch": 2.8452411007321383, "grad_norm": 0.49149778485298157, "learning_rate": 1.292855339560717e-05, "loss": 0.0688, "step": 11270 }, { "epoch": 2.8477657157283516, "grad_norm": 0.5606464147567749, "learning_rate": 1.2913405705629892e-05, "loss": 0.0693, "step": 11280 }, { "epoch": 2.8502903307245644, "grad_norm": 0.7137352228164673, "learning_rate": 1.2898258015652613e-05, "loss": 0.0458, "step": 11290 }, { "epoch": 2.8528149457207777, "grad_norm": 0.5822766423225403, "learning_rate": 1.2883110325675335e-05, "loss": 0.065, "step": 11300 }, { "epoch": 2.855339560716991, "grad_norm": 0.5194171071052551, "learning_rate": 1.2867962635698056e-05, "loss": 0.0605, "step": 11310 }, { "epoch": 2.8578641757132037, "grad_norm": 0.35691502690315247, "learning_rate": 1.2852814945720778e-05, "loss": 0.0463, "step": 11320 }, { "epoch": 2.8603887907094165, "grad_norm": 0.4269840121269226, "learning_rate": 1.28376672557435e-05, "loss": 0.0554, "step": 11330 }, { "epoch": 2.86291340570563, "grad_norm": 0.45535925030708313, "learning_rate": 1.2822519565766221e-05, "loss": 0.0542, "step": 11340 }, { "epoch": 2.865438020701843, "grad_norm": 0.4456949532032013, "learning_rate": 1.2807371875788943e-05, "loss": 0.0555, "step": 11350 }, { "epoch": 2.8679626356980563, "grad_norm": 0.373177707195282, "learning_rate": 1.2792224185811663e-05, "loss": 0.0548, "step": 11360 }, { "epoch": 2.870487250694269, "grad_norm": 0.33291095495224, "learning_rate": 1.2777076495834386e-05, "loss": 0.0567, "step": 11370 }, { "epoch": 2.873011865690482, "grad_norm": 0.34440621733665466, "learning_rate": 1.2761928805857107e-05, "loss": 0.0664, "step": 11380 }, { "epoch": 2.875536480686695, "grad_norm": 0.7178020477294922, "learning_rate": 1.2746781115879827e-05, "loss": 0.0661, "step": 11390 }, { "epoch": 2.8780610956829085, "grad_norm": 0.6484971642494202, "learning_rate": 1.273163342590255e-05, "loss": 0.0661, "step": 11400 }, { "epoch": 2.8805857106791213, "grad_norm": 0.44024258852005005, "learning_rate": 1.2716485735925272e-05, "loss": 0.0525, "step": 11410 }, { "epoch": 2.8831103256753345, "grad_norm": 0.42825838923454285, "learning_rate": 1.2701338045947992e-05, "loss": 0.058, "step": 11420 }, { "epoch": 2.8856349406715474, "grad_norm": 0.2894943952560425, "learning_rate": 1.2686190355970715e-05, "loss": 0.0613, "step": 11430 }, { "epoch": 2.8881595556677606, "grad_norm": 0.5295472741127014, "learning_rate": 1.2671042665993437e-05, "loss": 0.0623, "step": 11440 }, { "epoch": 2.890684170663974, "grad_norm": 0.25687211751937866, "learning_rate": 1.2655894976016157e-05, "loss": 0.0495, "step": 11450 }, { "epoch": 2.8932087856601867, "grad_norm": 0.6008526682853699, "learning_rate": 1.264074728603888e-05, "loss": 0.0604, "step": 11460 }, { "epoch": 2.8957334006564, "grad_norm": 0.3679234981536865, "learning_rate": 1.2625599596061602e-05, "loss": 0.0489, "step": 11470 }, { "epoch": 2.8982580156526128, "grad_norm": 0.34721651673316956, "learning_rate": 1.2610451906084322e-05, "loss": 0.0619, "step": 11480 }, { "epoch": 2.900782630648826, "grad_norm": 0.2352529764175415, "learning_rate": 1.2595304216107045e-05, "loss": 0.0641, "step": 11490 }, { "epoch": 2.9033072456450393, "grad_norm": 0.621487557888031, "learning_rate": 1.2580156526129767e-05, "loss": 0.0636, "step": 11500 }, { "epoch": 2.905831860641252, "grad_norm": 0.5426394939422607, "learning_rate": 1.2565008836152487e-05, "loss": 0.0694, "step": 11510 }, { "epoch": 2.9083564756374654, "grad_norm": 0.5092729926109314, "learning_rate": 1.2549861146175208e-05, "loss": 0.0536, "step": 11520 }, { "epoch": 2.910881090633678, "grad_norm": 0.4754561185836792, "learning_rate": 1.2534713456197931e-05, "loss": 0.0605, "step": 11530 }, { "epoch": 2.9134057056298914, "grad_norm": 0.4031394124031067, "learning_rate": 1.2519565766220651e-05, "loss": 0.0765, "step": 11540 }, { "epoch": 2.9159303206261047, "grad_norm": 0.46325090527534485, "learning_rate": 1.2504418076243373e-05, "loss": 0.0691, "step": 11550 }, { "epoch": 2.9184549356223175, "grad_norm": 0.5663125514984131, "learning_rate": 1.2489270386266094e-05, "loss": 0.0651, "step": 11560 }, { "epoch": 2.9209795506185308, "grad_norm": 0.4145601987838745, "learning_rate": 1.2474122696288816e-05, "loss": 0.0546, "step": 11570 }, { "epoch": 2.9235041656147436, "grad_norm": 0.42057228088378906, "learning_rate": 1.2458975006311538e-05, "loss": 0.056, "step": 11580 }, { "epoch": 2.926028780610957, "grad_norm": 0.4702089726924896, "learning_rate": 1.244382731633426e-05, "loss": 0.0526, "step": 11590 }, { "epoch": 2.92855339560717, "grad_norm": 0.6012133359909058, "learning_rate": 1.2428679626356981e-05, "loss": 0.0588, "step": 11600 }, { "epoch": 2.931078010603383, "grad_norm": 0.7933056950569153, "learning_rate": 1.2413531936379702e-05, "loss": 0.0706, "step": 11610 }, { "epoch": 2.933602625599596, "grad_norm": 0.49585798382759094, "learning_rate": 1.2398384246402424e-05, "loss": 0.0561, "step": 11620 }, { "epoch": 2.936127240595809, "grad_norm": 0.460102379322052, "learning_rate": 1.2383236556425146e-05, "loss": 0.0726, "step": 11630 }, { "epoch": 2.9386518555920222, "grad_norm": 0.5320731401443481, "learning_rate": 1.2368088866447867e-05, "loss": 0.0572, "step": 11640 }, { "epoch": 2.9411764705882355, "grad_norm": 0.5159802436828613, "learning_rate": 1.2352941176470587e-05, "loss": 0.0555, "step": 11650 }, { "epoch": 2.9437010855844483, "grad_norm": 0.5879008173942566, "learning_rate": 1.233779348649331e-05, "loss": 0.0504, "step": 11660 }, { "epoch": 2.9462257005806616, "grad_norm": 0.8441261053085327, "learning_rate": 1.2322645796516032e-05, "loss": 0.0631, "step": 11670 }, { "epoch": 2.9487503155768744, "grad_norm": 0.4483579695224762, "learning_rate": 1.2307498106538752e-05, "loss": 0.0639, "step": 11680 }, { "epoch": 2.9512749305730877, "grad_norm": 0.41955748200416565, "learning_rate": 1.2292350416561475e-05, "loss": 0.0641, "step": 11690 }, { "epoch": 2.953799545569301, "grad_norm": 0.5988761782646179, "learning_rate": 1.2277202726584197e-05, "loss": 0.0631, "step": 11700 }, { "epoch": 2.9563241605655137, "grad_norm": 0.9389402270317078, "learning_rate": 1.2262055036606917e-05, "loss": 0.0659, "step": 11710 }, { "epoch": 2.958848775561727, "grad_norm": 0.5522141456604004, "learning_rate": 1.224690734662964e-05, "loss": 0.062, "step": 11720 }, { "epoch": 2.96137339055794, "grad_norm": 0.3770173490047455, "learning_rate": 1.2231759656652362e-05, "loss": 0.0573, "step": 11730 }, { "epoch": 2.963898005554153, "grad_norm": 0.4838135540485382, "learning_rate": 1.2216611966675082e-05, "loss": 0.0605, "step": 11740 }, { "epoch": 2.9664226205503663, "grad_norm": 0.4662052094936371, "learning_rate": 1.2201464276697805e-05, "loss": 0.0716, "step": 11750 }, { "epoch": 2.968947235546579, "grad_norm": 0.42140257358551025, "learning_rate": 1.2186316586720526e-05, "loss": 0.0711, "step": 11760 }, { "epoch": 2.971471850542792, "grad_norm": 0.41893675923347473, "learning_rate": 1.2171168896743246e-05, "loss": 0.0743, "step": 11770 }, { "epoch": 2.973996465539005, "grad_norm": 0.36561962962150574, "learning_rate": 1.215602120676597e-05, "loss": 0.0715, "step": 11780 }, { "epoch": 2.9765210805352185, "grad_norm": 0.338679701089859, "learning_rate": 1.2140873516788691e-05, "loss": 0.0669, "step": 11790 }, { "epoch": 2.9790456955314317, "grad_norm": 0.18762636184692383, "learning_rate": 1.2125725826811411e-05, "loss": 0.0695, "step": 11800 }, { "epoch": 2.9815703105276445, "grad_norm": 0.32193130254745483, "learning_rate": 1.2110578136834133e-05, "loss": 0.0641, "step": 11810 }, { "epoch": 2.9840949255238574, "grad_norm": 0.5772337317466736, "learning_rate": 1.2095430446856854e-05, "loss": 0.0551, "step": 11820 }, { "epoch": 2.9866195405200706, "grad_norm": 0.58876633644104, "learning_rate": 1.2080282756879576e-05, "loss": 0.0702, "step": 11830 }, { "epoch": 2.989144155516284, "grad_norm": 0.3960408568382263, "learning_rate": 1.2065135066902297e-05, "loss": 0.0608, "step": 11840 }, { "epoch": 2.9916687705124967, "grad_norm": 0.4262355864048004, "learning_rate": 1.2049987376925019e-05, "loss": 0.0552, "step": 11850 }, { "epoch": 2.99419338550871, "grad_norm": 0.5799320340156555, "learning_rate": 1.203483968694774e-05, "loss": 0.0492, "step": 11860 }, { "epoch": 2.9967180005049228, "grad_norm": 0.3781181573867798, "learning_rate": 1.2019691996970462e-05, "loss": 0.0693, "step": 11870 }, { "epoch": 2.999242615501136, "grad_norm": 0.5994321703910828, "learning_rate": 1.2004544306993184e-05, "loss": 0.0598, "step": 11880 }, { "epoch": 3.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.05238291248679161, "eval_runtime": 905.9676, "eval_samples_per_second": 227.67, "eval_steps_per_second": 3.558, "step": 11883 }, { "epoch": 3.0017672304973493, "grad_norm": 0.819588840007782, "learning_rate": 1.1989396617015905e-05, "loss": 0.0492, "step": 11890 }, { "epoch": 3.004291845493562, "grad_norm": 0.41840866208076477, "learning_rate": 1.1974248927038627e-05, "loss": 0.0439, "step": 11900 }, { "epoch": 3.0068164604897754, "grad_norm": 0.4053829312324524, "learning_rate": 1.1959101237061349e-05, "loss": 0.0586, "step": 11910 }, { "epoch": 3.009341075485988, "grad_norm": 0.4883507788181305, "learning_rate": 1.194395354708407e-05, "loss": 0.0559, "step": 11920 }, { "epoch": 3.0118656904822014, "grad_norm": 0.18890917301177979, "learning_rate": 1.1928805857106792e-05, "loss": 0.0629, "step": 11930 }, { "epoch": 3.0143903054784147, "grad_norm": 0.46093079447746277, "learning_rate": 1.1913658167129512e-05, "loss": 0.0534, "step": 11940 }, { "epoch": 3.0169149204746275, "grad_norm": 0.6295234560966492, "learning_rate": 1.1898510477152235e-05, "loss": 0.0696, "step": 11950 }, { "epoch": 3.0194395354708408, "grad_norm": 0.6419150233268738, "learning_rate": 1.1883362787174957e-05, "loss": 0.0664, "step": 11960 }, { "epoch": 3.0219641504670536, "grad_norm": 0.7133572101593018, "learning_rate": 1.1868215097197676e-05, "loss": 0.0761, "step": 11970 }, { "epoch": 3.024488765463267, "grad_norm": 0.7136935591697693, "learning_rate": 1.18530674072204e-05, "loss": 0.0688, "step": 11980 }, { "epoch": 3.02701338045948, "grad_norm": 0.33323633670806885, "learning_rate": 1.1837919717243121e-05, "loss": 0.0556, "step": 11990 }, { "epoch": 3.029537995455693, "grad_norm": 0.52031409740448, "learning_rate": 1.1822772027265841e-05, "loss": 0.0606, "step": 12000 }, { "epoch": 3.032062610451906, "grad_norm": 0.5969738364219666, "learning_rate": 1.1807624337288565e-05, "loss": 0.0625, "step": 12010 }, { "epoch": 3.034587225448119, "grad_norm": 0.38949260115623474, "learning_rate": 1.1792476647311286e-05, "loss": 0.0525, "step": 12020 }, { "epoch": 3.0371118404443322, "grad_norm": 0.3799903988838196, "learning_rate": 1.1777328957334006e-05, "loss": 0.0561, "step": 12030 }, { "epoch": 3.0396364554405455, "grad_norm": 0.44943466782569885, "learning_rate": 1.176218126735673e-05, "loss": 0.0513, "step": 12040 }, { "epoch": 3.0421610704367583, "grad_norm": 0.5962013602256775, "learning_rate": 1.1747033577379451e-05, "loss": 0.0551, "step": 12050 }, { "epoch": 3.0446856854329716, "grad_norm": 0.500968873500824, "learning_rate": 1.173188588740217e-05, "loss": 0.0569, "step": 12060 }, { "epoch": 3.0472103004291844, "grad_norm": 0.46429353952407837, "learning_rate": 1.1716738197424892e-05, "loss": 0.0655, "step": 12070 }, { "epoch": 3.0497349154253977, "grad_norm": 0.49735480546951294, "learning_rate": 1.1701590507447614e-05, "loss": 0.057, "step": 12080 }, { "epoch": 3.052259530421611, "grad_norm": 0.6153069138526917, "learning_rate": 1.1686442817470336e-05, "loss": 0.0585, "step": 12090 }, { "epoch": 3.0547841454178237, "grad_norm": 0.5006738305091858, "learning_rate": 1.1671295127493057e-05, "loss": 0.0589, "step": 12100 }, { "epoch": 3.057308760414037, "grad_norm": 0.4022675156593323, "learning_rate": 1.1656147437515779e-05, "loss": 0.0704, "step": 12110 }, { "epoch": 3.05983337541025, "grad_norm": 0.4161219000816345, "learning_rate": 1.16409997475385e-05, "loss": 0.0582, "step": 12120 }, { "epoch": 3.062357990406463, "grad_norm": 0.33736857771873474, "learning_rate": 1.1625852057561222e-05, "loss": 0.0578, "step": 12130 }, { "epoch": 3.0648826054026763, "grad_norm": 0.5995674729347229, "learning_rate": 1.1610704367583944e-05, "loss": 0.0556, "step": 12140 }, { "epoch": 3.067407220398889, "grad_norm": 0.5381020903587341, "learning_rate": 1.1595556677606665e-05, "loss": 0.0616, "step": 12150 }, { "epoch": 3.0699318353951024, "grad_norm": 0.23393908143043518, "learning_rate": 1.1580408987629387e-05, "loss": 0.0784, "step": 12160 }, { "epoch": 3.072456450391315, "grad_norm": 0.4823366105556488, "learning_rate": 1.1565261297652108e-05, "loss": 0.0698, "step": 12170 }, { "epoch": 3.0749810653875285, "grad_norm": 0.5678715109825134, "learning_rate": 1.155011360767483e-05, "loss": 0.0576, "step": 12180 }, { "epoch": 3.0775056803837413, "grad_norm": 0.42829954624176025, "learning_rate": 1.1534965917697552e-05, "loss": 0.0657, "step": 12190 }, { "epoch": 3.0800302953799545, "grad_norm": 0.31741514801979065, "learning_rate": 1.1519818227720273e-05, "loss": 0.0513, "step": 12200 }, { "epoch": 3.082554910376168, "grad_norm": 0.4757698178291321, "learning_rate": 1.1504670537742995e-05, "loss": 0.0458, "step": 12210 }, { "epoch": 3.0850795253723806, "grad_norm": 0.49062609672546387, "learning_rate": 1.1489522847765716e-05, "loss": 0.0626, "step": 12220 }, { "epoch": 3.087604140368594, "grad_norm": 0.40323606133461, "learning_rate": 1.1474375157788436e-05, "loss": 0.0466, "step": 12230 }, { "epoch": 3.0901287553648067, "grad_norm": 0.5706951022148132, "learning_rate": 1.145922746781116e-05, "loss": 0.0706, "step": 12240 }, { "epoch": 3.09265337036102, "grad_norm": 0.5950903296470642, "learning_rate": 1.1444079777833881e-05, "loss": 0.0506, "step": 12250 }, { "epoch": 3.095177985357233, "grad_norm": 0.21615612506866455, "learning_rate": 1.1428932087856601e-05, "loss": 0.0474, "step": 12260 }, { "epoch": 3.097702600353446, "grad_norm": 0.3570312261581421, "learning_rate": 1.1413784397879324e-05, "loss": 0.0654, "step": 12270 }, { "epoch": 3.1002272153496593, "grad_norm": 0.5387312769889832, "learning_rate": 1.1398636707902046e-05, "loss": 0.0669, "step": 12280 }, { "epoch": 3.102751830345872, "grad_norm": 0.36091628670692444, "learning_rate": 1.1383489017924766e-05, "loss": 0.0603, "step": 12290 }, { "epoch": 3.1052764453420854, "grad_norm": 0.49484896659851074, "learning_rate": 1.1368341327947489e-05, "loss": 0.0503, "step": 12300 }, { "epoch": 3.1078010603382986, "grad_norm": 0.6939244270324707, "learning_rate": 1.135319363797021e-05, "loss": 0.0578, "step": 12310 }, { "epoch": 3.1103256753345114, "grad_norm": 0.6148266196250916, "learning_rate": 1.133804594799293e-05, "loss": 0.055, "step": 12320 }, { "epoch": 3.1128502903307247, "grad_norm": 0.5472010374069214, "learning_rate": 1.1322898258015654e-05, "loss": 0.0527, "step": 12330 }, { "epoch": 3.1153749053269375, "grad_norm": 0.40635547041893005, "learning_rate": 1.1307750568038375e-05, "loss": 0.0734, "step": 12340 }, { "epoch": 3.1178995203231508, "grad_norm": 0.49441322684288025, "learning_rate": 1.1292602878061095e-05, "loss": 0.0686, "step": 12350 }, { "epoch": 3.1204241353193636, "grad_norm": 0.41052699089050293, "learning_rate": 1.1277455188083817e-05, "loss": 0.0636, "step": 12360 }, { "epoch": 3.122948750315577, "grad_norm": 0.5868958234786987, "learning_rate": 1.1262307498106539e-05, "loss": 0.0546, "step": 12370 }, { "epoch": 3.12547336531179, "grad_norm": 0.3751147389411926, "learning_rate": 1.124715980812926e-05, "loss": 0.064, "step": 12380 }, { "epoch": 3.127997980308003, "grad_norm": 0.5202364325523376, "learning_rate": 1.1232012118151982e-05, "loss": 0.0536, "step": 12390 }, { "epoch": 3.130522595304216, "grad_norm": 0.5483293533325195, "learning_rate": 1.1216864428174703e-05, "loss": 0.0656, "step": 12400 }, { "epoch": 3.133047210300429, "grad_norm": 0.6402902603149414, "learning_rate": 1.1201716738197425e-05, "loss": 0.0643, "step": 12410 }, { "epoch": 3.1355718252966422, "grad_norm": 0.609380841255188, "learning_rate": 1.1186569048220147e-05, "loss": 0.0763, "step": 12420 }, { "epoch": 3.1380964402928555, "grad_norm": 0.49389705061912537, "learning_rate": 1.1171421358242868e-05, "loss": 0.0727, "step": 12430 }, { "epoch": 3.1406210552890683, "grad_norm": 0.40313732624053955, "learning_rate": 1.115627366826559e-05, "loss": 0.0514, "step": 12440 }, { "epoch": 3.1431456702852816, "grad_norm": 0.5067439675331116, "learning_rate": 1.1141125978288311e-05, "loss": 0.0592, "step": 12450 }, { "epoch": 3.1456702852814944, "grad_norm": 0.39174848794937134, "learning_rate": 1.1125978288311033e-05, "loss": 0.0668, "step": 12460 }, { "epoch": 3.1481949002777077, "grad_norm": 0.6642642617225647, "learning_rate": 1.1110830598333755e-05, "loss": 0.0566, "step": 12470 }, { "epoch": 3.150719515273921, "grad_norm": 0.5557689070701599, "learning_rate": 1.1095682908356476e-05, "loss": 0.0706, "step": 12480 }, { "epoch": 3.1532441302701337, "grad_norm": 0.3642650246620178, "learning_rate": 1.1080535218379198e-05, "loss": 0.0457, "step": 12490 }, { "epoch": 3.155768745266347, "grad_norm": 0.43238964676856995, "learning_rate": 1.106538752840192e-05, "loss": 0.0629, "step": 12500 }, { "epoch": 3.15829336026256, "grad_norm": 0.4752700626850128, "learning_rate": 1.1050239838424641e-05, "loss": 0.0666, "step": 12510 }, { "epoch": 3.160817975258773, "grad_norm": 0.6083767414093018, "learning_rate": 1.103509214844736e-05, "loss": 0.0621, "step": 12520 }, { "epoch": 3.1633425902549863, "grad_norm": 0.5357356071472168, "learning_rate": 1.1019944458470084e-05, "loss": 0.0581, "step": 12530 }, { "epoch": 3.165867205251199, "grad_norm": 0.3257083594799042, "learning_rate": 1.1004796768492806e-05, "loss": 0.063, "step": 12540 }, { "epoch": 3.1683918202474124, "grad_norm": 0.357324481010437, "learning_rate": 1.0989649078515526e-05, "loss": 0.0612, "step": 12550 }, { "epoch": 3.170916435243625, "grad_norm": 0.4535214304924011, "learning_rate": 1.0974501388538249e-05, "loss": 0.064, "step": 12560 }, { "epoch": 3.1734410502398385, "grad_norm": 0.5646650791168213, "learning_rate": 1.095935369856097e-05, "loss": 0.0572, "step": 12570 }, { "epoch": 3.1759656652360517, "grad_norm": 0.8542249202728271, "learning_rate": 1.094420600858369e-05, "loss": 0.0692, "step": 12580 }, { "epoch": 3.1784902802322645, "grad_norm": 0.5563963651657104, "learning_rate": 1.0929058318606414e-05, "loss": 0.0705, "step": 12590 }, { "epoch": 3.181014895228478, "grad_norm": 0.5933377742767334, "learning_rate": 1.0913910628629135e-05, "loss": 0.0489, "step": 12600 }, { "epoch": 3.1835395102246906, "grad_norm": 0.6337215900421143, "learning_rate": 1.0898762938651855e-05, "loss": 0.0757, "step": 12610 }, { "epoch": 3.186064125220904, "grad_norm": 0.5588740110397339, "learning_rate": 1.0883615248674578e-05, "loss": 0.0617, "step": 12620 }, { "epoch": 3.1885887402171167, "grad_norm": 0.5218625068664551, "learning_rate": 1.0868467558697298e-05, "loss": 0.0604, "step": 12630 }, { "epoch": 3.19111335521333, "grad_norm": 0.3570559620857239, "learning_rate": 1.085331986872002e-05, "loss": 0.0571, "step": 12640 }, { "epoch": 3.193637970209543, "grad_norm": 0.48153752088546753, "learning_rate": 1.0838172178742742e-05, "loss": 0.0601, "step": 12650 }, { "epoch": 3.196162585205756, "grad_norm": 0.9248821139335632, "learning_rate": 1.0823024488765463e-05, "loss": 0.0651, "step": 12660 }, { "epoch": 3.1986872002019693, "grad_norm": 0.5911086797714233, "learning_rate": 1.0807876798788185e-05, "loss": 0.0566, "step": 12670 }, { "epoch": 3.201211815198182, "grad_norm": 0.5010080337524414, "learning_rate": 1.0792729108810906e-05, "loss": 0.0568, "step": 12680 }, { "epoch": 3.2037364301943954, "grad_norm": 0.3672632873058319, "learning_rate": 1.0777581418833628e-05, "loss": 0.0699, "step": 12690 }, { "epoch": 3.2062610451906086, "grad_norm": 0.38973551988601685, "learning_rate": 1.076243372885635e-05, "loss": 0.0593, "step": 12700 }, { "epoch": 3.2087856601868214, "grad_norm": 0.32008224725723267, "learning_rate": 1.0747286038879071e-05, "loss": 0.0752, "step": 12710 }, { "epoch": 3.2113102751830347, "grad_norm": 0.2462305873632431, "learning_rate": 1.0732138348901793e-05, "loss": 0.0585, "step": 12720 }, { "epoch": 3.2138348901792475, "grad_norm": 0.34616610407829285, "learning_rate": 1.0716990658924514e-05, "loss": 0.054, "step": 12730 }, { "epoch": 3.2163595051754608, "grad_norm": 0.5276474952697754, "learning_rate": 1.0701842968947236e-05, "loss": 0.0641, "step": 12740 }, { "epoch": 3.218884120171674, "grad_norm": 0.36549025774002075, "learning_rate": 1.0686695278969957e-05, "loss": 0.0632, "step": 12750 }, { "epoch": 3.221408735167887, "grad_norm": 0.5248700380325317, "learning_rate": 1.0671547588992679e-05, "loss": 0.0706, "step": 12760 }, { "epoch": 3.2239333501641, "grad_norm": 0.3740836977958679, "learning_rate": 1.06563998990154e-05, "loss": 0.0594, "step": 12770 }, { "epoch": 3.226457965160313, "grad_norm": 0.31425297260284424, "learning_rate": 1.0641252209038122e-05, "loss": 0.0608, "step": 12780 }, { "epoch": 3.228982580156526, "grad_norm": 0.5715880393981934, "learning_rate": 1.0626104519060844e-05, "loss": 0.0604, "step": 12790 }, { "epoch": 3.231507195152739, "grad_norm": 0.3549630343914032, "learning_rate": 1.0610956829083565e-05, "loss": 0.0471, "step": 12800 }, { "epoch": 3.2340318101489522, "grad_norm": 0.6027510166168213, "learning_rate": 1.0595809139106285e-05, "loss": 0.0672, "step": 12810 }, { "epoch": 3.2365564251451655, "grad_norm": 0.6292756795883179, "learning_rate": 1.0580661449129009e-05, "loss": 0.0522, "step": 12820 }, { "epoch": 3.2390810401413783, "grad_norm": 0.4945664405822754, "learning_rate": 1.056551375915173e-05, "loss": 0.0606, "step": 12830 }, { "epoch": 3.2416056551375916, "grad_norm": 0.3837689757347107, "learning_rate": 1.055036606917445e-05, "loss": 0.058, "step": 12840 }, { "epoch": 3.2441302701338044, "grad_norm": 0.41095155477523804, "learning_rate": 1.0535218379197173e-05, "loss": 0.0632, "step": 12850 }, { "epoch": 3.2466548851300177, "grad_norm": 0.48984506726264954, "learning_rate": 1.0520070689219895e-05, "loss": 0.0628, "step": 12860 }, { "epoch": 3.249179500126231, "grad_norm": 0.6692824959754944, "learning_rate": 1.0504922999242615e-05, "loss": 0.0586, "step": 12870 }, { "epoch": 3.2517041151224437, "grad_norm": 0.5213368535041809, "learning_rate": 1.0489775309265338e-05, "loss": 0.0497, "step": 12880 }, { "epoch": 3.254228730118657, "grad_norm": 0.696983277797699, "learning_rate": 1.0474627619288058e-05, "loss": 0.0635, "step": 12890 }, { "epoch": 3.25675334511487, "grad_norm": 0.29254981875419617, "learning_rate": 1.045947992931078e-05, "loss": 0.0527, "step": 12900 }, { "epoch": 3.259277960111083, "grad_norm": 0.3891927897930145, "learning_rate": 1.0444332239333503e-05, "loss": 0.067, "step": 12910 }, { "epoch": 3.2618025751072963, "grad_norm": 0.49604347348213196, "learning_rate": 1.0429184549356223e-05, "loss": 0.0578, "step": 12920 }, { "epoch": 3.264327190103509, "grad_norm": 0.5001896619796753, "learning_rate": 1.0414036859378944e-05, "loss": 0.0575, "step": 12930 }, { "epoch": 3.2668518050997224, "grad_norm": 0.24430322647094727, "learning_rate": 1.0398889169401666e-05, "loss": 0.0682, "step": 12940 }, { "epoch": 3.269376420095935, "grad_norm": 0.4671231508255005, "learning_rate": 1.0383741479424388e-05, "loss": 0.0628, "step": 12950 }, { "epoch": 3.2719010350921485, "grad_norm": 0.32760515809059143, "learning_rate": 1.036859378944711e-05, "loss": 0.0594, "step": 12960 }, { "epoch": 3.2744256500883617, "grad_norm": 0.46056973934173584, "learning_rate": 1.035344609946983e-05, "loss": 0.0562, "step": 12970 }, { "epoch": 3.2769502650845745, "grad_norm": 0.3852224349975586, "learning_rate": 1.0338298409492552e-05, "loss": 0.064, "step": 12980 }, { "epoch": 3.279474880080788, "grad_norm": 0.6717817187309265, "learning_rate": 1.0323150719515274e-05, "loss": 0.0585, "step": 12990 }, { "epoch": 3.2819994950770006, "grad_norm": 0.6648727655410767, "learning_rate": 1.0308003029537996e-05, "loss": 0.0667, "step": 13000 }, { "epoch": 3.284524110073214, "grad_norm": 0.4613960385322571, "learning_rate": 1.0292855339560717e-05, "loss": 0.0537, "step": 13010 }, { "epoch": 3.287048725069427, "grad_norm": 0.3950752913951874, "learning_rate": 1.0277707649583439e-05, "loss": 0.0574, "step": 13020 }, { "epoch": 3.28957334006564, "grad_norm": 0.4458863437175751, "learning_rate": 1.026255995960616e-05, "loss": 0.0576, "step": 13030 }, { "epoch": 3.292097955061853, "grad_norm": 0.5602406859397888, "learning_rate": 1.0247412269628882e-05, "loss": 0.0635, "step": 13040 }, { "epoch": 3.294622570058066, "grad_norm": 0.49716469645500183, "learning_rate": 1.0232264579651604e-05, "loss": 0.0537, "step": 13050 }, { "epoch": 3.2971471850542793, "grad_norm": 0.9081646800041199, "learning_rate": 1.0217116889674325e-05, "loss": 0.0572, "step": 13060 }, { "epoch": 3.2996718000504925, "grad_norm": 0.36501345038414, "learning_rate": 1.0201969199697047e-05, "loss": 0.063, "step": 13070 }, { "epoch": 3.3021964150467054, "grad_norm": 0.419605553150177, "learning_rate": 1.0186821509719768e-05, "loss": 0.0564, "step": 13080 }, { "epoch": 3.3047210300429186, "grad_norm": 0.4859483242034912, "learning_rate": 1.017167381974249e-05, "loss": 0.0581, "step": 13090 }, { "epoch": 3.3072456450391314, "grad_norm": 0.6135731339454651, "learning_rate": 1.015652612976521e-05, "loss": 0.0562, "step": 13100 }, { "epoch": 3.3097702600353447, "grad_norm": 0.5303016304969788, "learning_rate": 1.0141378439787933e-05, "loss": 0.0573, "step": 13110 }, { "epoch": 3.3122948750315575, "grad_norm": 0.34575608372688293, "learning_rate": 1.0126230749810655e-05, "loss": 0.0652, "step": 13120 }, { "epoch": 3.3148194900277708, "grad_norm": 0.3637102544307709, "learning_rate": 1.0111083059833375e-05, "loss": 0.0558, "step": 13130 }, { "epoch": 3.317344105023984, "grad_norm": 0.3905535936355591, "learning_rate": 1.0095935369856098e-05, "loss": 0.0553, "step": 13140 }, { "epoch": 3.319868720020197, "grad_norm": 0.3310402035713196, "learning_rate": 1.008078767987882e-05, "loss": 0.0748, "step": 13150 }, { "epoch": 3.32239333501641, "grad_norm": 0.6091330647468567, "learning_rate": 1.006563998990154e-05, "loss": 0.0561, "step": 13160 }, { "epoch": 3.324917950012623, "grad_norm": 0.4984453320503235, "learning_rate": 1.0050492299924263e-05, "loss": 0.0666, "step": 13170 }, { "epoch": 3.327442565008836, "grad_norm": 0.5523614883422852, "learning_rate": 1.0035344609946983e-05, "loss": 0.0521, "step": 13180 }, { "epoch": 3.329967180005049, "grad_norm": 0.28940534591674805, "learning_rate": 1.0020196919969704e-05, "loss": 0.0661, "step": 13190 }, { "epoch": 3.3324917950012622, "grad_norm": 0.4787939488887787, "learning_rate": 1.0005049229992428e-05, "loss": 0.0585, "step": 13200 }, { "epoch": 3.3350164099974755, "grad_norm": 0.8407096266746521, "learning_rate": 9.989901540015147e-06, "loss": 0.0562, "step": 13210 }, { "epoch": 3.3375410249936883, "grad_norm": 0.4084523618221283, "learning_rate": 9.974753850037869e-06, "loss": 0.0528, "step": 13220 }, { "epoch": 3.3400656399899016, "grad_norm": 0.5200213193893433, "learning_rate": 9.95960616006059e-06, "loss": 0.0648, "step": 13230 }, { "epoch": 3.3425902549861144, "grad_norm": 0.34778347611427307, "learning_rate": 9.944458470083312e-06, "loss": 0.0495, "step": 13240 }, { "epoch": 3.3451148699823277, "grad_norm": 0.5605157017707825, "learning_rate": 9.929310780106034e-06, "loss": 0.0706, "step": 13250 }, { "epoch": 3.347639484978541, "grad_norm": 0.5080039501190186, "learning_rate": 9.914163090128755e-06, "loss": 0.0583, "step": 13260 }, { "epoch": 3.3501640999747537, "grad_norm": 0.35784703493118286, "learning_rate": 9.899015400151477e-06, "loss": 0.0599, "step": 13270 }, { "epoch": 3.352688714970967, "grad_norm": 0.44402334094047546, "learning_rate": 9.883867710174199e-06, "loss": 0.0429, "step": 13280 }, { "epoch": 3.35521332996718, "grad_norm": 0.26014208793640137, "learning_rate": 9.86872002019692e-06, "loss": 0.0654, "step": 13290 }, { "epoch": 3.357737944963393, "grad_norm": 0.5210455060005188, "learning_rate": 9.853572330219642e-06, "loss": 0.0524, "step": 13300 }, { "epoch": 3.3602625599596063, "grad_norm": 0.5096918344497681, "learning_rate": 9.838424640242363e-06, "loss": 0.0627, "step": 13310 }, { "epoch": 3.362787174955819, "grad_norm": 0.46019914746284485, "learning_rate": 9.823276950265085e-06, "loss": 0.0534, "step": 13320 }, { "epoch": 3.3653117899520324, "grad_norm": 0.6467626690864563, "learning_rate": 9.808129260287807e-06, "loss": 0.0728, "step": 13330 }, { "epoch": 3.367836404948245, "grad_norm": 0.6337939500808716, "learning_rate": 9.792981570310528e-06, "loss": 0.0601, "step": 13340 }, { "epoch": 3.3703610199444585, "grad_norm": 0.36074209213256836, "learning_rate": 9.77783388033325e-06, "loss": 0.0602, "step": 13350 }, { "epoch": 3.3728856349406717, "grad_norm": 0.5853038430213928, "learning_rate": 9.762686190355971e-06, "loss": 0.0488, "step": 13360 }, { "epoch": 3.3754102499368845, "grad_norm": 0.46384280920028687, "learning_rate": 9.747538500378693e-06, "loss": 0.0609, "step": 13370 }, { "epoch": 3.377934864933098, "grad_norm": 0.38020265102386475, "learning_rate": 9.732390810401415e-06, "loss": 0.0515, "step": 13380 }, { "epoch": 3.3804594799293106, "grad_norm": 0.5325652956962585, "learning_rate": 9.717243120424134e-06, "loss": 0.0609, "step": 13390 }, { "epoch": 3.382984094925524, "grad_norm": 0.3673112392425537, "learning_rate": 9.702095430446858e-06, "loss": 0.0729, "step": 13400 }, { "epoch": 3.385508709921737, "grad_norm": 0.5106756091117859, "learning_rate": 9.68694774046958e-06, "loss": 0.0545, "step": 13410 }, { "epoch": 3.38803332491795, "grad_norm": 0.39151692390441895, "learning_rate": 9.6718000504923e-06, "loss": 0.0588, "step": 13420 }, { "epoch": 3.390557939914163, "grad_norm": 0.3701721131801605, "learning_rate": 9.656652360515022e-06, "loss": 0.0576, "step": 13430 }, { "epoch": 3.393082554910376, "grad_norm": 0.5336679816246033, "learning_rate": 9.641504670537742e-06, "loss": 0.0597, "step": 13440 }, { "epoch": 3.3956071699065893, "grad_norm": 0.4533694088459015, "learning_rate": 9.626356980560464e-06, "loss": 0.0708, "step": 13450 }, { "epoch": 3.3981317849028025, "grad_norm": 0.5501024127006531, "learning_rate": 9.611209290583187e-06, "loss": 0.059, "step": 13460 }, { "epoch": 3.4006563998990154, "grad_norm": 0.38329634070396423, "learning_rate": 9.596061600605907e-06, "loss": 0.0563, "step": 13470 }, { "epoch": 3.4031810148952286, "grad_norm": 0.4871230721473694, "learning_rate": 9.580913910628629e-06, "loss": 0.0657, "step": 13480 }, { "epoch": 3.4057056298914414, "grad_norm": 0.6076348423957825, "learning_rate": 9.565766220651352e-06, "loss": 0.0664, "step": 13490 }, { "epoch": 3.4082302448876547, "grad_norm": 0.44044622778892517, "learning_rate": 9.550618530674072e-06, "loss": 0.0657, "step": 13500 }, { "epoch": 3.410754859883868, "grad_norm": 0.7991306781768799, "learning_rate": 9.535470840696794e-06, "loss": 0.0455, "step": 13510 }, { "epoch": 3.4132794748800808, "grad_norm": 0.41269394755363464, "learning_rate": 9.520323150719515e-06, "loss": 0.0586, "step": 13520 }, { "epoch": 3.415804089876294, "grad_norm": 0.4661392867565155, "learning_rate": 9.505175460742237e-06, "loss": 0.066, "step": 13530 }, { "epoch": 3.418328704872507, "grad_norm": 0.7089284062385559, "learning_rate": 9.490027770764958e-06, "loss": 0.0605, "step": 13540 }, { "epoch": 3.42085331986872, "grad_norm": 0.3510993421077728, "learning_rate": 9.47488008078768e-06, "loss": 0.0541, "step": 13550 }, { "epoch": 3.423377934864933, "grad_norm": 0.3425975739955902, "learning_rate": 9.459732390810402e-06, "loss": 0.0623, "step": 13560 }, { "epoch": 3.425902549861146, "grad_norm": 0.3647609353065491, "learning_rate": 9.444584700833123e-06, "loss": 0.0571, "step": 13570 }, { "epoch": 3.4284271648573594, "grad_norm": 0.46789881587028503, "learning_rate": 9.429437010855845e-06, "loss": 0.0537, "step": 13580 }, { "epoch": 3.4309517798535722, "grad_norm": 0.6661184430122375, "learning_rate": 9.414289320878566e-06, "loss": 0.0544, "step": 13590 }, { "epoch": 3.4334763948497855, "grad_norm": 0.6467635035514832, "learning_rate": 9.399141630901288e-06, "loss": 0.0633, "step": 13600 }, { "epoch": 3.4360010098459983, "grad_norm": 0.5681377649307251, "learning_rate": 9.38399394092401e-06, "loss": 0.0613, "step": 13610 }, { "epoch": 3.4385256248422116, "grad_norm": 0.49820029735565186, "learning_rate": 9.368846250946731e-06, "loss": 0.0477, "step": 13620 }, { "epoch": 3.4410502398384244, "grad_norm": 0.5915389060974121, "learning_rate": 9.353698560969453e-06, "loss": 0.0599, "step": 13630 }, { "epoch": 3.4435748548346377, "grad_norm": 0.540043830871582, "learning_rate": 9.338550870992174e-06, "loss": 0.0658, "step": 13640 }, { "epoch": 3.446099469830851, "grad_norm": 0.3997081220149994, "learning_rate": 9.323403181014896e-06, "loss": 0.054, "step": 13650 }, { "epoch": 3.4486240848270637, "grad_norm": 0.45656365156173706, "learning_rate": 9.308255491037617e-06, "loss": 0.058, "step": 13660 }, { "epoch": 3.451148699823277, "grad_norm": 0.417140394449234, "learning_rate": 9.293107801060339e-06, "loss": 0.0652, "step": 13670 }, { "epoch": 3.45367331481949, "grad_norm": 0.5246079564094543, "learning_rate": 9.277960111083059e-06, "loss": 0.0512, "step": 13680 }, { "epoch": 3.456197929815703, "grad_norm": 0.3504631519317627, "learning_rate": 9.262812421105782e-06, "loss": 0.0721, "step": 13690 }, { "epoch": 3.4587225448119163, "grad_norm": 0.29381871223449707, "learning_rate": 9.247664731128504e-06, "loss": 0.0553, "step": 13700 }, { "epoch": 3.461247159808129, "grad_norm": 0.41762417554855347, "learning_rate": 9.232517041151224e-06, "loss": 0.0734, "step": 13710 }, { "epoch": 3.4637717748043424, "grad_norm": 0.6137571930885315, "learning_rate": 9.217369351173947e-06, "loss": 0.0588, "step": 13720 }, { "epoch": 3.466296389800555, "grad_norm": 0.6194366812705994, "learning_rate": 9.202221661196667e-06, "loss": 0.0568, "step": 13730 }, { "epoch": 3.4688210047967685, "grad_norm": 0.7250993251800537, "learning_rate": 9.187073971219389e-06, "loss": 0.056, "step": 13740 }, { "epoch": 3.4713456197929817, "grad_norm": 0.5507886409759521, "learning_rate": 9.171926281242112e-06, "loss": 0.0707, "step": 13750 }, { "epoch": 3.4738702347891945, "grad_norm": 0.39389169216156006, "learning_rate": 9.156778591264832e-06, "loss": 0.064, "step": 13760 }, { "epoch": 3.476394849785408, "grad_norm": 0.47429707646369934, "learning_rate": 9.141630901287553e-06, "loss": 0.0618, "step": 13770 }, { "epoch": 3.4789194647816206, "grad_norm": 0.3083738088607788, "learning_rate": 9.126483211310277e-06, "loss": 0.0538, "step": 13780 }, { "epoch": 3.481444079777834, "grad_norm": 0.6611940860748291, "learning_rate": 9.111335521332997e-06, "loss": 0.0754, "step": 13790 }, { "epoch": 3.483968694774047, "grad_norm": 0.3021230697631836, "learning_rate": 9.096187831355718e-06, "loss": 0.0522, "step": 13800 }, { "epoch": 3.48649330977026, "grad_norm": 0.34516650438308716, "learning_rate": 9.08104014137844e-06, "loss": 0.0532, "step": 13810 }, { "epoch": 3.489017924766473, "grad_norm": 0.2838568687438965, "learning_rate": 9.065892451401161e-06, "loss": 0.0555, "step": 13820 }, { "epoch": 3.491542539762686, "grad_norm": 0.4129784405231476, "learning_rate": 9.050744761423883e-06, "loss": 0.0552, "step": 13830 }, { "epoch": 3.4940671547588993, "grad_norm": 0.539181113243103, "learning_rate": 9.035597071446604e-06, "loss": 0.0594, "step": 13840 }, { "epoch": 3.4965917697551125, "grad_norm": 0.2918561100959778, "learning_rate": 9.020449381469326e-06, "loss": 0.0496, "step": 13850 }, { "epoch": 3.4991163847513254, "grad_norm": 0.39629873633384705, "learning_rate": 9.005301691492048e-06, "loss": 0.0451, "step": 13860 }, { "epoch": 3.5016409997475386, "grad_norm": 0.842644214630127, "learning_rate": 8.99015400151477e-06, "loss": 0.0595, "step": 13870 }, { "epoch": 3.5041656147437514, "grad_norm": 0.3173305094242096, "learning_rate": 8.975006311537491e-06, "loss": 0.0517, "step": 13880 }, { "epoch": 3.5066902297399647, "grad_norm": 0.19913332164287567, "learning_rate": 8.959858621560212e-06, "loss": 0.0617, "step": 13890 }, { "epoch": 3.509214844736178, "grad_norm": 0.42569276690483093, "learning_rate": 8.944710931582934e-06, "loss": 0.0543, "step": 13900 }, { "epoch": 3.5117394597323908, "grad_norm": 0.3855074346065521, "learning_rate": 8.929563241605656e-06, "loss": 0.0479, "step": 13910 }, { "epoch": 3.514264074728604, "grad_norm": 0.7564893960952759, "learning_rate": 8.914415551628377e-06, "loss": 0.07, "step": 13920 }, { "epoch": 3.516788689724817, "grad_norm": 0.4746794104576111, "learning_rate": 8.899267861651099e-06, "loss": 0.0615, "step": 13930 }, { "epoch": 3.51931330472103, "grad_norm": 0.5218245387077332, "learning_rate": 8.88412017167382e-06, "loss": 0.0603, "step": 13940 }, { "epoch": 3.5218379197172434, "grad_norm": 0.3240072429180145, "learning_rate": 8.868972481696542e-06, "loss": 0.0728, "step": 13950 }, { "epoch": 3.524362534713456, "grad_norm": 0.4840814471244812, "learning_rate": 8.853824791719264e-06, "loss": 0.0586, "step": 13960 }, { "epoch": 3.5268871497096694, "grad_norm": 0.4556601643562317, "learning_rate": 8.838677101741984e-06, "loss": 0.0612, "step": 13970 }, { "epoch": 3.5294117647058822, "grad_norm": 0.6337325572967529, "learning_rate": 8.823529411764707e-06, "loss": 0.0695, "step": 13980 }, { "epoch": 3.5319363797020955, "grad_norm": 0.5485546588897705, "learning_rate": 8.808381721787427e-06, "loss": 0.0599, "step": 13990 }, { "epoch": 3.5344609946983088, "grad_norm": 0.33449289202690125, "learning_rate": 8.793234031810148e-06, "loss": 0.058, "step": 14000 }, { "epoch": 3.5369856096945216, "grad_norm": 0.40033623576164246, "learning_rate": 8.778086341832872e-06, "loss": 0.0595, "step": 14010 }, { "epoch": 3.5395102246907344, "grad_norm": 0.31661325693130493, "learning_rate": 8.762938651855591e-06, "loss": 0.0552, "step": 14020 }, { "epoch": 3.5420348396869477, "grad_norm": 0.32345449924468994, "learning_rate": 8.747790961878313e-06, "loss": 0.057, "step": 14030 }, { "epoch": 3.544559454683161, "grad_norm": 0.455110639333725, "learning_rate": 8.732643271901036e-06, "loss": 0.0639, "step": 14040 }, { "epoch": 3.5470840696793737, "grad_norm": 0.47227227687835693, "learning_rate": 8.717495581923756e-06, "loss": 0.0646, "step": 14050 }, { "epoch": 3.549608684675587, "grad_norm": 0.7046650648117065, "learning_rate": 8.702347891946478e-06, "loss": 0.0452, "step": 14060 }, { "epoch": 3.5521332996718, "grad_norm": 0.28317102789878845, "learning_rate": 8.687200201969201e-06, "loss": 0.0657, "step": 14070 }, { "epoch": 3.554657914668013, "grad_norm": 0.5077089071273804, "learning_rate": 8.672052511991921e-06, "loss": 0.048, "step": 14080 }, { "epoch": 3.5571825296642263, "grad_norm": 0.657687783241272, "learning_rate": 8.656904822014643e-06, "loss": 0.0745, "step": 14090 }, { "epoch": 3.559707144660439, "grad_norm": 0.4756108820438385, "learning_rate": 8.641757132037364e-06, "loss": 0.0565, "step": 14100 }, { "epoch": 3.5622317596566524, "grad_norm": 0.36463871598243713, "learning_rate": 8.626609442060086e-06, "loss": 0.0632, "step": 14110 }, { "epoch": 3.564756374652865, "grad_norm": 0.514712393283844, "learning_rate": 8.611461752082807e-06, "loss": 0.0609, "step": 14120 }, { "epoch": 3.5672809896490785, "grad_norm": 0.4450944662094116, "learning_rate": 8.596314062105529e-06, "loss": 0.0515, "step": 14130 }, { "epoch": 3.5698056046452917, "grad_norm": 0.4317336976528168, "learning_rate": 8.58116637212825e-06, "loss": 0.0641, "step": 14140 }, { "epoch": 3.5723302196415045, "grad_norm": 0.46943795680999756, "learning_rate": 8.566018682150972e-06, "loss": 0.0495, "step": 14150 }, { "epoch": 3.574854834637718, "grad_norm": 0.42267486453056335, "learning_rate": 8.550870992173694e-06, "loss": 0.0525, "step": 14160 }, { "epoch": 3.5773794496339306, "grad_norm": 0.4673076272010803, "learning_rate": 8.535723302196415e-06, "loss": 0.0621, "step": 14170 }, { "epoch": 3.579904064630144, "grad_norm": 0.549739420413971, "learning_rate": 8.520575612219137e-06, "loss": 0.0711, "step": 14180 }, { "epoch": 3.582428679626357, "grad_norm": 0.5667104125022888, "learning_rate": 8.505427922241859e-06, "loss": 0.0607, "step": 14190 }, { "epoch": 3.58495329462257, "grad_norm": 0.5759524703025818, "learning_rate": 8.49028023226458e-06, "loss": 0.054, "step": 14200 }, { "epoch": 3.587477909618783, "grad_norm": 0.5231726765632629, "learning_rate": 8.475132542287302e-06, "loss": 0.0651, "step": 14210 }, { "epoch": 3.590002524614996, "grad_norm": 0.5827665328979492, "learning_rate": 8.459984852310023e-06, "loss": 0.069, "step": 14220 }, { "epoch": 3.5925271396112093, "grad_norm": 0.35911238193511963, "learning_rate": 8.444837162332745e-06, "loss": 0.0602, "step": 14230 }, { "epoch": 3.5950517546074225, "grad_norm": 0.5111239552497864, "learning_rate": 8.429689472355467e-06, "loss": 0.0549, "step": 14240 }, { "epoch": 3.5975763696036354, "grad_norm": 0.39395228028297424, "learning_rate": 8.414541782378186e-06, "loss": 0.0704, "step": 14250 }, { "epoch": 3.6001009845998486, "grad_norm": 0.30438244342803955, "learning_rate": 8.399394092400908e-06, "loss": 0.0545, "step": 14260 }, { "epoch": 3.6026255995960614, "grad_norm": 0.6330828666687012, "learning_rate": 8.384246402423631e-06, "loss": 0.0691, "step": 14270 }, { "epoch": 3.6051502145922747, "grad_norm": 0.6127652525901794, "learning_rate": 8.369098712446351e-06, "loss": 0.061, "step": 14280 }, { "epoch": 3.607674829588488, "grad_norm": 0.5500686764717102, "learning_rate": 8.353951022469073e-06, "loss": 0.0565, "step": 14290 }, { "epoch": 3.6101994445847008, "grad_norm": 0.3351685404777527, "learning_rate": 8.338803332491796e-06, "loss": 0.0463, "step": 14300 }, { "epoch": 3.612724059580914, "grad_norm": 0.6388446092605591, "learning_rate": 8.323655642514516e-06, "loss": 0.0567, "step": 14310 }, { "epoch": 3.615248674577127, "grad_norm": 0.311712384223938, "learning_rate": 8.308507952537238e-06, "loss": 0.0499, "step": 14320 }, { "epoch": 3.61777328957334, "grad_norm": 0.5051207542419434, "learning_rate": 8.293360262559961e-06, "loss": 0.0612, "step": 14330 }, { "epoch": 3.6202979045695534, "grad_norm": 0.6908370852470398, "learning_rate": 8.27821257258268e-06, "loss": 0.0677, "step": 14340 }, { "epoch": 3.622822519565766, "grad_norm": 0.5216169953346252, "learning_rate": 8.263064882605402e-06, "loss": 0.0646, "step": 14350 }, { "epoch": 3.6253471345619794, "grad_norm": 0.35106194019317627, "learning_rate": 8.247917192628126e-06, "loss": 0.0689, "step": 14360 }, { "epoch": 3.6278717495581922, "grad_norm": 0.5394479036331177, "learning_rate": 8.232769502650846e-06, "loss": 0.0534, "step": 14370 }, { "epoch": 3.6303963645544055, "grad_norm": 0.5065087676048279, "learning_rate": 8.217621812673567e-06, "loss": 0.0588, "step": 14380 }, { "epoch": 3.6329209795506188, "grad_norm": 0.6077365875244141, "learning_rate": 8.202474122696289e-06, "loss": 0.062, "step": 14390 }, { "epoch": 3.6354455945468316, "grad_norm": 0.5733447074890137, "learning_rate": 8.18732643271901e-06, "loss": 0.0664, "step": 14400 }, { "epoch": 3.637970209543045, "grad_norm": 0.3470951020717621, "learning_rate": 8.172178742741732e-06, "loss": 0.0619, "step": 14410 }, { "epoch": 3.6404948245392577, "grad_norm": 0.7638422250747681, "learning_rate": 8.157031052764454e-06, "loss": 0.0622, "step": 14420 }, { "epoch": 3.643019439535471, "grad_norm": 0.5586839914321899, "learning_rate": 8.141883362787175e-06, "loss": 0.0616, "step": 14430 }, { "epoch": 3.645544054531684, "grad_norm": 0.28789180517196655, "learning_rate": 8.126735672809897e-06, "loss": 0.0548, "step": 14440 }, { "epoch": 3.648068669527897, "grad_norm": 0.351252943277359, "learning_rate": 8.111587982832618e-06, "loss": 0.0554, "step": 14450 }, { "epoch": 3.65059328452411, "grad_norm": 0.5091794729232788, "learning_rate": 8.09644029285534e-06, "loss": 0.0624, "step": 14460 }, { "epoch": 3.653117899520323, "grad_norm": 0.5550070405006409, "learning_rate": 8.081292602878062e-06, "loss": 0.0479, "step": 14470 }, { "epoch": 3.6556425145165363, "grad_norm": 0.5677917003631592, "learning_rate": 8.066144912900783e-06, "loss": 0.0508, "step": 14480 }, { "epoch": 3.6581671295127496, "grad_norm": 0.38157495856285095, "learning_rate": 8.050997222923505e-06, "loss": 0.06, "step": 14490 }, { "epoch": 3.6606917445089624, "grad_norm": 0.4893856942653656, "learning_rate": 8.035849532946226e-06, "loss": 0.0583, "step": 14500 }, { "epoch": 3.663216359505175, "grad_norm": 0.5423852801322937, "learning_rate": 8.020701842968948e-06, "loss": 0.0569, "step": 14510 }, { "epoch": 3.6657409745013885, "grad_norm": 0.2972621023654938, "learning_rate": 8.00555415299167e-06, "loss": 0.0471, "step": 14520 }, { "epoch": 3.6682655894976017, "grad_norm": 0.324066162109375, "learning_rate": 7.990406463014391e-06, "loss": 0.0552, "step": 14530 }, { "epoch": 3.6707902044938145, "grad_norm": 0.5666695237159729, "learning_rate": 7.975258773037111e-06, "loss": 0.0493, "step": 14540 }, { "epoch": 3.673314819490028, "grad_norm": 0.4640502333641052, "learning_rate": 7.960111083059833e-06, "loss": 0.0564, "step": 14550 }, { "epoch": 3.6758394344862406, "grad_norm": 0.2752133905887604, "learning_rate": 7.944963393082556e-06, "loss": 0.0634, "step": 14560 }, { "epoch": 3.678364049482454, "grad_norm": 0.6435825824737549, "learning_rate": 7.929815703105276e-06, "loss": 0.0499, "step": 14570 }, { "epoch": 3.680888664478667, "grad_norm": 0.3424232006072998, "learning_rate": 7.914668013127997e-06, "loss": 0.0558, "step": 14580 }, { "epoch": 3.68341327947488, "grad_norm": 0.4150161147117615, "learning_rate": 7.89952032315072e-06, "loss": 0.0573, "step": 14590 }, { "epoch": 3.685937894471093, "grad_norm": 0.47552725672721863, "learning_rate": 7.88437263317344e-06, "loss": 0.0609, "step": 14600 }, { "epoch": 3.688462509467306, "grad_norm": 0.2726339101791382, "learning_rate": 7.869224943196162e-06, "loss": 0.0565, "step": 14610 }, { "epoch": 3.6909871244635193, "grad_norm": 0.41144663095474243, "learning_rate": 7.854077253218885e-06, "loss": 0.0659, "step": 14620 }, { "epoch": 3.6935117394597325, "grad_norm": 0.4347810447216034, "learning_rate": 7.838929563241605e-06, "loss": 0.0634, "step": 14630 }, { "epoch": 3.6960363544559454, "grad_norm": 0.6239407658576965, "learning_rate": 7.823781873264327e-06, "loss": 0.0612, "step": 14640 }, { "epoch": 3.6985609694521586, "grad_norm": 0.43966999650001526, "learning_rate": 7.80863418328705e-06, "loss": 0.0541, "step": 14650 }, { "epoch": 3.7010855844483714, "grad_norm": 0.3874231278896332, "learning_rate": 7.79348649330977e-06, "loss": 0.0514, "step": 14660 }, { "epoch": 3.7036101994445847, "grad_norm": 0.5817126035690308, "learning_rate": 7.778338803332492e-06, "loss": 0.0631, "step": 14670 }, { "epoch": 3.706134814440798, "grad_norm": 0.5537297129631042, "learning_rate": 7.763191113355213e-06, "loss": 0.0709, "step": 14680 }, { "epoch": 3.7086594294370108, "grad_norm": 0.5339685678482056, "learning_rate": 7.748043423377935e-06, "loss": 0.0665, "step": 14690 }, { "epoch": 3.711184044433224, "grad_norm": 0.33265355229377747, "learning_rate": 7.732895733400657e-06, "loss": 0.0544, "step": 14700 }, { "epoch": 3.713708659429437, "grad_norm": 0.6044656038284302, "learning_rate": 7.717748043423378e-06, "loss": 0.051, "step": 14710 }, { "epoch": 3.71623327442565, "grad_norm": 0.6232115626335144, "learning_rate": 7.7026003534461e-06, "loss": 0.0707, "step": 14720 }, { "epoch": 3.7187578894218634, "grad_norm": 0.43664541840553284, "learning_rate": 7.687452663468821e-06, "loss": 0.0607, "step": 14730 }, { "epoch": 3.721282504418076, "grad_norm": 0.36150795221328735, "learning_rate": 7.672304973491543e-06, "loss": 0.0556, "step": 14740 }, { "epoch": 3.7238071194142894, "grad_norm": 0.49992161989212036, "learning_rate": 7.657157283514264e-06, "loss": 0.0624, "step": 14750 }, { "epoch": 3.7263317344105022, "grad_norm": 0.7528823614120483, "learning_rate": 7.642009593536986e-06, "loss": 0.0724, "step": 14760 }, { "epoch": 3.7288563494067155, "grad_norm": 0.5596957206726074, "learning_rate": 7.6268619035597085e-06, "loss": 0.0539, "step": 14770 }, { "epoch": 3.7313809644029288, "grad_norm": 0.532844603061676, "learning_rate": 7.6117142135824284e-06, "loss": 0.0656, "step": 14780 }, { "epoch": 3.7339055793991416, "grad_norm": 0.568172812461853, "learning_rate": 7.596566523605151e-06, "loss": 0.0753, "step": 14790 }, { "epoch": 3.736430194395355, "grad_norm": 0.7042198181152344, "learning_rate": 7.581418833627871e-06, "loss": 0.072, "step": 14800 }, { "epoch": 3.7389548093915677, "grad_norm": 0.46920961141586304, "learning_rate": 7.566271143650593e-06, "loss": 0.0563, "step": 14810 }, { "epoch": 3.741479424387781, "grad_norm": 0.655121922492981, "learning_rate": 7.551123453673316e-06, "loss": 0.0644, "step": 14820 }, { "epoch": 3.744004039383994, "grad_norm": 0.4194955825805664, "learning_rate": 7.5359757636960356e-06, "loss": 0.0552, "step": 14830 }, { "epoch": 3.746528654380207, "grad_norm": 0.44179287552833557, "learning_rate": 7.520828073718758e-06, "loss": 0.0618, "step": 14840 }, { "epoch": 3.7490532693764202, "grad_norm": 0.576785147190094, "learning_rate": 7.5056803837414804e-06, "loss": 0.0603, "step": 14850 }, { "epoch": 3.751577884372633, "grad_norm": 0.5226455330848694, "learning_rate": 7.490532693764201e-06, "loss": 0.0549, "step": 14860 }, { "epoch": 3.7541024993688463, "grad_norm": 0.5605834722518921, "learning_rate": 7.475385003786923e-06, "loss": 0.0614, "step": 14870 }, { "epoch": 3.7566271143650596, "grad_norm": 0.4599710702896118, "learning_rate": 7.4602373138096435e-06, "loss": 0.0579, "step": 14880 }, { "epoch": 3.7591517293612724, "grad_norm": 0.3665611445903778, "learning_rate": 7.445089623832366e-06, "loss": 0.0503, "step": 14890 }, { "epoch": 3.761676344357485, "grad_norm": 0.5790762901306152, "learning_rate": 7.4299419338550876e-06, "loss": 0.0584, "step": 14900 }, { "epoch": 3.7642009593536985, "grad_norm": 0.4075251817703247, "learning_rate": 7.414794243877808e-06, "loss": 0.0451, "step": 14910 }, { "epoch": 3.7667255743499117, "grad_norm": 0.3984999656677246, "learning_rate": 7.399646553900531e-06, "loss": 0.0604, "step": 14920 }, { "epoch": 3.769250189346125, "grad_norm": 0.5887860059738159, "learning_rate": 7.3844988639232515e-06, "loss": 0.0585, "step": 14930 }, { "epoch": 3.771774804342338, "grad_norm": 0.516468346118927, "learning_rate": 7.369351173945973e-06, "loss": 0.0646, "step": 14940 }, { "epoch": 3.7742994193385506, "grad_norm": 0.455007404088974, "learning_rate": 7.3542034839686955e-06, "loss": 0.0615, "step": 14950 }, { "epoch": 3.776824034334764, "grad_norm": 0.5999513864517212, "learning_rate": 7.339055793991416e-06, "loss": 0.0566, "step": 14960 }, { "epoch": 3.779348649330977, "grad_norm": 0.5931031703948975, "learning_rate": 7.323908104014138e-06, "loss": 0.0552, "step": 14970 }, { "epoch": 3.78187326432719, "grad_norm": 0.3761138319969177, "learning_rate": 7.30876041403686e-06, "loss": 0.0728, "step": 14980 }, { "epoch": 3.784397879323403, "grad_norm": 0.5019908547401428, "learning_rate": 7.293612724059581e-06, "loss": 0.0564, "step": 14990 }, { "epoch": 3.786922494319616, "grad_norm": 0.4987894296646118, "learning_rate": 7.278465034082303e-06, "loss": 0.0403, "step": 15000 }, { "epoch": 3.7894471093158293, "grad_norm": 0.6358135342597961, "learning_rate": 7.263317344105023e-06, "loss": 0.0575, "step": 15010 }, { "epoch": 3.7919717243120425, "grad_norm": 0.3665125072002411, "learning_rate": 7.248169654127746e-06, "loss": 0.0626, "step": 15020 }, { "epoch": 3.7944963393082554, "grad_norm": 0.7710779309272766, "learning_rate": 7.2330219641504674e-06, "loss": 0.0558, "step": 15030 }, { "epoch": 3.7970209543044686, "grad_norm": 0.7254796624183655, "learning_rate": 7.217874274173188e-06, "loss": 0.059, "step": 15040 }, { "epoch": 3.7995455693006814, "grad_norm": 0.6218250393867493, "learning_rate": 7.202726584195911e-06, "loss": 0.0594, "step": 15050 }, { "epoch": 3.8020701842968947, "grad_norm": 0.4998759627342224, "learning_rate": 7.187578894218632e-06, "loss": 0.0708, "step": 15060 }, { "epoch": 3.804594799293108, "grad_norm": 0.31853801012039185, "learning_rate": 7.172431204241353e-06, "loss": 0.0613, "step": 15070 }, { "epoch": 3.8071194142893208, "grad_norm": 0.5865182876586914, "learning_rate": 7.157283514264075e-06, "loss": 0.0539, "step": 15080 }, { "epoch": 3.809644029285534, "grad_norm": 0.46650758385658264, "learning_rate": 7.142135824286796e-06, "loss": 0.052, "step": 15090 }, { "epoch": 3.812168644281747, "grad_norm": 0.49982815980911255, "learning_rate": 7.126988134309518e-06, "loss": 0.053, "step": 15100 }, { "epoch": 3.81469325927796, "grad_norm": 0.3356648087501526, "learning_rate": 7.11184044433224e-06, "loss": 0.0554, "step": 15110 }, { "epoch": 3.8172178742741734, "grad_norm": 0.42580679059028625, "learning_rate": 7.096692754354961e-06, "loss": 0.0588, "step": 15120 }, { "epoch": 3.819742489270386, "grad_norm": 0.5860910415649414, "learning_rate": 7.0815450643776825e-06, "loss": 0.0571, "step": 15130 }, { "epoch": 3.8222671042665994, "grad_norm": 0.7389113306999207, "learning_rate": 7.066397374400404e-06, "loss": 0.0635, "step": 15140 }, { "epoch": 3.8247917192628123, "grad_norm": 0.8048897385597229, "learning_rate": 7.051249684423126e-06, "loss": 0.0639, "step": 15150 }, { "epoch": 3.8273163342590255, "grad_norm": 0.43790996074676514, "learning_rate": 7.036101994445847e-06, "loss": 0.0469, "step": 15160 }, { "epoch": 3.8298409492552388, "grad_norm": 0.633665919303894, "learning_rate": 7.020954304468568e-06, "loss": 0.0525, "step": 15170 }, { "epoch": 3.8323655642514516, "grad_norm": 0.48216986656188965, "learning_rate": 7.0058066144912905e-06, "loss": 0.0604, "step": 15180 }, { "epoch": 3.834890179247665, "grad_norm": 0.4421907663345337, "learning_rate": 6.990658924514012e-06, "loss": 0.0632, "step": 15190 }, { "epoch": 3.8374147942438777, "grad_norm": 0.41442814469337463, "learning_rate": 6.975511234536733e-06, "loss": 0.051, "step": 15200 }, { "epoch": 3.839939409240091, "grad_norm": 0.35029637813568115, "learning_rate": 6.960363544559455e-06, "loss": 0.0415, "step": 15210 }, { "epoch": 3.842464024236304, "grad_norm": 0.5078822374343872, "learning_rate": 6.945215854582176e-06, "loss": 0.0536, "step": 15220 }, { "epoch": 3.844988639232517, "grad_norm": 0.5782067179679871, "learning_rate": 6.930068164604898e-06, "loss": 0.0663, "step": 15230 }, { "epoch": 3.8475132542287303, "grad_norm": 0.38581833243370056, "learning_rate": 6.91492047462762e-06, "loss": 0.0623, "step": 15240 }, { "epoch": 3.850037869224943, "grad_norm": 0.4432540535926819, "learning_rate": 6.899772784650341e-06, "loss": 0.0608, "step": 15250 }, { "epoch": 3.8525624842211563, "grad_norm": 0.2796371877193451, "learning_rate": 6.8846250946730624e-06, "loss": 0.0515, "step": 15260 }, { "epoch": 3.8550870992173696, "grad_norm": 0.38160884380340576, "learning_rate": 6.869477404695784e-06, "loss": 0.0619, "step": 15270 }, { "epoch": 3.8576117142135824, "grad_norm": 0.4360808730125427, "learning_rate": 6.854329714718506e-06, "loss": 0.0686, "step": 15280 }, { "epoch": 3.8601363292097957, "grad_norm": 0.5405408143997192, "learning_rate": 6.839182024741227e-06, "loss": 0.0493, "step": 15290 }, { "epoch": 3.8626609442060085, "grad_norm": 0.6223629713058472, "learning_rate": 6.824034334763948e-06, "loss": 0.0592, "step": 15300 }, { "epoch": 3.8651855592022217, "grad_norm": 0.6864993572235107, "learning_rate": 6.80888664478667e-06, "loss": 0.0597, "step": 15310 }, { "epoch": 3.867710174198435, "grad_norm": 0.5163440108299255, "learning_rate": 6.793738954809392e-06, "loss": 0.052, "step": 15320 }, { "epoch": 3.870234789194648, "grad_norm": 0.3290441334247589, "learning_rate": 6.778591264832113e-06, "loss": 0.0454, "step": 15330 }, { "epoch": 3.8727594041908606, "grad_norm": 0.5202774405479431, "learning_rate": 6.763443574854835e-06, "loss": 0.0494, "step": 15340 }, { "epoch": 3.875284019187074, "grad_norm": 0.8797051906585693, "learning_rate": 6.748295884877557e-06, "loss": 0.0664, "step": 15350 }, { "epoch": 3.877808634183287, "grad_norm": 0.5602105855941772, "learning_rate": 6.7331481949002775e-06, "loss": 0.0639, "step": 15360 }, { "epoch": 3.8803332491795004, "grad_norm": 0.33669281005859375, "learning_rate": 6.718000504923e-06, "loss": 0.0542, "step": 15370 }, { "epoch": 3.882857864175713, "grad_norm": 0.41936057806015015, "learning_rate": 6.702852814945721e-06, "loss": 0.0545, "step": 15380 }, { "epoch": 3.885382479171926, "grad_norm": 0.3497774302959442, "learning_rate": 6.687705124968442e-06, "loss": 0.0515, "step": 15390 }, { "epoch": 3.8879070941681393, "grad_norm": 0.8730018734931946, "learning_rate": 6.672557434991164e-06, "loss": 0.0539, "step": 15400 }, { "epoch": 3.8904317091643525, "grad_norm": 0.3379184305667877, "learning_rate": 6.6574097450138855e-06, "loss": 0.0553, "step": 15410 }, { "epoch": 3.8929563241605654, "grad_norm": 0.7440978288650513, "learning_rate": 6.642262055036607e-06, "loss": 0.0585, "step": 15420 }, { "epoch": 3.8954809391567786, "grad_norm": 0.3775716722011566, "learning_rate": 6.627114365059329e-06, "loss": 0.0607, "step": 15430 }, { "epoch": 3.8980055541529914, "grad_norm": 0.5544441342353821, "learning_rate": 6.61196667508205e-06, "loss": 0.0638, "step": 15440 }, { "epoch": 3.9005301691492047, "grad_norm": 0.5136420726776123, "learning_rate": 6.596818985104772e-06, "loss": 0.0642, "step": 15450 }, { "epoch": 3.903054784145418, "grad_norm": 0.3824068307876587, "learning_rate": 6.581671295127493e-06, "loss": 0.0616, "step": 15460 }, { "epoch": 3.9055793991416308, "grad_norm": 0.6130052208900452, "learning_rate": 6.566523605150215e-06, "loss": 0.0558, "step": 15470 }, { "epoch": 3.908104014137844, "grad_norm": 0.48635825514793396, "learning_rate": 6.551375915172937e-06, "loss": 0.0539, "step": 15480 }, { "epoch": 3.910628629134057, "grad_norm": 0.4900931417942047, "learning_rate": 6.536228225195657e-06, "loss": 0.0488, "step": 15490 }, { "epoch": 3.91315324413027, "grad_norm": 0.3312920331954956, "learning_rate": 6.52108053521838e-06, "loss": 0.0577, "step": 15500 }, { "epoch": 3.9156778591264834, "grad_norm": 0.38757237792015076, "learning_rate": 6.505932845241101e-06, "loss": 0.052, "step": 15510 }, { "epoch": 3.918202474122696, "grad_norm": 0.5382483601570129, "learning_rate": 6.490785155263822e-06, "loss": 0.0617, "step": 15520 }, { "epoch": 3.9207270891189094, "grad_norm": 0.7959476113319397, "learning_rate": 6.475637465286545e-06, "loss": 0.0619, "step": 15530 }, { "epoch": 3.9232517041151223, "grad_norm": 0.5656881332397461, "learning_rate": 6.460489775309265e-06, "loss": 0.057, "step": 15540 }, { "epoch": 3.9257763191113355, "grad_norm": 0.32945945858955383, "learning_rate": 6.445342085331987e-06, "loss": 0.0467, "step": 15550 }, { "epoch": 3.9283009341075488, "grad_norm": 0.5367522835731506, "learning_rate": 6.4301943953547086e-06, "loss": 0.0495, "step": 15560 }, { "epoch": 3.9308255491037616, "grad_norm": 0.6505579948425293, "learning_rate": 6.41504670537743e-06, "loss": 0.0573, "step": 15570 }, { "epoch": 3.933350164099975, "grad_norm": 0.39535531401634216, "learning_rate": 6.399899015400152e-06, "loss": 0.0668, "step": 15580 }, { "epoch": 3.9358747790961877, "grad_norm": 0.47247302532196045, "learning_rate": 6.3847513254228725e-06, "loss": 0.0602, "step": 15590 }, { "epoch": 3.938399394092401, "grad_norm": 0.45602869987487793, "learning_rate": 6.369603635445595e-06, "loss": 0.0623, "step": 15600 }, { "epoch": 3.940924009088614, "grad_norm": 0.5000142455101013, "learning_rate": 6.3544559454683165e-06, "loss": 0.0492, "step": 15610 }, { "epoch": 3.943448624084827, "grad_norm": 0.22823497653007507, "learning_rate": 6.339308255491037e-06, "loss": 0.0425, "step": 15620 }, { "epoch": 3.9459732390810403, "grad_norm": 0.7233946323394775, "learning_rate": 6.32416056551376e-06, "loss": 0.0517, "step": 15630 }, { "epoch": 3.948497854077253, "grad_norm": 0.4734395444393158, "learning_rate": 6.309012875536481e-06, "loss": 0.0548, "step": 15640 }, { "epoch": 3.9510224690734663, "grad_norm": 0.5519753098487854, "learning_rate": 6.293865185559202e-06, "loss": 0.054, "step": 15650 }, { "epoch": 3.9535470840696796, "grad_norm": 0.8592543601989746, "learning_rate": 6.2787174955819245e-06, "loss": 0.0622, "step": 15660 }, { "epoch": 3.9560716990658924, "grad_norm": 0.716918408870697, "learning_rate": 6.263569805604645e-06, "loss": 0.0651, "step": 15670 }, { "epoch": 3.9585963140621057, "grad_norm": 0.5099024176597595, "learning_rate": 6.248422115627367e-06, "loss": 0.0668, "step": 15680 }, { "epoch": 3.9611209290583185, "grad_norm": 0.4536076784133911, "learning_rate": 6.2332744256500884e-06, "loss": 0.0477, "step": 15690 }, { "epoch": 3.9636455440545317, "grad_norm": 0.6459512114524841, "learning_rate": 6.21812673567281e-06, "loss": 0.057, "step": 15700 }, { "epoch": 3.966170159050745, "grad_norm": 0.6501273512840271, "learning_rate": 6.202979045695532e-06, "loss": 0.0526, "step": 15710 }, { "epoch": 3.968694774046958, "grad_norm": 0.40765637159347534, "learning_rate": 6.187831355718253e-06, "loss": 0.0639, "step": 15720 }, { "epoch": 3.971219389043171, "grad_norm": 0.564132571220398, "learning_rate": 6.172683665740975e-06, "loss": 0.0631, "step": 15730 }, { "epoch": 3.973744004039384, "grad_norm": 0.5302206873893738, "learning_rate": 6.157535975763696e-06, "loss": 0.0569, "step": 15740 }, { "epoch": 3.976268619035597, "grad_norm": 0.5838475227355957, "learning_rate": 6.142388285786417e-06, "loss": 0.0633, "step": 15750 }, { "epoch": 3.9787932340318104, "grad_norm": 0.46987318992614746, "learning_rate": 6.12724059580914e-06, "loss": 0.0603, "step": 15760 }, { "epoch": 3.981317849028023, "grad_norm": 0.5803635716438293, "learning_rate": 6.112092905831861e-06, "loss": 0.0631, "step": 15770 }, { "epoch": 3.983842464024236, "grad_norm": 0.34555870294570923, "learning_rate": 6.096945215854582e-06, "loss": 0.059, "step": 15780 }, { "epoch": 3.9863670790204493, "grad_norm": 0.3343939781188965, "learning_rate": 6.081797525877304e-06, "loss": 0.0625, "step": 15790 }, { "epoch": 3.9888916940166625, "grad_norm": 0.6097005605697632, "learning_rate": 6.066649835900025e-06, "loss": 0.0617, "step": 15800 }, { "epoch": 3.991416309012876, "grad_norm": 0.495924174785614, "learning_rate": 6.051502145922747e-06, "loss": 0.0441, "step": 15810 }, { "epoch": 3.9939409240090886, "grad_norm": 0.19013169407844543, "learning_rate": 6.036354455945468e-06, "loss": 0.0661, "step": 15820 }, { "epoch": 3.9964655390053014, "grad_norm": 0.4961543083190918, "learning_rate": 6.02120676596819e-06, "loss": 0.0661, "step": 15830 }, { "epoch": 3.9989901540015147, "grad_norm": 0.4737859070301056, "learning_rate": 6.0060590759909115e-06, "loss": 0.0627, "step": 15840 }, { "epoch": 4.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.04224640876054764, "eval_runtime": 902.072, "eval_samples_per_second": 228.654, "eval_steps_per_second": 3.573, "step": 15844 }, { "epoch": 4.001514768997728, "grad_norm": 0.5551168322563171, "learning_rate": 5.990911386013633e-06, "loss": 0.0627, "step": 15850 }, { "epoch": 4.004039383993941, "grad_norm": 0.5756297707557678, "learning_rate": 5.975763696036355e-06, "loss": 0.0502, "step": 15860 }, { "epoch": 4.006563998990154, "grad_norm": 0.3061296045780182, "learning_rate": 5.960616006059076e-06, "loss": 0.0582, "step": 15870 }, { "epoch": 4.009088613986367, "grad_norm": 0.3693884313106537, "learning_rate": 5.945468316081797e-06, "loss": 0.055, "step": 15880 }, { "epoch": 4.01161322898258, "grad_norm": 0.4693322777748108, "learning_rate": 5.9303206261045195e-06, "loss": 0.0596, "step": 15890 }, { "epoch": 4.014137843978793, "grad_norm": 0.36858975887298584, "learning_rate": 5.915172936127241e-06, "loss": 0.048, "step": 15900 }, { "epoch": 4.016662458975007, "grad_norm": 0.4288581907749176, "learning_rate": 5.900025246149962e-06, "loss": 0.0523, "step": 15910 }, { "epoch": 4.019187073971219, "grad_norm": 0.5312148928642273, "learning_rate": 5.884877556172684e-06, "loss": 0.0549, "step": 15920 }, { "epoch": 4.021711688967432, "grad_norm": 0.3998830020427704, "learning_rate": 5.869729866195406e-06, "loss": 0.0671, "step": 15930 }, { "epoch": 4.0242363039636455, "grad_norm": 0.40000516176223755, "learning_rate": 5.854582176218127e-06, "loss": 0.0635, "step": 15940 }, { "epoch": 4.026760918959859, "grad_norm": 0.273234486579895, "learning_rate": 5.839434486240848e-06, "loss": 0.0569, "step": 15950 }, { "epoch": 4.029285533956072, "grad_norm": 0.6919571161270142, "learning_rate": 5.82428679626357e-06, "loss": 0.0547, "step": 15960 }, { "epoch": 4.031810148952284, "grad_norm": 0.508310079574585, "learning_rate": 5.809139106286291e-06, "loss": 0.0623, "step": 15970 }, { "epoch": 4.034334763948498, "grad_norm": 0.5494049787521362, "learning_rate": 5.793991416309013e-06, "loss": 0.0612, "step": 15980 }, { "epoch": 4.036859378944711, "grad_norm": 0.4908718168735504, "learning_rate": 5.778843726331735e-06, "loss": 0.051, "step": 15990 }, { "epoch": 4.039383993940924, "grad_norm": 0.40417003631591797, "learning_rate": 5.763696036354456e-06, "loss": 0.0651, "step": 16000 }, { "epoch": 4.041908608937137, "grad_norm": 0.3193455934524536, "learning_rate": 5.748548346377178e-06, "loss": 0.0623, "step": 16010 }, { "epoch": 4.04443322393335, "grad_norm": 0.629706859588623, "learning_rate": 5.733400656399899e-06, "loss": 0.0538, "step": 16020 }, { "epoch": 4.046957838929563, "grad_norm": 0.5846245884895325, "learning_rate": 5.718252966422621e-06, "loss": 0.0633, "step": 16030 }, { "epoch": 4.049482453925776, "grad_norm": 0.4376593828201294, "learning_rate": 5.703105276445342e-06, "loss": 0.0537, "step": 16040 }, { "epoch": 4.05200706892199, "grad_norm": 0.44141125679016113, "learning_rate": 5.687957586468064e-06, "loss": 0.0563, "step": 16050 }, { "epoch": 4.054531683918203, "grad_norm": 0.466325581073761, "learning_rate": 5.672809896490786e-06, "loss": 0.0666, "step": 16060 }, { "epoch": 4.057056298914415, "grad_norm": 0.4581875205039978, "learning_rate": 5.6576622065135065e-06, "loss": 0.0635, "step": 16070 }, { "epoch": 4.0595809139106285, "grad_norm": 0.31961241364479065, "learning_rate": 5.642514516536228e-06, "loss": 0.045, "step": 16080 }, { "epoch": 4.062105528906842, "grad_norm": 0.31504639983177185, "learning_rate": 5.62736682655895e-06, "loss": 0.0663, "step": 16090 }, { "epoch": 4.064630143903055, "grad_norm": 0.19217759370803833, "learning_rate": 5.612219136581671e-06, "loss": 0.0684, "step": 16100 }, { "epoch": 4.067154758899268, "grad_norm": 0.3876609802246094, "learning_rate": 5.597071446604393e-06, "loss": 0.0473, "step": 16110 }, { "epoch": 4.069679373895481, "grad_norm": 0.8723250031471252, "learning_rate": 5.5819237566271145e-06, "loss": 0.0634, "step": 16120 }, { "epoch": 4.072203988891694, "grad_norm": 0.41938892006874084, "learning_rate": 5.566776066649836e-06, "loss": 0.0576, "step": 16130 }, { "epoch": 4.074728603887907, "grad_norm": 0.39180928468704224, "learning_rate": 5.551628376672558e-06, "loss": 0.0562, "step": 16140 }, { "epoch": 4.07725321888412, "grad_norm": 0.5083798766136169, "learning_rate": 5.536480686695279e-06, "loss": 0.0487, "step": 16150 }, { "epoch": 4.079777833880334, "grad_norm": 0.7230374217033386, "learning_rate": 5.521332996718001e-06, "loss": 0.0606, "step": 16160 }, { "epoch": 4.082302448876546, "grad_norm": 0.8608932495117188, "learning_rate": 5.506185306740722e-06, "loss": 0.0479, "step": 16170 }, { "epoch": 4.084827063872759, "grad_norm": 0.3399522006511688, "learning_rate": 5.491037616763444e-06, "loss": 0.062, "step": 16180 }, { "epoch": 4.0873516788689725, "grad_norm": 0.6174184083938599, "learning_rate": 5.475889926786166e-06, "loss": 0.0567, "step": 16190 }, { "epoch": 4.089876293865186, "grad_norm": 0.5198509693145752, "learning_rate": 5.460742236808886e-06, "loss": 0.0572, "step": 16200 }, { "epoch": 4.092400908861399, "grad_norm": 0.807162344455719, "learning_rate": 5.445594546831609e-06, "loss": 0.0707, "step": 16210 }, { "epoch": 4.094925523857611, "grad_norm": 0.4432941973209381, "learning_rate": 5.43044685685433e-06, "loss": 0.0505, "step": 16220 }, { "epoch": 4.097450138853825, "grad_norm": 0.5982592701911926, "learning_rate": 5.415299166877051e-06, "loss": 0.0521, "step": 16230 }, { "epoch": 4.099974753850038, "grad_norm": 0.5292986631393433, "learning_rate": 5.400151476899773e-06, "loss": 0.0481, "step": 16240 }, { "epoch": 4.102499368846251, "grad_norm": 0.42435184121131897, "learning_rate": 5.385003786922494e-06, "loss": 0.0549, "step": 16250 }, { "epoch": 4.1050239838424645, "grad_norm": 0.5759508013725281, "learning_rate": 5.369856096945216e-06, "loss": 0.0574, "step": 16260 }, { "epoch": 4.107548598838677, "grad_norm": 0.7036067843437195, "learning_rate": 5.3547084069679375e-06, "loss": 0.0633, "step": 16270 }, { "epoch": 4.11007321383489, "grad_norm": 0.45638132095336914, "learning_rate": 5.339560716990659e-06, "loss": 0.0472, "step": 16280 }, { "epoch": 4.112597828831103, "grad_norm": 0.5197755098342896, "learning_rate": 5.324413027013381e-06, "loss": 0.0598, "step": 16290 }, { "epoch": 4.115122443827317, "grad_norm": 0.5755794048309326, "learning_rate": 5.309265337036102e-06, "loss": 0.0513, "step": 16300 }, { "epoch": 4.117647058823529, "grad_norm": 0.470956414937973, "learning_rate": 5.294117647058824e-06, "loss": 0.0685, "step": 16310 }, { "epoch": 4.120171673819742, "grad_norm": 0.4178047478199005, "learning_rate": 5.2789699570815455e-06, "loss": 0.0638, "step": 16320 }, { "epoch": 4.1226962888159555, "grad_norm": 0.3784389793872833, "learning_rate": 5.263822267104266e-06, "loss": 0.0605, "step": 16330 }, { "epoch": 4.125220903812169, "grad_norm": 0.5465134382247925, "learning_rate": 5.248674577126989e-06, "loss": 0.0627, "step": 16340 }, { "epoch": 4.127745518808382, "grad_norm": 0.7110384106636047, "learning_rate": 5.23352688714971e-06, "loss": 0.0623, "step": 16350 }, { "epoch": 4.130270133804594, "grad_norm": 0.38853955268859863, "learning_rate": 5.218379197172431e-06, "loss": 0.0595, "step": 16360 }, { "epoch": 4.132794748800808, "grad_norm": 0.44513699412345886, "learning_rate": 5.203231507195153e-06, "loss": 0.0487, "step": 16370 }, { "epoch": 4.135319363797021, "grad_norm": 0.4682462513446808, "learning_rate": 5.188083817217874e-06, "loss": 0.0694, "step": 16380 }, { "epoch": 4.137843978793234, "grad_norm": 0.599108099937439, "learning_rate": 5.172936127240596e-06, "loss": 0.047, "step": 16390 }, { "epoch": 4.140368593789447, "grad_norm": 0.49504604935646057, "learning_rate": 5.157788437263317e-06, "loss": 0.0502, "step": 16400 }, { "epoch": 4.14289320878566, "grad_norm": 0.42351964116096497, "learning_rate": 5.142640747286039e-06, "loss": 0.0599, "step": 16410 }, { "epoch": 4.145417823781873, "grad_norm": 0.1978461593389511, "learning_rate": 5.127493057308761e-06, "loss": 0.0521, "step": 16420 }, { "epoch": 4.147942438778086, "grad_norm": 0.5165350437164307, "learning_rate": 5.112345367331482e-06, "loss": 0.0536, "step": 16430 }, { "epoch": 4.1504670537743, "grad_norm": 0.4052433669567108, "learning_rate": 5.097197677354204e-06, "loss": 0.0575, "step": 16440 }, { "epoch": 4.152991668770513, "grad_norm": 0.6475027203559875, "learning_rate": 5.082049987376925e-06, "loss": 0.0541, "step": 16450 }, { "epoch": 4.155516283766725, "grad_norm": 0.4289510250091553, "learning_rate": 5.066902297399646e-06, "loss": 0.0513, "step": 16460 }, { "epoch": 4.1580408987629385, "grad_norm": 0.38766923546791077, "learning_rate": 5.051754607422369e-06, "loss": 0.0487, "step": 16470 }, { "epoch": 4.160565513759152, "grad_norm": 0.7618467211723328, "learning_rate": 5.03660691744509e-06, "loss": 0.0519, "step": 16480 }, { "epoch": 4.163090128755365, "grad_norm": 0.3821544945240021, "learning_rate": 5.021459227467811e-06, "loss": 0.0628, "step": 16490 }, { "epoch": 4.165614743751578, "grad_norm": 0.436176061630249, "learning_rate": 5.0063115374905325e-06, "loss": 0.0524, "step": 16500 }, { "epoch": 4.168139358747791, "grad_norm": 0.6463722586631775, "learning_rate": 4.991163847513255e-06, "loss": 0.0535, "step": 16510 }, { "epoch": 4.170663973744004, "grad_norm": 0.35036715865135193, "learning_rate": 4.976016157535976e-06, "loss": 0.0631, "step": 16520 }, { "epoch": 4.173188588740217, "grad_norm": 0.6320012211799622, "learning_rate": 4.960868467558697e-06, "loss": 0.0593, "step": 16530 }, { "epoch": 4.17571320373643, "grad_norm": 0.29677146673202515, "learning_rate": 4.945720777581419e-06, "loss": 0.0512, "step": 16540 }, { "epoch": 4.178237818732644, "grad_norm": 0.4986904263496399, "learning_rate": 4.9305730876041405e-06, "loss": 0.0526, "step": 16550 }, { "epoch": 4.180762433728856, "grad_norm": 0.5562833547592163, "learning_rate": 4.915425397626862e-06, "loss": 0.0522, "step": 16560 }, { "epoch": 4.183287048725069, "grad_norm": 0.6436367630958557, "learning_rate": 4.900277707649584e-06, "loss": 0.0688, "step": 16570 }, { "epoch": 4.1858116637212825, "grad_norm": 0.5076253414154053, "learning_rate": 4.885130017672305e-06, "loss": 0.0685, "step": 16580 }, { "epoch": 4.188336278717496, "grad_norm": 0.4076981544494629, "learning_rate": 4.869982327695026e-06, "loss": 0.0537, "step": 16590 }, { "epoch": 4.190860893713709, "grad_norm": 0.44264063239097595, "learning_rate": 4.8548346377177485e-06, "loss": 0.0579, "step": 16600 }, { "epoch": 4.193385508709921, "grad_norm": 0.3889271020889282, "learning_rate": 4.83968694774047e-06, "loss": 0.0555, "step": 16610 }, { "epoch": 4.195910123706135, "grad_norm": 0.5801649689674377, "learning_rate": 4.824539257763191e-06, "loss": 0.0371, "step": 16620 }, { "epoch": 4.198434738702348, "grad_norm": 0.43185538053512573, "learning_rate": 4.809391567785912e-06, "loss": 0.0711, "step": 16630 }, { "epoch": 4.200959353698561, "grad_norm": 0.7169790267944336, "learning_rate": 4.794243877808635e-06, "loss": 0.0692, "step": 16640 }, { "epoch": 4.2034839686947745, "grad_norm": 0.3428625464439392, "learning_rate": 4.779096187831356e-06, "loss": 0.0659, "step": 16650 }, { "epoch": 4.206008583690987, "grad_norm": 0.39647018909454346, "learning_rate": 4.763948497854077e-06, "loss": 0.055, "step": 16660 }, { "epoch": 4.2085331986872, "grad_norm": 0.40620213747024536, "learning_rate": 4.748800807876799e-06, "loss": 0.058, "step": 16670 }, { "epoch": 4.211057813683413, "grad_norm": 0.5194025635719299, "learning_rate": 4.73365311789952e-06, "loss": 0.0574, "step": 16680 }, { "epoch": 4.213582428679627, "grad_norm": 0.4074387848377228, "learning_rate": 4.718505427922242e-06, "loss": 0.0525, "step": 16690 }, { "epoch": 4.216107043675839, "grad_norm": 0.670364260673523, "learning_rate": 4.7033577379449636e-06, "loss": 0.064, "step": 16700 }, { "epoch": 4.218631658672052, "grad_norm": 0.5439589023590088, "learning_rate": 4.688210047967685e-06, "loss": 0.0553, "step": 16710 }, { "epoch": 4.2211562736682655, "grad_norm": 0.5016559362411499, "learning_rate": 4.673062357990407e-06, "loss": 0.0526, "step": 16720 }, { "epoch": 4.223680888664479, "grad_norm": 0.5297515988349915, "learning_rate": 4.657914668013128e-06, "loss": 0.0533, "step": 16730 }, { "epoch": 4.226205503660692, "grad_norm": 0.5738322734832764, "learning_rate": 4.64276697803585e-06, "loss": 0.0653, "step": 16740 }, { "epoch": 4.228730118656904, "grad_norm": 2.0166008472442627, "learning_rate": 4.627619288058571e-06, "loss": 0.0644, "step": 16750 }, { "epoch": 4.231254733653118, "grad_norm": 0.44286590814590454, "learning_rate": 4.612471598081292e-06, "loss": 0.0734, "step": 16760 }, { "epoch": 4.233779348649331, "grad_norm": 0.3186633288860321, "learning_rate": 4.597323908104015e-06, "loss": 0.044, "step": 16770 }, { "epoch": 4.236303963645544, "grad_norm": 0.7005212306976318, "learning_rate": 4.5821762181267355e-06, "loss": 0.0586, "step": 16780 }, { "epoch": 4.238828578641757, "grad_norm": 0.858946681022644, "learning_rate": 4.567028528149457e-06, "loss": 0.0579, "step": 16790 }, { "epoch": 4.24135319363797, "grad_norm": 0.3274882137775421, "learning_rate": 4.5518808381721795e-06, "loss": 0.0578, "step": 16800 }, { "epoch": 4.243877808634183, "grad_norm": 0.5176664590835571, "learning_rate": 4.5367331481949e-06, "loss": 0.0637, "step": 16810 }, { "epoch": 4.246402423630396, "grad_norm": 0.6378607749938965, "learning_rate": 4.521585458217622e-06, "loss": 0.0527, "step": 16820 }, { "epoch": 4.24892703862661, "grad_norm": 0.3820461928844452, "learning_rate": 4.5064377682403434e-06, "loss": 0.0648, "step": 16830 }, { "epoch": 4.251451653622823, "grad_norm": 0.34068912267684937, "learning_rate": 4.491290078263065e-06, "loss": 0.0542, "step": 16840 }, { "epoch": 4.253976268619035, "grad_norm": 0.4906207323074341, "learning_rate": 4.476142388285787e-06, "loss": 0.0547, "step": 16850 }, { "epoch": 4.2565008836152485, "grad_norm": 0.7165696024894714, "learning_rate": 4.460994698308508e-06, "loss": 0.0496, "step": 16860 }, { "epoch": 4.259025498611462, "grad_norm": 0.3498016595840454, "learning_rate": 4.44584700833123e-06, "loss": 0.051, "step": 16870 }, { "epoch": 4.261550113607675, "grad_norm": 0.6449806094169617, "learning_rate": 4.4306993183539506e-06, "loss": 0.0574, "step": 16880 }, { "epoch": 4.264074728603888, "grad_norm": 0.719499945640564, "learning_rate": 4.415551628376672e-06, "loss": 0.0619, "step": 16890 }, { "epoch": 4.266599343600101, "grad_norm": 0.630200207233429, "learning_rate": 4.400403938399395e-06, "loss": 0.0521, "step": 16900 }, { "epoch": 4.269123958596314, "grad_norm": 0.982899010181427, "learning_rate": 4.385256248422115e-06, "loss": 0.0658, "step": 16910 }, { "epoch": 4.271648573592527, "grad_norm": 0.555623471736908, "learning_rate": 4.370108558444837e-06, "loss": 0.0485, "step": 16920 }, { "epoch": 4.27417318858874, "grad_norm": 0.5891562700271606, "learning_rate": 4.354960868467559e-06, "loss": 0.0513, "step": 16930 }, { "epoch": 4.276697803584954, "grad_norm": 0.5145660042762756, "learning_rate": 4.33981317849028e-06, "loss": 0.0509, "step": 16940 }, { "epoch": 4.279222418581166, "grad_norm": 0.33446022868156433, "learning_rate": 4.324665488513002e-06, "loss": 0.0481, "step": 16950 }, { "epoch": 4.281747033577379, "grad_norm": 0.493359237909317, "learning_rate": 4.309517798535723e-06, "loss": 0.0564, "step": 16960 }, { "epoch": 4.2842716485735926, "grad_norm": 0.3073669970035553, "learning_rate": 4.294370108558445e-06, "loss": 0.062, "step": 16970 }, { "epoch": 4.286796263569806, "grad_norm": 0.609738826751709, "learning_rate": 4.2792224185811665e-06, "loss": 0.0577, "step": 16980 }, { "epoch": 4.289320878566019, "grad_norm": 0.5454846620559692, "learning_rate": 4.264074728603888e-06, "loss": 0.0553, "step": 16990 }, { "epoch": 4.291845493562231, "grad_norm": 0.5008363723754883, "learning_rate": 4.24892703862661e-06, "loss": 0.0517, "step": 17000 }, { "epoch": 4.294370108558445, "grad_norm": 0.4839312434196472, "learning_rate": 4.233779348649331e-06, "loss": 0.0588, "step": 17010 }, { "epoch": 4.296894723554658, "grad_norm": 0.47222980856895447, "learning_rate": 4.218631658672053e-06, "loss": 0.0474, "step": 17020 }, { "epoch": 4.299419338550871, "grad_norm": 0.29100242257118225, "learning_rate": 4.2034839686947745e-06, "loss": 0.0479, "step": 17030 }, { "epoch": 4.3019439535470845, "grad_norm": 0.9103265404701233, "learning_rate": 4.188336278717495e-06, "loss": 0.0499, "step": 17040 }, { "epoch": 4.304468568543297, "grad_norm": 0.4569959044456482, "learning_rate": 4.173188588740217e-06, "loss": 0.0439, "step": 17050 }, { "epoch": 4.30699318353951, "grad_norm": 0.5705316662788391, "learning_rate": 4.158040898762939e-06, "loss": 0.0577, "step": 17060 }, { "epoch": 4.309517798535723, "grad_norm": 0.6814060211181641, "learning_rate": 4.14289320878566e-06, "loss": 0.0535, "step": 17070 }, { "epoch": 4.312042413531937, "grad_norm": 0.5100272297859192, "learning_rate": 4.127745518808382e-06, "loss": 0.0582, "step": 17080 }, { "epoch": 4.314567028528149, "grad_norm": 0.520350456237793, "learning_rate": 4.112597828831104e-06, "loss": 0.049, "step": 17090 }, { "epoch": 4.317091643524362, "grad_norm": 0.44908711314201355, "learning_rate": 4.097450138853825e-06, "loss": 0.059, "step": 17100 }, { "epoch": 4.3196162585205755, "grad_norm": 0.3135251998901367, "learning_rate": 4.082302448876546e-06, "loss": 0.0627, "step": 17110 }, { "epoch": 4.322140873516789, "grad_norm": 0.5100424885749817, "learning_rate": 4.067154758899268e-06, "loss": 0.0762, "step": 17120 }, { "epoch": 4.324665488513002, "grad_norm": 1.0668565034866333, "learning_rate": 4.0520070689219896e-06, "loss": 0.0628, "step": 17130 }, { "epoch": 4.327190103509215, "grad_norm": 0.3573605418205261, "learning_rate": 4.036859378944711e-06, "loss": 0.0647, "step": 17140 }, { "epoch": 4.329714718505428, "grad_norm": 0.5911782383918762, "learning_rate": 4.021711688967433e-06, "loss": 0.0554, "step": 17150 }, { "epoch": 4.332239333501641, "grad_norm": 0.7212706804275513, "learning_rate": 4.006563998990154e-06, "loss": 0.0595, "step": 17160 }, { "epoch": 4.334763948497854, "grad_norm": 0.6161842346191406, "learning_rate": 3.991416309012875e-06, "loss": 0.0585, "step": 17170 }, { "epoch": 4.337288563494067, "grad_norm": 0.5191289782524109, "learning_rate": 3.976268619035597e-06, "loss": 0.0534, "step": 17180 }, { "epoch": 4.33981317849028, "grad_norm": 0.4344318211078644, "learning_rate": 3.961120929058319e-06, "loss": 0.0569, "step": 17190 }, { "epoch": 4.342337793486493, "grad_norm": 0.8711338043212891, "learning_rate": 3.94597323908104e-06, "loss": 0.0489, "step": 17200 }, { "epoch": 4.344862408482706, "grad_norm": 0.4723449647426605, "learning_rate": 3.9308255491037615e-06, "loss": 0.0572, "step": 17210 }, { "epoch": 4.34738702347892, "grad_norm": 0.48766812682151794, "learning_rate": 3.915677859126484e-06, "loss": 0.0609, "step": 17220 }, { "epoch": 4.349911638475133, "grad_norm": 0.41383126378059387, "learning_rate": 3.900530169149205e-06, "loss": 0.0518, "step": 17230 }, { "epoch": 4.352436253471345, "grad_norm": 0.6354776620864868, "learning_rate": 3.885382479171926e-06, "loss": 0.0564, "step": 17240 }, { "epoch": 4.3549608684675585, "grad_norm": 0.44549328088760376, "learning_rate": 3.870234789194648e-06, "loss": 0.0643, "step": 17250 }, { "epoch": 4.357485483463772, "grad_norm": 0.5504968762397766, "learning_rate": 3.8550870992173695e-06, "loss": 0.0601, "step": 17260 }, { "epoch": 4.360010098459985, "grad_norm": 0.4032590389251709, "learning_rate": 3.839939409240091e-06, "loss": 0.0435, "step": 17270 }, { "epoch": 4.362534713456198, "grad_norm": 0.3765304982662201, "learning_rate": 3.824791719262813e-06, "loss": 0.0514, "step": 17280 }, { "epoch": 4.365059328452411, "grad_norm": 0.4660661220550537, "learning_rate": 3.8096440292855342e-06, "loss": 0.0565, "step": 17290 }, { "epoch": 4.367583943448624, "grad_norm": 0.6698967814445496, "learning_rate": 3.7944963393082554e-06, "loss": 0.0531, "step": 17300 }, { "epoch": 4.370108558444837, "grad_norm": 0.7271828651428223, "learning_rate": 3.7793486493309766e-06, "loss": 0.057, "step": 17310 }, { "epoch": 4.37263317344105, "grad_norm": 0.2889387607574463, "learning_rate": 3.764200959353699e-06, "loss": 0.05, "step": 17320 }, { "epoch": 4.375157788437264, "grad_norm": 0.8666712045669556, "learning_rate": 3.74905326937642e-06, "loss": 0.0638, "step": 17330 }, { "epoch": 4.377682403433476, "grad_norm": 0.3147795498371124, "learning_rate": 3.733905579399142e-06, "loss": 0.0583, "step": 17340 }, { "epoch": 4.380207018429689, "grad_norm": 0.6090699434280396, "learning_rate": 3.718757889421863e-06, "loss": 0.058, "step": 17350 }, { "epoch": 4.3827316334259026, "grad_norm": 0.3823327422142029, "learning_rate": 3.703610199444585e-06, "loss": 0.0578, "step": 17360 }, { "epoch": 4.385256248422116, "grad_norm": 0.43010056018829346, "learning_rate": 3.6884625094673066e-06, "loss": 0.0537, "step": 17370 }, { "epoch": 4.387780863418329, "grad_norm": 0.5161568522453308, "learning_rate": 3.6733148194900277e-06, "loss": 0.0578, "step": 17380 }, { "epoch": 4.390305478414541, "grad_norm": 0.3812313675880432, "learning_rate": 3.6581671295127493e-06, "loss": 0.0586, "step": 17390 }, { "epoch": 4.392830093410755, "grad_norm": 0.4599802494049072, "learning_rate": 3.643019439535471e-06, "loss": 0.0554, "step": 17400 }, { "epoch": 4.395354708406968, "grad_norm": 0.4805625379085541, "learning_rate": 3.6278717495581925e-06, "loss": 0.055, "step": 17410 }, { "epoch": 4.397879323403181, "grad_norm": 0.5854561924934387, "learning_rate": 3.612724059580914e-06, "loss": 0.0554, "step": 17420 }, { "epoch": 4.4004039383993945, "grad_norm": 0.48171231150627136, "learning_rate": 3.5975763696036353e-06, "loss": 0.0532, "step": 17430 }, { "epoch": 4.402928553395607, "grad_norm": 0.532574474811554, "learning_rate": 3.582428679626357e-06, "loss": 0.0617, "step": 17440 }, { "epoch": 4.40545316839182, "grad_norm": 0.33348578214645386, "learning_rate": 3.567280989649079e-06, "loss": 0.0514, "step": 17450 }, { "epoch": 4.407977783388033, "grad_norm": 0.49266380071640015, "learning_rate": 3.5521332996718e-06, "loss": 0.0564, "step": 17460 }, { "epoch": 4.410502398384247, "grad_norm": 0.8559315800666809, "learning_rate": 3.5369856096945217e-06, "loss": 0.0499, "step": 17470 }, { "epoch": 4.41302701338046, "grad_norm": 0.4928590953350067, "learning_rate": 3.521837919717243e-06, "loss": 0.0422, "step": 17480 }, { "epoch": 4.415551628376672, "grad_norm": 0.37242400646209717, "learning_rate": 3.506690229739965e-06, "loss": 0.0569, "step": 17490 }, { "epoch": 4.4180762433728855, "grad_norm": 0.5479409694671631, "learning_rate": 3.4915425397626865e-06, "loss": 0.0498, "step": 17500 }, { "epoch": 4.420600858369099, "grad_norm": 0.5095922350883484, "learning_rate": 3.4763948497854076e-06, "loss": 0.0629, "step": 17510 }, { "epoch": 4.423125473365312, "grad_norm": 0.49946799874305725, "learning_rate": 3.4612471598081292e-06, "loss": 0.0719, "step": 17520 }, { "epoch": 4.425650088361525, "grad_norm": 0.34545761346817017, "learning_rate": 3.4460994698308512e-06, "loss": 0.0471, "step": 17530 }, { "epoch": 4.428174703357738, "grad_norm": 0.3934885263442993, "learning_rate": 3.4309517798535724e-06, "loss": 0.0645, "step": 17540 }, { "epoch": 4.430699318353951, "grad_norm": 0.5036693215370178, "learning_rate": 3.415804089876294e-06, "loss": 0.0725, "step": 17550 }, { "epoch": 4.433223933350164, "grad_norm": 0.5618218779563904, "learning_rate": 3.400656399899015e-06, "loss": 0.0535, "step": 17560 }, { "epoch": 4.435748548346377, "grad_norm": 0.41341492533683777, "learning_rate": 3.385508709921737e-06, "loss": 0.0559, "step": 17570 }, { "epoch": 4.43827316334259, "grad_norm": 0.5488719940185547, "learning_rate": 3.3703610199444588e-06, "loss": 0.0619, "step": 17580 }, { "epoch": 4.440797778338803, "grad_norm": 0.6855219006538391, "learning_rate": 3.35521332996718e-06, "loss": 0.061, "step": 17590 }, { "epoch": 4.443322393335016, "grad_norm": 0.5198519229888916, "learning_rate": 3.3400656399899016e-06, "loss": 0.0484, "step": 17600 }, { "epoch": 4.44584700833123, "grad_norm": 0.4923730790615082, "learning_rate": 3.324917950012623e-06, "loss": 0.052, "step": 17610 }, { "epoch": 4.448371623327443, "grad_norm": 0.60850590467453, "learning_rate": 3.3097702600353447e-06, "loss": 0.0587, "step": 17620 }, { "epoch": 4.450896238323656, "grad_norm": 0.280381441116333, "learning_rate": 3.2946225700580663e-06, "loss": 0.0626, "step": 17630 }, { "epoch": 4.4534208533198685, "grad_norm": 0.6930276155471802, "learning_rate": 3.2794748800807875e-06, "loss": 0.051, "step": 17640 }, { "epoch": 4.455945468316082, "grad_norm": 0.46625441312789917, "learning_rate": 3.2643271901035095e-06, "loss": 0.0573, "step": 17650 }, { "epoch": 4.458470083312295, "grad_norm": 0.6228309869766235, "learning_rate": 3.249179500126231e-06, "loss": 0.0503, "step": 17660 }, { "epoch": 4.460994698308508, "grad_norm": 0.33836373686790466, "learning_rate": 3.2340318101489523e-06, "loss": 0.0495, "step": 17670 }, { "epoch": 4.463519313304721, "grad_norm": 0.2794092893600464, "learning_rate": 3.218884120171674e-06, "loss": 0.0465, "step": 17680 }, { "epoch": 4.466043928300934, "grad_norm": 0.5994489192962646, "learning_rate": 3.2037364301943955e-06, "loss": 0.0588, "step": 17690 }, { "epoch": 4.468568543297147, "grad_norm": 0.7377051711082458, "learning_rate": 3.188588740217117e-06, "loss": 0.0563, "step": 17700 }, { "epoch": 4.47109315829336, "grad_norm": 0.6417201161384583, "learning_rate": 3.1734410502398387e-06, "loss": 0.0491, "step": 17710 }, { "epoch": 4.473617773289574, "grad_norm": 0.8163862228393555, "learning_rate": 3.15829336026256e-06, "loss": 0.0593, "step": 17720 }, { "epoch": 4.476142388285786, "grad_norm": 0.4559797942638397, "learning_rate": 3.1431456702852814e-06, "loss": 0.0553, "step": 17730 }, { "epoch": 4.478667003281999, "grad_norm": 0.26883384585380554, "learning_rate": 3.127997980308003e-06, "loss": 0.0763, "step": 17740 }, { "epoch": 4.4811916182782126, "grad_norm": 0.5811520218849182, "learning_rate": 3.1128502903307246e-06, "loss": 0.0466, "step": 17750 }, { "epoch": 4.483716233274426, "grad_norm": 0.5414985418319702, "learning_rate": 3.0977026003534462e-06, "loss": 0.0472, "step": 17760 }, { "epoch": 4.486240848270639, "grad_norm": 0.2951160669326782, "learning_rate": 3.0825549103761674e-06, "loss": 0.0573, "step": 17770 }, { "epoch": 4.488765463266851, "grad_norm": 0.5405964851379395, "learning_rate": 3.0674072203988894e-06, "loss": 0.0562, "step": 17780 }, { "epoch": 4.491290078263065, "grad_norm": 0.6366590261459351, "learning_rate": 3.052259530421611e-06, "loss": 0.0531, "step": 17790 }, { "epoch": 4.493814693259278, "grad_norm": 0.4919568598270416, "learning_rate": 3.037111840444332e-06, "loss": 0.0679, "step": 17800 }, { "epoch": 4.496339308255491, "grad_norm": 0.4891291856765747, "learning_rate": 3.0219641504670538e-06, "loss": 0.057, "step": 17810 }, { "epoch": 4.4988639232517045, "grad_norm": 0.5892491340637207, "learning_rate": 3.0068164604897754e-06, "loss": 0.0519, "step": 17820 }, { "epoch": 4.501388538247917, "grad_norm": 0.3350531756877899, "learning_rate": 2.991668770512497e-06, "loss": 0.0572, "step": 17830 }, { "epoch": 4.50391315324413, "grad_norm": 0.4016503095626831, "learning_rate": 2.9765210805352185e-06, "loss": 0.0486, "step": 17840 }, { "epoch": 4.506437768240343, "grad_norm": 0.41794684529304504, "learning_rate": 2.9613733905579397e-06, "loss": 0.0423, "step": 17850 }, { "epoch": 4.508962383236557, "grad_norm": 0.5687925815582275, "learning_rate": 2.9462257005806617e-06, "loss": 0.052, "step": 17860 }, { "epoch": 4.51148699823277, "grad_norm": 0.42348799109458923, "learning_rate": 2.9310780106033833e-06, "loss": 0.0478, "step": 17870 }, { "epoch": 4.514011613228982, "grad_norm": 1.0323283672332764, "learning_rate": 2.9159303206261045e-06, "loss": 0.0659, "step": 17880 }, { "epoch": 4.5165362282251955, "grad_norm": 0.5422408580780029, "learning_rate": 2.900782630648826e-06, "loss": 0.0584, "step": 17890 }, { "epoch": 4.519060843221409, "grad_norm": 0.3516719341278076, "learning_rate": 2.8856349406715477e-06, "loss": 0.0651, "step": 17900 }, { "epoch": 4.521585458217622, "grad_norm": 0.5614657402038574, "learning_rate": 2.8704872506942693e-06, "loss": 0.0513, "step": 17910 }, { "epoch": 4.524110073213835, "grad_norm": 0.43666282296180725, "learning_rate": 2.855339560716991e-06, "loss": 0.0607, "step": 17920 }, { "epoch": 4.526634688210048, "grad_norm": 0.7072877883911133, "learning_rate": 2.840191870739712e-06, "loss": 0.0625, "step": 17930 }, { "epoch": 4.529159303206261, "grad_norm": 0.5805211663246155, "learning_rate": 2.825044180762434e-06, "loss": 0.0541, "step": 17940 }, { "epoch": 4.531683918202474, "grad_norm": 0.35863280296325684, "learning_rate": 2.8098964907851552e-06, "loss": 0.0525, "step": 17950 }, { "epoch": 4.534208533198687, "grad_norm": 0.6086267232894897, "learning_rate": 2.794748800807877e-06, "loss": 0.0673, "step": 17960 }, { "epoch": 4.5367331481949, "grad_norm": 0.33207792043685913, "learning_rate": 2.7796011108305984e-06, "loss": 0.0628, "step": 17970 }, { "epoch": 4.539257763191113, "grad_norm": 0.3145606517791748, "learning_rate": 2.76445342085332e-06, "loss": 0.056, "step": 17980 }, { "epoch": 4.541782378187326, "grad_norm": 0.3434918224811554, "learning_rate": 2.7493057308760416e-06, "loss": 0.0569, "step": 17990 }, { "epoch": 4.54430699318354, "grad_norm": 0.6462843418121338, "learning_rate": 2.734158040898763e-06, "loss": 0.0615, "step": 18000 }, { "epoch": 4.546831608179753, "grad_norm": 0.5379069447517395, "learning_rate": 2.7190103509214844e-06, "loss": 0.0618, "step": 18010 }, { "epoch": 4.549356223175966, "grad_norm": 0.36423930525779724, "learning_rate": 2.703862660944206e-06, "loss": 0.0528, "step": 18020 }, { "epoch": 4.5518808381721785, "grad_norm": 0.5888210535049438, "learning_rate": 2.6887149709669276e-06, "loss": 0.0581, "step": 18030 }, { "epoch": 4.554405453168392, "grad_norm": 0.3849516808986664, "learning_rate": 2.673567280989649e-06, "loss": 0.0518, "step": 18040 }, { "epoch": 4.556930068164605, "grad_norm": 0.4130467474460602, "learning_rate": 2.6584195910123708e-06, "loss": 0.0486, "step": 18050 }, { "epoch": 4.559454683160818, "grad_norm": 0.4366299510002136, "learning_rate": 2.643271901035092e-06, "loss": 0.0449, "step": 18060 }, { "epoch": 4.561979298157031, "grad_norm": 0.6537075042724609, "learning_rate": 2.628124211057814e-06, "loss": 0.0607, "step": 18070 }, { "epoch": 4.564503913153244, "grad_norm": 0.633960485458374, "learning_rate": 2.612976521080535e-06, "loss": 0.0599, "step": 18080 }, { "epoch": 4.567028528149457, "grad_norm": 0.3362826406955719, "learning_rate": 2.5978288311032567e-06, "loss": 0.044, "step": 18090 }, { "epoch": 4.56955314314567, "grad_norm": 0.4045182764530182, "learning_rate": 2.5826811411259783e-06, "loss": 0.0486, "step": 18100 }, { "epoch": 4.572077758141884, "grad_norm": 0.4667646884918213, "learning_rate": 2.5675334511487e-06, "loss": 0.05, "step": 18110 }, { "epoch": 4.574602373138097, "grad_norm": 0.24448010325431824, "learning_rate": 2.5523857611714215e-06, "loss": 0.05, "step": 18120 }, { "epoch": 4.577126988134309, "grad_norm": 0.5539112091064453, "learning_rate": 2.537238071194143e-06, "loss": 0.0607, "step": 18130 }, { "epoch": 4.5796516031305226, "grad_norm": 0.6582273244857788, "learning_rate": 2.5220903812168643e-06, "loss": 0.0711, "step": 18140 }, { "epoch": 4.582176218126736, "grad_norm": 0.42286741733551025, "learning_rate": 2.5069426912395863e-06, "loss": 0.0504, "step": 18150 }, { "epoch": 4.584700833122949, "grad_norm": 0.4873209595680237, "learning_rate": 2.4917950012623075e-06, "loss": 0.0522, "step": 18160 }, { "epoch": 4.587225448119161, "grad_norm": 0.4046306014060974, "learning_rate": 2.476647311285029e-06, "loss": 0.0562, "step": 18170 }, { "epoch": 4.589750063115375, "grad_norm": 0.3709363043308258, "learning_rate": 2.4614996213077506e-06, "loss": 0.0635, "step": 18180 }, { "epoch": 4.592274678111588, "grad_norm": 0.2947100102901459, "learning_rate": 2.4463519313304722e-06, "loss": 0.0619, "step": 18190 }, { "epoch": 4.594799293107801, "grad_norm": 0.6312539577484131, "learning_rate": 2.431204241353194e-06, "loss": 0.0455, "step": 18200 }, { "epoch": 4.5973239081040145, "grad_norm": 0.5368882417678833, "learning_rate": 2.4160565513759154e-06, "loss": 0.0613, "step": 18210 }, { "epoch": 4.599848523100227, "grad_norm": 0.760257363319397, "learning_rate": 2.4009088613986366e-06, "loss": 0.0493, "step": 18220 }, { "epoch": 4.60237313809644, "grad_norm": 0.5237419009208679, "learning_rate": 2.3857611714213586e-06, "loss": 0.0602, "step": 18230 }, { "epoch": 4.604897753092653, "grad_norm": 0.6729007959365845, "learning_rate": 2.3706134814440798e-06, "loss": 0.0576, "step": 18240 }, { "epoch": 4.607422368088867, "grad_norm": 0.30550557374954224, "learning_rate": 2.3554657914668014e-06, "loss": 0.0431, "step": 18250 }, { "epoch": 4.60994698308508, "grad_norm": 0.6244345903396606, "learning_rate": 2.340318101489523e-06, "loss": 0.0657, "step": 18260 }, { "epoch": 4.612471598081292, "grad_norm": 0.6878501772880554, "learning_rate": 2.325170411512244e-06, "loss": 0.056, "step": 18270 }, { "epoch": 4.6149962130775055, "grad_norm": 0.40880918502807617, "learning_rate": 2.310022721534966e-06, "loss": 0.0512, "step": 18280 }, { "epoch": 4.617520828073719, "grad_norm": 0.5096030235290527, "learning_rate": 2.2948750315576873e-06, "loss": 0.0471, "step": 18290 }, { "epoch": 4.620045443069932, "grad_norm": 0.3214782178401947, "learning_rate": 2.279727341580409e-06, "loss": 0.0517, "step": 18300 }, { "epoch": 4.622570058066145, "grad_norm": 0.3744233250617981, "learning_rate": 2.2645796516031305e-06, "loss": 0.0517, "step": 18310 }, { "epoch": 4.625094673062358, "grad_norm": 0.45410263538360596, "learning_rate": 2.249431961625852e-06, "loss": 0.0592, "step": 18320 }, { "epoch": 4.627619288058571, "grad_norm": 0.4763607680797577, "learning_rate": 2.2342842716485737e-06, "loss": 0.0547, "step": 18330 }, { "epoch": 4.630143903054784, "grad_norm": 0.36201509833335876, "learning_rate": 2.2191365816712953e-06, "loss": 0.0486, "step": 18340 }, { "epoch": 4.632668518050997, "grad_norm": 0.5421953797340393, "learning_rate": 2.2039888916940165e-06, "loss": 0.0491, "step": 18350 }, { "epoch": 4.63519313304721, "grad_norm": 0.39594170451164246, "learning_rate": 2.1888412017167385e-06, "loss": 0.0479, "step": 18360 }, { "epoch": 4.637717748043423, "grad_norm": 0.6088730692863464, "learning_rate": 2.1736935117394597e-06, "loss": 0.0594, "step": 18370 }, { "epoch": 4.640242363039636, "grad_norm": 0.3005998432636261, "learning_rate": 2.1585458217621813e-06, "loss": 0.0537, "step": 18380 }, { "epoch": 4.64276697803585, "grad_norm": 0.5010228753089905, "learning_rate": 2.143398131784903e-06, "loss": 0.0609, "step": 18390 }, { "epoch": 4.645291593032063, "grad_norm": 0.32724529504776, "learning_rate": 2.1282504418076244e-06, "loss": 0.0639, "step": 18400 }, { "epoch": 4.647816208028276, "grad_norm": 0.6042733192443848, "learning_rate": 2.113102751830346e-06, "loss": 0.0618, "step": 18410 }, { "epoch": 4.6503408230244885, "grad_norm": 0.3925885558128357, "learning_rate": 2.0979550618530672e-06, "loss": 0.0482, "step": 18420 }, { "epoch": 4.652865438020702, "grad_norm": 0.3752647936344147, "learning_rate": 2.082807371875789e-06, "loss": 0.0607, "step": 18430 }, { "epoch": 4.655390053016915, "grad_norm": 0.5499758720397949, "learning_rate": 2.067659681898511e-06, "loss": 0.0629, "step": 18440 }, { "epoch": 4.657914668013128, "grad_norm": 0.46264201402664185, "learning_rate": 2.052511991921232e-06, "loss": 0.0515, "step": 18450 }, { "epoch": 4.660439283009341, "grad_norm": 0.4616946876049042, "learning_rate": 2.0373643019439536e-06, "loss": 0.0563, "step": 18460 }, { "epoch": 4.662963898005554, "grad_norm": 0.5082404017448425, "learning_rate": 2.022216611966675e-06, "loss": 0.0538, "step": 18470 }, { "epoch": 4.665488513001767, "grad_norm": 0.42462125420570374, "learning_rate": 2.0070689219893968e-06, "loss": 0.05, "step": 18480 }, { "epoch": 4.66801312799798, "grad_norm": 0.3045310080051422, "learning_rate": 1.9919212320121184e-06, "loss": 0.054, "step": 18490 }, { "epoch": 4.670537742994194, "grad_norm": 0.4136578142642975, "learning_rate": 1.9767735420348395e-06, "loss": 0.0675, "step": 18500 }, { "epoch": 4.673062357990407, "grad_norm": 0.644835352897644, "learning_rate": 1.961625852057561e-06, "loss": 0.0574, "step": 18510 }, { "epoch": 4.675586972986619, "grad_norm": 0.4754480719566345, "learning_rate": 1.946478162080283e-06, "loss": 0.0496, "step": 18520 }, { "epoch": 4.6781115879828326, "grad_norm": 0.6022984981536865, "learning_rate": 1.9313304721030043e-06, "loss": 0.0586, "step": 18530 }, { "epoch": 4.680636202979046, "grad_norm": 0.5673237442970276, "learning_rate": 1.916182782125726e-06, "loss": 0.0565, "step": 18540 }, { "epoch": 4.683160817975259, "grad_norm": 0.42253974080085754, "learning_rate": 1.9010350921484475e-06, "loss": 0.0555, "step": 18550 }, { "epoch": 4.685685432971471, "grad_norm": 0.1553160846233368, "learning_rate": 1.885887402171169e-06, "loss": 0.0469, "step": 18560 }, { "epoch": 4.688210047967685, "grad_norm": 0.5742431282997131, "learning_rate": 1.8707397121938903e-06, "loss": 0.0466, "step": 18570 }, { "epoch": 4.690734662963898, "grad_norm": 0.6274034380912781, "learning_rate": 1.855592022216612e-06, "loss": 0.0513, "step": 18580 }, { "epoch": 4.693259277960111, "grad_norm": 0.42444777488708496, "learning_rate": 1.8404443322393335e-06, "loss": 0.0516, "step": 18590 }, { "epoch": 4.6957838929563245, "grad_norm": 0.8592566251754761, "learning_rate": 1.825296642262055e-06, "loss": 0.0557, "step": 18600 }, { "epoch": 4.698308507952537, "grad_norm": 0.37311816215515137, "learning_rate": 1.8101489522847765e-06, "loss": 0.0566, "step": 18610 }, { "epoch": 4.70083312294875, "grad_norm": 0.5313109159469604, "learning_rate": 1.7950012623074983e-06, "loss": 0.0535, "step": 18620 }, { "epoch": 4.703357737944963, "grad_norm": 0.5695617198944092, "learning_rate": 1.7798535723302196e-06, "loss": 0.0524, "step": 18630 }, { "epoch": 4.705882352941177, "grad_norm": 0.4023250937461853, "learning_rate": 1.7647058823529412e-06, "loss": 0.0598, "step": 18640 }, { "epoch": 4.70840696793739, "grad_norm": 0.36584198474884033, "learning_rate": 1.7495581923756626e-06, "loss": 0.0547, "step": 18650 }, { "epoch": 4.710931582933602, "grad_norm": 0.6956744194030762, "learning_rate": 1.7344105023983844e-06, "loss": 0.0551, "step": 18660 }, { "epoch": 4.7134561979298155, "grad_norm": 0.5592136979103088, "learning_rate": 1.7192628124211058e-06, "loss": 0.0428, "step": 18670 }, { "epoch": 4.715980812926029, "grad_norm": 0.7112722396850586, "learning_rate": 1.7041151224438274e-06, "loss": 0.0538, "step": 18680 }, { "epoch": 4.718505427922242, "grad_norm": 0.44742482900619507, "learning_rate": 1.6889674324665488e-06, "loss": 0.0583, "step": 18690 }, { "epoch": 4.721030042918455, "grad_norm": 0.7442821264266968, "learning_rate": 1.6738197424892706e-06, "loss": 0.066, "step": 18700 }, { "epoch": 4.723554657914668, "grad_norm": 0.5363455414772034, "learning_rate": 1.658672052511992e-06, "loss": 0.0537, "step": 18710 }, { "epoch": 4.726079272910881, "grad_norm": 0.3161306381225586, "learning_rate": 1.6435243625347136e-06, "loss": 0.0438, "step": 18720 }, { "epoch": 4.728603887907094, "grad_norm": 0.5690962076187134, "learning_rate": 1.628376672557435e-06, "loss": 0.064, "step": 18730 }, { "epoch": 4.731128502903307, "grad_norm": 0.28230124711990356, "learning_rate": 1.6132289825801565e-06, "loss": 0.0576, "step": 18740 }, { "epoch": 4.733653117899521, "grad_norm": 0.3042181134223938, "learning_rate": 1.5980812926028781e-06, "loss": 0.0589, "step": 18750 }, { "epoch": 4.736177732895733, "grad_norm": 0.42241615056991577, "learning_rate": 1.5829336026255997e-06, "loss": 0.0532, "step": 18760 }, { "epoch": 4.738702347891946, "grad_norm": 0.6024519801139832, "learning_rate": 1.5677859126483211e-06, "loss": 0.0586, "step": 18770 }, { "epoch": 4.74122696288816, "grad_norm": 0.36207395792007446, "learning_rate": 1.5526382226710427e-06, "loss": 0.0604, "step": 18780 }, { "epoch": 4.743751577884373, "grad_norm": 0.5053508877754211, "learning_rate": 1.5374905326937643e-06, "loss": 0.0523, "step": 18790 }, { "epoch": 4.746276192880586, "grad_norm": 0.48265892267227173, "learning_rate": 1.522342842716486e-06, "loss": 0.0594, "step": 18800 }, { "epoch": 4.7488008078767985, "grad_norm": 0.5995743274688721, "learning_rate": 1.5071951527392073e-06, "loss": 0.0548, "step": 18810 }, { "epoch": 4.751325422873012, "grad_norm": 0.45999419689178467, "learning_rate": 1.4920474627619289e-06, "loss": 0.0478, "step": 18820 }, { "epoch": 4.753850037869225, "grad_norm": 0.24280232191085815, "learning_rate": 1.4768997727846505e-06, "loss": 0.0485, "step": 18830 }, { "epoch": 4.756374652865438, "grad_norm": 0.41278672218322754, "learning_rate": 1.4617520828073719e-06, "loss": 0.0644, "step": 18840 }, { "epoch": 4.758899267861651, "grad_norm": 0.41428983211517334, "learning_rate": 1.4466043928300934e-06, "loss": 0.0483, "step": 18850 }, { "epoch": 4.761423882857864, "grad_norm": 0.6497278809547424, "learning_rate": 1.4314567028528148e-06, "loss": 0.0596, "step": 18860 }, { "epoch": 4.763948497854077, "grad_norm": 0.49583032727241516, "learning_rate": 1.4163090128755366e-06, "loss": 0.0636, "step": 18870 }, { "epoch": 4.76647311285029, "grad_norm": 0.37153711915016174, "learning_rate": 1.401161322898258e-06, "loss": 0.0581, "step": 18880 }, { "epoch": 4.768997727846504, "grad_norm": 0.14342792332172394, "learning_rate": 1.3860136329209796e-06, "loss": 0.057, "step": 18890 }, { "epoch": 4.771522342842717, "grad_norm": 0.5583682656288147, "learning_rate": 1.370865942943701e-06, "loss": 0.0539, "step": 18900 }, { "epoch": 4.774046957838929, "grad_norm": 0.45823413133621216, "learning_rate": 1.3557182529664226e-06, "loss": 0.052, "step": 18910 }, { "epoch": 4.7765715728351426, "grad_norm": 0.5518306493759155, "learning_rate": 1.3405705629891442e-06, "loss": 0.0494, "step": 18920 }, { "epoch": 4.779096187831356, "grad_norm": 0.3273313343524933, "learning_rate": 1.3254228730118658e-06, "loss": 0.0567, "step": 18930 }, { "epoch": 4.781620802827569, "grad_norm": 0.862960159778595, "learning_rate": 1.3102751830345872e-06, "loss": 0.0562, "step": 18940 }, { "epoch": 4.784145417823781, "grad_norm": 0.590394914150238, "learning_rate": 1.2951274930573088e-06, "loss": 0.0538, "step": 18950 }, { "epoch": 4.786670032819995, "grad_norm": 0.2639109194278717, "learning_rate": 1.2799798030800304e-06, "loss": 0.0514, "step": 18960 }, { "epoch": 4.789194647816208, "grad_norm": 0.4883747100830078, "learning_rate": 1.264832113102752e-06, "loss": 0.0621, "step": 18970 }, { "epoch": 4.791719262812421, "grad_norm": 0.6707940697669983, "learning_rate": 1.2496844231254733e-06, "loss": 0.0583, "step": 18980 }, { "epoch": 4.7942438778086345, "grad_norm": 0.4944520890712738, "learning_rate": 1.234536733148195e-06, "loss": 0.0536, "step": 18990 }, { "epoch": 4.796768492804848, "grad_norm": 0.5789937973022461, "learning_rate": 1.2193890431709165e-06, "loss": 0.0616, "step": 19000 }, { "epoch": 4.79929310780106, "grad_norm": 0.3722054958343506, "learning_rate": 1.2042413531936381e-06, "loss": 0.0358, "step": 19010 }, { "epoch": 4.801817722797273, "grad_norm": 0.5968698859214783, "learning_rate": 1.1890936632163595e-06, "loss": 0.0621, "step": 19020 }, { "epoch": 4.804342337793487, "grad_norm": 0.41492170095443726, "learning_rate": 1.173945973239081e-06, "loss": 0.0455, "step": 19030 }, { "epoch": 4.8068669527897, "grad_norm": 0.5387087464332581, "learning_rate": 1.1587982832618027e-06, "loss": 0.0549, "step": 19040 }, { "epoch": 4.809391567785912, "grad_norm": 0.7498044967651367, "learning_rate": 1.1436505932845243e-06, "loss": 0.056, "step": 19050 }, { "epoch": 4.8119161827821255, "grad_norm": 0.30329465866088867, "learning_rate": 1.1285029033072457e-06, "loss": 0.0619, "step": 19060 }, { "epoch": 4.814440797778339, "grad_norm": 0.6062590479850769, "learning_rate": 1.1133552133299673e-06, "loss": 0.0619, "step": 19070 }, { "epoch": 4.816965412774552, "grad_norm": 0.36201271414756775, "learning_rate": 1.0982075233526886e-06, "loss": 0.0615, "step": 19080 }, { "epoch": 4.819490027770765, "grad_norm": 0.612701416015625, "learning_rate": 1.0830598333754104e-06, "loss": 0.0393, "step": 19090 }, { "epoch": 4.822014642766978, "grad_norm": 0.48930907249450684, "learning_rate": 1.0679121433981318e-06, "loss": 0.0508, "step": 19100 }, { "epoch": 4.824539257763191, "grad_norm": 0.5168036818504333, "learning_rate": 1.0527644534208532e-06, "loss": 0.0613, "step": 19110 }, { "epoch": 4.827063872759404, "grad_norm": 0.7766786217689514, "learning_rate": 1.0376167634435748e-06, "loss": 0.0644, "step": 19120 }, { "epoch": 4.829588487755617, "grad_norm": 0.504666268825531, "learning_rate": 1.0224690734662964e-06, "loss": 0.0494, "step": 19130 }, { "epoch": 4.832113102751831, "grad_norm": 0.5199721455574036, "learning_rate": 1.007321383489018e-06, "loss": 0.0508, "step": 19140 }, { "epoch": 4.834637717748043, "grad_norm": 0.31914207339286804, "learning_rate": 9.921736935117394e-07, "loss": 0.0478, "step": 19150 }, { "epoch": 4.837162332744256, "grad_norm": 0.3759099841117859, "learning_rate": 9.77026003534461e-07, "loss": 0.0457, "step": 19160 }, { "epoch": 4.83968694774047, "grad_norm": 0.5880575180053711, "learning_rate": 9.618783135571826e-07, "loss": 0.0573, "step": 19170 }, { "epoch": 4.842211562736683, "grad_norm": 0.6293249726295471, "learning_rate": 9.467306235799042e-07, "loss": 0.0545, "step": 19180 }, { "epoch": 4.844736177732896, "grad_norm": 0.5560355186462402, "learning_rate": 9.315829336026256e-07, "loss": 0.0524, "step": 19190 }, { "epoch": 4.8472607927291085, "grad_norm": 0.48308777809143066, "learning_rate": 9.164352436253472e-07, "loss": 0.054, "step": 19200 }, { "epoch": 4.849785407725322, "grad_norm": 0.6635215282440186, "learning_rate": 9.012875536480687e-07, "loss": 0.066, "step": 19210 }, { "epoch": 4.852310022721535, "grad_norm": 0.637913703918457, "learning_rate": 8.861398636707903e-07, "loss": 0.0654, "step": 19220 }, { "epoch": 4.854834637717748, "grad_norm": 0.3939951956272125, "learning_rate": 8.709921736935118e-07, "loss": 0.0638, "step": 19230 }, { "epoch": 4.857359252713961, "grad_norm": 0.6704174876213074, "learning_rate": 8.558444837162333e-07, "loss": 0.0572, "step": 19240 }, { "epoch": 4.859883867710174, "grad_norm": 0.7500046491622925, "learning_rate": 8.406967937389548e-07, "loss": 0.0581, "step": 19250 }, { "epoch": 4.862408482706387, "grad_norm": 0.5089032649993896, "learning_rate": 8.255491037616763e-07, "loss": 0.0607, "step": 19260 }, { "epoch": 4.8649330977026, "grad_norm": 0.4917643368244171, "learning_rate": 8.104014137843979e-07, "loss": 0.0484, "step": 19270 }, { "epoch": 4.867457712698814, "grad_norm": 0.4157859683036804, "learning_rate": 7.952537238071194e-07, "loss": 0.0562, "step": 19280 }, { "epoch": 4.869982327695027, "grad_norm": 0.8012222647666931, "learning_rate": 7.80106033829841e-07, "loss": 0.0565, "step": 19290 }, { "epoch": 4.872506942691239, "grad_norm": 0.4449537694454193, "learning_rate": 7.649583438525624e-07, "loss": 0.0579, "step": 19300 }, { "epoch": 4.8750315576874526, "grad_norm": 0.6649788022041321, "learning_rate": 7.49810653875284e-07, "loss": 0.0713, "step": 19310 }, { "epoch": 4.877556172683666, "grad_norm": 0.3748777508735657, "learning_rate": 7.346629638980055e-07, "loss": 0.0552, "step": 19320 }, { "epoch": 4.880080787679879, "grad_norm": 0.6150493621826172, "learning_rate": 7.195152739207271e-07, "loss": 0.0476, "step": 19330 }, { "epoch": 4.882605402676091, "grad_norm": 0.5284005999565125, "learning_rate": 7.043675839434486e-07, "loss": 0.0556, "step": 19340 }, { "epoch": 4.885130017672305, "grad_norm": 0.3671182692050934, "learning_rate": 6.892198939661702e-07, "loss": 0.0674, "step": 19350 }, { "epoch": 4.887654632668518, "grad_norm": 0.6458919644355774, "learning_rate": 6.740722039888917e-07, "loss": 0.0611, "step": 19360 }, { "epoch": 4.890179247664731, "grad_norm": 0.47282344102859497, "learning_rate": 6.589245140116133e-07, "loss": 0.0537, "step": 19370 }, { "epoch": 4.8927038626609445, "grad_norm": 0.8219795823097229, "learning_rate": 6.437768240343348e-07, "loss": 0.0607, "step": 19380 }, { "epoch": 4.895228477657158, "grad_norm": 0.7187753319740295, "learning_rate": 6.286291340570563e-07, "loss": 0.0509, "step": 19390 }, { "epoch": 4.89775309265337, "grad_norm": 0.3172253668308258, "learning_rate": 6.134814440797779e-07, "loss": 0.0483, "step": 19400 }, { "epoch": 4.900277707649583, "grad_norm": 0.5177561044692993, "learning_rate": 5.983337541024993e-07, "loss": 0.056, "step": 19410 }, { "epoch": 4.902802322645797, "grad_norm": 0.27060073614120483, "learning_rate": 5.831860641252209e-07, "loss": 0.0564, "step": 19420 }, { "epoch": 4.90532693764201, "grad_norm": 0.5498465895652771, "learning_rate": 5.680383741479424e-07, "loss": 0.0472, "step": 19430 }, { "epoch": 4.907851552638222, "grad_norm": 0.48039332032203674, "learning_rate": 5.52890684170664e-07, "loss": 0.0469, "step": 19440 }, { "epoch": 4.9103761676344355, "grad_norm": 0.27178895473480225, "learning_rate": 5.377429941933855e-07, "loss": 0.0609, "step": 19450 }, { "epoch": 4.912900782630649, "grad_norm": 0.6592223048210144, "learning_rate": 5.225953042161071e-07, "loss": 0.0593, "step": 19460 }, { "epoch": 4.915425397626862, "grad_norm": 0.547122061252594, "learning_rate": 5.074476142388286e-07, "loss": 0.0509, "step": 19470 }, { "epoch": 4.917950012623075, "grad_norm": 0.44897332787513733, "learning_rate": 4.922999242615502e-07, "loss": 0.0456, "step": 19480 }, { "epoch": 4.9204746276192886, "grad_norm": 0.6687337756156921, "learning_rate": 4.771522342842717e-07, "loss": 0.0444, "step": 19490 }, { "epoch": 4.922999242615501, "grad_norm": 0.4966764450073242, "learning_rate": 4.6200454430699317e-07, "loss": 0.0505, "step": 19500 }, { "epoch": 4.925523857611714, "grad_norm": 0.7039111256599426, "learning_rate": 4.468568543297147e-07, "loss": 0.0537, "step": 19510 }, { "epoch": 4.928048472607927, "grad_norm": 0.5682207345962524, "learning_rate": 4.3170916435243625e-07, "loss": 0.0457, "step": 19520 }, { "epoch": 4.930573087604141, "grad_norm": 0.3622889220714569, "learning_rate": 4.165614743751578e-07, "loss": 0.0466, "step": 19530 }, { "epoch": 4.933097702600353, "grad_norm": 0.38879507780075073, "learning_rate": 4.0141378439787934e-07, "loss": 0.0453, "step": 19540 }, { "epoch": 4.935622317596566, "grad_norm": 0.185908704996109, "learning_rate": 3.862660944206009e-07, "loss": 0.041, "step": 19550 }, { "epoch": 4.93814693259278, "grad_norm": 0.4912923276424408, "learning_rate": 3.711184044433224e-07, "loss": 0.0568, "step": 19560 }, { "epoch": 4.940671547588993, "grad_norm": 0.22671930491924286, "learning_rate": 3.5597071446604396e-07, "loss": 0.055, "step": 19570 }, { "epoch": 4.943196162585206, "grad_norm": 0.43126681447029114, "learning_rate": 3.408230244887655e-07, "loss": 0.0638, "step": 19580 }, { "epoch": 4.9457207775814185, "grad_norm": 0.42626145482063293, "learning_rate": 3.2567533451148704e-07, "loss": 0.0517, "step": 19590 }, { "epoch": 4.948245392577632, "grad_norm": 0.6885790228843689, "learning_rate": 3.1052764453420853e-07, "loss": 0.0595, "step": 19600 }, { "epoch": 4.950770007573845, "grad_norm": 0.5854836702346802, "learning_rate": 2.9537995455693007e-07, "loss": 0.0653, "step": 19610 }, { "epoch": 4.953294622570058, "grad_norm": 0.4316968321800232, "learning_rate": 2.802322645796516e-07, "loss": 0.0575, "step": 19620 }, { "epoch": 4.9558192375662715, "grad_norm": 0.31033676862716675, "learning_rate": 2.6508457460237316e-07, "loss": 0.0416, "step": 19630 }, { "epoch": 4.958343852562484, "grad_norm": 0.37612465023994446, "learning_rate": 2.4993688462509464e-07, "loss": 0.0531, "step": 19640 }, { "epoch": 4.960868467558697, "grad_norm": 0.5763729810714722, "learning_rate": 2.3478919464781619e-07, "loss": 0.0602, "step": 19650 }, { "epoch": 4.96339308255491, "grad_norm": 0.8366661071777344, "learning_rate": 2.1964150467053775e-07, "loss": 0.0678, "step": 19660 }, { "epoch": 4.965917697551124, "grad_norm": 0.517417848110199, "learning_rate": 2.044938146932593e-07, "loss": 0.0524, "step": 19670 }, { "epoch": 4.968442312547337, "grad_norm": 0.6865304708480835, "learning_rate": 1.893461247159808e-07, "loss": 0.0574, "step": 19680 }, { "epoch": 4.970966927543549, "grad_norm": 0.3418879508972168, "learning_rate": 1.7419843473870235e-07, "loss": 0.0531, "step": 19690 }, { "epoch": 4.9734915425397626, "grad_norm": 1.0368373394012451, "learning_rate": 1.5905074476142387e-07, "loss": 0.0613, "step": 19700 }, { "epoch": 4.976016157535976, "grad_norm": 0.6324265599250793, "learning_rate": 1.439030547841454e-07, "loss": 0.0472, "step": 19710 }, { "epoch": 4.978540772532189, "grad_norm": 0.44766560196876526, "learning_rate": 1.2875536480686695e-07, "loss": 0.0465, "step": 19720 }, { "epoch": 4.9810653875284014, "grad_norm": 0.1590045988559723, "learning_rate": 1.1360767482958849e-07, "loss": 0.0439, "step": 19730 }, { "epoch": 4.983590002524615, "grad_norm": 0.5014932155609131, "learning_rate": 9.845998485231003e-08, "loss": 0.052, "step": 19740 }, { "epoch": 4.986114617520828, "grad_norm": 0.703318178653717, "learning_rate": 8.331229487503156e-08, "loss": 0.0569, "step": 19750 }, { "epoch": 4.988639232517041, "grad_norm": 0.5787055492401123, "learning_rate": 6.816460489775309e-08, "loss": 0.0561, "step": 19760 }, { "epoch": 4.9911638475132545, "grad_norm": 0.5603996515274048, "learning_rate": 5.301691492047463e-08, "loss": 0.0532, "step": 19770 }, { "epoch": 4.993688462509468, "grad_norm": 0.7136797904968262, "learning_rate": 3.786922494319616e-08, "loss": 0.0509, "step": 19780 }, { "epoch": 4.99621307750568, "grad_norm": 0.5492009520530701, "learning_rate": 2.2721534965917698e-08, "loss": 0.0573, "step": 19790 }, { "epoch": 4.998737692501893, "grad_norm": 0.364566445350647, "learning_rate": 7.573844988639233e-09, "loss": 0.0622, "step": 19800 }, { "epoch": 5.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.04460228607058525, "eval_runtime": 901.9248, "eval_samples_per_second": 228.691, "eval_steps_per_second": 3.573, "step": 19805 }, { "epoch": 5.0, "step": 19805, "total_flos": 9.820471825285631e+19, "train_loss": 0.06684475449172272, "train_runtime": 18247.0343, "train_samples_per_second": 69.452, "train_steps_per_second": 1.085 } ], "logging_steps": 10, "max_steps": 19805, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.820471825285631e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }