diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 0.04224640876054764, - "best_model_checkpoint": "./testVal_default_model/checkpoint-15844", + "best_metric": 0.04290741682052612, + "best_model_checkpoint": "./test_default_model/checkpoint-19805", "epoch": 5.0, "eval_steps": 500, "global_step": 19805, @@ -9,13918 +9,13927 @@ "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0025246149962130774, - "grad_norm": 2.1267971992492676, - "learning_rate": 2.9984852310022722e-05, - "loss": 0.3068, + "epoch": 0.00297000297000297, + "grad_norm": 1.7466098070144653, + "learning_rate": 2.9982179982179983e-05, + "loss": 0.2849, "step": 10 }, { - "epoch": 0.005049229992426155, - "grad_norm": 0.8344613909721375, - "learning_rate": 2.9969704620045444e-05, - "loss": 0.1692, + "epoch": 0.00594000594000594, + "grad_norm": 1.1830675601959229, + "learning_rate": 2.9964359964359965e-05, + "loss": 0.165, "step": 20 }, { - "epoch": 0.007573844988639233, - "grad_norm": 1.6312780380249023, - "learning_rate": 2.9954556930068166e-05, - "loss": 0.1614, + "epoch": 0.00891000891000891, + "grad_norm": 1.1286729574203491, + "learning_rate": 2.9946539946539947e-05, + "loss": 0.1477, "step": 30 }, { - "epoch": 0.01009845998485231, - "grad_norm": 2.278165102005005, - "learning_rate": 2.9939409240090887e-05, - "loss": 0.1488, + "epoch": 0.01188001188001188, + "grad_norm": 1.1905314922332764, + "learning_rate": 2.992871992871993e-05, + "loss": 0.1291, "step": 40 }, { - "epoch": 0.012623074981065387, - "grad_norm": 0.7996057868003845, - "learning_rate": 2.992426155011361e-05, - "loss": 0.1342, + "epoch": 0.01485001485001485, + "grad_norm": 1.1131917238235474, + "learning_rate": 2.991089991089991e-05, + "loss": 0.1261, "step": 50 }, { - "epoch": 0.015147689977278465, - "grad_norm": 0.9300896525382996, - "learning_rate": 2.990911386013633e-05, - "loss": 0.1341, + "epoch": 0.01782001782001782, + "grad_norm": 1.267146348953247, + "learning_rate": 2.9893079893079894e-05, + "loss": 0.1202, "step": 60 }, { - "epoch": 0.017672304973491544, - "grad_norm": 1.3064889907836914, - "learning_rate": 2.9893966170159052e-05, - "loss": 0.1469, + "epoch": 0.02079002079002079, + "grad_norm": 1.0151565074920654, + "learning_rate": 2.9875259875259876e-05, + "loss": 0.1209, "step": 70 }, { - "epoch": 0.02019691996970462, - "grad_norm": 0.842812180519104, - "learning_rate": 2.9878818480181774e-05, - "loss": 0.1306, + "epoch": 0.02376002376002376, + "grad_norm": 0.8143087029457092, + "learning_rate": 2.9857439857439858e-05, + "loss": 0.1142, "step": 80 }, { - "epoch": 0.022721534965917698, - "grad_norm": 1.0661952495574951, - "learning_rate": 2.9863670790204495e-05, - "loss": 0.1243, + "epoch": 0.02673002673002673, + "grad_norm": 1.4103150367736816, + "learning_rate": 2.983961983961984e-05, + "loss": 0.1123, "step": 90 }, { - "epoch": 0.025246149962130773, - "grad_norm": 0.8375621438026428, - "learning_rate": 2.9848523100227217e-05, - "loss": 0.1137, + "epoch": 0.0297000297000297, + "grad_norm": 1.6572177410125732, + "learning_rate": 2.9821799821799822e-05, + "loss": 0.1077, "step": 100 }, { - "epoch": 0.027770764958343852, - "grad_norm": 0.6965815424919128, - "learning_rate": 2.9833375410249938e-05, - "loss": 0.097, + "epoch": 0.03267003267003267, + "grad_norm": 0.7510947585105896, + "learning_rate": 2.9803979803979805e-05, + "loss": 0.1209, "step": 110 }, { - "epoch": 0.03029537995455693, - "grad_norm": 1.5321495532989502, - "learning_rate": 2.981822772027266e-05, - "loss": 0.1218, + "epoch": 0.03564003564003564, + "grad_norm": 1.0185679197311401, + "learning_rate": 2.9786159786159787e-05, + "loss": 0.1073, "step": 120 }, { - "epoch": 0.03281999495077001, - "grad_norm": 0.7771267890930176, - "learning_rate": 2.980308003029538e-05, - "loss": 0.1044, + "epoch": 0.03861003861003861, + "grad_norm": 1.214509129524231, + "learning_rate": 2.976833976833977e-05, + "loss": 0.1243, "step": 130 }, { - "epoch": 0.03534460994698309, - "grad_norm": 1.137905478477478, - "learning_rate": 2.9787932340318103e-05, - "loss": 0.1219, + "epoch": 0.04158004158004158, + "grad_norm": 0.7015527486801147, + "learning_rate": 2.975051975051975e-05, + "loss": 0.1073, "step": 140 }, { - "epoch": 0.03786922494319616, - "grad_norm": 0.6131516098976135, - "learning_rate": 2.9772784650340825e-05, - "loss": 0.1163, + "epoch": 0.04455004455004455, + "grad_norm": 0.7993521690368652, + "learning_rate": 2.9732699732699733e-05, + "loss": 0.1162, "step": 150 }, { - "epoch": 0.04039383993940924, - "grad_norm": 0.6713071465492249, - "learning_rate": 2.9757636960363543e-05, - "loss": 0.1046, + "epoch": 0.04752004752004752, + "grad_norm": 1.4627108573913574, + "learning_rate": 2.9714879714879715e-05, + "loss": 0.1094, "step": 160 }, { - "epoch": 0.04291845493562232, - "grad_norm": 0.7513856887817383, - "learning_rate": 2.9742489270386268e-05, - "loss": 0.1186, + "epoch": 0.05049005049005049, + "grad_norm": 0.5512596368789673, + "learning_rate": 2.9697059697059698e-05, + "loss": 0.1105, "step": 170 }, { - "epoch": 0.045443069931835396, - "grad_norm": 0.9550772905349731, - "learning_rate": 2.972734158040899e-05, - "loss": 0.1053, + "epoch": 0.05346005346005346, + "grad_norm": 0.6437152028083801, + "learning_rate": 2.967923967923968e-05, + "loss": 0.1165, "step": 180 }, { - "epoch": 0.047967684928048475, - "grad_norm": 1.595443606376648, - "learning_rate": 2.9712193890431708e-05, - "loss": 0.1229, + "epoch": 0.05643005643005643, + "grad_norm": 0.9050448536872864, + "learning_rate": 2.9661419661419662e-05, + "loss": 0.1158, "step": 190 }, { - "epoch": 0.05049229992426155, - "grad_norm": 0.4370063543319702, - "learning_rate": 2.9697046200454433e-05, - "loss": 0.1324, + "epoch": 0.0594000594000594, + "grad_norm": 0.7845998406410217, + "learning_rate": 2.9643599643599644e-05, + "loss": 0.1046, "step": 200 }, { - "epoch": 0.053016914920474625, - "grad_norm": 1.201798439025879, - "learning_rate": 2.9681898510477154e-05, - "loss": 0.1222, + "epoch": 0.062370062370062374, + "grad_norm": 1.2148687839508057, + "learning_rate": 2.9625779625779626e-05, + "loss": 0.1048, "step": 210 }, { - "epoch": 0.055541529916687704, - "grad_norm": 0.6518549919128418, - "learning_rate": 2.9666750820499872e-05, - "loss": 0.1039, + "epoch": 0.06534006534006534, + "grad_norm": 0.5540338754653931, + "learning_rate": 2.960795960795961e-05, + "loss": 0.0989, "step": 220 }, { - "epoch": 0.05806614491290078, - "grad_norm": 0.9148812294006348, - "learning_rate": 2.9651603130522597e-05, - "loss": 0.1213, + "epoch": 0.0683100683100683, + "grad_norm": 1.147627830505371, + "learning_rate": 2.959013959013959e-05, + "loss": 0.0948, "step": 230 }, { - "epoch": 0.06059075990911386, - "grad_norm": 0.9625220894813538, - "learning_rate": 2.963645544054532e-05, - "loss": 0.0982, + "epoch": 0.07128007128007129, + "grad_norm": 0.6427733898162842, + "learning_rate": 2.9572319572319573e-05, + "loss": 0.1132, "step": 240 }, { - "epoch": 0.06311537490532694, - "grad_norm": 1.0822298526763916, - "learning_rate": 2.9621307750568037e-05, - "loss": 0.1058, + "epoch": 0.07425007425007425, + "grad_norm": 0.48930391669273376, + "learning_rate": 2.9554499554499555e-05, + "loss": 0.1077, "step": 250 }, { - "epoch": 0.06563998990154002, - "grad_norm": 0.5486002564430237, - "learning_rate": 2.9606160060590762e-05, - "loss": 0.1103, + "epoch": 0.07722007722007722, + "grad_norm": 0.5915262699127197, + "learning_rate": 2.9536679536679537e-05, + "loss": 0.0988, "step": 260 }, { - "epoch": 0.0681646048977531, - "grad_norm": 0.6268118619918823, - "learning_rate": 2.9591012370613484e-05, - "loss": 0.1082, + "epoch": 0.08019008019008018, + "grad_norm": 0.9437302350997925, + "learning_rate": 2.951885951885952e-05, + "loss": 0.0943, "step": 270 }, { - "epoch": 0.07068921989396618, - "grad_norm": 0.950860321521759, - "learning_rate": 2.9575864680636202e-05, - "loss": 0.1153, + "epoch": 0.08316008316008316, + "grad_norm": 0.8880864381790161, + "learning_rate": 2.95010395010395e-05, + "loss": 0.0874, "step": 280 }, { - "epoch": 0.07321383489017924, - "grad_norm": 0.5327135324478149, - "learning_rate": 2.9560716990658924e-05, - "loss": 0.1022, + "epoch": 0.08613008613008613, + "grad_norm": 0.5144450068473816, + "learning_rate": 2.9483219483219484e-05, + "loss": 0.1055, "step": 290 }, { - "epoch": 0.07573844988639232, - "grad_norm": 0.5201209187507629, - "learning_rate": 2.954556930068165e-05, - "loss": 0.1222, + "epoch": 0.0891000891000891, + "grad_norm": 0.718809187412262, + "learning_rate": 2.9465399465399466e-05, + "loss": 0.1052, "step": 300 }, { - "epoch": 0.0782630648826054, - "grad_norm": 0.5655984282493591, - "learning_rate": 2.9530421610704367e-05, - "loss": 0.0962, + "epoch": 0.09207009207009206, + "grad_norm": 1.3571438789367676, + "learning_rate": 2.9447579447579448e-05, + "loss": 0.1063, "step": 310 }, { - "epoch": 0.08078767987881848, - "grad_norm": 0.48904240131378174, - "learning_rate": 2.951527392072709e-05, - "loss": 0.0881, + "epoch": 0.09504009504009504, + "grad_norm": 1.6258764266967773, + "learning_rate": 2.942975942975943e-05, + "loss": 0.1162, "step": 320 }, { - "epoch": 0.08331229487503156, - "grad_norm": 0.8683993220329285, - "learning_rate": 2.9500126230749813e-05, - "loss": 0.1149, + "epoch": 0.09801009801009801, + "grad_norm": 0.46299999952316284, + "learning_rate": 2.9411939411939412e-05, + "loss": 0.0883, "step": 330 }, { - "epoch": 0.08583690987124463, - "grad_norm": 0.8078688383102417, - "learning_rate": 2.948497854077253e-05, - "loss": 0.0921, + "epoch": 0.10098010098010098, + "grad_norm": 0.6494696140289307, + "learning_rate": 2.9394119394119395e-05, + "loss": 0.112, "step": 340 }, { - "epoch": 0.08836152486745771, - "grad_norm": 2.072779893875122, - "learning_rate": 2.9469830850795253e-05, - "loss": 0.0912, + "epoch": 0.10395010395010396, + "grad_norm": 0.6922862529754639, + "learning_rate": 2.9376299376299377e-05, + "loss": 0.099, "step": 350 }, { - "epoch": 0.09088613986367079, - "grad_norm": 0.41440829634666443, - "learning_rate": 2.9454683160817978e-05, - "loss": 0.094, + "epoch": 0.10692010692010692, + "grad_norm": 0.6345096230506897, + "learning_rate": 2.935847935847936e-05, + "loss": 0.0879, "step": 360 }, { - "epoch": 0.09341075485988387, - "grad_norm": 0.5622707605361938, - "learning_rate": 2.9439535470840696e-05, - "loss": 0.098, + "epoch": 0.10989010989010989, + "grad_norm": 1.3332024812698364, + "learning_rate": 2.934065934065934e-05, + "loss": 0.1001, "step": 370 }, { - "epoch": 0.09593536985609695, - "grad_norm": 0.9337944388389587, - "learning_rate": 2.9424387780863418e-05, - "loss": 0.0998, + "epoch": 0.11286011286011285, + "grad_norm": 1.0127153396606445, + "learning_rate": 2.9322839322839323e-05, + "loss": 0.0997, "step": 380 }, { - "epoch": 0.09845998485231003, - "grad_norm": 0.6288129091262817, - "learning_rate": 2.9409240090886143e-05, - "loss": 0.1113, + "epoch": 0.11583011583011583, + "grad_norm": 0.48219984769821167, + "learning_rate": 2.930501930501931e-05, + "loss": 0.0875, "step": 390 }, { - "epoch": 0.1009845998485231, - "grad_norm": 0.5012751817703247, - "learning_rate": 2.939409240090886e-05, - "loss": 0.1035, + "epoch": 0.1188001188001188, + "grad_norm": 0.8579444289207458, + "learning_rate": 2.9287199287199288e-05, + "loss": 0.1045, "step": 400 }, { - "epoch": 0.10350921484473617, - "grad_norm": 0.5585261583328247, - "learning_rate": 2.9378944710931583e-05, - "loss": 0.1073, + "epoch": 0.12177012177012177, + "grad_norm": 0.5488039255142212, + "learning_rate": 2.926937926937927e-05, + "loss": 0.1084, "step": 410 }, { - "epoch": 0.10603382984094925, - "grad_norm": 0.8108246922492981, - "learning_rate": 2.9363797020954308e-05, - "loss": 0.0965, + "epoch": 0.12474012474012475, + "grad_norm": 1.2597718238830566, + "learning_rate": 2.9251559251559252e-05, + "loss": 0.0853, "step": 420 }, { - "epoch": 0.10855844483716233, - "grad_norm": 0.9611131548881531, - "learning_rate": 2.9348649330977026e-05, - "loss": 0.1109, + "epoch": 0.1277101277101277, + "grad_norm": 1.077631950378418, + "learning_rate": 2.9233739233739234e-05, + "loss": 0.0962, "step": 430 }, { - "epoch": 0.11108305983337541, - "grad_norm": 0.6837782859802246, - "learning_rate": 2.9333501640999748e-05, - "loss": 0.1084, + "epoch": 0.13068013068013068, + "grad_norm": 0.5581513047218323, + "learning_rate": 2.9215919215919216e-05, + "loss": 0.0922, "step": 440 }, { - "epoch": 0.11360767482958849, - "grad_norm": 1.0511689186096191, - "learning_rate": 2.931835395102247e-05, - "loss": 0.0979, + "epoch": 0.13365013365013365, + "grad_norm": 0.6805756092071533, + "learning_rate": 2.91980991980992e-05, + "loss": 0.1083, "step": 450 }, { - "epoch": 0.11613228982580157, - "grad_norm": 1.4842995405197144, - "learning_rate": 2.930320626104519e-05, - "loss": 0.0915, + "epoch": 0.1366201366201366, + "grad_norm": 0.860261857509613, + "learning_rate": 2.9180279180279184e-05, + "loss": 0.0852, "step": 460 }, { - "epoch": 0.11865690482201464, - "grad_norm": 0.948907732963562, - "learning_rate": 2.9288058571067912e-05, - "loss": 0.0774, + "epoch": 0.13959013959013958, + "grad_norm": 1.9232168197631836, + "learning_rate": 2.9162459162459163e-05, + "loss": 0.0933, "step": 470 }, { - "epoch": 0.12118151981822772, - "grad_norm": 0.8867286443710327, - "learning_rate": 2.9272910881090634e-05, - "loss": 0.0858, + "epoch": 0.14256014256014257, + "grad_norm": 0.8232311606407166, + "learning_rate": 2.9144639144639145e-05, + "loss": 0.0928, "step": 480 }, { - "epoch": 0.1237061348144408, - "grad_norm": 0.5692402720451355, - "learning_rate": 2.9257763191113356e-05, - "loss": 0.0939, + "epoch": 0.14553014553014554, + "grad_norm": 0.8007870316505432, + "learning_rate": 2.9126819126819127e-05, + "loss": 0.0906, "step": 490 }, { - "epoch": 0.12623074981065388, - "grad_norm": 0.4716489613056183, - "learning_rate": 2.9242615501136077e-05, - "loss": 0.108, + "epoch": 0.1485001485001485, + "grad_norm": 1.1848207712173462, + "learning_rate": 2.910899910899911e-05, + "loss": 0.1052, "step": 500 }, { - "epoch": 0.12875536480686695, - "grad_norm": 0.4867834448814392, - "learning_rate": 2.92274678111588e-05, - "loss": 0.0858, + "epoch": 0.15147015147015147, + "grad_norm": 0.5605499744415283, + "learning_rate": 2.909117909117909e-05, + "loss": 0.093, "step": 510 }, { - "epoch": 0.13127997980308004, - "grad_norm": 0.4366174638271332, - "learning_rate": 2.921232012118152e-05, - "loss": 0.0695, + "epoch": 0.15444015444015444, + "grad_norm": 0.6382190585136414, + "learning_rate": 2.9073359073359074e-05, + "loss": 0.0997, "step": 520 }, { - "epoch": 0.1338045947992931, - "grad_norm": 1.3339869976043701, - "learning_rate": 2.9197172431204242e-05, - "loss": 0.0955, + "epoch": 0.1574101574101574, + "grad_norm": 0.5192627310752869, + "learning_rate": 2.905553905553906e-05, + "loss": 0.1098, "step": 530 }, { - "epoch": 0.1363292097955062, - "grad_norm": 0.9120462536811829, - "learning_rate": 2.9182024741226963e-05, - "loss": 0.103, + "epoch": 0.16038016038016037, + "grad_norm": 0.5898168683052063, + "learning_rate": 2.9037719037719038e-05, + "loss": 0.0951, "step": 540 }, { - "epoch": 0.13885382479171926, - "grad_norm": 0.3417504131793976, - "learning_rate": 2.9166877051249685e-05, - "loss": 0.0878, + "epoch": 0.16335016335016336, + "grad_norm": 0.465077668428421, + "learning_rate": 2.901989901989902e-05, + "loss": 0.0929, "step": 550 }, { - "epoch": 0.14137843978793235, - "grad_norm": 0.5997135043144226, - "learning_rate": 2.9151729361272407e-05, - "loss": 0.0944, + "epoch": 0.16632016632016633, + "grad_norm": 0.6358753442764282, + "learning_rate": 2.9002079002079002e-05, + "loss": 0.0999, "step": 560 }, { - "epoch": 0.14390305478414542, - "grad_norm": 0.7205860018730164, - "learning_rate": 2.9136581671295128e-05, - "loss": 0.0877, + "epoch": 0.1692901692901693, + "grad_norm": 0.7714558839797974, + "learning_rate": 2.8984258984258984e-05, + "loss": 0.1031, "step": 570 }, { - "epoch": 0.14642766978035848, - "grad_norm": 0.5023823976516724, - "learning_rate": 2.912143398131785e-05, - "loss": 0.091, + "epoch": 0.17226017226017226, + "grad_norm": 0.865616500377655, + "learning_rate": 2.8966438966438967e-05, + "loss": 0.0932, "step": 580 }, { - "epoch": 0.14895228477657158, - "grad_norm": 0.6862772703170776, - "learning_rate": 2.910628629134057e-05, - "loss": 0.0964, + "epoch": 0.17523017523017523, + "grad_norm": 0.621036171913147, + "learning_rate": 2.894861894861895e-05, + "loss": 0.094, "step": 590 }, { - "epoch": 0.15147689977278464, - "grad_norm": 0.7713685035705566, - "learning_rate": 2.9091138601363293e-05, - "loss": 0.0962, + "epoch": 0.1782001782001782, + "grad_norm": 0.5007760524749756, + "learning_rate": 2.8930798930798934e-05, + "loss": 0.099, "step": 600 }, { - "epoch": 0.15400151476899773, - "grad_norm": 0.45930472016334534, - "learning_rate": 2.9075990911386015e-05, - "loss": 0.1, + "epoch": 0.18117018117018116, + "grad_norm": 0.47733157873153687, + "learning_rate": 2.8912978912978913e-05, + "loss": 0.0795, "step": 610 }, { - "epoch": 0.1565261297652108, - "grad_norm": 0.6255579590797424, - "learning_rate": 2.9060843221408736e-05, - "loss": 0.0957, + "epoch": 0.18414018414018413, + "grad_norm": 0.40642765164375305, + "learning_rate": 2.8895158895158895e-05, + "loss": 0.0829, "step": 620 }, { - "epoch": 0.1590507447614239, - "grad_norm": 0.7254714369773865, - "learning_rate": 2.9045695531431458e-05, - "loss": 0.1, + "epoch": 0.18711018711018712, + "grad_norm": 1.1361258029937744, + "learning_rate": 2.8877338877338877e-05, + "loss": 0.0893, "step": 630 }, { - "epoch": 0.16157535975763695, - "grad_norm": 0.9384628534317017, - "learning_rate": 2.903054784145418e-05, - "loss": 0.0923, + "epoch": 0.1900801900801901, + "grad_norm": 0.7784861922264099, + "learning_rate": 2.885951885951886e-05, + "loss": 0.0888, "step": 640 }, { - "epoch": 0.16409997475385005, - "grad_norm": 0.5457489490509033, - "learning_rate": 2.90154001514769e-05, - "loss": 0.1059, + "epoch": 0.19305019305019305, + "grad_norm": 0.43066325783729553, + "learning_rate": 2.8841698841698842e-05, + "loss": 0.0966, "step": 650 }, { - "epoch": 0.1666245897500631, - "grad_norm": 0.5641859769821167, - "learning_rate": 2.9000252461499623e-05, - "loss": 0.0952, + "epoch": 0.19602019602019602, + "grad_norm": 0.36752209067344666, + "learning_rate": 2.8823878823878824e-05, + "loss": 0.0999, "step": 660 }, { - "epoch": 0.1691492047462762, - "grad_norm": 0.5558760166168213, - "learning_rate": 2.8985104771522344e-05, - "loss": 0.1043, + "epoch": 0.19899019899019899, + "grad_norm": 0.9712108969688416, + "learning_rate": 2.880605880605881e-05, + "loss": 0.0906, "step": 670 }, { - "epoch": 0.17167381974248927, - "grad_norm": 0.6601101756095886, - "learning_rate": 2.8969957081545066e-05, - "loss": 0.0875, + "epoch": 0.20196020196020195, + "grad_norm": 0.714443564414978, + "learning_rate": 2.878823878823879e-05, + "loss": 0.1049, "step": 680 }, { - "epoch": 0.17419843473870233, - "grad_norm": 0.6963163614273071, - "learning_rate": 2.8954809391567787e-05, - "loss": 0.0937, + "epoch": 0.20493020493020492, + "grad_norm": 0.3934662640094757, + "learning_rate": 2.877041877041877e-05, + "loss": 0.09, "step": 690 }, { - "epoch": 0.17672304973491543, - "grad_norm": 1.265081524848938, - "learning_rate": 2.893966170159051e-05, - "loss": 0.0816, + "epoch": 0.2079002079002079, + "grad_norm": 1.9262911081314087, + "learning_rate": 2.8752598752598753e-05, + "loss": 0.1051, "step": 700 }, { - "epoch": 0.1792476647311285, - "grad_norm": 0.5968114137649536, - "learning_rate": 2.892451401161323e-05, - "loss": 0.0842, + "epoch": 0.21087021087021088, + "grad_norm": 0.6336867809295654, + "learning_rate": 2.8734778734778735e-05, + "loss": 0.0852, "step": 710 }, { - "epoch": 0.18177227972734158, - "grad_norm": 0.9470769762992859, - "learning_rate": 2.8909366321635952e-05, - "loss": 0.0863, + "epoch": 0.21384021384021384, + "grad_norm": 0.45155736804008484, + "learning_rate": 2.8716958716958717e-05, + "loss": 0.0928, "step": 720 }, { - "epoch": 0.18429689472355465, - "grad_norm": 0.4141634702682495, - "learning_rate": 2.8894218631658674e-05, - "loss": 0.0825, + "epoch": 0.2168102168102168, + "grad_norm": 0.6008352041244507, + "learning_rate": 2.86991386991387e-05, + "loss": 0.0829, "step": 730 }, { - "epoch": 0.18682150971976774, - "grad_norm": 0.5291838645935059, - "learning_rate": 2.8879070941681392e-05, - "loss": 0.1073, + "epoch": 0.21978021978021978, + "grad_norm": 0.4825937747955322, + "learning_rate": 2.8681318681318685e-05, + "loss": 0.0995, "step": 740 }, { - "epoch": 0.1893461247159808, - "grad_norm": 0.6476059556007385, - "learning_rate": 2.8863923251704117e-05, - "loss": 0.0852, + "epoch": 0.22275022275022274, + "grad_norm": 1.0774333477020264, + "learning_rate": 2.8663498663498664e-05, + "loss": 0.1031, "step": 750 }, { - "epoch": 0.1918707397121939, - "grad_norm": 1.3100022077560425, - "learning_rate": 2.884877556172684e-05, - "loss": 0.0987, + "epoch": 0.2257202257202257, + "grad_norm": 0.7147405743598938, + "learning_rate": 2.8645678645678646e-05, + "loss": 0.1018, "step": 760 }, { - "epoch": 0.19439535470840696, - "grad_norm": 0.4951756000518799, - "learning_rate": 2.8833627871749557e-05, - "loss": 0.0842, + "epoch": 0.2286902286902287, + "grad_norm": 0.6777707934379578, + "learning_rate": 2.8627858627858628e-05, + "loss": 0.0901, "step": 770 }, { - "epoch": 0.19691996970462006, - "grad_norm": 0.9130496978759766, - "learning_rate": 2.8818480181772282e-05, - "loss": 0.0877, + "epoch": 0.23166023166023167, + "grad_norm": 0.4215840697288513, + "learning_rate": 2.861003861003861e-05, + "loss": 0.0862, "step": 780 }, { - "epoch": 0.19944458470083312, - "grad_norm": 1.0746809244155884, - "learning_rate": 2.8803332491795003e-05, - "loss": 0.0784, + "epoch": 0.23463023463023464, + "grad_norm": 0.4555210471153259, + "learning_rate": 2.8592218592218592e-05, + "loss": 0.0825, "step": 790 }, { - "epoch": 0.2019691996970462, - "grad_norm": 0.5604913830757141, - "learning_rate": 2.878818480181772e-05, - "loss": 0.0991, + "epoch": 0.2376002376002376, + "grad_norm": 0.7088650465011597, + "learning_rate": 2.8574398574398574e-05, + "loss": 0.0932, "step": 800 }, { - "epoch": 0.20449381469325928, - "grad_norm": 0.512355625629425, - "learning_rate": 2.8773037111840447e-05, - "loss": 0.0974, + "epoch": 0.24057024057024057, + "grad_norm": 0.6595791578292847, + "learning_rate": 2.855657855657856e-05, + "loss": 0.098, "step": 810 }, { - "epoch": 0.20701842968947234, - "grad_norm": 0.6891873478889465, - "learning_rate": 2.8757889421863168e-05, - "loss": 0.0803, + "epoch": 0.24354024354024353, + "grad_norm": 0.5375499725341797, + "learning_rate": 2.853875853875854e-05, + "loss": 0.0875, "step": 820 }, { - "epoch": 0.20954304468568544, - "grad_norm": 0.4630875289440155, - "learning_rate": 2.8742741731885886e-05, - "loss": 0.0803, + "epoch": 0.2465102465102465, + "grad_norm": 0.4199369549751282, + "learning_rate": 2.852093852093852e-05, + "loss": 0.0807, "step": 830 }, { - "epoch": 0.2120676596818985, - "grad_norm": 0.9669208526611328, - "learning_rate": 2.8727594041908608e-05, - "loss": 0.1017, + "epoch": 0.2494802494802495, + "grad_norm": 0.41728097200393677, + "learning_rate": 2.8503118503118503e-05, + "loss": 0.0925, "step": 840 }, { - "epoch": 0.2145922746781116, - "grad_norm": 0.36673077940940857, - "learning_rate": 2.8712446351931333e-05, - "loss": 0.0864, + "epoch": 0.25245025245025243, + "grad_norm": 0.6526634693145752, + "learning_rate": 2.8485298485298485e-05, + "loss": 0.093, "step": 850 }, { - "epoch": 0.21711688967432466, - "grad_norm": 0.587645947933197, - "learning_rate": 2.869729866195405e-05, - "loss": 0.0832, + "epoch": 0.2554202554202554, + "grad_norm": 0.6086540222167969, + "learning_rate": 2.8467478467478467e-05, + "loss": 0.1036, "step": 860 }, { - "epoch": 0.21964150467053775, - "grad_norm": 0.7566137909889221, - "learning_rate": 2.8682150971976773e-05, - "loss": 0.0991, + "epoch": 0.25839025839025836, + "grad_norm": 0.8363798260688782, + "learning_rate": 2.844965844965845e-05, + "loss": 0.0871, "step": 870 }, { - "epoch": 0.22216611966675082, - "grad_norm": 0.9127172827720642, - "learning_rate": 2.8667003281999498e-05, - "loss": 0.0725, + "epoch": 0.26136026136026136, + "grad_norm": 0.49175241589546204, + "learning_rate": 2.8431838431838435e-05, + "loss": 0.0953, "step": 880 }, { - "epoch": 0.2246907346629639, - "grad_norm": 0.44545140862464905, - "learning_rate": 2.8651855592022216e-05, - "loss": 0.1114, + "epoch": 0.26433026433026435, + "grad_norm": 0.6891732811927795, + "learning_rate": 2.8414018414018414e-05, + "loss": 0.0789, "step": 890 }, { - "epoch": 0.22721534965917697, - "grad_norm": 0.6126995086669922, - "learning_rate": 2.8636707902044937e-05, - "loss": 0.0939, + "epoch": 0.2673002673002673, + "grad_norm": 0.7982739210128784, + "learning_rate": 2.8396198396198396e-05, + "loss": 0.1105, "step": 900 }, { - "epoch": 0.22973996465539007, - "grad_norm": 0.5383595824241638, - "learning_rate": 2.8621560212067662e-05, - "loss": 0.0956, + "epoch": 0.2702702702702703, + "grad_norm": 1.5775654315948486, + "learning_rate": 2.8378378378378378e-05, + "loss": 0.0902, "step": 910 }, { - "epoch": 0.23226457965160313, - "grad_norm": 0.6282172203063965, - "learning_rate": 2.860641252209038e-05, - "loss": 0.087, + "epoch": 0.2732402732402732, + "grad_norm": 0.42223745584487915, + "learning_rate": 2.836055836055836e-05, + "loss": 0.0937, "step": 920 }, { - "epoch": 0.2347891946478162, - "grad_norm": 0.8885302543640137, - "learning_rate": 2.8591264832113102e-05, - "loss": 0.0951, + "epoch": 0.2762102762102762, + "grad_norm": 0.8138239979743958, + "learning_rate": 2.8342738342738343e-05, + "loss": 0.0989, "step": 930 }, { - "epoch": 0.2373138096440293, - "grad_norm": 0.7967470288276672, - "learning_rate": 2.8576117142135827e-05, - "loss": 0.1023, + "epoch": 0.27918027918027916, + "grad_norm": 0.5486122965812683, + "learning_rate": 2.8324918324918325e-05, + "loss": 0.0901, "step": 940 }, { - "epoch": 0.23983842464024235, - "grad_norm": 0.7743595838546753, - "learning_rate": 2.8560969452158545e-05, - "loss": 0.0797, + "epoch": 0.28215028215028215, + "grad_norm": 0.5096667408943176, + "learning_rate": 2.830709830709831e-05, + "loss": 0.0799, "step": 950 }, { - "epoch": 0.24236303963645545, - "grad_norm": 0.5401091575622559, - "learning_rate": 2.8545821762181267e-05, - "loss": 0.1063, + "epoch": 0.28512028512028514, + "grad_norm": 0.5797027945518494, + "learning_rate": 2.8289278289278293e-05, + "loss": 0.0907, "step": 960 }, { - "epoch": 0.2448876546326685, - "grad_norm": 0.8554522395133972, - "learning_rate": 2.8530674072203992e-05, - "loss": 0.0833, + "epoch": 0.2880902880902881, + "grad_norm": 0.7815655469894409, + "learning_rate": 2.827145827145827e-05, + "loss": 0.0793, "step": 970 }, { - "epoch": 0.2474122696288816, - "grad_norm": 0.7112722396850586, - "learning_rate": 2.851552638222671e-05, - "loss": 0.0891, + "epoch": 0.2910602910602911, + "grad_norm": 0.5682644248008728, + "learning_rate": 2.8253638253638253e-05, + "loss": 0.0799, "step": 980 }, { - "epoch": 0.24993688462509467, - "grad_norm": 0.5287074446678162, - "learning_rate": 2.8500378692249432e-05, - "loss": 0.0877, + "epoch": 0.294030294030294, + "grad_norm": 0.5554261207580566, + "learning_rate": 2.8235818235818236e-05, + "loss": 0.0949, "step": 990 }, { - "epoch": 0.25246149962130776, - "grad_norm": 0.6009781956672668, - "learning_rate": 2.8485231002272157e-05, - "loss": 0.0882, + "epoch": 0.297000297000297, + "grad_norm": 0.5728469491004944, + "learning_rate": 2.8217998217998218e-05, + "loss": 0.1111, "step": 1000 }, { - "epoch": 0.25498611461752085, - "grad_norm": 0.3697584271430969, - "learning_rate": 2.8470083312294875e-05, - "loss": 0.0659, + "epoch": 0.29997029997029995, + "grad_norm": 0.5483665466308594, + "learning_rate": 2.82001782001782e-05, + "loss": 0.0901, "step": 1010 }, { - "epoch": 0.2575107296137339, - "grad_norm": 0.6940627694129944, - "learning_rate": 2.8454935622317597e-05, - "loss": 0.0897, + "epoch": 0.30294030294030294, + "grad_norm": 0.7061681151390076, + "learning_rate": 2.8182358182358186e-05, + "loss": 0.0942, "step": 1020 }, { - "epoch": 0.260035344609947, - "grad_norm": 0.43336808681488037, - "learning_rate": 2.8439787932340318e-05, - "loss": 0.0721, + "epoch": 0.30591030591030594, + "grad_norm": 0.4503157436847687, + "learning_rate": 2.8164538164538168e-05, + "loss": 0.0837, "step": 1030 }, { - "epoch": 0.2625599596061601, - "grad_norm": 0.7710297703742981, - "learning_rate": 2.842464024236304e-05, - "loss": 0.0886, + "epoch": 0.3088803088803089, + "grad_norm": 1.187880277633667, + "learning_rate": 2.8146718146718146e-05, + "loss": 0.0866, "step": 1040 }, { - "epoch": 0.2650845746023731, - "grad_norm": 1.1904844045639038, - "learning_rate": 2.840949255238576e-05, - "loss": 0.0903, + "epoch": 0.31185031185031187, + "grad_norm": 1.120139718055725, + "learning_rate": 2.812889812889813e-05, + "loss": 0.1058, "step": 1050 }, { - "epoch": 0.2676091895985862, - "grad_norm": 0.6289048194885254, - "learning_rate": 2.8394344862408483e-05, - "loss": 0.0782, + "epoch": 0.3148203148203148, + "grad_norm": 0.7681704759597778, + "learning_rate": 2.811107811107811e-05, + "loss": 0.0864, "step": 1060 }, { - "epoch": 0.2701338045947993, - "grad_norm": 0.5819368958473206, - "learning_rate": 2.8379197172431205e-05, - "loss": 0.0841, + "epoch": 0.3177903177903178, + "grad_norm": 0.6372396349906921, + "learning_rate": 2.8093258093258093e-05, + "loss": 0.0886, "step": 1070 }, { - "epoch": 0.2726584195910124, - "grad_norm": 0.9544229507446289, - "learning_rate": 2.8364049482453926e-05, - "loss": 0.0813, + "epoch": 0.32076032076032074, + "grad_norm": 0.7018745541572571, + "learning_rate": 2.8075438075438075e-05, + "loss": 0.0786, "step": 1080 }, { - "epoch": 0.27518303458722543, - "grad_norm": 0.5838118195533752, - "learning_rate": 2.8348901792476648e-05, - "loss": 0.1027, + "epoch": 0.32373032373032373, + "grad_norm": 0.8289116621017456, + "learning_rate": 2.805761805761806e-05, + "loss": 0.0964, "step": 1090 }, { - "epoch": 0.2777076495834385, - "grad_norm": 0.5665274858474731, - "learning_rate": 2.833375410249937e-05, - "loss": 0.0932, + "epoch": 0.3267003267003267, + "grad_norm": 0.7211658954620361, + "learning_rate": 2.8039798039798043e-05, + "loss": 0.1066, "step": 1100 }, { - "epoch": 0.2802322645796516, - "grad_norm": 0.7049827575683594, - "learning_rate": 2.831860641252209e-05, - "loss": 0.0895, + "epoch": 0.32967032967032966, + "grad_norm": 0.677126407623291, + "learning_rate": 2.802197802197802e-05, + "loss": 0.081, "step": 1110 }, { - "epoch": 0.2827568795758647, - "grad_norm": 0.7130704522132874, - "learning_rate": 2.8303458722544813e-05, - "loss": 0.0922, + "epoch": 0.33264033264033266, + "grad_norm": 0.3897887170314789, + "learning_rate": 2.8004158004158004e-05, + "loss": 0.0937, "step": 1120 }, { - "epoch": 0.28528149457207774, - "grad_norm": 0.606171727180481, - "learning_rate": 2.8288311032567534e-05, - "loss": 0.0763, + "epoch": 0.3356103356103356, + "grad_norm": 0.5881434679031372, + "learning_rate": 2.7986337986337986e-05, + "loss": 0.0852, "step": 1130 }, { - "epoch": 0.28780610956829084, - "grad_norm": 0.6686602830886841, - "learning_rate": 2.8273163342590256e-05, - "loss": 0.0853, + "epoch": 0.3385803385803386, + "grad_norm": 0.6897678971290588, + "learning_rate": 2.7968517968517968e-05, + "loss": 0.0873, "step": 1140 }, { - "epoch": 0.29033072456450393, - "grad_norm": 0.555953860282898, - "learning_rate": 2.8258015652612977e-05, - "loss": 0.0844, + "epoch": 0.34155034155034153, + "grad_norm": 0.6038883328437805, + "learning_rate": 2.795069795069795e-05, + "loss": 0.0805, "step": 1150 }, { - "epoch": 0.29285533956071697, - "grad_norm": 0.763724684715271, - "learning_rate": 2.82428679626357e-05, - "loss": 0.0927, + "epoch": 0.3445203445203445, + "grad_norm": 0.4414396286010742, + "learning_rate": 2.7932877932877936e-05, + "loss": 0.0981, "step": 1160 }, { - "epoch": 0.29537995455693006, - "grad_norm": 0.6845389008522034, - "learning_rate": 2.822772027265842e-05, - "loss": 0.082, + "epoch": 0.3474903474903475, + "grad_norm": 0.48170387744903564, + "learning_rate": 2.7915057915057918e-05, + "loss": 0.0938, "step": 1170 }, { - "epoch": 0.29790456955314315, - "grad_norm": 0.5240347385406494, - "learning_rate": 2.8212572582681142e-05, - "loss": 0.0791, + "epoch": 0.35046035046035046, + "grad_norm": 0.5567618012428284, + "learning_rate": 2.7897237897237897e-05, + "loss": 0.0897, "step": 1180 }, { - "epoch": 0.30042918454935624, - "grad_norm": 0.7150965332984924, - "learning_rate": 2.8197424892703864e-05, - "loss": 0.0898, + "epoch": 0.35343035343035345, + "grad_norm": 0.6452346444129944, + "learning_rate": 2.787941787941788e-05, + "loss": 0.0948, "step": 1190 }, { - "epoch": 0.3029537995455693, - "grad_norm": 0.45540598034858704, - "learning_rate": 2.8182277202726585e-05, - "loss": 0.0802, + "epoch": 0.3564003564003564, + "grad_norm": 0.4139314889907837, + "learning_rate": 2.786159786159786e-05, + "loss": 0.0849, "step": 1200 }, { - "epoch": 0.3054784145417824, - "grad_norm": 0.8016244173049927, - "learning_rate": 2.8167129512749307e-05, - "loss": 0.078, + "epoch": 0.3593703593703594, + "grad_norm": 0.5524829030036926, + "learning_rate": 2.7843777843777843e-05, + "loss": 0.0742, "step": 1210 }, { - "epoch": 0.30800302953799547, - "grad_norm": 0.5537816286087036, - "learning_rate": 2.815198182277203e-05, - "loss": 0.0809, + "epoch": 0.3623403623403623, + "grad_norm": 1.0731943845748901, + "learning_rate": 2.7825957825957826e-05, + "loss": 0.0834, "step": 1220 }, { - "epoch": 0.31052764453420856, - "grad_norm": 0.6857221722602844, - "learning_rate": 2.813683413279475e-05, - "loss": 0.0905, + "epoch": 0.3653103653103653, + "grad_norm": 0.6787437796592712, + "learning_rate": 2.780813780813781e-05, + "loss": 0.0921, "step": 1230 }, { - "epoch": 0.3130522595304216, - "grad_norm": 0.5408879518508911, - "learning_rate": 2.8121686442817472e-05, - "loss": 0.0829, + "epoch": 0.36828036828036825, + "grad_norm": 0.536044716835022, + "learning_rate": 2.7790317790317793e-05, + "loss": 0.0837, "step": 1240 }, { - "epoch": 0.3155768745266347, - "grad_norm": 0.3268249034881592, - "learning_rate": 2.8106538752840193e-05, - "loss": 0.0928, + "epoch": 0.37125037125037125, + "grad_norm": 0.4149301052093506, + "learning_rate": 2.7772497772497772e-05, + "loss": 0.0847, "step": 1250 }, { - "epoch": 0.3181014895228478, - "grad_norm": 0.5064918398857117, - "learning_rate": 2.8091391062862915e-05, - "loss": 0.0805, + "epoch": 0.37422037422037424, + "grad_norm": 0.6760357618331909, + "learning_rate": 2.7754677754677754e-05, + "loss": 0.0965, "step": 1260 }, { - "epoch": 0.3206261045190608, - "grad_norm": 1.134167194366455, - "learning_rate": 2.8076243372885636e-05, - "loss": 0.091, + "epoch": 0.3771903771903772, + "grad_norm": 0.8695538640022278, + "learning_rate": 2.7736857736857736e-05, + "loss": 0.0862, "step": 1270 }, { - "epoch": 0.3231507195152739, - "grad_norm": 0.6449709534645081, - "learning_rate": 2.8061095682908358e-05, - "loss": 0.0926, + "epoch": 0.3801603801603802, + "grad_norm": 1.1023316383361816, + "learning_rate": 2.771903771903772e-05, + "loss": 0.0818, "step": 1280 }, { - "epoch": 0.325675334511487, - "grad_norm": 0.4470975399017334, - "learning_rate": 2.804594799293108e-05, - "loss": 0.0888, + "epoch": 0.3831303831303831, + "grad_norm": 1.0046688318252563, + "learning_rate": 2.77012177012177e-05, + "loss": 0.0923, "step": 1290 }, { - "epoch": 0.3281999495077001, - "grad_norm": 0.6847784519195557, - "learning_rate": 2.80308003029538e-05, - "loss": 0.0872, + "epoch": 0.3861003861003861, + "grad_norm": 0.4843716323375702, + "learning_rate": 2.7683397683397686e-05, + "loss": 0.0774, "step": 1300 }, { - "epoch": 0.33072456450391313, - "grad_norm": 0.5377852320671082, - "learning_rate": 2.8015652612976523e-05, - "loss": 0.0802, + "epoch": 0.38907038907038904, + "grad_norm": 0.6335024833679199, + "learning_rate": 2.766557766557767e-05, + "loss": 0.083, "step": 1310 }, { - "epoch": 0.3332491795001262, - "grad_norm": 0.7764335870742798, - "learning_rate": 2.800050492299924e-05, - "loss": 0.083, + "epoch": 0.39204039204039204, + "grad_norm": 0.5234698057174683, + "learning_rate": 2.7647757647757647e-05, + "loss": 0.0755, "step": 1320 }, { - "epoch": 0.3357737944963393, - "grad_norm": 0.6579951047897339, - "learning_rate": 2.7985357233021966e-05, - "loss": 0.0828, + "epoch": 0.39501039501039503, + "grad_norm": 0.477662056684494, + "learning_rate": 2.762993762993763e-05, + "loss": 0.0756, "step": 1330 }, { - "epoch": 0.3382984094925524, - "grad_norm": 0.7836153507232666, - "learning_rate": 2.7970209543044688e-05, - "loss": 0.0716, + "epoch": 0.39798039798039797, + "grad_norm": 0.5107772350311279, + "learning_rate": 2.761211761211761e-05, + "loss": 0.0688, "step": 1340 }, { - "epoch": 0.34082302448876545, - "grad_norm": 1.0645248889923096, - "learning_rate": 2.7955061853067406e-05, - "loss": 0.0828, + "epoch": 0.40095040095040096, + "grad_norm": 0.6898319125175476, + "learning_rate": 2.7594297594297594e-05, + "loss": 0.0856, "step": 1350 }, { - "epoch": 0.34334763948497854, - "grad_norm": 0.4852674603462219, - "learning_rate": 2.7939914163090127e-05, - "loss": 0.0797, + "epoch": 0.4039204039204039, + "grad_norm": 0.5590442419052124, + "learning_rate": 2.7576477576477576e-05, + "loss": 0.0857, "step": 1360 }, { - "epoch": 0.34587225448119163, - "grad_norm": 0.7467886805534363, - "learning_rate": 2.7924766473112852e-05, - "loss": 0.0865, + "epoch": 0.4068904068904069, + "grad_norm": 0.6682615280151367, + "learning_rate": 2.755865755865756e-05, + "loss": 0.0828, "step": 1370 }, { - "epoch": 0.34839686947740467, - "grad_norm": 0.8338529467582703, - "learning_rate": 2.790961878313557e-05, - "loss": 0.075, + "epoch": 0.40986040986040984, + "grad_norm": 0.27072158455848694, + "learning_rate": 2.7540837540837544e-05, + "loss": 0.0775, "step": 1380 }, { - "epoch": 0.35092148447361776, - "grad_norm": 0.9489770531654358, - "learning_rate": 2.7894471093158292e-05, - "loss": 0.0799, + "epoch": 0.41283041283041283, + "grad_norm": 0.6918196082115173, + "learning_rate": 2.7523017523017522e-05, + "loss": 0.0734, "step": 1390 }, { - "epoch": 0.35344609946983085, - "grad_norm": 0.4182811677455902, - "learning_rate": 2.7879323403181017e-05, - "loss": 0.0953, + "epoch": 0.4158004158004158, + "grad_norm": 0.6403471827507019, + "learning_rate": 2.7505197505197505e-05, + "loss": 0.0814, "step": 1400 }, { - "epoch": 0.35597071446604395, - "grad_norm": 0.472494900226593, - "learning_rate": 2.7864175713203735e-05, - "loss": 0.0939, + "epoch": 0.41877041877041876, + "grad_norm": 0.7018643617630005, + "learning_rate": 2.7487377487377487e-05, + "loss": 0.0811, "step": 1410 }, { - "epoch": 0.358495329462257, - "grad_norm": 0.6279084086418152, - "learning_rate": 2.7849028023226457e-05, - "loss": 0.0977, + "epoch": 0.42174042174042176, + "grad_norm": 0.6571378111839294, + "learning_rate": 2.746955746955747e-05, + "loss": 0.0786, "step": 1420 }, { - "epoch": 0.3610199444584701, - "grad_norm": 0.5314123630523682, - "learning_rate": 2.7833880333249182e-05, - "loss": 0.0905, + "epoch": 0.4247104247104247, + "grad_norm": 0.7818433046340942, + "learning_rate": 2.745173745173745e-05, + "loss": 0.0743, "step": 1430 }, { - "epoch": 0.36354455945468317, - "grad_norm": 0.3234538435935974, - "learning_rate": 2.78187326432719e-05, - "loss": 0.0792, + "epoch": 0.4276804276804277, + "grad_norm": 0.7524327635765076, + "learning_rate": 2.7433917433917437e-05, + "loss": 0.0757, "step": 1440 }, { - "epoch": 0.36606917445089626, - "grad_norm": 0.7543266415596008, - "learning_rate": 2.7803584953294622e-05, - "loss": 0.0828, + "epoch": 0.4306504306504306, + "grad_norm": 0.8632511496543884, + "learning_rate": 2.741609741609742e-05, + "loss": 0.084, "step": 1450 }, { - "epoch": 0.3685937894471093, - "grad_norm": 0.38049548864364624, - "learning_rate": 2.7788437263317347e-05, - "loss": 0.0783, + "epoch": 0.4336204336204336, + "grad_norm": 0.6295231580734253, + "learning_rate": 2.7398277398277398e-05, + "loss": 0.0892, "step": 1460 }, { - "epoch": 0.3711184044433224, - "grad_norm": 0.3925035893917084, - "learning_rate": 2.7773289573340065e-05, - "loss": 0.0738, + "epoch": 0.4365904365904366, + "grad_norm": 0.6907210946083069, + "learning_rate": 2.738045738045738e-05, + "loss": 0.1006, "step": 1470 }, { - "epoch": 0.3736430194395355, - "grad_norm": 0.6012548208236694, - "learning_rate": 2.7758141883362787e-05, - "loss": 0.0767, + "epoch": 0.43956043956043955, + "grad_norm": 0.617152988910675, + "learning_rate": 2.7362637362637362e-05, + "loss": 0.097, "step": 1480 }, { - "epoch": 0.3761676344357485, - "grad_norm": 0.6800134181976318, - "learning_rate": 2.774299419338551e-05, - "loss": 0.0778, + "epoch": 0.44253044253044255, + "grad_norm": 0.6373753547668457, + "learning_rate": 2.7344817344817344e-05, + "loss": 0.0784, "step": 1490 }, { - "epoch": 0.3786922494319616, - "grad_norm": 0.561687707901001, - "learning_rate": 2.772784650340823e-05, - "loss": 0.079, + "epoch": 0.4455004455004455, + "grad_norm": 0.7640069723129272, + "learning_rate": 2.7326997326997326e-05, + "loss": 0.0729, "step": 1500 }, { - "epoch": 0.3812168644281747, - "grad_norm": 0.48705849051475525, - "learning_rate": 2.771269881343095e-05, - "loss": 0.0813, + "epoch": 0.4484704484704485, + "grad_norm": 0.5482354164123535, + "learning_rate": 2.7309177309177312e-05, + "loss": 0.0876, "step": 1510 }, { - "epoch": 0.3837414794243878, - "grad_norm": 0.5197842121124268, - "learning_rate": 2.7697551123453676e-05, - "loss": 0.0872, + "epoch": 0.4514404514404514, + "grad_norm": 0.7966523766517639, + "learning_rate": 2.7291357291357294e-05, + "loss": 0.0833, "step": 1520 }, { - "epoch": 0.38626609442060084, - "grad_norm": 0.476550430059433, - "learning_rate": 2.7682403433476395e-05, - "loss": 0.0684, + "epoch": 0.4544104544104544, + "grad_norm": 0.6484697461128235, + "learning_rate": 2.7273537273537276e-05, + "loss": 0.0854, "step": 1530 }, { - "epoch": 0.38879070941681393, - "grad_norm": 0.7136000394821167, - "learning_rate": 2.7667255743499116e-05, - "loss": 0.0787, + "epoch": 0.4573804573804574, + "grad_norm": 0.43090665340423584, + "learning_rate": 2.7255717255717255e-05, + "loss": 0.0914, "step": 1540 }, { - "epoch": 0.391315324413027, - "grad_norm": 0.8119826912879944, - "learning_rate": 2.765210805352184e-05, - "loss": 0.0834, + "epoch": 0.46035046035046034, + "grad_norm": 0.5118837356567383, + "learning_rate": 2.7237897237897237e-05, + "loss": 0.0819, "step": 1550 }, { - "epoch": 0.3938399394092401, - "grad_norm": 0.7646836638450623, - "learning_rate": 2.763696036354456e-05, - "loss": 0.0867, + "epoch": 0.46332046332046334, + "grad_norm": 0.9723702669143677, + "learning_rate": 2.722007722007722e-05, + "loss": 0.088, "step": 1560 }, { - "epoch": 0.39636455440545315, - "grad_norm": 0.5930790901184082, - "learning_rate": 2.762181267356728e-05, - "loss": 0.0731, + "epoch": 0.4662904662904663, + "grad_norm": 1.0589011907577515, + "learning_rate": 2.72022572022572e-05, + "loss": 0.0963, "step": 1570 }, { - "epoch": 0.39888916940166624, - "grad_norm": 0.3663583993911743, - "learning_rate": 2.7606664983590006e-05, - "loss": 0.0727, + "epoch": 0.46926046926046927, + "grad_norm": 0.6201198697090149, + "learning_rate": 2.7184437184437187e-05, + "loss": 0.0702, "step": 1580 }, { - "epoch": 0.40141378439787934, - "grad_norm": 0.3528522551059723, - "learning_rate": 2.7591517293612724e-05, - "loss": 0.0779, + "epoch": 0.4722304722304722, + "grad_norm": 0.40020257234573364, + "learning_rate": 2.716661716661717e-05, + "loss": 0.0752, "step": 1590 }, { - "epoch": 0.4039383993940924, - "grad_norm": 0.3986479938030243, - "learning_rate": 2.7576369603635446e-05, - "loss": 0.0876, + "epoch": 0.4752004752004752, + "grad_norm": 0.8229923844337463, + "learning_rate": 2.714879714879715e-05, + "loss": 0.1031, "step": 1600 }, { - "epoch": 0.40646301439030547, - "grad_norm": 0.5565474033355713, - "learning_rate": 2.7561221913658167e-05, - "loss": 0.0884, + "epoch": 0.4781704781704782, + "grad_norm": 0.5380883812904358, + "learning_rate": 2.713097713097713e-05, + "loss": 0.0911, "step": 1610 }, { - "epoch": 0.40898762938651856, - "grad_norm": 0.49433985352516174, - "learning_rate": 2.754607422368089e-05, - "loss": 0.0772, + "epoch": 0.48114048114048114, + "grad_norm": 0.507243812084198, + "learning_rate": 2.7113157113157112e-05, + "loss": 0.079, "step": 1620 }, { - "epoch": 0.41151224438273165, - "grad_norm": 0.6990385055541992, - "learning_rate": 2.753092653370361e-05, - "loss": 0.0718, + "epoch": 0.48411048411048413, + "grad_norm": 0.6244765520095825, + "learning_rate": 2.7095337095337095e-05, + "loss": 0.0643, "step": 1630 }, { - "epoch": 0.4140368593789447, - "grad_norm": 1.0656036138534546, - "learning_rate": 2.7515778843726332e-05, - "loss": 0.071, + "epoch": 0.48708048708048707, + "grad_norm": 1.1058402061462402, + "learning_rate": 2.7077517077517077e-05, + "loss": 0.0928, "step": 1640 }, { - "epoch": 0.4165614743751578, - "grad_norm": 0.4659973978996277, - "learning_rate": 2.7500631153749054e-05, - "loss": 0.0885, + "epoch": 0.49005049005049006, + "grad_norm": 0.8316872715950012, + "learning_rate": 2.7059697059697062e-05, + "loss": 0.0729, "step": 1650 }, { - "epoch": 0.4190860893713709, - "grad_norm": 0.6749240159988403, - "learning_rate": 2.7485483463771775e-05, - "loss": 0.0854, + "epoch": 0.493020493020493, + "grad_norm": 0.6039434671401978, + "learning_rate": 2.7041877041877044e-05, + "loss": 0.0907, "step": 1660 }, { - "epoch": 0.42161070436758397, - "grad_norm": 0.4509848952293396, - "learning_rate": 2.7470335773794497e-05, - "loss": 0.082, + "epoch": 0.495990495990496, + "grad_norm": 0.47073495388031006, + "learning_rate": 2.7024057024057027e-05, + "loss": 0.0806, "step": 1670 }, { - "epoch": 0.424135319363797, - "grad_norm": 0.6541846394538879, - "learning_rate": 2.745518808381722e-05, - "loss": 0.0989, + "epoch": 0.498960498960499, + "grad_norm": 0.4234858453273773, + "learning_rate": 2.7006237006237005e-05, + "loss": 0.0796, "step": 1680 }, { - "epoch": 0.4266599343600101, - "grad_norm": 0.603756308555603, - "learning_rate": 2.744004039383994e-05, - "loss": 0.0834, + "epoch": 0.5019305019305019, + "grad_norm": 0.7585604190826416, + "learning_rate": 2.6988416988416988e-05, + "loss": 0.0922, "step": 1690 }, { - "epoch": 0.4291845493562232, - "grad_norm": 0.4919886589050293, - "learning_rate": 2.742489270386266e-05, - "loss": 0.0847, + "epoch": 0.5049005049005049, + "grad_norm": 0.5006585717201233, + "learning_rate": 2.697059697059697e-05, + "loss": 0.0826, "step": 1700 }, { - "epoch": 0.4317091643524363, - "grad_norm": 0.8659531474113464, - "learning_rate": 2.7409745013885383e-05, - "loss": 0.0878, + "epoch": 0.5078705078705079, + "grad_norm": 0.6841594576835632, + "learning_rate": 2.6952776952776952e-05, + "loss": 0.0879, "step": 1710 }, { - "epoch": 0.4342337793486493, - "grad_norm": 0.6441717743873596, - "learning_rate": 2.7394597323908105e-05, - "loss": 0.0704, + "epoch": 0.5108405108405109, + "grad_norm": 0.6505159139633179, + "learning_rate": 2.6934956934956937e-05, + "loss": 0.097, "step": 1720 }, { - "epoch": 0.4367583943448624, - "grad_norm": 0.5323107838630676, - "learning_rate": 2.7379449633930826e-05, - "loss": 0.0676, + "epoch": 0.5138105138105138, + "grad_norm": 0.48233747482299805, + "learning_rate": 2.691713691713692e-05, + "loss": 0.079, "step": 1730 }, { - "epoch": 0.4392830093410755, - "grad_norm": 0.5645779967308044, - "learning_rate": 2.7364301943953548e-05, - "loss": 0.0745, + "epoch": 0.5167805167805167, + "grad_norm": 0.5792484879493713, + "learning_rate": 2.6899316899316902e-05, + "loss": 0.0847, "step": 1740 }, { - "epoch": 0.44180762433728854, - "grad_norm": 0.6338810920715332, - "learning_rate": 2.734915425397627e-05, - "loss": 0.083, + "epoch": 0.5197505197505198, + "grad_norm": 0.6649707555770874, + "learning_rate": 2.688149688149688e-05, + "loss": 0.0584, "step": 1750 }, { - "epoch": 0.44433223933350163, - "grad_norm": 0.8070118427276611, - "learning_rate": 2.733400656399899e-05, - "loss": 0.0679, + "epoch": 0.5227205227205227, + "grad_norm": 0.6543247699737549, + "learning_rate": 2.6863676863676863e-05, + "loss": 0.0558, "step": 1760 }, { - "epoch": 0.4468568543297147, - "grad_norm": 0.5226871967315674, - "learning_rate": 2.7318858874021713e-05, - "loss": 0.0966, + "epoch": 0.5256905256905257, + "grad_norm": 0.6927476525306702, + "learning_rate": 2.6845856845856845e-05, + "loss": 0.0828, "step": 1770 }, { - "epoch": 0.4493814693259278, - "grad_norm": 0.4776107668876648, - "learning_rate": 2.7303711184044434e-05, - "loss": 0.0861, + "epoch": 0.5286605286605287, + "grad_norm": 0.9066148996353149, + "learning_rate": 2.6828036828036827e-05, + "loss": 0.0827, "step": 1780 }, { - "epoch": 0.45190608432214086, - "grad_norm": 0.5352398753166199, - "learning_rate": 2.7288563494067156e-05, - "loss": 0.0743, + "epoch": 0.5316305316305316, + "grad_norm": 0.6122345924377441, + "learning_rate": 2.6810216810216813e-05, + "loss": 0.0831, "step": 1790 }, { - "epoch": 0.45443069931835395, - "grad_norm": 0.6204835772514343, - "learning_rate": 2.7273415804089878e-05, - "loss": 0.0852, + "epoch": 0.5346005346005346, + "grad_norm": 0.5523887872695923, + "learning_rate": 2.6792396792396795e-05, + "loss": 0.0925, "step": 1800 }, { - "epoch": 0.45695531431456704, - "grad_norm": 0.7186635732650757, - "learning_rate": 2.72582681141126e-05, - "loss": 0.0832, + "epoch": 0.5375705375705375, + "grad_norm": 0.9167420268058777, + "learning_rate": 2.6774576774576777e-05, + "loss": 0.0756, "step": 1810 }, { - "epoch": 0.45947992931078013, - "grad_norm": 0.3791876435279846, - "learning_rate": 2.724312042413532e-05, - "loss": 0.083, + "epoch": 0.5405405405405406, + "grad_norm": 0.4656206965446472, + "learning_rate": 2.6756756756756756e-05, + "loss": 0.0827, "step": 1820 }, { - "epoch": 0.46200454430699317, - "grad_norm": 0.6428934931755066, - "learning_rate": 2.7227972734158042e-05, - "loss": 0.0872, + "epoch": 0.5435105435105435, + "grad_norm": 0.49738115072250366, + "learning_rate": 2.6738936738936738e-05, + "loss": 0.0996, "step": 1830 }, { - "epoch": 0.46452915930320626, - "grad_norm": 0.3516116440296173, - "learning_rate": 2.7212825044180764e-05, - "loss": 0.0659, + "epoch": 0.5464805464805464, + "grad_norm": 0.7212559580802917, + "learning_rate": 2.672111672111672e-05, + "loss": 0.0791, "step": 1840 }, { - "epoch": 0.46705377429941936, - "grad_norm": 0.44267144799232483, - "learning_rate": 2.7197677354203486e-05, - "loss": 0.0873, + "epoch": 0.5494505494505495, + "grad_norm": 0.6626265645027161, + "learning_rate": 2.6703296703296702e-05, + "loss": 0.091, "step": 1850 }, { - "epoch": 0.4695783892956324, - "grad_norm": 0.5157018899917603, - "learning_rate": 2.7182529664226207e-05, - "loss": 0.0841, + "epoch": 0.5524205524205524, + "grad_norm": 0.38933899998664856, + "learning_rate": 2.6685476685476688e-05, + "loss": 0.0788, "step": 1860 }, { - "epoch": 0.4721030042918455, - "grad_norm": 0.39743056893348694, - "learning_rate": 2.716738197424893e-05, - "loss": 0.0751, + "epoch": 0.5553905553905554, + "grad_norm": 0.41860514879226685, + "learning_rate": 2.666765666765667e-05, + "loss": 0.1031, "step": 1870 }, { - "epoch": 0.4746276192880586, - "grad_norm": 0.9152094721794128, - "learning_rate": 2.715223428427165e-05, - "loss": 0.0809, + "epoch": 0.5583605583605583, + "grad_norm": 0.5364987850189209, + "learning_rate": 2.6649836649836652e-05, + "loss": 0.0891, "step": 1880 }, { - "epoch": 0.47715223428427167, - "grad_norm": 0.5350621342658997, - "learning_rate": 2.7137086594294372e-05, - "loss": 0.0792, + "epoch": 0.5613305613305614, + "grad_norm": 0.28089386224746704, + "learning_rate": 2.663201663201663e-05, + "loss": 0.0921, "step": 1890 }, { - "epoch": 0.4796768492804847, - "grad_norm": 0.6785259246826172, - "learning_rate": 2.712193890431709e-05, - "loss": 0.0932, + "epoch": 0.5643005643005643, + "grad_norm": 0.6708937287330627, + "learning_rate": 2.6614196614196613e-05, + "loss": 0.0876, "step": 1900 }, { - "epoch": 0.4822014642766978, - "grad_norm": 0.5591861605644226, - "learning_rate": 2.7106791214339812e-05, - "loss": 0.0875, + "epoch": 0.5672705672705672, + "grad_norm": 0.49499982595443726, + "learning_rate": 2.6596376596376595e-05, + "loss": 0.0889, "step": 1910 }, { - "epoch": 0.4847260792729109, - "grad_norm": 0.4783095419406891, - "learning_rate": 2.7091643524362537e-05, - "loss": 0.0953, + "epoch": 0.5702405702405703, + "grad_norm": 0.5181038975715637, + "learning_rate": 2.6578556578556577e-05, + "loss": 0.0687, "step": 1920 }, { - "epoch": 0.487250694269124, - "grad_norm": 0.3992745578289032, - "learning_rate": 2.7076495834385255e-05, - "loss": 0.0897, + "epoch": 0.5732105732105732, + "grad_norm": 0.4590006172657013, + "learning_rate": 2.6560736560736563e-05, + "loss": 0.0843, "step": 1930 }, { - "epoch": 0.489775309265337, - "grad_norm": 0.5237228870391846, - "learning_rate": 2.7061348144407977e-05, - "loss": 0.0735, + "epoch": 0.5761805761805762, + "grad_norm": 0.542353630065918, + "learning_rate": 2.6542916542916545e-05, + "loss": 0.0807, "step": 1940 }, { - "epoch": 0.4922999242615501, - "grad_norm": 0.43609362840652466, - "learning_rate": 2.70462004544307e-05, - "loss": 0.0885, + "epoch": 0.5791505791505791, + "grad_norm": 0.4152495861053467, + "learning_rate": 2.6525096525096527e-05, + "loss": 0.0833, "step": 1950 }, { - "epoch": 0.4948245392577632, - "grad_norm": 0.9206698536872864, - "learning_rate": 2.703105276445342e-05, - "loss": 0.0861, + "epoch": 0.5821205821205822, + "grad_norm": 0.4847126603126526, + "learning_rate": 2.6507276507276506e-05, + "loss": 0.0844, "step": 1960 }, { - "epoch": 0.49734915425397624, - "grad_norm": 0.9243408441543579, - "learning_rate": 2.701590507447614e-05, - "loss": 0.0923, + "epoch": 0.5850905850905851, + "grad_norm": 0.5619663596153259, + "learning_rate": 2.648945648945649e-05, + "loss": 0.0768, "step": 1970 }, { - "epoch": 0.49987376925018934, - "grad_norm": 0.7312402725219727, - "learning_rate": 2.7000757384498866e-05, - "loss": 0.0856, + "epoch": 0.588060588060588, + "grad_norm": 0.6558105945587158, + "learning_rate": 2.647163647163647e-05, + "loss": 0.0754, "step": 1980 }, { - "epoch": 0.5023983842464024, - "grad_norm": 0.40028661489486694, - "learning_rate": 2.6985609694521585e-05, - "loss": 0.0676, + "epoch": 0.5910305910305911, + "grad_norm": 0.9754857420921326, + "learning_rate": 2.6453816453816453e-05, + "loss": 0.0868, "step": 1990 }, { - "epoch": 0.5049229992426155, - "grad_norm": 0.4305866062641144, - "learning_rate": 2.6970462004544306e-05, - "loss": 0.0744, + "epoch": 0.594000594000594, + "grad_norm": 0.4641966223716736, + "learning_rate": 2.6435996435996438e-05, + "loss": 0.0929, "step": 2000 }, { - "epoch": 0.5074476142388286, - "grad_norm": 0.30411994457244873, - "learning_rate": 2.695531431456703e-05, - "loss": 0.064, + "epoch": 0.596970596970597, + "grad_norm": 0.46997398138046265, + "learning_rate": 2.641817641817642e-05, + "loss": 0.0822, "step": 2010 }, { - "epoch": 0.5099722292350417, - "grad_norm": 0.44871994853019714, - "learning_rate": 2.694016662458975e-05, - "loss": 0.0883, + "epoch": 0.5999405999405999, + "grad_norm": 0.6096898913383484, + "learning_rate": 2.6400356400356403e-05, + "loss": 0.0871, "step": 2020 }, { - "epoch": 0.5124968442312547, - "grad_norm": 0.8206811547279358, - "learning_rate": 2.692501893461247e-05, - "loss": 0.0803, + "epoch": 0.6029106029106029, + "grad_norm": 0.4723495543003082, + "learning_rate": 2.638253638253638e-05, + "loss": 0.0767, "step": 2030 }, { - "epoch": 0.5150214592274678, - "grad_norm": 0.6455752849578857, - "learning_rate": 2.6909871244635196e-05, - "loss": 0.0951, + "epoch": 0.6058806058806059, + "grad_norm": 0.5081328749656677, + "learning_rate": 2.6364716364716364e-05, + "loss": 0.0728, "step": 2040 }, { - "epoch": 0.5175460742236809, - "grad_norm": 0.5514742732048035, - "learning_rate": 2.6894723554657914e-05, - "loss": 0.0855, + "epoch": 0.6088506088506088, + "grad_norm": 0.5929988026618958, + "learning_rate": 2.6346896346896346e-05, + "loss": 0.0777, "step": 2050 }, { - "epoch": 0.520070689219894, - "grad_norm": 0.3301815688610077, - "learning_rate": 2.6879575864680636e-05, - "loss": 0.0916, + "epoch": 0.6118206118206119, + "grad_norm": 0.5095152854919434, + "learning_rate": 2.6329076329076328e-05, + "loss": 0.084, "step": 2060 }, { - "epoch": 0.522595304216107, - "grad_norm": 0.4964681565761566, - "learning_rate": 2.686442817470336e-05, - "loss": 0.0794, + "epoch": 0.6147906147906148, + "grad_norm": 0.47717463970184326, + "learning_rate": 2.6311256311256313e-05, + "loss": 0.0724, "step": 2070 }, { - "epoch": 0.5251199192123202, - "grad_norm": 0.6347307562828064, - "learning_rate": 2.684928048472608e-05, - "loss": 0.0756, + "epoch": 0.6177606177606177, + "grad_norm": 0.3432537615299225, + "learning_rate": 2.6293436293436296e-05, + "loss": 0.0727, "step": 2080 }, { - "epoch": 0.5276445342085332, - "grad_norm": 0.7548183798789978, - "learning_rate": 2.68341327947488e-05, - "loss": 0.0781, + "epoch": 0.6207306207306207, + "grad_norm": 0.6386498212814331, + "learning_rate": 2.6275616275616278e-05, + "loss": 0.0814, "step": 2090 }, { - "epoch": 0.5301691492047462, - "grad_norm": 0.43172019720077515, - "learning_rate": 2.6818985104771525e-05, - "loss": 0.0674, + "epoch": 0.6237006237006237, + "grad_norm": 0.5590204000473022, + "learning_rate": 2.625779625779626e-05, + "loss": 0.0803, "step": 2100 }, { - "epoch": 0.5326937642009594, - "grad_norm": 0.8760651350021362, - "learning_rate": 2.6803837414794244e-05, - "loss": 0.0815, + "epoch": 0.6266706266706267, + "grad_norm": 0.3727136552333832, + "learning_rate": 2.623997623997624e-05, + "loss": 0.0784, "step": 2110 }, { - "epoch": 0.5352183791971724, - "grad_norm": 0.795153021812439, - "learning_rate": 2.6788689724816965e-05, - "loss": 0.0828, + "epoch": 0.6296406296406296, + "grad_norm": 0.9345456957817078, + "learning_rate": 2.622215622215622e-05, + "loss": 0.102, "step": 2120 }, { - "epoch": 0.5377429941933856, - "grad_norm": 0.7313557267189026, - "learning_rate": 2.677354203483969e-05, - "loss": 0.0983, + "epoch": 0.6326106326106327, + "grad_norm": 0.6383994221687317, + "learning_rate": 2.6204336204336203e-05, + "loss": 0.0799, "step": 2130 }, { - "epoch": 0.5402676091895986, - "grad_norm": 0.4594115912914276, - "learning_rate": 2.675839434486241e-05, - "loss": 0.0705, + "epoch": 0.6355806355806356, + "grad_norm": 0.6339811682701111, + "learning_rate": 2.618651618651619e-05, + "loss": 0.0697, "step": 2140 }, { - "epoch": 0.5427922241858116, - "grad_norm": 0.5128085613250732, - "learning_rate": 2.674324665488513e-05, - "loss": 0.0823, + "epoch": 0.6385506385506385, + "grad_norm": 0.6489042639732361, + "learning_rate": 2.616869616869617e-05, + "loss": 0.0762, "step": 2150 }, { - "epoch": 0.5453168391820248, - "grad_norm": 0.7316603064537048, - "learning_rate": 2.6728098964907855e-05, - "loss": 0.0736, + "epoch": 0.6415206415206415, + "grad_norm": 0.43688729405403137, + "learning_rate": 2.6150876150876153e-05, + "loss": 0.0856, "step": 2160 }, { - "epoch": 0.5478414541782378, - "grad_norm": 0.47318318486213684, - "learning_rate": 2.6712951274930573e-05, - "loss": 0.0714, + "epoch": 0.6444906444906445, + "grad_norm": 0.5854159593582153, + "learning_rate": 2.6133056133056135e-05, + "loss": 0.0923, "step": 2170 }, { - "epoch": 0.5503660691744509, - "grad_norm": 0.669712245464325, - "learning_rate": 2.6697803584953295e-05, - "loss": 0.0787, + "epoch": 0.6474606474606475, + "grad_norm": 0.4497719407081604, + "learning_rate": 2.6115236115236114e-05, + "loss": 0.0863, "step": 2180 }, { - "epoch": 0.552890684170664, - "grad_norm": 0.5920892357826233, - "learning_rate": 2.6682655894976016e-05, - "loss": 0.0787, + "epoch": 0.6504306504306504, + "grad_norm": 0.39971357583999634, + "learning_rate": 2.6097416097416096e-05, + "loss": 0.0693, "step": 2190 }, { - "epoch": 0.555415299166877, - "grad_norm": 0.3415123522281647, - "learning_rate": 2.6667508204998738e-05, - "loss": 0.0824, + "epoch": 0.6534006534006535, + "grad_norm": 0.6880261301994324, + "learning_rate": 2.6079596079596078e-05, + "loss": 0.0861, "step": 2200 }, { - "epoch": 0.5579399141630901, - "grad_norm": 0.37938541173934937, - "learning_rate": 2.665236051502146e-05, - "loss": 0.0749, + "epoch": 0.6563706563706564, + "grad_norm": 0.39452025294303894, + "learning_rate": 2.6061776061776064e-05, + "loss": 0.0666, "step": 2210 }, { - "epoch": 0.5604645291593032, - "grad_norm": 0.6105532646179199, - "learning_rate": 2.663721282504418e-05, - "loss": 0.0832, + "epoch": 0.6593406593406593, + "grad_norm": 0.4145357310771942, + "learning_rate": 2.6043956043956046e-05, + "loss": 0.07, "step": 2220 }, { - "epoch": 0.5629891441555163, - "grad_norm": 0.6222105026245117, - "learning_rate": 2.6622065135066903e-05, - "loss": 0.0662, + "epoch": 0.6623106623106623, + "grad_norm": 0.6330484747886658, + "learning_rate": 2.6026136026136028e-05, + "loss": 0.084, "step": 2230 }, { - "epoch": 0.5655137591517294, - "grad_norm": 0.7301434874534607, - "learning_rate": 2.6606917445089624e-05, - "loss": 0.082, + "epoch": 0.6652806652806653, + "grad_norm": 0.5894971489906311, + "learning_rate": 2.600831600831601e-05, + "loss": 0.0925, "step": 2240 }, { - "epoch": 0.5680383741479424, - "grad_norm": 0.36286047101020813, - "learning_rate": 2.6591769755112346e-05, - "loss": 0.0823, + "epoch": 0.6682506682506683, + "grad_norm": 0.3733588457107544, + "learning_rate": 2.599049599049599e-05, + "loss": 0.082, "step": 2250 }, { - "epoch": 0.5705629891441555, - "grad_norm": 0.483547180891037, - "learning_rate": 2.6576622065135068e-05, - "loss": 0.0882, + "epoch": 0.6712206712206712, + "grad_norm": 0.45527949929237366, + "learning_rate": 2.597267597267597e-05, + "loss": 0.0769, "step": 2260 }, { - "epoch": 0.5730876041403686, - "grad_norm": 0.3633301854133606, - "learning_rate": 2.656147437515779e-05, - "loss": 0.0847, + "epoch": 0.6741906741906742, + "grad_norm": 0.6295212507247925, + "learning_rate": 2.5954855954855953e-05, + "loss": 0.0798, "step": 2270 }, { - "epoch": 0.5756122191365817, - "grad_norm": 0.44909363985061646, - "learning_rate": 2.654632668518051e-05, - "loss": 0.0886, + "epoch": 0.6771606771606772, + "grad_norm": 0.4148741066455841, + "learning_rate": 2.593703593703594e-05, + "loss": 0.0702, "step": 2280 }, { - "epoch": 0.5781368341327947, - "grad_norm": 0.5327022671699524, - "learning_rate": 2.6531178995203232e-05, - "loss": 0.0714, + "epoch": 0.6801306801306801, + "grad_norm": 0.4446201026439667, + "learning_rate": 2.591921591921592e-05, + "loss": 0.081, "step": 2290 }, { - "epoch": 0.5806614491290079, - "grad_norm": 0.7483319044113159, - "learning_rate": 2.6516031305225954e-05, - "loss": 0.0806, + "epoch": 0.6831006831006831, + "grad_norm": 0.5348713397979736, + "learning_rate": 2.5901395901395903e-05, + "loss": 0.0804, "step": 2300 }, { - "epoch": 0.5831860641252209, - "grad_norm": 0.5000852346420288, - "learning_rate": 2.6500883615248676e-05, - "loss": 0.0725, + "epoch": 0.6860706860706861, + "grad_norm": 0.7064197659492493, + "learning_rate": 2.5883575883575886e-05, + "loss": 0.0766, "step": 2310 }, { - "epoch": 0.5857106791214339, - "grad_norm": 0.3488561809062958, - "learning_rate": 2.6485735925271397e-05, - "loss": 0.0653, + "epoch": 0.689040689040689, + "grad_norm": 0.5868175029754639, + "learning_rate": 2.5865755865755864e-05, + "loss": 0.0797, "step": 2320 }, { - "epoch": 0.5882352941176471, - "grad_norm": 0.32613256573677063, - "learning_rate": 2.647058823529412e-05, - "loss": 0.0682, + "epoch": 0.692010692010692, + "grad_norm": 0.6839095950126648, + "learning_rate": 2.5847935847935846e-05, + "loss": 0.0794, "step": 2330 }, { - "epoch": 0.5907599091138601, - "grad_norm": 0.8107202649116516, - "learning_rate": 2.645544054531684e-05, - "loss": 0.0848, + "epoch": 0.694980694980695, + "grad_norm": 0.41192343831062317, + "learning_rate": 2.583011583011583e-05, + "loss": 0.0706, "step": 2340 }, { - "epoch": 0.5932845241100733, - "grad_norm": 0.6575340628623962, - "learning_rate": 2.6440292855339562e-05, - "loss": 0.1013, + "epoch": 0.697950697950698, + "grad_norm": 0.7668315768241882, + "learning_rate": 2.5812295812295814e-05, + "loss": 0.0785, "step": 2350 }, { - "epoch": 0.5958091391062863, - "grad_norm": 0.43987488746643066, - "learning_rate": 2.6425145165362283e-05, - "loss": 0.0749, + "epoch": 0.7009207009207009, + "grad_norm": 0.43974947929382324, + "learning_rate": 2.5794475794475796e-05, + "loss": 0.0712, "step": 2360 }, { - "epoch": 0.5983337541024993, - "grad_norm": 0.9867390394210815, - "learning_rate": 2.6409997475385005e-05, - "loss": 0.0859, + "epoch": 0.7038907038907039, + "grad_norm": 0.3848420977592468, + "learning_rate": 2.577665577665578e-05, + "loss": 0.077, "step": 2370 }, { - "epoch": 0.6008583690987125, - "grad_norm": 0.5467984080314636, - "learning_rate": 2.6394849785407727e-05, - "loss": 0.0743, + "epoch": 0.7068607068607069, + "grad_norm": 0.6403735280036926, + "learning_rate": 2.575883575883576e-05, + "loss": 0.0729, "step": 2380 }, { - "epoch": 0.6033829840949255, - "grad_norm": 0.42555686831474304, - "learning_rate": 2.6379702095430448e-05, - "loss": 0.0906, + "epoch": 0.7098307098307098, + "grad_norm": 0.5417028665542603, + "learning_rate": 2.574101574101574e-05, + "loss": 0.0834, "step": 2390 }, { - "epoch": 0.6059075990911386, - "grad_norm": 0.33940818905830383, - "learning_rate": 2.636455440545317e-05, - "loss": 0.0734, + "epoch": 0.7128007128007128, + "grad_norm": 0.9361075162887573, + "learning_rate": 2.572319572319572e-05, + "loss": 0.077, "step": 2400 }, { - "epoch": 0.6084322140873517, - "grad_norm": 0.4564431309700012, - "learning_rate": 2.634940671547589e-05, - "loss": 0.0687, + "epoch": 0.7157707157707157, + "grad_norm": 0.483093798160553, + "learning_rate": 2.5705375705375707e-05, + "loss": 0.088, "step": 2410 }, { - "epoch": 0.6109568290835647, - "grad_norm": 0.591241180896759, - "learning_rate": 2.6334259025498613e-05, - "loss": 0.0701, + "epoch": 0.7187407187407188, + "grad_norm": 0.4506361782550812, + "learning_rate": 2.568755568755569e-05, + "loss": 0.0919, "step": 2420 }, { - "epoch": 0.6134814440797778, - "grad_norm": 0.49482131004333496, - "learning_rate": 2.6319111335521335e-05, - "loss": 0.0741, + "epoch": 0.7217107217107217, + "grad_norm": 0.6593904495239258, + "learning_rate": 2.566973566973567e-05, + "loss": 0.087, "step": 2430 }, { - "epoch": 0.6160060590759909, - "grad_norm": 0.32807767391204834, - "learning_rate": 2.6303963645544056e-05, - "loss": 0.0836, + "epoch": 0.7246807246807246, + "grad_norm": 0.5274522304534912, + "learning_rate": 2.5651915651915654e-05, + "loss": 0.0768, "step": 2440 }, { - "epoch": 0.618530674072204, - "grad_norm": 0.3978806734085083, - "learning_rate": 2.6288815955566778e-05, - "loss": 0.0796, + "epoch": 0.7276507276507277, + "grad_norm": 0.5065791606903076, + "learning_rate": 2.5634095634095636e-05, + "loss": 0.0828, "step": 2450 }, { - "epoch": 0.6210552890684171, - "grad_norm": 0.3510526418685913, - "learning_rate": 2.6273668265589496e-05, - "loss": 0.0735, + "epoch": 0.7306207306207306, + "grad_norm": 0.6130974888801575, + "learning_rate": 2.5616275616275615e-05, + "loss": 0.0742, "step": 2460 }, { - "epoch": 0.6235799040646302, - "grad_norm": 0.42759060859680176, - "learning_rate": 2.625852057561222e-05, - "loss": 0.0703, + "epoch": 0.7335907335907336, + "grad_norm": 0.6379355192184448, + "learning_rate": 2.5598455598455597e-05, + "loss": 0.0847, "step": 2470 }, { - "epoch": 0.6261045190608432, - "grad_norm": 0.7114939093589783, - "learning_rate": 2.624337288563494e-05, - "loss": 0.0843, + "epoch": 0.7365607365607365, + "grad_norm": 0.6738227009773254, + "learning_rate": 2.5580635580635582e-05, + "loss": 0.0793, "step": 2480 }, { - "epoch": 0.6286291340570563, - "grad_norm": 0.7506195306777954, - "learning_rate": 2.622822519565766e-05, - "loss": 0.0835, + "epoch": 0.7395307395307396, + "grad_norm": 0.6309618949890137, + "learning_rate": 2.5562815562815565e-05, + "loss": 0.0871, "step": 2490 }, { - "epoch": 0.6311537490532694, - "grad_norm": 0.41576099395751953, - "learning_rate": 2.6213077505680386e-05, - "loss": 0.0788, + "epoch": 0.7425007425007425, + "grad_norm": 0.2825660705566406, + "learning_rate": 2.5544995544995547e-05, + "loss": 0.074, "step": 2500 }, { - "epoch": 0.6336783640494824, - "grad_norm": 0.5157233476638794, - "learning_rate": 2.6197929815703104e-05, - "loss": 0.0765, + "epoch": 0.7454707454707454, + "grad_norm": 0.43583425879478455, + "learning_rate": 2.552717552717553e-05, + "loss": 0.0858, "step": 2510 }, { - "epoch": 0.6362029790456956, - "grad_norm": 0.532408595085144, - "learning_rate": 2.6182782125725826e-05, - "loss": 0.0861, + "epoch": 0.7484407484407485, + "grad_norm": 0.7557492256164551, + "learning_rate": 2.550935550935551e-05, + "loss": 0.0691, "step": 2520 }, { - "epoch": 0.6387275940419086, - "grad_norm": 0.4434366822242737, - "learning_rate": 2.616763443574855e-05, - "loss": 0.0619, + "epoch": 0.7514107514107514, + "grad_norm": 0.44126811623573303, + "learning_rate": 2.549153549153549e-05, + "loss": 0.0664, "step": 2530 }, { - "epoch": 0.6412522090381216, - "grad_norm": 0.5986734628677368, - "learning_rate": 2.615248674577127e-05, - "loss": 0.0902, + "epoch": 0.7543807543807544, + "grad_norm": 0.5966764092445374, + "learning_rate": 2.5473715473715472e-05, + "loss": 0.0766, "step": 2540 }, { - "epoch": 0.6437768240343348, - "grad_norm": 0.4010680615901947, - "learning_rate": 2.613733905579399e-05, - "loss": 0.0894, + "epoch": 0.7573507573507573, + "grad_norm": 0.4621107578277588, + "learning_rate": 2.5455895455895458e-05, + "loss": 0.0834, "step": 2550 }, { - "epoch": 0.6463014390305478, - "grad_norm": 0.9059281349182129, - "learning_rate": 2.6122191365816715e-05, - "loss": 0.0713, + "epoch": 0.7603207603207603, + "grad_norm": 0.593605637550354, + "learning_rate": 2.543807543807544e-05, + "loss": 0.0812, "step": 2560 }, { - "epoch": 0.648826054026761, - "grad_norm": 0.5254572033882141, - "learning_rate": 2.6107043675839434e-05, - "loss": 0.0818, + "epoch": 0.7632907632907633, + "grad_norm": 0.8139130473136902, + "learning_rate": 2.5420255420255422e-05, + "loss": 0.0663, "step": 2570 }, { - "epoch": 0.651350669022974, - "grad_norm": 0.3511335253715515, - "learning_rate": 2.6091895985862155e-05, - "loss": 0.0775, + "epoch": 0.7662607662607662, + "grad_norm": 0.4853007197380066, + "learning_rate": 2.5402435402435404e-05, + "loss": 0.0789, "step": 2580 }, { - "epoch": 0.653875284019187, - "grad_norm": 0.39499038457870483, - "learning_rate": 2.607674829588488e-05, - "loss": 0.0738, + "epoch": 0.7692307692307693, + "grad_norm": 0.4105505645275116, + "learning_rate": 2.5384615384615386e-05, + "loss": 0.0702, "step": 2590 }, { - "epoch": 0.6563998990154002, - "grad_norm": 0.4215822219848633, - "learning_rate": 2.60616006059076e-05, - "loss": 0.0704, + "epoch": 0.7722007722007722, + "grad_norm": 0.5971934795379639, + "learning_rate": 2.5366795366795365e-05, + "loss": 0.0847, "step": 2600 }, { - "epoch": 0.6589245140116132, - "grad_norm": 0.5732513070106506, - "learning_rate": 2.604645291593032e-05, - "loss": 0.066, + "epoch": 0.7751707751707752, + "grad_norm": 0.34833744168281555, + "learning_rate": 2.5348975348975347e-05, + "loss": 0.064, "step": 2610 }, { - "epoch": 0.6614491290078263, - "grad_norm": 0.6704108715057373, - "learning_rate": 2.6031305225953045e-05, - "loss": 0.0688, + "epoch": 0.7781407781407781, + "grad_norm": 0.35726526379585266, + "learning_rate": 2.5331155331155333e-05, + "loss": 0.0758, "step": 2620 }, { - "epoch": 0.6639737440040394, - "grad_norm": 0.47199252247810364, - "learning_rate": 2.6016157535975763e-05, - "loss": 0.0735, + "epoch": 0.7811107811107811, + "grad_norm": 0.4475048780441284, + "learning_rate": 2.5313335313335315e-05, + "loss": 0.0768, "step": 2630 }, { - "epoch": 0.6664983590002524, - "grad_norm": 0.7372543215751648, - "learning_rate": 2.6001009845998485e-05, - "loss": 0.0732, + "epoch": 0.7840807840807841, + "grad_norm": 0.48018935322761536, + "learning_rate": 2.5295515295515297e-05, + "loss": 0.0723, "step": 2640 }, { - "epoch": 0.6690229739964655, - "grad_norm": 0.6057668924331665, - "learning_rate": 2.598586215602121e-05, - "loss": 0.0812, + "epoch": 0.787050787050787, + "grad_norm": 0.47765350341796875, + "learning_rate": 2.527769527769528e-05, + "loss": 0.0765, "step": 2650 }, { - "epoch": 0.6715475889926786, - "grad_norm": 0.5473082065582275, - "learning_rate": 2.5970714466043928e-05, - "loss": 0.0792, + "epoch": 0.7900207900207901, + "grad_norm": 0.6376664638519287, + "learning_rate": 2.525987525987526e-05, + "loss": 0.0777, "step": 2660 }, { - "epoch": 0.6740722039888917, - "grad_norm": 0.5566405057907104, - "learning_rate": 2.595556677606665e-05, - "loss": 0.0784, + "epoch": 0.792990792990793, + "grad_norm": 0.7332932353019714, + "learning_rate": 2.524205524205524e-05, + "loss": 0.0963, "step": 2670 }, { - "epoch": 0.6765968189851048, - "grad_norm": 0.41001176834106445, - "learning_rate": 2.5940419086089375e-05, - "loss": 0.0836, + "epoch": 0.7959607959607959, + "grad_norm": 0.6165478825569153, + "learning_rate": 2.5224235224235222e-05, + "loss": 0.0827, "step": 2680 }, { - "epoch": 0.6791214339813179, - "grad_norm": 0.7479286789894104, - "learning_rate": 2.5925271396112093e-05, - "loss": 0.0766, + "epoch": 0.7989307989307989, + "grad_norm": 0.693350613117218, + "learning_rate": 2.5206415206415208e-05, + "loss": 0.0752, "step": 2690 }, { - "epoch": 0.6816460489775309, - "grad_norm": 0.5025691390037537, - "learning_rate": 2.5910123706134814e-05, - "loss": 0.0751, + "epoch": 0.8019008019008019, + "grad_norm": 0.5711894035339355, + "learning_rate": 2.518859518859519e-05, + "loss": 0.0699, "step": 2700 }, { - "epoch": 0.684170663973744, - "grad_norm": 0.5500156283378601, - "learning_rate": 2.589497601615754e-05, - "loss": 0.0838, + "epoch": 0.8048708048708049, + "grad_norm": 0.6042230725288391, + "learning_rate": 2.5170775170775172e-05, + "loss": 0.0681, "step": 2710 }, { - "epoch": 0.6866952789699571, - "grad_norm": 0.5829827785491943, - "learning_rate": 2.5879828326180258e-05, - "loss": 0.0836, + "epoch": 0.8078408078408078, + "grad_norm": 0.43989643454551697, + "learning_rate": 2.5152955152955155e-05, + "loss": 0.0684, "step": 2720 }, { - "epoch": 0.6892198939661701, - "grad_norm": 0.4228859543800354, - "learning_rate": 2.586468063620298e-05, - "loss": 0.0784, + "epoch": 0.8108108108108109, + "grad_norm": 0.3606058359146118, + "learning_rate": 2.5135135135135137e-05, + "loss": 0.0793, "step": 2730 }, { - "epoch": 0.6917445089623833, - "grad_norm": 0.46277573704719543, - "learning_rate": 2.5849532946225704e-05, - "loss": 0.0859, + "epoch": 0.8137808137808138, + "grad_norm": 0.578762412071228, + "learning_rate": 2.511731511731512e-05, + "loss": 0.0703, "step": 2740 }, { - "epoch": 0.6942691239585963, - "grad_norm": 0.446417897939682, - "learning_rate": 2.5834385256248422e-05, - "loss": 0.07, + "epoch": 0.8167508167508167, + "grad_norm": 0.5686031579971313, + "learning_rate": 2.5099495099495098e-05, + "loss": 0.0851, "step": 2750 }, { - "epoch": 0.6967937389548093, - "grad_norm": 0.5821120738983154, - "learning_rate": 2.5819237566271144e-05, - "loss": 0.0847, + "epoch": 0.8197208197208197, + "grad_norm": 0.5423585772514343, + "learning_rate": 2.5081675081675083e-05, + "loss": 0.0744, "step": 2760 }, { - "epoch": 0.6993183539510225, - "grad_norm": 0.4102267622947693, - "learning_rate": 2.5804089876293865e-05, - "loss": 0.0724, + "epoch": 0.8226908226908227, + "grad_norm": 0.6459795236587524, + "learning_rate": 2.5063855063855065e-05, + "loss": 0.0749, "step": 2770 }, { - "epoch": 0.7018429689472355, - "grad_norm": 0.6494462490081787, - "learning_rate": 2.5788942186316587e-05, - "loss": 0.0787, + "epoch": 0.8256608256608257, + "grad_norm": 0.5151922106742859, + "learning_rate": 2.5046035046035048e-05, + "loss": 0.0838, "step": 2780 }, { - "epoch": 0.7043675839434487, - "grad_norm": 0.45610910654067993, - "learning_rate": 2.577379449633931e-05, - "loss": 0.0789, + "epoch": 0.8286308286308286, + "grad_norm": 0.49044474959373474, + "learning_rate": 2.502821502821503e-05, + "loss": 0.081, "step": 2790 }, { - "epoch": 0.7068921989396617, - "grad_norm": 0.763862133026123, - "learning_rate": 2.575864680636203e-05, - "loss": 0.0691, + "epoch": 0.8316008316008316, + "grad_norm": 0.6159443855285645, + "learning_rate": 2.5010395010395012e-05, + "loss": 0.0814, "step": 2800 }, { - "epoch": 0.7094168139358747, - "grad_norm": 0.6916351318359375, - "learning_rate": 2.5743499116384752e-05, - "loss": 0.0714, + "epoch": 0.8345708345708346, + "grad_norm": 0.6860203146934509, + "learning_rate": 2.4992574992574994e-05, + "loss": 0.0731, "step": 2810 }, { - "epoch": 0.7119414289320879, - "grad_norm": 0.48409590125083923, - "learning_rate": 2.5728351426407473e-05, - "loss": 0.0705, + "epoch": 0.8375408375408375, + "grad_norm": 0.43102753162384033, + "learning_rate": 2.4974754974754973e-05, + "loss": 0.0867, "step": 2820 }, { - "epoch": 0.7144660439283009, - "grad_norm": 0.5418862700462341, - "learning_rate": 2.5713203736430195e-05, - "loss": 0.0785, + "epoch": 0.8405108405108405, + "grad_norm": 0.6863781809806824, + "learning_rate": 2.495693495693496e-05, + "loss": 0.0681, "step": 2830 }, { - "epoch": 0.716990658924514, - "grad_norm": 0.384924978017807, - "learning_rate": 2.5698056046452917e-05, - "loss": 0.0758, + "epoch": 0.8434808434808435, + "grad_norm": 0.6627882122993469, + "learning_rate": 2.493911493911494e-05, + "loss": 0.0692, "step": 2840 }, { - "epoch": 0.7195152739207271, - "grad_norm": 0.313711553812027, - "learning_rate": 2.5682908356475638e-05, - "loss": 0.0784, + "epoch": 0.8464508464508465, + "grad_norm": 0.556719183921814, + "learning_rate": 2.4921294921294923e-05, + "loss": 0.0942, "step": 2850 }, { - "epoch": 0.7220398889169402, - "grad_norm": 0.33260729908943176, - "learning_rate": 2.566776066649836e-05, - "loss": 0.0546, + "epoch": 0.8494208494208494, + "grad_norm": 0.6097808480262756, + "learning_rate": 2.4903474903474905e-05, + "loss": 0.0788, "step": 2860 }, { - "epoch": 0.7245645039131532, - "grad_norm": 0.8136101365089417, - "learning_rate": 2.565261297652108e-05, - "loss": 0.0762, + "epoch": 0.8523908523908524, + "grad_norm": 0.3771260976791382, + "learning_rate": 2.4885654885654887e-05, + "loss": 0.0872, "step": 2870 }, { - "epoch": 0.7270891189093663, - "grad_norm": 0.363004595041275, - "learning_rate": 2.5637465286543803e-05, - "loss": 0.0853, + "epoch": 0.8553608553608554, + "grad_norm": 0.2577713131904602, + "learning_rate": 2.486783486783487e-05, + "loss": 0.0849, "step": 2880 }, { - "epoch": 0.7296137339055794, - "grad_norm": 0.533036470413208, - "learning_rate": 2.5622317596566525e-05, - "loss": 0.0831, + "epoch": 0.8583308583308583, + "grad_norm": 0.6618907451629639, + "learning_rate": 2.4850014850014848e-05, + "loss": 0.0794, "step": 2890 }, { - "epoch": 0.7321383489017925, - "grad_norm": 0.5486681461334229, - "learning_rate": 2.5607169906589246e-05, - "loss": 0.0661, + "epoch": 0.8613008613008613, + "grad_norm": 0.33715909719467163, + "learning_rate": 2.4832194832194834e-05, + "loss": 0.0689, "step": 2900 }, { - "epoch": 0.7346629638980056, - "grad_norm": 0.4462204873561859, - "learning_rate": 2.5592022216611968e-05, - "loss": 0.0792, + "epoch": 0.8642708642708643, + "grad_norm": 0.5500791072845459, + "learning_rate": 2.4814374814374816e-05, + "loss": 0.0862, "step": 2910 }, { - "epoch": 0.7371875788942186, - "grad_norm": 0.39562398195266724, - "learning_rate": 2.557687452663469e-05, - "loss": 0.0905, + "epoch": 0.8672408672408672, + "grad_norm": 0.6228634119033813, + "learning_rate": 2.4796554796554798e-05, + "loss": 0.0769, "step": 2920 }, { - "epoch": 0.7397121938904317, - "grad_norm": 0.3897605836391449, - "learning_rate": 2.556172683665741e-05, - "loss": 0.0817, + "epoch": 0.8702108702108702, + "grad_norm": 0.8019270896911621, + "learning_rate": 2.477873477873478e-05, + "loss": 0.071, "step": 2930 }, { - "epoch": 0.7422368088866448, - "grad_norm": 0.44640547037124634, - "learning_rate": 2.5546579146680133e-05, - "loss": 0.0666, + "epoch": 0.8731808731808732, + "grad_norm": 0.47143444418907166, + "learning_rate": 2.4760914760914762e-05, + "loss": 0.0812, "step": 2940 }, { - "epoch": 0.7447614238828578, - "grad_norm": 0.3504391312599182, - "learning_rate": 2.5531431456702854e-05, - "loss": 0.0765, + "epoch": 0.8761508761508762, + "grad_norm": 0.47617822885513306, + "learning_rate": 2.4743094743094744e-05, + "loss": 0.0769, "step": 2950 }, { - "epoch": 0.747286038879071, - "grad_norm": 0.7800899147987366, - "learning_rate": 2.5516283766725576e-05, - "loss": 0.0763, + "epoch": 0.8791208791208791, + "grad_norm": 0.6791771054267883, + "learning_rate": 2.4725274725274723e-05, + "loss": 0.0693, "step": 2960 }, { - "epoch": 0.749810653875284, - "grad_norm": 0.6443584561347961, - "learning_rate": 2.5501136076748297e-05, - "loss": 0.0909, + "epoch": 0.882090882090882, + "grad_norm": 0.4986003339290619, + "learning_rate": 2.470745470745471e-05, + "loss": 0.0778, "step": 2970 }, { - "epoch": 0.752335268871497, - "grad_norm": 0.3258838951587677, - "learning_rate": 2.548598838677102e-05, - "loss": 0.0717, + "epoch": 0.8850608850608851, + "grad_norm": 0.351012647151947, + "learning_rate": 2.468963468963469e-05, + "loss": 0.073, "step": 2980 }, { - "epoch": 0.7548598838677102, - "grad_norm": 0.545174241065979, - "learning_rate": 2.547084069679374e-05, - "loss": 0.0675, + "epoch": 0.888030888030888, + "grad_norm": 0.6079609394073486, + "learning_rate": 2.4671814671814673e-05, + "loss": 0.075, "step": 2990 }, { - "epoch": 0.7573844988639232, - "grad_norm": 0.751238226890564, - "learning_rate": 2.5455693006816462e-05, - "loss": 0.0767, + "epoch": 0.891000891000891, + "grad_norm": 0.49167245626449585, + "learning_rate": 2.4653994653994655e-05, + "loss": 0.0745, "step": 3000 }, { - "epoch": 0.7599091138601364, - "grad_norm": 0.5401891469955444, - "learning_rate": 2.544054531683918e-05, - "loss": 0.0733, + "epoch": 0.893970893970894, + "grad_norm": 0.49965718388557434, + "learning_rate": 2.4636174636174637e-05, + "loss": 0.0861, "step": 3010 }, { - "epoch": 0.7624337288563494, - "grad_norm": 0.7195361256599426, - "learning_rate": 2.5425397626861905e-05, - "loss": 0.0724, + "epoch": 0.896940896940897, + "grad_norm": 0.5942029356956482, + "learning_rate": 2.461835461835462e-05, + "loss": 0.0775, "step": 3020 }, { - "epoch": 0.7649583438525625, - "grad_norm": 0.5175593495368958, - "learning_rate": 2.5410249936884627e-05, - "loss": 0.0819, + "epoch": 0.8999108999108999, + "grad_norm": 0.5431137084960938, + "learning_rate": 2.46005346005346e-05, + "loss": 0.0732, "step": 3030 }, { - "epoch": 0.7674829588487756, - "grad_norm": 0.5216336250305176, - "learning_rate": 2.5395102246907345e-05, - "loss": 0.0685, + "epoch": 0.9028809028809028, + "grad_norm": 0.4982147514820099, + "learning_rate": 2.4582714582714584e-05, + "loss": 0.0749, "step": 3040 }, { - "epoch": 0.7700075738449886, - "grad_norm": 0.618516743183136, - "learning_rate": 2.537995455693007e-05, - "loss": 0.0766, + "epoch": 0.9058509058509059, + "grad_norm": 0.6718347072601318, + "learning_rate": 2.4564894564894566e-05, + "loss": 0.0843, "step": 3050 }, { - "epoch": 0.7725321888412017, - "grad_norm": 0.6617169380187988, - "learning_rate": 2.536480686695279e-05, - "loss": 0.0909, + "epoch": 0.9088209088209088, + "grad_norm": 0.7574843168258667, + "learning_rate": 2.454707454707455e-05, + "loss": 0.0769, "step": 3060 }, { - "epoch": 0.7750568038374148, - "grad_norm": 0.43062853813171387, - "learning_rate": 2.534965917697551e-05, - "loss": 0.0671, + "epoch": 0.9117909117909118, + "grad_norm": 0.5467488169670105, + "learning_rate": 2.452925452925453e-05, + "loss": 0.0802, "step": 3070 }, { - "epoch": 0.7775814188336279, - "grad_norm": 0.3703617453575134, - "learning_rate": 2.5334511486998235e-05, - "loss": 0.0685, + "epoch": 0.9147609147609148, + "grad_norm": 0.4699064791202545, + "learning_rate": 2.4511434511434513e-05, + "loss": 0.0813, "step": 3080 }, { - "epoch": 0.7801060338298409, - "grad_norm": 0.397579550743103, - "learning_rate": 2.5319363797020953e-05, - "loss": 0.0764, + "epoch": 0.9177309177309178, + "grad_norm": 0.4939485788345337, + "learning_rate": 2.4493614493614495e-05, + "loss": 0.0763, "step": 3090 }, { - "epoch": 0.782630648826054, - "grad_norm": 0.42598873376846313, - "learning_rate": 2.5304216107043675e-05, - "loss": 0.0708, + "epoch": 0.9207009207009207, + "grad_norm": 0.4790801405906677, + "learning_rate": 2.4475794475794474e-05, + "loss": 0.0765, "step": 3100 }, { - "epoch": 0.7851552638222671, - "grad_norm": 0.49477070569992065, - "learning_rate": 2.52890684170664e-05, - "loss": 0.0755, + "epoch": 0.9236709236709236, + "grad_norm": 0.3700208365917206, + "learning_rate": 2.445797445797446e-05, + "loss": 0.0862, "step": 3110 }, { - "epoch": 0.7876798788184802, - "grad_norm": 0.3231316804885864, - "learning_rate": 2.5273920727089118e-05, - "loss": 0.0702, + "epoch": 0.9266409266409267, + "grad_norm": 0.5105488300323486, + "learning_rate": 2.444015444015444e-05, + "loss": 0.0773, "step": 3120 }, { - "epoch": 0.7902044938146933, - "grad_norm": 0.4189174175262451, - "learning_rate": 2.525877303711184e-05, - "loss": 0.0853, + "epoch": 0.9296109296109296, + "grad_norm": 0.3455560803413391, + "learning_rate": 2.4422334422334424e-05, + "loss": 0.0716, "step": 3130 }, { - "epoch": 0.7927291088109063, - "grad_norm": 0.48693427443504333, - "learning_rate": 2.5243625347134564e-05, - "loss": 0.0669, + "epoch": 0.9325809325809326, + "grad_norm": 0.5318461656570435, + "learning_rate": 2.4404514404514406e-05, + "loss": 0.079, "step": 3140 }, { - "epoch": 0.7952537238071195, - "grad_norm": 0.48928236961364746, - "learning_rate": 2.5228477657157283e-05, - "loss": 0.0855, + "epoch": 0.9355509355509356, + "grad_norm": 0.42595726251602173, + "learning_rate": 2.4386694386694388e-05, + "loss": 0.0892, "step": 3150 }, { - "epoch": 0.7977783388033325, - "grad_norm": 0.4300285279750824, - "learning_rate": 2.5213329967180004e-05, - "loss": 0.0947, + "epoch": 0.9385209385209385, + "grad_norm": 0.651802659034729, + "learning_rate": 2.436887436887437e-05, + "loss": 0.0793, "step": 3160 }, { - "epoch": 0.8003029537995455, - "grad_norm": 0.5028986930847168, - "learning_rate": 2.519818227720273e-05, - "loss": 0.0901, + "epoch": 0.9414909414909415, + "grad_norm": 0.6579793095588684, + "learning_rate": 2.435105435105435e-05, + "loss": 0.0661, "step": 3170 }, { - "epoch": 0.8028275687957587, - "grad_norm": 0.6309892535209656, - "learning_rate": 2.5183034587225447e-05, - "loss": 0.072, + "epoch": 0.9444609444609444, + "grad_norm": 0.5980479717254639, + "learning_rate": 2.4333234333234334e-05, + "loss": 0.0757, "step": 3180 }, { - "epoch": 0.8053521837919717, - "grad_norm": 0.34405893087387085, - "learning_rate": 2.516788689724817e-05, - "loss": 0.0805, + "epoch": 0.9474309474309475, + "grad_norm": 0.5788313746452332, + "learning_rate": 2.4315414315414317e-05, + "loss": 0.0921, "step": 3190 }, { - "epoch": 0.8078767987881847, - "grad_norm": 0.5418539643287659, - "learning_rate": 2.5152739207270894e-05, - "loss": 0.0638, + "epoch": 0.9504009504009504, + "grad_norm": 0.47703874111175537, + "learning_rate": 2.42975942975943e-05, + "loss": 0.0661, "step": 3200 }, { - "epoch": 0.8104014137843979, - "grad_norm": 0.47962188720703125, - "learning_rate": 2.5137591517293612e-05, - "loss": 0.0646, + "epoch": 0.9533709533709533, + "grad_norm": 0.5644926428794861, + "learning_rate": 2.427977427977428e-05, + "loss": 0.0706, "step": 3210 }, { - "epoch": 0.8129260287806109, - "grad_norm": 0.2669812738895416, - "learning_rate": 2.5122443827316334e-05, - "loss": 0.0737, + "epoch": 0.9563409563409564, + "grad_norm": 0.6008754372596741, + "learning_rate": 2.4261954261954263e-05, + "loss": 0.0757, "step": 3220 }, { - "epoch": 0.8154506437768241, - "grad_norm": 0.5387107133865356, - "learning_rate": 2.510729613733906e-05, - "loss": 0.0855, + "epoch": 0.9593109593109593, + "grad_norm": 0.5607688426971436, + "learning_rate": 2.4244134244134245e-05, + "loss": 0.0718, "step": 3230 }, { - "epoch": 0.8179752587730371, - "grad_norm": 1.0121177434921265, - "learning_rate": 2.5092148447361777e-05, - "loss": 0.0778, + "epoch": 0.9622809622809623, + "grad_norm": 0.7359547019004822, + "learning_rate": 2.4226314226314224e-05, + "loss": 0.083, "step": 3240 }, { - "epoch": 0.8204998737692502, - "grad_norm": 0.5904279351234436, - "learning_rate": 2.50770007573845e-05, - "loss": 0.0713, + "epoch": 0.9652509652509652, + "grad_norm": 0.596994161605835, + "learning_rate": 2.420849420849421e-05, + "loss": 0.0717, "step": 3250 }, { - "epoch": 0.8230244887654633, - "grad_norm": 0.49105462431907654, - "learning_rate": 2.5061853067407224e-05, - "loss": 0.0758, + "epoch": 0.9682209682209683, + "grad_norm": 0.7237496972084045, + "learning_rate": 2.4190674190674192e-05, + "loss": 0.0767, "step": 3260 }, { - "epoch": 0.8255491037616763, - "grad_norm": 0.8744997382164001, - "learning_rate": 2.5046705377429942e-05, - "loss": 0.0769, + "epoch": 0.9711909711909712, + "grad_norm": 0.6103200316429138, + "learning_rate": 2.4172854172854174e-05, + "loss": 0.0738, "step": 3270 }, { - "epoch": 0.8280737187578894, - "grad_norm": 0.7501155734062195, - "learning_rate": 2.5031557687452663e-05, - "loss": 0.0735, + "epoch": 0.9741609741609741, + "grad_norm": 0.7314611077308655, + "learning_rate": 2.4155034155034156e-05, + "loss": 0.0851, "step": 3280 }, { - "epoch": 0.8305983337541025, - "grad_norm": 0.42036619782447815, - "learning_rate": 2.501640999747539e-05, - "loss": 0.0822, + "epoch": 0.9771309771309772, + "grad_norm": 0.496187299489975, + "learning_rate": 2.4137214137214138e-05, + "loss": 0.0757, "step": 3290 }, { - "epoch": 0.8331229487503156, - "grad_norm": 0.44184601306915283, - "learning_rate": 2.5001262307498107e-05, - "loss": 0.072, + "epoch": 0.9801009801009801, + "grad_norm": 0.5102724432945251, + "learning_rate": 2.411939411939412e-05, + "loss": 0.0705, "step": 3300 }, { - "epoch": 0.8356475637465287, - "grad_norm": 0.3335505425930023, - "learning_rate": 2.4986114617520828e-05, - "loss": 0.0723, + "epoch": 0.9830709830709831, + "grad_norm": 0.43364787101745605, + "learning_rate": 2.4101574101574103e-05, + "loss": 0.0594, "step": 3310 }, { - "epoch": 0.8381721787427417, - "grad_norm": 0.4426600933074951, - "learning_rate": 2.497096692754355e-05, - "loss": 0.0789, + "epoch": 0.986040986040986, + "grad_norm": 0.5329870581626892, + "learning_rate": 2.4083754083754085e-05, + "loss": 0.0757, "step": 3320 }, { - "epoch": 0.8406967937389548, - "grad_norm": 0.45263999700546265, - "learning_rate": 2.495581923756627e-05, - "loss": 0.0744, + "epoch": 0.989010989010989, + "grad_norm": 0.5290941596031189, + "learning_rate": 2.4065934065934067e-05, + "loss": 0.0798, "step": 3330 }, { - "epoch": 0.8432214087351679, - "grad_norm": 0.2730228900909424, - "learning_rate": 2.4940671547588993e-05, - "loss": 0.0712, + "epoch": 0.991980991980992, + "grad_norm": 0.5744608044624329, + "learning_rate": 2.404811404811405e-05, + "loss": 0.072, "step": 3340 }, { - "epoch": 0.845746023731381, - "grad_norm": 0.34930211305618286, - "learning_rate": 2.4925523857611715e-05, - "loss": 0.0699, + "epoch": 0.9949509949509949, + "grad_norm": 0.5449424386024475, + "learning_rate": 2.403029403029403e-05, + "loss": 0.0827, "step": 3350 }, { - "epoch": 0.848270638727594, - "grad_norm": 0.3079373240470886, - "learning_rate": 2.4910376167634436e-05, - "loss": 0.0693, + "epoch": 0.997920997920998, + "grad_norm": 0.5638298392295837, + "learning_rate": 2.4012474012474013e-05, + "loss": 0.0796, "step": 3360 }, { - "epoch": 0.8507952537238072, - "grad_norm": 0.5083094239234924, - "learning_rate": 2.4895228477657158e-05, - "loss": 0.08, + "epoch": 1.0, + "eval_f1": 0.49727767695099817, + "eval_loss": 0.0686563029885292, + "eval_runtime": 821.5096, + "eval_samples_per_second": 46.279, + "eval_steps_per_second": 0.724, + "step": 3367 + }, + { + "epoch": 1.0008910008910008, + "grad_norm": 0.5497238039970398, + "learning_rate": 2.3994653994653996e-05, + "loss": 0.0703, "step": 3370 }, { - "epoch": 0.8533198687200202, - "grad_norm": 0.40638256072998047, - "learning_rate": 2.488008078767988e-05, - "loss": 0.0752, + "epoch": 1.0038610038610039, + "grad_norm": 0.3895362913608551, + "learning_rate": 2.3976833976833978e-05, + "loss": 0.0714, "step": 3380 }, { - "epoch": 0.8558444837162332, - "grad_norm": 0.5014817714691162, - "learning_rate": 2.48649330977026e-05, - "loss": 0.0719, + "epoch": 1.006831006831007, + "grad_norm": 0.5208247900009155, + "learning_rate": 2.395901395901396e-05, + "loss": 0.0882, "step": 3390 }, { - "epoch": 0.8583690987124464, - "grad_norm": 0.5369516015052795, - "learning_rate": 2.4849785407725323e-05, - "loss": 0.0846, + "epoch": 1.0098010098010097, + "grad_norm": 0.4272199273109436, + "learning_rate": 2.3941193941193942e-05, + "loss": 0.0735, "step": 3400 }, { - "epoch": 0.8608937137086594, - "grad_norm": 0.5256063342094421, - "learning_rate": 2.4834637717748044e-05, - "loss": 0.0902, + "epoch": 1.0127710127710128, + "grad_norm": 0.5025156140327454, + "learning_rate": 2.3923373923373924e-05, + "loss": 0.0706, "step": 3410 }, { - "epoch": 0.8634183287048726, - "grad_norm": 0.4781491756439209, - "learning_rate": 2.4819490027770766e-05, - "loss": 0.0902, + "epoch": 1.0157410157410158, + "grad_norm": 0.3242335617542267, + "learning_rate": 2.3905553905553906e-05, + "loss": 0.0678, "step": 3420 }, { - "epoch": 0.8659429437010856, - "grad_norm": 0.48606160283088684, - "learning_rate": 2.4804342337793487e-05, - "loss": 0.0868, + "epoch": 1.0187110187110187, + "grad_norm": 0.3997895121574402, + "learning_rate": 2.388773388773389e-05, + "loss": 0.0812, "step": 3430 }, { - "epoch": 0.8684675586972986, - "grad_norm": 0.6008639931678772, - "learning_rate": 2.478919464781621e-05, - "loss": 0.0726, + "epoch": 1.0216810216810217, + "grad_norm": 0.752778172492981, + "learning_rate": 2.386991386991387e-05, + "loss": 0.0884, "step": 3440 }, { - "epoch": 0.8709921736935118, - "grad_norm": 0.35723280906677246, - "learning_rate": 2.477404695783893e-05, - "loss": 0.0614, + "epoch": 1.0246510246510248, + "grad_norm": 0.8602269291877747, + "learning_rate": 2.3852093852093853e-05, + "loss": 0.0878, "step": 3450 }, { - "epoch": 0.8735167886897248, - "grad_norm": 0.8890093564987183, - "learning_rate": 2.4758899267861652e-05, - "loss": 0.0674, + "epoch": 1.0276210276210276, + "grad_norm": 0.4281240403652191, + "learning_rate": 2.3834273834273835e-05, + "loss": 0.0718, "step": 3460 }, { - "epoch": 0.8760414036859379, - "grad_norm": 0.6494946479797363, - "learning_rate": 2.4743751577884374e-05, - "loss": 0.0679, + "epoch": 1.0305910305910306, + "grad_norm": 0.5941810607910156, + "learning_rate": 2.3816453816453817e-05, + "loss": 0.0737, "step": 3470 }, { - "epoch": 0.878566018682151, - "grad_norm": 0.5928673148155212, - "learning_rate": 2.4728603887907095e-05, - "loss": 0.0638, + "epoch": 1.0335610335610335, + "grad_norm": 0.573628306388855, + "learning_rate": 2.37986337986338e-05, + "loss": 0.0659, "step": 3480 }, { - "epoch": 0.881090633678364, - "grad_norm": 0.3160926103591919, - "learning_rate": 2.4713456197929817e-05, - "loss": 0.0826, + "epoch": 1.0365310365310365, + "grad_norm": 0.6910396814346313, + "learning_rate": 2.378081378081378e-05, + "loss": 0.084, "step": 3490 }, { - "epoch": 0.8836152486745771, - "grad_norm": 0.585054874420166, - "learning_rate": 2.469830850795254e-05, - "loss": 0.0732, + "epoch": 1.0395010395010396, + "grad_norm": 0.38856300711631775, + "learning_rate": 2.3762993762993764e-05, + "loss": 0.0761, "step": 3500 }, { - "epoch": 0.8861398636707902, - "grad_norm": 0.4782266318798065, - "learning_rate": 2.468316081797526e-05, - "loss": 0.0752, + "epoch": 1.0424710424710424, + "grad_norm": 0.41457536816596985, + "learning_rate": 2.3745173745173746e-05, + "loss": 0.082, "step": 3510 }, { - "epoch": 0.8886644786670033, - "grad_norm": 0.43204379081726074, - "learning_rate": 2.466801312799798e-05, - "loss": 0.089, + "epoch": 1.0454410454410454, + "grad_norm": 0.6538494825363159, + "learning_rate": 2.3727353727353728e-05, + "loss": 0.0817, "step": 3520 }, { - "epoch": 0.8911890936632164, - "grad_norm": 0.5396738052368164, - "learning_rate": 2.46528654380207e-05, - "loss": 0.0818, + "epoch": 1.0484110484110485, + "grad_norm": 0.3478659689426422, + "learning_rate": 2.370953370953371e-05, + "loss": 0.0851, "step": 3530 }, { - "epoch": 0.8937137086594295, - "grad_norm": 0.39785364270210266, - "learning_rate": 2.4637717748043425e-05, - "loss": 0.0775, + "epoch": 1.0513810513810513, + "grad_norm": 0.546033501625061, + "learning_rate": 2.3691713691713692e-05, + "loss": 0.084, "step": 3540 }, { - "epoch": 0.8962383236556425, - "grad_norm": 0.41307616233825684, - "learning_rate": 2.4622570058066146e-05, - "loss": 0.0754, + "epoch": 1.0543510543510544, + "grad_norm": 0.4026525020599365, + "learning_rate": 2.3673893673893675e-05, + "loss": 0.0725, "step": 3550 }, { - "epoch": 0.8987629386518556, - "grad_norm": 0.5757405161857605, - "learning_rate": 2.4607422368088865e-05, - "loss": 0.0808, + "epoch": 1.0573210573210574, + "grad_norm": 0.5000739097595215, + "learning_rate": 2.3656073656073657e-05, + "loss": 0.0822, "step": 3560 }, { - "epoch": 0.9012875536480687, - "grad_norm": 0.4765954911708832, - "learning_rate": 2.459227467811159e-05, - "loss": 0.0752, + "epoch": 1.0602910602910602, + "grad_norm": 0.48692411184310913, + "learning_rate": 2.363825363825364e-05, + "loss": 0.0796, "step": 3570 }, { - "epoch": 0.9038121686442817, - "grad_norm": 0.553316593170166, - "learning_rate": 2.457712698813431e-05, - "loss": 0.0646, + "epoch": 1.0632610632610633, + "grad_norm": 0.5664608478546143, + "learning_rate": 2.362043362043362e-05, + "loss": 0.0661, "step": 3580 }, { - "epoch": 0.9063367836404949, - "grad_norm": 0.6468276977539062, - "learning_rate": 2.456197929815703e-05, - "loss": 0.0804, + "epoch": 1.0662310662310661, + "grad_norm": 0.502124547958374, + "learning_rate": 2.3602613602613603e-05, + "loss": 0.0638, "step": 3590 }, { - "epoch": 0.9088613986367079, - "grad_norm": 0.38345563411712646, - "learning_rate": 2.4546831608179754e-05, - "loss": 0.0761, + "epoch": 1.0692010692010692, + "grad_norm": 0.5469791889190674, + "learning_rate": 2.3584793584793586e-05, + "loss": 0.0719, "step": 3600 }, { - "epoch": 0.9113860136329209, - "grad_norm": 0.38474252820014954, - "learning_rate": 2.4531683918202476e-05, - "loss": 0.0543, + "epoch": 1.0721710721710722, + "grad_norm": 0.5133867859840393, + "learning_rate": 2.3566973566973568e-05, + "loss": 0.0772, "step": 3610 }, { - "epoch": 0.9139106286291341, - "grad_norm": 0.44275468587875366, - "learning_rate": 2.4516536228225194e-05, - "loss": 0.0909, + "epoch": 1.075141075141075, + "grad_norm": 0.5197412371635437, + "learning_rate": 2.354915354915355e-05, + "loss": 0.0862, "step": 3620 }, { - "epoch": 0.9164352436253471, - "grad_norm": 0.40391191840171814, - "learning_rate": 2.450138853824792e-05, - "loss": 0.0655, + "epoch": 1.078111078111078, + "grad_norm": 0.4368208050727844, + "learning_rate": 2.3531333531333532e-05, + "loss": 0.0618, "step": 3630 }, { - "epoch": 0.9189598586215603, - "grad_norm": 0.7056028246879578, - "learning_rate": 2.4486240848270637e-05, - "loss": 0.0774, + "epoch": 1.0810810810810811, + "grad_norm": 0.46737584471702576, + "learning_rate": 2.3513513513513514e-05, + "loss": 0.0618, "step": 3640 }, { - "epoch": 0.9214844736177733, - "grad_norm": 0.6490464806556702, - "learning_rate": 2.447109315829336e-05, - "loss": 0.0827, + "epoch": 1.084051084051084, + "grad_norm": 0.46774664521217346, + "learning_rate": 2.3495693495693496e-05, + "loss": 0.0844, "step": 3650 }, { - "epoch": 0.9240090886139863, - "grad_norm": 0.6265650391578674, - "learning_rate": 2.4455945468316084e-05, - "loss": 0.0795, + "epoch": 1.087021087021087, + "grad_norm": 0.5892476439476013, + "learning_rate": 2.347787347787348e-05, + "loss": 0.0823, "step": 3660 }, { - "epoch": 0.9265337036101995, - "grad_norm": 0.38877788186073303, - "learning_rate": 2.4440797778338802e-05, - "loss": 0.0741, + "epoch": 1.08999108999109, + "grad_norm": 0.32615166902542114, + "learning_rate": 2.346005346005346e-05, + "loss": 0.0579, "step": 3670 }, { - "epoch": 0.9290583186064125, - "grad_norm": 0.5186750888824463, - "learning_rate": 2.4425650088361524e-05, - "loss": 0.0725, + "epoch": 1.092961092961093, + "grad_norm": 0.4170238673686981, + "learning_rate": 2.3442233442233443e-05, + "loss": 0.0655, "step": 3680 }, { - "epoch": 0.9315829336026256, - "grad_norm": 0.6181837916374207, - "learning_rate": 2.441050239838425e-05, - "loss": 0.0807, + "epoch": 1.095931095931096, + "grad_norm": 0.4704936146736145, + "learning_rate": 2.3424413424413425e-05, + "loss": 0.0814, "step": 3690 }, { - "epoch": 0.9341075485988387, - "grad_norm": 0.4812741279602051, - "learning_rate": 2.4395354708406967e-05, - "loss": 0.0806, + "epoch": 1.098901098901099, + "grad_norm": 0.5191180109977722, + "learning_rate": 2.3406593406593407e-05, + "loss": 0.0787, "step": 3700 }, { - "epoch": 0.9366321635950517, - "grad_norm": 0.45176827907562256, - "learning_rate": 2.438020701842969e-05, - "loss": 0.0712, + "epoch": 1.1018711018711018, + "grad_norm": 0.48460114002227783, + "learning_rate": 2.338877338877339e-05, + "loss": 0.0522, "step": 3710 }, { - "epoch": 0.9391567785912648, - "grad_norm": 0.5601783990859985, - "learning_rate": 2.4365059328452414e-05, - "loss": 0.0747, + "epoch": 1.1048411048411049, + "grad_norm": 0.5503575205802917, + "learning_rate": 2.337095337095337e-05, + "loss": 0.0769, "step": 3720 }, { - "epoch": 0.9416813935874779, - "grad_norm": 0.5894845128059387, - "learning_rate": 2.4349911638475132e-05, - "loss": 0.0795, + "epoch": 1.107811107811108, + "grad_norm": 0.6398834586143494, + "learning_rate": 2.3353133353133354e-05, + "loss": 0.0664, "step": 3730 }, { - "epoch": 0.944206008583691, - "grad_norm": 0.42322757840156555, - "learning_rate": 2.4334763948497853e-05, - "loss": 0.072, + "epoch": 1.1107811107811107, + "grad_norm": 0.39908480644226074, + "learning_rate": 2.3335313335313336e-05, + "loss": 0.0759, "step": 3740 }, { - "epoch": 0.9467306235799041, - "grad_norm": 0.5656612515449524, - "learning_rate": 2.431961625852058e-05, - "loss": 0.0689, + "epoch": 1.1137511137511138, + "grad_norm": 0.4675776958465576, + "learning_rate": 2.3317493317493318e-05, + "loss": 0.0765, "step": 3750 }, { - "epoch": 0.9492552385761172, - "grad_norm": 0.4395703971385956, - "learning_rate": 2.4304468568543297e-05, - "loss": 0.0916, + "epoch": 1.1167211167211166, + "grad_norm": 0.350972443819046, + "learning_rate": 2.32996732996733e-05, + "loss": 0.0777, "step": 3760 }, { - "epoch": 0.9517798535723302, - "grad_norm": 0.3628901243209839, - "learning_rate": 2.4289320878566018e-05, - "loss": 0.0727, + "epoch": 1.1196911196911197, + "grad_norm": 0.4611550569534302, + "learning_rate": 2.3281853281853282e-05, + "loss": 0.0709, "step": 3770 }, { - "epoch": 0.9543044685685433, - "grad_norm": 0.3859265148639679, - "learning_rate": 2.4274173188588743e-05, - "loss": 0.0835, + "epoch": 1.1226611226611227, + "grad_norm": 0.5342544913291931, + "learning_rate": 2.3264033264033265e-05, + "loss": 0.0649, "step": 3780 }, { - "epoch": 0.9568290835647564, - "grad_norm": 0.2310730367898941, - "learning_rate": 2.425902549861146e-05, - "loss": 0.0687, + "epoch": 1.1256311256311256, + "grad_norm": 0.6507514119148254, + "learning_rate": 2.3246213246213247e-05, + "loss": 0.076, "step": 3790 }, { - "epoch": 0.9593536985609694, - "grad_norm": 0.30712732672691345, - "learning_rate": 2.4243877808634183e-05, - "loss": 0.0695, + "epoch": 1.1286011286011286, + "grad_norm": 0.7478254437446594, + "learning_rate": 2.322839322839323e-05, + "loss": 0.0857, "step": 3800 }, { - "epoch": 0.9618783135571826, - "grad_norm": 0.7812157273292542, - "learning_rate": 2.4228730118656908e-05, - "loss": 0.0709, + "epoch": 1.1315711315711316, + "grad_norm": 0.5067834258079529, + "learning_rate": 2.321057321057321e-05, + "loss": 0.0745, "step": 3810 }, { - "epoch": 0.9644029285533956, - "grad_norm": 0.3865745961666107, - "learning_rate": 2.4213582428679626e-05, - "loss": 0.0837, + "epoch": 1.1345411345411345, + "grad_norm": 0.6091060042381287, + "learning_rate": 2.3192753192753193e-05, + "loss": 0.0761, "step": 3820 }, { - "epoch": 0.9669275435496086, - "grad_norm": 0.5264056921005249, - "learning_rate": 2.4198434738702348e-05, - "loss": 0.0771, + "epoch": 1.1375111375111375, + "grad_norm": 0.4694317579269409, + "learning_rate": 2.3174933174933175e-05, + "loss": 0.0815, "step": 3830 }, { - "epoch": 0.9694521585458218, - "grad_norm": 0.4684106111526489, - "learning_rate": 2.4183287048725073e-05, - "loss": 0.0876, + "epoch": 1.1404811404811406, + "grad_norm": 0.5222705006599426, + "learning_rate": 2.3157113157113158e-05, + "loss": 0.0788, "step": 3840 }, { - "epoch": 0.9719767735420348, - "grad_norm": 0.37889453768730164, - "learning_rate": 2.416813935874779e-05, - "loss": 0.078, + "epoch": 1.1434511434511434, + "grad_norm": 0.5226296782493591, + "learning_rate": 2.313929313929314e-05, + "loss": 0.0773, "step": 3850 }, { - "epoch": 0.974501388538248, - "grad_norm": 0.4028097987174988, - "learning_rate": 2.4152991668770513e-05, - "loss": 0.0825, + "epoch": 1.1464211464211465, + "grad_norm": 0.5545721054077148, + "learning_rate": 2.3121473121473122e-05, + "loss": 0.0675, "step": 3860 }, { - "epoch": 0.977026003534461, - "grad_norm": 0.4436962902545929, - "learning_rate": 2.4137843978793237e-05, - "loss": 0.0627, + "epoch": 1.1493911493911493, + "grad_norm": 0.5250979065895081, + "learning_rate": 2.3103653103653104e-05, + "loss": 0.0815, "step": 3870 }, { - "epoch": 0.979550618530674, - "grad_norm": 0.35176676511764526, - "learning_rate": 2.4122696288815956e-05, - "loss": 0.0706, + "epoch": 1.1523611523611523, + "grad_norm": 0.4267248213291168, + "learning_rate": 2.3085833085833086e-05, + "loss": 0.0704, "step": 3880 }, { - "epoch": 0.9820752335268872, - "grad_norm": 0.5100188255310059, - "learning_rate": 2.4107548598838677e-05, - "loss": 0.066, + "epoch": 1.1553311553311554, + "grad_norm": 0.3308209478855133, + "learning_rate": 2.306801306801307e-05, + "loss": 0.08, "step": 3890 }, { - "epoch": 0.9845998485231002, - "grad_norm": 0.3400685489177704, - "learning_rate": 2.40924009088614e-05, - "loss": 0.0611, + "epoch": 1.1583011583011582, + "grad_norm": 0.49279993772506714, + "learning_rate": 2.305019305019305e-05, + "loss": 0.0868, "step": 3900 }, { - "epoch": 0.9871244635193133, - "grad_norm": 0.4831116497516632, - "learning_rate": 2.407725321888412e-05, - "loss": 0.0907, + "epoch": 1.1612711612711613, + "grad_norm": 0.49307748675346375, + "learning_rate": 2.3032373032373033e-05, + "loss": 0.081, "step": 3910 }, { - "epoch": 0.9896490785155264, - "grad_norm": 0.7926459312438965, - "learning_rate": 2.4062105528906842e-05, - "loss": 0.0956, + "epoch": 1.1642411642411643, + "grad_norm": 0.691349446773529, + "learning_rate": 2.3014553014553015e-05, + "loss": 0.0712, "step": 3920 }, { - "epoch": 0.9921736935117395, - "grad_norm": 0.5712438225746155, - "learning_rate": 2.4046957838929564e-05, - "loss": 0.0807, + "epoch": 1.1672111672111671, + "grad_norm": 0.4932047724723816, + "learning_rate": 2.2996732996732997e-05, + "loss": 0.0683, "step": 3930 }, { - "epoch": 0.9946983085079525, - "grad_norm": 0.4294516146183014, - "learning_rate": 2.4031810148952285e-05, - "loss": 0.075, + "epoch": 1.1701811701811702, + "grad_norm": 0.5138940811157227, + "learning_rate": 2.297891297891298e-05, + "loss": 0.0646, "step": 3940 }, { - "epoch": 0.9972229235041656, - "grad_norm": 0.37027591466903687, - "learning_rate": 2.4016662458975007e-05, - "loss": 0.071, + "epoch": 1.1731511731511732, + "grad_norm": 0.4573695659637451, + "learning_rate": 2.2961092961092965e-05, + "loss": 0.0593, "step": 3950 }, { - "epoch": 0.9997475385003787, - "grad_norm": 0.5219939947128296, - "learning_rate": 2.400151476899773e-05, - "loss": 0.0659, + "epoch": 1.176121176121176, + "grad_norm": 0.6048777103424072, + "learning_rate": 2.2943272943272944e-05, + "loss": 0.0768, "step": 3960 }, { - "epoch": 1.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.05425933748483658, - "eval_runtime": 969.6738, - "eval_samples_per_second": 212.713, - "eval_steps_per_second": 3.324, - "step": 3961 - }, - { - "epoch": 1.0022721534965917, - "grad_norm": 0.4360177218914032, - "learning_rate": 2.398636707902045e-05, - "loss": 0.0797, + "epoch": 1.179091179091179, + "grad_norm": 0.6311981678009033, + "learning_rate": 2.2925452925452926e-05, + "loss": 0.0901, "step": 3970 }, { - "epoch": 1.0047967684928047, - "grad_norm": 0.6967335343360901, - "learning_rate": 2.397121938904317e-05, - "loss": 0.0688, + "epoch": 1.1820611820611822, + "grad_norm": 0.4408791661262512, + "learning_rate": 2.2907632907632908e-05, + "loss": 0.0729, "step": 3980 }, { - "epoch": 1.007321383489018, - "grad_norm": 0.6186540722846985, - "learning_rate": 2.3956071699065893e-05, - "loss": 0.0703, + "epoch": 1.185031185031185, + "grad_norm": 0.3359534740447998, + "learning_rate": 2.288981288981289e-05, + "loss": 0.071, "step": 3990 }, { - "epoch": 1.009845998485231, - "grad_norm": 0.6125244498252869, - "learning_rate": 2.3940924009088615e-05, - "loss": 0.0759, + "epoch": 1.188001188001188, + "grad_norm": 0.3939429223537445, + "learning_rate": 2.2871992871992872e-05, + "loss": 0.0681, "step": 4000 }, { - "epoch": 1.012370613481444, - "grad_norm": 0.48123863339424133, - "learning_rate": 2.3925776319111336e-05, - "loss": 0.0646, + "epoch": 1.190971190971191, + "grad_norm": 0.46291255950927734, + "learning_rate": 2.2854172854172855e-05, + "loss": 0.0721, "step": 4010 }, { - "epoch": 1.0148952284776571, - "grad_norm": 0.32201361656188965, - "learning_rate": 2.3910628629134058e-05, - "loss": 0.0741, + "epoch": 1.193941193941194, + "grad_norm": 0.4679121971130371, + "learning_rate": 2.283635283635284e-05, + "loss": 0.0823, "step": 4020 }, { - "epoch": 1.0174198434738702, - "grad_norm": 0.7003934979438782, - "learning_rate": 2.389548093915678e-05, - "loss": 0.0704, + "epoch": 1.196911196911197, + "grad_norm": 0.6498029232025146, + "learning_rate": 2.281853281853282e-05, + "loss": 0.064, "step": 4030 }, { - "epoch": 1.0199444584700834, - "grad_norm": 0.5227739810943604, - "learning_rate": 2.38803332491795e-05, - "loss": 0.0705, + "epoch": 1.1998811998811998, + "grad_norm": 0.5375266671180725, + "learning_rate": 2.28007128007128e-05, + "loss": 0.0827, "step": 4040 }, { - "epoch": 1.0224690734662965, - "grad_norm": 0.59422367811203, - "learning_rate": 2.3865185559202223e-05, - "loss": 0.0751, + "epoch": 1.2028512028512028, + "grad_norm": 0.7022712230682373, + "learning_rate": 2.2782892782892783e-05, + "loss": 0.0723, "step": 4050 }, { - "epoch": 1.0249936884625095, - "grad_norm": 0.5077184438705444, - "learning_rate": 2.3850037869224944e-05, - "loss": 0.0818, + "epoch": 1.2058212058212059, + "grad_norm": 0.888565182685852, + "learning_rate": 2.2765072765072765e-05, + "loss": 0.0699, "step": 4060 }, { - "epoch": 1.0275183034587225, - "grad_norm": 0.3867091238498688, - "learning_rate": 2.3834890179247666e-05, - "loss": 0.0768, + "epoch": 1.2087912087912087, + "grad_norm": 0.615304172039032, + "learning_rate": 2.2747252747252748e-05, + "loss": 0.0639, "step": 4070 }, { - "epoch": 1.0300429184549356, - "grad_norm": 0.43073731660842896, - "learning_rate": 2.3819742489270384e-05, - "loss": 0.0794, + "epoch": 1.2117612117612118, + "grad_norm": 1.067995309829712, + "learning_rate": 2.272943272943273e-05, + "loss": 0.0727, "step": 4080 }, { - "epoch": 1.0325675334511486, - "grad_norm": 0.5008924007415771, - "learning_rate": 2.380459479929311e-05, - "loss": 0.0676, + "epoch": 1.2147312147312148, + "grad_norm": 0.38957396149635315, + "learning_rate": 2.2711612711612715e-05, + "loss": 0.0817, "step": 4090 }, { - "epoch": 1.0350921484473619, - "grad_norm": 0.3357384204864502, - "learning_rate": 2.378944710931583e-05, - "loss": 0.0751, + "epoch": 1.2177012177012176, + "grad_norm": 0.4814799726009369, + "learning_rate": 2.2693792693792694e-05, + "loss": 0.0676, "step": 4100 }, { - "epoch": 1.037616763443575, - "grad_norm": 0.2947876453399658, - "learning_rate": 2.377429941933855e-05, - "loss": 0.0705, + "epoch": 1.2206712206712207, + "grad_norm": 0.33193427324295044, + "learning_rate": 2.2675972675972676e-05, + "loss": 0.0717, "step": 4110 }, { - "epoch": 1.040141378439788, - "grad_norm": 0.4923837184906006, - "learning_rate": 2.3759151729361274e-05, - "loss": 0.0854, + "epoch": 1.2236412236412235, + "grad_norm": 0.5651602149009705, + "learning_rate": 2.265815265815266e-05, + "loss": 0.0669, "step": 4120 }, { - "epoch": 1.042665993436001, - "grad_norm": 0.6966848373413086, - "learning_rate": 2.3744004039383996e-05, - "loss": 0.0707, + "epoch": 1.2266112266112266, + "grad_norm": 0.6378253102302551, + "learning_rate": 2.264033264033264e-05, + "loss": 0.0897, "step": 4130 }, { - "epoch": 1.045190608432214, - "grad_norm": 0.5456628799438477, - "learning_rate": 2.3728856349406714e-05, - "loss": 0.0732, + "epoch": 1.2295812295812296, + "grad_norm": 0.6030372977256775, + "learning_rate": 2.2622512622512623e-05, + "loss": 0.0896, "step": 4140 }, { - "epoch": 1.0477152234284273, - "grad_norm": 0.5405559539794922, - "learning_rate": 2.371370865942944e-05, - "loss": 0.075, + "epoch": 1.2325512325512324, + "grad_norm": 0.8515591621398926, + "learning_rate": 2.2604692604692605e-05, + "loss": 0.0645, "step": 4150 }, { - "epoch": 1.0502398384246403, - "grad_norm": 0.3989148736000061, - "learning_rate": 2.369856096945216e-05, - "loss": 0.0717, + "epoch": 1.2355212355212355, + "grad_norm": 0.6547635197639465, + "learning_rate": 2.258687258687259e-05, + "loss": 0.0736, "step": 4160 }, { - "epoch": 1.0527644534208533, - "grad_norm": 0.4704185724258423, - "learning_rate": 2.368341327947488e-05, - "loss": 0.0652, + "epoch": 1.2384912384912385, + "grad_norm": 0.4761018753051758, + "learning_rate": 2.256905256905257e-05, + "loss": 0.0689, "step": 4170 }, { - "epoch": 1.0552890684170664, - "grad_norm": 0.42349693179130554, - "learning_rate": 2.3668265589497604e-05, - "loss": 0.0745, + "epoch": 1.2414612414612414, + "grad_norm": 0.39740657806396484, + "learning_rate": 2.255123255123255e-05, + "loss": 0.0696, "step": 4180 }, { - "epoch": 1.0578136834132794, - "grad_norm": 0.4445798993110657, - "learning_rate": 2.3653117899520325e-05, - "loss": 0.0807, + "epoch": 1.2444312444312444, + "grad_norm": 0.49501290917396545, + "learning_rate": 2.2533412533412534e-05, + "loss": 0.0779, "step": 4190 }, { - "epoch": 1.0603382984094925, - "grad_norm": 0.8739972710609436, - "learning_rate": 2.3637970209543043e-05, - "loss": 0.0841, + "epoch": 1.2474012474012475, + "grad_norm": 0.5703093409538269, + "learning_rate": 2.2515592515592516e-05, + "loss": 0.0663, "step": 4200 }, { - "epoch": 1.0628629134057057, - "grad_norm": 0.45838242769241333, - "learning_rate": 2.362282251956577e-05, - "loss": 0.0774, + "epoch": 1.2503712503712503, + "grad_norm": 0.4675036370754242, + "learning_rate": 2.2497772497772498e-05, + "loss": 0.0772, "step": 4210 }, { - "epoch": 1.0653875284019187, - "grad_norm": 0.43052664399147034, - "learning_rate": 2.3607674829588487e-05, - "loss": 0.0797, + "epoch": 1.2533412533412533, + "grad_norm": 0.6520904898643494, + "learning_rate": 2.247995247995248e-05, + "loss": 0.074, "step": 4220 }, { - "epoch": 1.0679121433981318, - "grad_norm": 0.5692147016525269, - "learning_rate": 2.3592527139611208e-05, - "loss": 0.0787, + "epoch": 1.2563112563112564, + "grad_norm": 0.4377146065235138, + "learning_rate": 2.2462132462132466e-05, + "loss": 0.0752, "step": 4230 }, { - "epoch": 1.0704367583943448, - "grad_norm": 0.48966166377067566, - "learning_rate": 2.3577379449633933e-05, - "loss": 0.0791, + "epoch": 1.2592812592812592, + "grad_norm": 0.4791605472564697, + "learning_rate": 2.2444312444312444e-05, + "loss": 0.0614, "step": 4240 }, { - "epoch": 1.0729613733905579, - "grad_norm": 0.510415256023407, - "learning_rate": 2.356223175965665e-05, - "loss": 0.0802, + "epoch": 1.2622512622512623, + "grad_norm": 0.5933295488357544, + "learning_rate": 2.2426492426492427e-05, + "loss": 0.0832, "step": 4250 }, { - "epoch": 1.0754859883867711, - "grad_norm": 0.4185769855976105, - "learning_rate": 2.3547084069679373e-05, - "loss": 0.0648, + "epoch": 1.2652212652212653, + "grad_norm": 0.4189813435077667, + "learning_rate": 2.240867240867241e-05, + "loss": 0.069, "step": 4260 }, { - "epoch": 1.0780106033829842, - "grad_norm": 0.5046850442886353, - "learning_rate": 2.3531936379702098e-05, - "loss": 0.0812, + "epoch": 1.2681912681912682, + "grad_norm": 0.651421070098877, + "learning_rate": 2.239085239085239e-05, + "loss": 0.0791, "step": 4270 }, { - "epoch": 1.0805352183791972, - "grad_norm": 0.41549035906791687, - "learning_rate": 2.3516788689724816e-05, - "loss": 0.0786, + "epoch": 1.2711612711612712, + "grad_norm": 0.40593355894088745, + "learning_rate": 2.2373032373032373e-05, + "loss": 0.0638, "step": 4280 }, { - "epoch": 1.0830598333754102, - "grad_norm": 0.4319119155406952, - "learning_rate": 2.3501640999747538e-05, - "loss": 0.0783, + "epoch": 1.2741312741312742, + "grad_norm": 0.5226801037788391, + "learning_rate": 2.2355212355212355e-05, + "loss": 0.077, "step": 4290 }, { - "epoch": 1.0855844483716233, - "grad_norm": 0.8472285270690918, - "learning_rate": 2.3486493309770263e-05, - "loss": 0.0663, + "epoch": 1.277101277101277, + "grad_norm": 0.6062614321708679, + "learning_rate": 2.233739233739234e-05, + "loss": 0.068, "step": 4300 }, { - "epoch": 1.0881090633678363, - "grad_norm": 0.2649112641811371, - "learning_rate": 2.347134561979298e-05, - "loss": 0.069, + "epoch": 1.2800712800712801, + "grad_norm": 0.48023584485054016, + "learning_rate": 2.231957231957232e-05, + "loss": 0.0622, "step": 4310 }, { - "epoch": 1.0906336783640496, - "grad_norm": 0.6875, - "learning_rate": 2.3456197929815702e-05, - "loss": 0.072, + "epoch": 1.2830412830412832, + "grad_norm": 0.4292398989200592, + "learning_rate": 2.2301752301752302e-05, + "loss": 0.0951, "step": 4320 }, { - "epoch": 1.0931582933602626, - "grad_norm": 0.6910480260848999, - "learning_rate": 2.3441050239838427e-05, - "loss": 0.0663, + "epoch": 1.286011286011286, + "grad_norm": 0.509908139705658, + "learning_rate": 2.2283932283932284e-05, + "loss": 0.0703, "step": 4330 }, { - "epoch": 1.0956829083564756, - "grad_norm": 0.413908988237381, - "learning_rate": 2.3425902549861146e-05, - "loss": 0.075, + "epoch": 1.288981288981289, + "grad_norm": 0.36277303099632263, + "learning_rate": 2.2266112266112266e-05, + "loss": 0.0752, "step": 4340 }, { - "epoch": 1.0982075233526887, - "grad_norm": 0.5234224200248718, - "learning_rate": 2.3410754859883867e-05, - "loss": 0.0653, + "epoch": 1.2919512919512919, + "grad_norm": 0.4135016202926636, + "learning_rate": 2.2248292248292248e-05, + "loss": 0.0673, "step": 4350 }, { - "epoch": 1.1007321383489017, - "grad_norm": 0.5041384100914001, - "learning_rate": 2.3395607169906592e-05, - "loss": 0.0616, + "epoch": 1.294921294921295, + "grad_norm": 0.4465673863887787, + "learning_rate": 2.223047223047223e-05, + "loss": 0.0774, "step": 4360 }, { - "epoch": 1.103256753345115, - "grad_norm": 0.34713175892829895, - "learning_rate": 2.338045947992931e-05, - "loss": 0.0835, + "epoch": 1.2978912978912978, + "grad_norm": 0.3581428825855255, + "learning_rate": 2.2212652212652216e-05, + "loss": 0.0722, "step": 4370 }, { - "epoch": 1.105781368341328, - "grad_norm": 0.5200866460800171, - "learning_rate": 2.3365311789952032e-05, - "loss": 0.0796, + "epoch": 1.3008613008613008, + "grad_norm": 0.8216081261634827, + "learning_rate": 2.2194832194832195e-05, + "loss": 0.0695, "step": 4380 }, { - "epoch": 1.108305983337541, - "grad_norm": 0.5935444831848145, - "learning_rate": 2.3350164099974757e-05, - "loss": 0.0824, + "epoch": 1.3038313038313039, + "grad_norm": 0.3974524736404419, + "learning_rate": 2.2177012177012177e-05, + "loss": 0.0548, "step": 4390 }, { - "epoch": 1.110830598333754, - "grad_norm": 0.45419350266456604, - "learning_rate": 2.3335016409997475e-05, - "loss": 0.0782, + "epoch": 1.3068013068013067, + "grad_norm": 0.40166157484054565, + "learning_rate": 2.215919215919216e-05, + "loss": 0.0821, "step": 4400 }, { - "epoch": 1.1133552133299671, - "grad_norm": 0.6200574040412903, - "learning_rate": 2.3319868720020197e-05, - "loss": 0.0721, + "epoch": 1.3097713097713097, + "grad_norm": 0.6108930706977844, + "learning_rate": 2.214137214137214e-05, + "loss": 0.0771, "step": 4410 }, { - "epoch": 1.1158798283261802, - "grad_norm": 0.3939630687236786, - "learning_rate": 2.3304721030042922e-05, - "loss": 0.0672, + "epoch": 1.3127413127413128, + "grad_norm": 0.33659735321998596, + "learning_rate": 2.2123552123552123e-05, + "loss": 0.0866, "step": 4420 }, { - "epoch": 1.1184044433223934, - "grad_norm": 0.44664672017097473, - "learning_rate": 2.328957334006564e-05, - "loss": 0.0639, + "epoch": 1.3157113157113156, + "grad_norm": 0.41419750452041626, + "learning_rate": 2.2105732105732106e-05, + "loss": 0.066, "step": 4430 }, { - "epoch": 1.1209290583186065, - "grad_norm": 0.3891284167766571, - "learning_rate": 2.327442565008836e-05, - "loss": 0.0637, + "epoch": 1.3186813186813187, + "grad_norm": 0.39843958616256714, + "learning_rate": 2.208791208791209e-05, + "loss": 0.0717, "step": 4440 }, { - "epoch": 1.1234536733148195, - "grad_norm": 0.5123685002326965, - "learning_rate": 2.3259277960111087e-05, - "loss": 0.0577, + "epoch": 1.3216513216513217, + "grad_norm": 0.4193469285964966, + "learning_rate": 2.207009207009207e-05, + "loss": 0.0608, "step": 4450 }, { - "epoch": 1.1259782883110325, - "grad_norm": 0.5410445332527161, - "learning_rate": 2.3244130270133805e-05, - "loss": 0.0777, + "epoch": 1.3246213246213245, + "grad_norm": 0.310855507850647, + "learning_rate": 2.2052272052272052e-05, + "loss": 0.0623, "step": 4460 }, { - "epoch": 1.1285029033072456, - "grad_norm": 0.39902034401893616, - "learning_rate": 2.3228982580156526e-05, - "loss": 0.0684, + "epoch": 1.3275913275913276, + "grad_norm": 0.3885134160518646, + "learning_rate": 2.2034452034452034e-05, + "loss": 0.0565, "step": 4470 }, { - "epoch": 1.1310275183034588, - "grad_norm": 0.40197306871414185, - "learning_rate": 2.3213834890179248e-05, - "loss": 0.0607, + "epoch": 1.3305613305613306, + "grad_norm": 0.31589820981025696, + "learning_rate": 2.2016632016632017e-05, + "loss": 0.0591, "step": 4480 }, { - "epoch": 1.1335521332996719, - "grad_norm": 0.44786280393600464, - "learning_rate": 2.319868720020197e-05, - "loss": 0.0665, + "epoch": 1.3335313335313335, + "grad_norm": 0.4833143651485443, + "learning_rate": 2.1998811998812e-05, + "loss": 0.0758, "step": 4490 }, { - "epoch": 1.136076748295885, - "grad_norm": 0.5214644074440002, - "learning_rate": 2.318353951022469e-05, - "loss": 0.0601, + "epoch": 1.3365013365013365, + "grad_norm": 0.47030189633369446, + "learning_rate": 2.198099198099198e-05, + "loss": 0.0644, "step": 4500 }, { - "epoch": 1.138601363292098, - "grad_norm": 0.4208206832408905, - "learning_rate": 2.3168391820247413e-05, - "loss": 0.075, + "epoch": 1.3394713394713396, + "grad_norm": 0.44581151008605957, + "learning_rate": 2.1963171963171966e-05, + "loss": 0.0675, "step": 4510 }, { - "epoch": 1.141125978288311, - "grad_norm": 0.46941113471984863, - "learning_rate": 2.3153244130270134e-05, - "loss": 0.0644, + "epoch": 1.3424413424413424, + "grad_norm": 0.5004817247390747, + "learning_rate": 2.1945351945351945e-05, + "loss": 0.0835, "step": 4520 }, { - "epoch": 1.1436505932845242, - "grad_norm": 0.4159145951271057, - "learning_rate": 2.3138096440292856e-05, - "loss": 0.0689, + "epoch": 1.3454113454113454, + "grad_norm": 0.5188937783241272, + "learning_rate": 2.1927531927531927e-05, + "loss": 0.0739, "step": 4530 }, { - "epoch": 1.1461752082807373, - "grad_norm": 0.7756341099739075, - "learning_rate": 2.3122948750315578e-05, - "loss": 0.075, + "epoch": 1.3483813483813485, + "grad_norm": 0.386055052280426, + "learning_rate": 2.190971190971191e-05, + "loss": 0.0752, "step": 4540 }, { - "epoch": 1.1486998232769503, - "grad_norm": 0.49775972962379456, - "learning_rate": 2.31078010603383e-05, - "loss": 0.0781, + "epoch": 1.3513513513513513, + "grad_norm": 0.5287050008773804, + "learning_rate": 2.1891891891891892e-05, + "loss": 0.0704, "step": 4550 }, { - "epoch": 1.1512244382731633, - "grad_norm": 0.3479367792606354, - "learning_rate": 2.309265337036102e-05, - "loss": 0.0725, + "epoch": 1.3543213543213544, + "grad_norm": 0.5197706818580627, + "learning_rate": 2.1874071874071874e-05, + "loss": 0.0722, "step": 4560 }, { - "epoch": 1.1537490532693764, - "grad_norm": 0.4266480803489685, - "learning_rate": 2.3077505680383742e-05, - "loss": 0.0791, + "epoch": 1.3572913572913574, + "grad_norm": 1.044822335243225, + "learning_rate": 2.1856251856251856e-05, + "loss": 0.0774, "step": 4570 }, { - "epoch": 1.1562736682655894, - "grad_norm": 0.46766582131385803, - "learning_rate": 2.3062357990406464e-05, - "loss": 0.0754, + "epoch": 1.3602613602613602, + "grad_norm": 0.35167747735977173, + "learning_rate": 2.183843183843184e-05, + "loss": 0.0688, "step": 4580 }, { - "epoch": 1.1587982832618025, - "grad_norm": 0.6098841428756714, - "learning_rate": 2.3047210300429186e-05, - "loss": 0.0701, + "epoch": 1.3632313632313633, + "grad_norm": 0.5518337488174438, + "learning_rate": 2.1820611820611824e-05, + "loss": 0.0899, "step": 4590 }, { - "epoch": 1.1613228982580157, - "grad_norm": 0.496367484331131, - "learning_rate": 2.3032062610451907e-05, - "loss": 0.0822, + "epoch": 1.3662013662013661, + "grad_norm": 0.5644456148147583, + "learning_rate": 2.1802791802791803e-05, + "loss": 0.0808, "step": 4600 }, { - "epoch": 1.1638475132542287, - "grad_norm": 0.4577995538711548, - "learning_rate": 2.301691492047463e-05, - "loss": 0.079, + "epoch": 1.3691713691713692, + "grad_norm": 0.45010289549827576, + "learning_rate": 2.1784971784971785e-05, + "loss": 0.0839, "step": 4610 }, { - "epoch": 1.1663721282504418, - "grad_norm": 0.33575066924095154, - "learning_rate": 2.300176723049735e-05, - "loss": 0.0712, + "epoch": 1.3721413721413722, + "grad_norm": 0.6567732095718384, + "learning_rate": 2.1767151767151767e-05, + "loss": 0.0761, "step": 4620 }, { - "epoch": 1.1688967432466548, - "grad_norm": 0.35166892409324646, - "learning_rate": 2.298661954052007e-05, - "loss": 0.0736, + "epoch": 1.375111375111375, + "grad_norm": 0.582931399345398, + "learning_rate": 2.174933174933175e-05, + "loss": 0.0669, "step": 4630 }, { - "epoch": 1.1714213582428679, - "grad_norm": 0.46313348412513733, - "learning_rate": 2.2971471850542793e-05, - "loss": 0.0676, + "epoch": 1.378081378081378, + "grad_norm": 0.39117926359176636, + "learning_rate": 2.173151173151173e-05, + "loss": 0.0763, "step": 4640 }, { - "epoch": 1.1739459732390811, - "grad_norm": 0.6426845788955688, - "learning_rate": 2.2956324160565515e-05, - "loss": 0.0621, + "epoch": 1.381051381051381, + "grad_norm": 0.44285526871681213, + "learning_rate": 2.1713691713691717e-05, + "loss": 0.071, "step": 4650 }, { - "epoch": 1.1764705882352942, - "grad_norm": 0.403133749961853, - "learning_rate": 2.2941176470588233e-05, - "loss": 0.0774, + "epoch": 1.384021384021384, + "grad_norm": 0.6497974395751953, + "learning_rate": 2.16958716958717e-05, + "loss": 0.0647, "step": 4660 }, { - "epoch": 1.1789952032315072, - "grad_norm": 0.4596996307373047, - "learning_rate": 2.2926028780610958e-05, - "loss": 0.0702, + "epoch": 1.386991386991387, + "grad_norm": 0.4394398033618927, + "learning_rate": 2.1678051678051678e-05, + "loss": 0.0666, "step": 4670 }, { - "epoch": 1.1815198182277202, - "grad_norm": 0.4984603226184845, - "learning_rate": 2.291088109063368e-05, - "loss": 0.0761, + "epoch": 1.3899613899613898, + "grad_norm": 0.6339782476425171, + "learning_rate": 2.166023166023166e-05, + "loss": 0.0693, "step": 4680 }, { - "epoch": 1.1840444332239333, - "grad_norm": 0.6605084538459778, - "learning_rate": 2.2895733400656398e-05, - "loss": 0.0718, + "epoch": 1.392931392931393, + "grad_norm": 0.24844326078891754, + "learning_rate": 2.1642411642411642e-05, + "loss": 0.0631, "step": 4690 }, { - "epoch": 1.1865690482201465, - "grad_norm": 0.6148932576179504, - "learning_rate": 2.2880585710679123e-05, - "loss": 0.0647, + "epoch": 1.395901395901396, + "grad_norm": 0.41448843479156494, + "learning_rate": 2.1624591624591624e-05, + "loss": 0.0667, "step": 4700 }, { - "epoch": 1.1890936632163596, - "grad_norm": 0.7315832376480103, - "learning_rate": 2.2865438020701845e-05, - "loss": 0.0812, + "epoch": 1.3988713988713988, + "grad_norm": 0.30131953954696655, + "learning_rate": 2.1606771606771606e-05, + "loss": 0.0625, "step": 4710 }, { - "epoch": 1.1916182782125726, - "grad_norm": 0.45521411299705505, - "learning_rate": 2.2850290330724563e-05, - "loss": 0.0745, + "epoch": 1.4018414018414018, + "grad_norm": 0.7573267817497253, + "learning_rate": 2.1588951588951592e-05, + "loss": 0.0672, "step": 4720 }, { - "epoch": 1.1941428932087856, - "grad_norm": 0.3817095160484314, - "learning_rate": 2.2835142640747288e-05, - "loss": 0.0652, + "epoch": 1.4048114048114049, + "grad_norm": 0.5527480840682983, + "learning_rate": 2.1571131571131574e-05, + "loss": 0.0597, "step": 4730 }, { - "epoch": 1.1966675082049987, - "grad_norm": 0.6603373289108276, - "learning_rate": 2.281999495077001e-05, - "loss": 0.0837, + "epoch": 1.4077814077814077, + "grad_norm": 0.5866405367851257, + "learning_rate": 2.1553311553311553e-05, + "loss": 0.0676, "step": 4740 }, { - "epoch": 1.199192123201212, - "grad_norm": 0.44251519441604614, - "learning_rate": 2.2804847260792728e-05, - "loss": 0.0652, + "epoch": 1.4107514107514108, + "grad_norm": 0.3691079318523407, + "learning_rate": 2.1535491535491535e-05, + "loss": 0.073, "step": 4750 }, { - "epoch": 1.201716738197425, - "grad_norm": 0.41316279768943787, - "learning_rate": 2.2789699570815453e-05, - "loss": 0.058, + "epoch": 1.4137214137214138, + "grad_norm": 0.46354126930236816, + "learning_rate": 2.1517671517671517e-05, + "loss": 0.062, "step": 4760 }, { - "epoch": 1.204241353193638, - "grad_norm": 0.3292355239391327, - "learning_rate": 2.2774551880838174e-05, - "loss": 0.0701, + "epoch": 1.4166914166914166, + "grad_norm": 0.4648849368095398, + "learning_rate": 2.14998514998515e-05, + "loss": 0.0854, "step": 4770 }, { - "epoch": 1.206765968189851, - "grad_norm": 0.47908949851989746, - "learning_rate": 2.2759404190860892e-05, - "loss": 0.0719, + "epoch": 1.4196614196614197, + "grad_norm": 0.4591132402420044, + "learning_rate": 2.148203148203148e-05, + "loss": 0.0571, "step": 4780 }, { - "epoch": 1.209290583186064, - "grad_norm": 0.5355591773986816, - "learning_rate": 2.2744256500883617e-05, - "loss": 0.0772, + "epoch": 1.4226314226314227, + "grad_norm": 0.6278248429298401, + "learning_rate": 2.1464211464211467e-05, + "loss": 0.0669, "step": 4790 }, { - "epoch": 1.2118151981822771, - "grad_norm": 0.3404475450515747, - "learning_rate": 2.2729108810906336e-05, - "loss": 0.0659, + "epoch": 1.4256014256014256, + "grad_norm": 0.7873584032058716, + "learning_rate": 2.144639144639145e-05, + "loss": 0.0708, "step": 4800 }, { - "epoch": 1.2143398131784902, - "grad_norm": 0.5191195607185364, - "learning_rate": 2.2713961120929057e-05, - "loss": 0.0697, + "epoch": 1.4285714285714286, + "grad_norm": 0.42913201451301575, + "learning_rate": 2.1428571428571428e-05, + "loss": 0.0666, "step": 4810 }, { - "epoch": 1.2168644281747034, - "grad_norm": 0.29288583993911743, - "learning_rate": 2.2698813430951782e-05, - "loss": 0.0628, + "epoch": 1.4315414315414317, + "grad_norm": 0.34143778681755066, + "learning_rate": 2.141075141075141e-05, + "loss": 0.0829, "step": 4820 }, { - "epoch": 1.2193890431709165, - "grad_norm": 0.3372870981693268, - "learning_rate": 2.26836657409745e-05, - "loss": 0.0724, + "epoch": 1.4345114345114345, + "grad_norm": 0.47077706456184387, + "learning_rate": 2.1392931392931392e-05, + "loss": 0.0794, "step": 4830 }, { - "epoch": 1.2219136581671295, - "grad_norm": 0.4657537639141083, - "learning_rate": 2.2668518050997222e-05, - "loss": 0.0582, + "epoch": 1.4374814374814375, + "grad_norm": 0.4886973202228546, + "learning_rate": 2.1375111375111375e-05, + "loss": 0.0646, "step": 4840 }, { - "epoch": 1.2244382731633425, - "grad_norm": 0.4123993217945099, - "learning_rate": 2.2653370361019947e-05, - "loss": 0.0669, + "epoch": 1.4404514404514406, + "grad_norm": 0.4241088628768921, + "learning_rate": 2.1357291357291357e-05, + "loss": 0.0762, "step": 4850 }, { - "epoch": 1.2269628881595556, - "grad_norm": 0.560075044631958, - "learning_rate": 2.2638222671042665e-05, - "loss": 0.0824, + "epoch": 1.4434214434214434, + "grad_norm": 0.4464230537414551, + "learning_rate": 2.1339471339471342e-05, + "loss": 0.0698, "step": 4860 }, { - "epoch": 1.2294875031557688, - "grad_norm": 0.4414028525352478, - "learning_rate": 2.2623074981065387e-05, - "loss": 0.0727, + "epoch": 1.4463914463914465, + "grad_norm": 0.36223044991493225, + "learning_rate": 2.1321651321651325e-05, + "loss": 0.0587, "step": 4870 }, { - "epoch": 1.2320121181519819, - "grad_norm": 0.450916588306427, - "learning_rate": 2.2607927291088112e-05, - "loss": 0.0718, + "epoch": 1.4493614493614493, + "grad_norm": 0.5170213580131531, + "learning_rate": 2.1303831303831303e-05, + "loss": 0.0663, "step": 4880 }, { - "epoch": 1.234536733148195, - "grad_norm": 0.6700304746627808, - "learning_rate": 2.259277960111083e-05, - "loss": 0.0705, + "epoch": 1.4523314523314523, + "grad_norm": 0.43765634298324585, + "learning_rate": 2.1286011286011286e-05, + "loss": 0.07, "step": 4890 }, { - "epoch": 1.237061348144408, - "grad_norm": 0.5820491909980774, - "learning_rate": 2.257763191113355e-05, - "loss": 0.0691, + "epoch": 1.4553014553014554, + "grad_norm": 0.30947327613830566, + "learning_rate": 2.1268191268191268e-05, + "loss": 0.0682, "step": 4900 }, { - "epoch": 1.239585963140621, - "grad_norm": 0.7849363088607788, - "learning_rate": 2.2562484221156277e-05, - "loss": 0.0746, + "epoch": 1.4582714582714582, + "grad_norm": 0.480027973651886, + "learning_rate": 2.125037125037125e-05, + "loss": 0.0652, "step": 4910 }, { - "epoch": 1.2421105781368342, - "grad_norm": 0.3631155490875244, - "learning_rate": 2.2547336531178995e-05, - "loss": 0.072, + "epoch": 1.4612414612414613, + "grad_norm": 0.7047821283340454, + "learning_rate": 2.1232551232551232e-05, + "loss": 0.0643, "step": 4920 }, { - "epoch": 1.2446351931330473, - "grad_norm": 0.35678336024284363, - "learning_rate": 2.2532188841201716e-05, - "loss": 0.0675, + "epoch": 1.464211464211464, + "grad_norm": 0.741016685962677, + "learning_rate": 2.1214731214731218e-05, + "loss": 0.0645, "step": 4930 }, { - "epoch": 1.2471598081292603, - "grad_norm": 0.4498080015182495, - "learning_rate": 2.251704115122444e-05, - "loss": 0.0692, + "epoch": 1.4671814671814671, + "grad_norm": 0.5473170280456543, + "learning_rate": 2.11969111969112e-05, + "loss": 0.0636, "step": 4940 }, { - "epoch": 1.2496844231254733, - "grad_norm": 0.5682255029678345, - "learning_rate": 2.250189346124716e-05, - "loss": 0.0832, + "epoch": 1.4701514701514702, + "grad_norm": 0.4111592471599579, + "learning_rate": 2.117909117909118e-05, + "loss": 0.0676, "step": 4950 }, { - "epoch": 1.2522090381216864, - "grad_norm": 0.5812047719955444, - "learning_rate": 2.248674577126988e-05, - "loss": 0.0848, + "epoch": 1.473121473121473, + "grad_norm": 0.7355438470840454, + "learning_rate": 2.116127116127116e-05, + "loss": 0.0666, "step": 4960 }, { - "epoch": 1.2547336531178996, - "grad_norm": 0.4898669123649597, - "learning_rate": 2.2471598081292606e-05, - "loss": 0.0699, + "epoch": 1.476091476091476, + "grad_norm": 0.2529616355895996, + "learning_rate": 2.1143451143451143e-05, + "loss": 0.0626, "step": 4970 }, { - "epoch": 1.2572582681141127, - "grad_norm": 0.45778876543045044, - "learning_rate": 2.2456450391315324e-05, - "loss": 0.078, + "epoch": 1.4790614790614791, + "grad_norm": 0.7075737714767456, + "learning_rate": 2.1125631125631125e-05, + "loss": 0.061, "step": 4980 }, { - "epoch": 1.2597828831103257, - "grad_norm": 0.32259121537208557, - "learning_rate": 2.2441302701338046e-05, - "loss": 0.0813, + "epoch": 1.482031482031482, + "grad_norm": 0.39400067925453186, + "learning_rate": 2.1107811107811107e-05, + "loss": 0.0714, "step": 4990 }, { - "epoch": 1.2623074981065388, - "grad_norm": 0.34969955682754517, - "learning_rate": 2.242615501136077e-05, - "loss": 0.064, + "epoch": 1.485001485001485, + "grad_norm": 0.4059322774410248, + "learning_rate": 2.1089991089991093e-05, + "loss": 0.0587, "step": 5000 }, { - "epoch": 1.2648321131027518, - "grad_norm": 0.4658315181732178, - "learning_rate": 2.241100732138349e-05, - "loss": 0.0641, + "epoch": 1.487971487971488, + "grad_norm": 0.3679432272911072, + "learning_rate": 2.1072171072171075e-05, + "loss": 0.0707, "step": 5010 }, { - "epoch": 1.267356728098965, - "grad_norm": 0.7253788113594055, - "learning_rate": 2.239585963140621e-05, - "loss": 0.0669, + "epoch": 1.4909414909414909, + "grad_norm": 0.45325401425361633, + "learning_rate": 2.1054351054351054e-05, + "loss": 0.0789, "step": 5020 }, { - "epoch": 1.2698813430951779, - "grad_norm": 0.4734630882740021, - "learning_rate": 2.2380711941428936e-05, - "loss": 0.0692, + "epoch": 1.493911493911494, + "grad_norm": 0.36480912566185, + "learning_rate": 2.1036531036531036e-05, + "loss": 0.0742, "step": 5030 }, { - "epoch": 1.2724059580913911, - "grad_norm": 0.496377170085907, - "learning_rate": 2.2365564251451654e-05, - "loss": 0.0701, + "epoch": 1.496881496881497, + "grad_norm": 0.4680189788341522, + "learning_rate": 2.1018711018711018e-05, + "loss": 0.0691, "step": 5040 }, { - "epoch": 1.2749305730876042, - "grad_norm": 0.427693247795105, - "learning_rate": 2.2350416561474375e-05, - "loss": 0.0724, + "epoch": 1.4998514998514998, + "grad_norm": 0.40691086649894714, + "learning_rate": 2.1000891000891e-05, + "loss": 0.0841, "step": 5050 }, { - "epoch": 1.2774551880838172, - "grad_norm": 0.3826102316379547, - "learning_rate": 2.2335268871497097e-05, - "loss": 0.0559, + "epoch": 1.5028215028215028, + "grad_norm": 0.30459049344062805, + "learning_rate": 2.0983070983070982e-05, + "loss": 0.0569, "step": 5060 }, { - "epoch": 1.2799798030800302, - "grad_norm": 0.4349898099899292, - "learning_rate": 2.232012118151982e-05, - "loss": 0.0804, + "epoch": 1.505791505791506, + "grad_norm": 0.8983843326568604, + "learning_rate": 2.0965250965250968e-05, + "loss": 0.0719, "step": 5070 }, { - "epoch": 1.2825044180762433, - "grad_norm": 0.4229235351085663, - "learning_rate": 2.230497349154254e-05, - "loss": 0.0583, + "epoch": 1.5087615087615087, + "grad_norm": 0.5937901139259338, + "learning_rate": 2.094743094743095e-05, + "loss": 0.076, "step": 5080 }, { - "epoch": 1.2850290330724565, - "grad_norm": 0.4786253869533539, - "learning_rate": 2.2289825801565262e-05, - "loss": 0.0802, + "epoch": 1.5117315117315118, + "grad_norm": 0.3914330005645752, + "learning_rate": 2.092961092961093e-05, + "loss": 0.0698, "step": 5090 }, { - "epoch": 1.2875536480686696, - "grad_norm": 0.5271996259689331, - "learning_rate": 2.2274678111587983e-05, - "loss": 0.0824, + "epoch": 1.5147015147015148, + "grad_norm": 0.38608691096305847, + "learning_rate": 2.091179091179091e-05, + "loss": 0.0777, "step": 5100 }, { - "epoch": 1.2900782630648826, - "grad_norm": 0.4490508735179901, - "learning_rate": 2.2259530421610705e-05, - "loss": 0.0748, + "epoch": 1.5176715176715176, + "grad_norm": 0.7738357186317444, + "learning_rate": 2.0893970893970893e-05, + "loss": 0.0693, "step": 5110 }, { - "epoch": 1.2926028780610956, - "grad_norm": 0.49381911754608154, - "learning_rate": 2.2244382731633427e-05, - "loss": 0.0701, + "epoch": 1.5206415206415207, + "grad_norm": 0.6664383411407471, + "learning_rate": 2.0876150876150875e-05, + "loss": 0.072, "step": 5120 }, { - "epoch": 1.2951274930573087, - "grad_norm": 0.4700550436973572, - "learning_rate": 2.2229235041656148e-05, - "loss": 0.061, + "epoch": 1.5236115236115237, + "grad_norm": 0.38639238476753235, + "learning_rate": 2.0858330858330858e-05, + "loss": 0.0724, "step": 5130 }, { - "epoch": 1.297652108053522, - "grad_norm": 0.2748670279979706, - "learning_rate": 2.221408735167887e-05, - "loss": 0.0638, + "epoch": 1.5265815265815266, + "grad_norm": 0.7060205936431885, + "learning_rate": 2.0840510840510843e-05, + "loss": 0.0665, "step": 5140 }, { - "epoch": 1.300176723049735, - "grad_norm": 0.32839298248291016, - "learning_rate": 2.219893966170159e-05, - "loss": 0.0627, + "epoch": 1.5295515295515294, + "grad_norm": 0.674248993396759, + "learning_rate": 2.0822690822690825e-05, + "loss": 0.0769, "step": 5150 }, { - "epoch": 1.302701338045948, - "grad_norm": 0.40593937039375305, - "learning_rate": 2.2183791971724313e-05, - "loss": 0.0664, + "epoch": 1.5325215325215327, + "grad_norm": 0.45502710342407227, + "learning_rate": 2.0804870804870808e-05, + "loss": 0.0808, "step": 5160 }, { - "epoch": 1.305225953042161, - "grad_norm": 0.43036961555480957, - "learning_rate": 2.2168644281747035e-05, - "loss": 0.0861, + "epoch": 1.5354915354915355, + "grad_norm": 0.4794248938560486, + "learning_rate": 2.0787050787050786e-05, + "loss": 0.0701, "step": 5170 }, { - "epoch": 1.307750568038374, - "grad_norm": 0.7976852655410767, - "learning_rate": 2.2153496591769753e-05, - "loss": 0.0869, + "epoch": 1.5384615384615383, + "grad_norm": 0.6008143424987793, + "learning_rate": 2.076923076923077e-05, + "loss": 0.0697, "step": 5180 }, { - "epoch": 1.3102751830345873, - "grad_norm": 1.0304032564163208, - "learning_rate": 2.2138348901792478e-05, - "loss": 0.0774, + "epoch": 1.5414315414315416, + "grad_norm": 0.5068689584732056, + "learning_rate": 2.075141075141075e-05, + "loss": 0.079, "step": 5190 }, { - "epoch": 1.3127997980308004, - "grad_norm": 0.38359397649765015, - "learning_rate": 2.21232012118152e-05, - "loss": 0.0821, + "epoch": 1.5444015444015444, + "grad_norm": 0.4885605275630951, + "learning_rate": 2.0733590733590733e-05, + "loss": 0.0749, "step": 5200 }, { - "epoch": 1.3153244130270134, - "grad_norm": 0.3385170102119446, - "learning_rate": 2.2108053521837918e-05, - "loss": 0.0747, + "epoch": 1.5473715473715473, + "grad_norm": 0.5522565841674805, + "learning_rate": 2.071577071577072e-05, + "loss": 0.071, "step": 5210 }, { - "epoch": 1.3178490280232265, - "grad_norm": 0.8735803365707397, - "learning_rate": 2.2092905831860643e-05, - "loss": 0.0691, + "epoch": 1.5503415503415503, + "grad_norm": 0.32774174213409424, + "learning_rate": 2.06979506979507e-05, + "loss": 0.0556, "step": 5220 }, { - "epoch": 1.3203736430194395, - "grad_norm": 0.5266577005386353, - "learning_rate": 2.2077758141883364e-05, - "loss": 0.071, + "epoch": 1.5533115533115534, + "grad_norm": 0.5104330778121948, + "learning_rate": 2.0680130680130683e-05, + "loss": 0.0738, "step": 5230 }, { - "epoch": 1.3228982580156528, - "grad_norm": 0.46573153138160706, - "learning_rate": 2.2062610451906082e-05, - "loss": 0.0876, + "epoch": 1.5562815562815562, + "grad_norm": 0.5387243628501892, + "learning_rate": 2.066231066231066e-05, + "loss": 0.0656, "step": 5240 }, { - "epoch": 1.3254228730118656, - "grad_norm": 0.6112514138221741, - "learning_rate": 2.2047462761928807e-05, - "loss": 0.0691, + "epoch": 1.5592515592515592, + "grad_norm": 0.49494025111198425, + "learning_rate": 2.0644490644490644e-05, + "loss": 0.0684, "step": 5250 }, { - "epoch": 1.3279474880080788, - "grad_norm": 0.4857766032218933, - "learning_rate": 2.203231507195153e-05, - "loss": 0.0743, + "epoch": 1.5622215622215623, + "grad_norm": 0.5351789593696594, + "learning_rate": 2.0626670626670626e-05, + "loss": 0.0629, "step": 5260 }, { - "epoch": 1.3304721030042919, - "grad_norm": 0.3289374113082886, - "learning_rate": 2.2017167381974247e-05, - "loss": 0.0679, + "epoch": 1.565191565191565, + "grad_norm": 0.5836585760116577, + "learning_rate": 2.0608850608850608e-05, + "loss": 0.0725, "step": 5270 }, { - "epoch": 1.332996718000505, - "grad_norm": 0.6034137606620789, - "learning_rate": 2.2002019691996972e-05, - "loss": 0.0664, + "epoch": 1.5681615681615682, + "grad_norm": 0.5254115462303162, + "learning_rate": 2.0591030591030594e-05, + "loss": 0.0617, "step": 5280 }, { - "epoch": 1.335521332996718, - "grad_norm": 0.6763460636138916, - "learning_rate": 2.1986872002019694e-05, - "loss": 0.0758, + "epoch": 1.5711315711315712, + "grad_norm": 0.9055864810943604, + "learning_rate": 2.0573210573210576e-05, + "loss": 0.0702, "step": 5290 }, { - "epoch": 1.338045947992931, - "grad_norm": 0.4285227060317993, - "learning_rate": 2.1971724312042412e-05, - "loss": 0.0709, + "epoch": 1.574101574101574, + "grad_norm": 0.6344959139823914, + "learning_rate": 2.0555390555390558e-05, + "loss": 0.0571, "step": 5300 }, { - "epoch": 1.3405705629891442, - "grad_norm": 0.3583575189113617, - "learning_rate": 2.1956576622065137e-05, - "loss": 0.0752, + "epoch": 1.577071577071577, + "grad_norm": 0.4069235622882843, + "learning_rate": 2.0537570537570537e-05, + "loss": 0.0562, "step": 5310 }, { - "epoch": 1.3430951779853573, - "grad_norm": 0.5029326677322388, - "learning_rate": 2.194142893208786e-05, - "loss": 0.0748, + "epoch": 1.5800415800415801, + "grad_norm": 0.4786476194858551, + "learning_rate": 2.051975051975052e-05, + "loss": 0.0661, "step": 5320 }, { - "epoch": 1.3456197929815703, - "grad_norm": 0.6658989191055298, - "learning_rate": 2.1926281242110577e-05, - "loss": 0.0655, + "epoch": 1.583011583011583, + "grad_norm": 0.45690423250198364, + "learning_rate": 2.05019305019305e-05, + "loss": 0.0754, "step": 5330 }, { - "epoch": 1.3481444079777833, - "grad_norm": 0.520709216594696, - "learning_rate": 2.1911133552133302e-05, - "loss": 0.0781, + "epoch": 1.585981585981586, + "grad_norm": 0.3506830036640167, + "learning_rate": 2.0484110484110483e-05, + "loss": 0.0671, "step": 5340 }, { - "epoch": 1.3506690229739964, - "grad_norm": 0.534546971321106, - "learning_rate": 2.189598586215602e-05, - "loss": 0.0678, + "epoch": 1.588951588951589, + "grad_norm": 0.6035703420639038, + "learning_rate": 2.046629046629047e-05, + "loss": 0.0697, "step": 5350 }, { - "epoch": 1.3531936379702096, - "grad_norm": 0.3448280096054077, - "learning_rate": 2.188083817217874e-05, - "loss": 0.046, + "epoch": 1.5919215919215919, + "grad_norm": 0.5453073382377625, + "learning_rate": 2.044847044847045e-05, + "loss": 0.0782, "step": 5360 }, { - "epoch": 1.3557182529664227, - "grad_norm": 0.47474193572998047, - "learning_rate": 2.1865690482201466e-05, - "loss": 0.0619, + "epoch": 1.594891594891595, + "grad_norm": 0.5534022450447083, + "learning_rate": 2.0430650430650433e-05, + "loss": 0.058, "step": 5370 }, { - "epoch": 1.3582428679626357, - "grad_norm": 0.3935701847076416, - "learning_rate": 2.1850542792224185e-05, - "loss": 0.0655, + "epoch": 1.597861597861598, + "grad_norm": 0.6920284032821655, + "learning_rate": 2.0412830412830412e-05, + "loss": 0.0824, "step": 5380 }, { - "epoch": 1.3607674829588488, - "grad_norm": 0.5216870903968811, - "learning_rate": 2.1835395102246906e-05, - "loss": 0.0727, + "epoch": 1.6008316008316008, + "grad_norm": 0.4295329451560974, + "learning_rate": 2.0395010395010394e-05, + "loss": 0.0708, "step": 5390 }, { - "epoch": 1.3632920979550618, - "grad_norm": 0.40178000926971436, - "learning_rate": 2.182024741226963e-05, - "loss": 0.0686, + "epoch": 1.6038016038016036, + "grad_norm": 0.6782526969909668, + "learning_rate": 2.0377190377190376e-05, + "loss": 0.0687, "step": 5400 }, { - "epoch": 1.365816712951275, - "grad_norm": 0.8555682301521301, - "learning_rate": 2.180509972229235e-05, - "loss": 0.0744, + "epoch": 1.606771606771607, + "grad_norm": 0.37526410818099976, + "learning_rate": 2.035937035937036e-05, + "loss": 0.0706, "step": 5410 }, { - "epoch": 1.368341327947488, - "grad_norm": 0.5784962773323059, - "learning_rate": 2.178995203231507e-05, - "loss": 0.0762, + "epoch": 1.6097416097416097, + "grad_norm": 0.581466555595398, + "learning_rate": 2.0341550341550344e-05, + "loss": 0.0658, "step": 5420 }, { - "epoch": 1.3708659429437011, - "grad_norm": 0.476243793964386, - "learning_rate": 2.1774804342337796e-05, - "loss": 0.0712, + "epoch": 1.6127116127116126, + "grad_norm": 0.6016833186149597, + "learning_rate": 2.0323730323730326e-05, + "loss": 0.0778, "step": 5430 }, { - "epoch": 1.3733905579399142, - "grad_norm": 0.4363716244697571, - "learning_rate": 2.1759656652360514e-05, - "loss": 0.0664, + "epoch": 1.6156816156816158, + "grad_norm": 0.46572285890579224, + "learning_rate": 2.0305910305910308e-05, + "loss": 0.0704, "step": 5440 }, { - "epoch": 1.3759151729361272, - "grad_norm": 0.533042848110199, - "learning_rate": 2.1744508962383236e-05, - "loss": 0.0881, + "epoch": 1.6186516186516187, + "grad_norm": 0.4586747884750366, + "learning_rate": 2.0288090288090287e-05, + "loss": 0.0582, "step": 5450 }, { - "epoch": 1.3784397879323405, - "grad_norm": 0.34616196155548096, - "learning_rate": 2.172936127240596e-05, - "loss": 0.0749, + "epoch": 1.6216216216216215, + "grad_norm": 0.5045327544212341, + "learning_rate": 2.027027027027027e-05, + "loss": 0.0774, "step": 5460 }, { - "epoch": 1.3809644029285533, - "grad_norm": 0.31354981660842896, - "learning_rate": 2.171421358242868e-05, - "loss": 0.0634, + "epoch": 1.6245916245916245, + "grad_norm": 0.4661884307861328, + "learning_rate": 2.025245025245025e-05, + "loss": 0.0706, "step": 5470 }, { - "epoch": 1.3834890179247665, - "grad_norm": 0.5656862854957581, - "learning_rate": 2.16990658924514e-05, - "loss": 0.0716, + "epoch": 1.6275616275616276, + "grad_norm": 0.3280268609523773, + "learning_rate": 2.0234630234630234e-05, + "loss": 0.0663, "step": 5480 }, { - "epoch": 1.3860136329209796, - "grad_norm": 0.516463041305542, - "learning_rate": 2.1683918202474126e-05, - "loss": 0.0614, + "epoch": 1.6305316305316304, + "grad_norm": 0.4486147165298462, + "learning_rate": 2.021681021681022e-05, + "loss": 0.0698, "step": 5490 }, { - "epoch": 1.3885382479171926, - "grad_norm": 0.3807201385498047, - "learning_rate": 2.1668770512496844e-05, - "loss": 0.0608, + "epoch": 1.6335016335016335, + "grad_norm": 0.5801326036453247, + "learning_rate": 2.01989901989902e-05, + "loss": 0.0808, "step": 5500 }, { - "epoch": 1.3910628629134056, - "grad_norm": 0.3986429274082184, - "learning_rate": 2.1653622822519565e-05, - "loss": 0.0701, + "epoch": 1.6364716364716365, + "grad_norm": 0.43352991342544556, + "learning_rate": 2.0181170181170183e-05, + "loss": 0.0522, "step": 5510 }, { - "epoch": 1.3935874779096187, - "grad_norm": 0.58119797706604, - "learning_rate": 2.163847513254229e-05, - "loss": 0.0692, + "epoch": 1.6394416394416393, + "grad_norm": 0.5242543816566467, + "learning_rate": 2.0163350163350162e-05, + "loss": 0.0744, "step": 5520 }, { - "epoch": 1.396112092905832, - "grad_norm": 0.5791721940040588, - "learning_rate": 2.162332744256501e-05, - "loss": 0.0713, + "epoch": 1.6424116424116424, + "grad_norm": 0.5735893249511719, + "learning_rate": 2.0145530145530144e-05, + "loss": 0.0663, "step": 5530 }, { - "epoch": 1.398636707902045, - "grad_norm": 0.39115121960639954, - "learning_rate": 2.160817975258773e-05, - "loss": 0.0678, + "epoch": 1.6453816453816454, + "grad_norm": 0.3472582697868347, + "learning_rate": 2.0127710127710127e-05, + "loss": 0.0685, "step": 5540 }, { - "epoch": 1.401161322898258, - "grad_norm": 0.37049493193626404, - "learning_rate": 2.1593032062610455e-05, - "loss": 0.0579, + "epoch": 1.6483516483516483, + "grad_norm": 0.4069629907608032, + "learning_rate": 2.010989010989011e-05, + "loss": 0.0619, "step": 5550 }, { - "epoch": 1.403685937894471, - "grad_norm": 0.7497106194496155, - "learning_rate": 2.1577884372633173e-05, - "loss": 0.0738, + "epoch": 1.6513216513216513, + "grad_norm": 0.5932218432426453, + "learning_rate": 2.0092070092070094e-05, + "loss": 0.0711, "step": 5560 }, { - "epoch": 1.406210552890684, - "grad_norm": 0.4488617777824402, - "learning_rate": 2.1562736682655895e-05, - "loss": 0.0826, + "epoch": 1.6542916542916544, + "grad_norm": 0.7638351321220398, + "learning_rate": 2.0074250074250076e-05, + "loss": 0.0837, "step": 5570 }, { - "epoch": 1.4087351678868973, - "grad_norm": 0.42779994010925293, - "learning_rate": 2.154758899267862e-05, - "loss": 0.0782, + "epoch": 1.6572616572616572, + "grad_norm": 0.7104766368865967, + "learning_rate": 2.005643005643006e-05, + "loss": 0.0659, "step": 5580 }, { - "epoch": 1.4112597828831104, - "grad_norm": 0.6836367249488831, - "learning_rate": 2.1532441302701338e-05, - "loss": 0.0673, + "epoch": 1.6602316602316602, + "grad_norm": 0.6623921394348145, + "learning_rate": 2.0038610038610037e-05, + "loss": 0.0693, "step": 5590 }, { - "epoch": 1.4137843978793234, - "grad_norm": 0.38072410225868225, - "learning_rate": 2.151729361272406e-05, - "loss": 0.0695, + "epoch": 1.6632016632016633, + "grad_norm": 0.5632063746452332, + "learning_rate": 2.002079002079002e-05, + "loss": 0.0647, "step": 5600 }, { - "epoch": 1.4163090128755365, - "grad_norm": 0.41650840640068054, - "learning_rate": 2.1502145922746785e-05, - "loss": 0.0757, + "epoch": 1.6661716661716661, + "grad_norm": 0.36101019382476807, + "learning_rate": 2.0002970002970002e-05, + "loss": 0.0724, "step": 5610 }, { - "epoch": 1.4188336278717495, - "grad_norm": 0.6885185241699219, - "learning_rate": 2.1486998232769503e-05, - "loss": 0.0662, + "epoch": 1.6691416691416692, + "grad_norm": 0.4157385230064392, + "learning_rate": 1.9985149985149984e-05, + "loss": 0.0609, "step": 5620 }, { - "epoch": 1.4213582428679628, - "grad_norm": 0.5995170474052429, - "learning_rate": 2.1471850542792225e-05, - "loss": 0.0723, + "epoch": 1.6721116721116722, + "grad_norm": 0.4751082956790924, + "learning_rate": 1.996732996732997e-05, + "loss": 0.0681, "step": 5630 }, { - "epoch": 1.4238828578641758, - "grad_norm": 0.6088976263999939, - "learning_rate": 2.1456702852814946e-05, - "loss": 0.0684, + "epoch": 1.675081675081675, + "grad_norm": 0.5545091032981873, + "learning_rate": 1.994950994950995e-05, + "loss": 0.0726, "step": 5640 }, { - "epoch": 1.4264074728603888, - "grad_norm": 0.5369215607643127, - "learning_rate": 2.1441555162837668e-05, - "loss": 0.0682, + "epoch": 1.678051678051678, + "grad_norm": 0.7477673888206482, + "learning_rate": 1.9931689931689934e-05, + "loss": 0.0707, "step": 5650 }, { - "epoch": 1.4289320878566019, - "grad_norm": 0.7823798060417175, - "learning_rate": 2.142640747286039e-05, - "loss": 0.0675, + "epoch": 1.6810216810216811, + "grad_norm": 0.8139877915382385, + "learning_rate": 1.9913869913869913e-05, + "loss": 0.0708, "step": 5660 }, { - "epoch": 1.431456702852815, - "grad_norm": 0.39496108889579773, - "learning_rate": 2.141125978288311e-05, - "loss": 0.0727, + "epoch": 1.683991683991684, + "grad_norm": 0.26891762018203735, + "learning_rate": 1.9896049896049895e-05, + "loss": 0.0703, "step": 5670 }, { - "epoch": 1.4339813178490282, - "grad_norm": 0.509624183177948, - "learning_rate": 2.1396112092905833e-05, - "loss": 0.0736, + "epoch": 1.6869616869616868, + "grad_norm": 0.47424066066741943, + "learning_rate": 1.9878229878229877e-05, + "loss": 0.0772, "step": 5680 }, { - "epoch": 1.436505932845241, - "grad_norm": 0.4820767641067505, - "learning_rate": 2.1380964402928554e-05, - "loss": 0.0683, + "epoch": 1.68993168993169, + "grad_norm": 0.41330039501190186, + "learning_rate": 1.986040986040986e-05, + "loss": 0.078, "step": 5690 }, { - "epoch": 1.4390305478414542, - "grad_norm": 0.5313420295715332, - "learning_rate": 2.1365816712951272e-05, - "loss": 0.0678, + "epoch": 1.692901692901693, + "grad_norm": 0.45802241563796997, + "learning_rate": 1.9842589842589845e-05, + "loss": 0.0588, "step": 5700 }, { - "epoch": 1.4415551628376673, - "grad_norm": 0.4537731409072876, - "learning_rate": 2.1350669022973997e-05, - "loss": 0.0765, + "epoch": 1.6958716958716957, + "grad_norm": 0.5270569920539856, + "learning_rate": 1.9824769824769827e-05, + "loss": 0.066, "step": 5710 }, { - "epoch": 1.4440797778338803, - "grad_norm": 0.6362118721008301, - "learning_rate": 2.133552133299672e-05, - "loss": 0.0552, + "epoch": 1.698841698841699, + "grad_norm": 0.5334698557853699, + "learning_rate": 1.980694980694981e-05, + "loss": 0.0795, "step": 5720 }, { - "epoch": 1.4466043928300933, - "grad_norm": 0.3806234300136566, - "learning_rate": 2.1320373643019437e-05, - "loss": 0.0748, + "epoch": 1.7018117018117018, + "grad_norm": 0.4093966484069824, + "learning_rate": 1.978912978912979e-05, + "loss": 0.0752, "step": 5730 }, { - "epoch": 1.4491290078263064, - "grad_norm": 0.4131557047367096, - "learning_rate": 2.1305225953042162e-05, - "loss": 0.0589, + "epoch": 1.7047817047817047, + "grad_norm": 0.5499134659767151, + "learning_rate": 1.977130977130977e-05, + "loss": 0.071, "step": 5740 }, { - "epoch": 1.4516536228225196, - "grad_norm": 0.5624988675117493, - "learning_rate": 2.1290078263064884e-05, - "loss": 0.0642, + "epoch": 1.7077517077517077, + "grad_norm": 0.5507758259773254, + "learning_rate": 1.9753489753489752e-05, + "loss": 0.0762, "step": 5750 }, { - "epoch": 1.4541782378187327, - "grad_norm": 0.29660847783088684, - "learning_rate": 2.1274930573087602e-05, - "loss": 0.0629, + "epoch": 1.7107217107217108, + "grad_norm": 0.726193904876709, + "learning_rate": 1.9735669735669734e-05, + "loss": 0.073, "step": 5760 }, { - "epoch": 1.4567028528149457, - "grad_norm": 0.5936652421951294, - "learning_rate": 2.1259782883110327e-05, - "loss": 0.0698, + "epoch": 1.7136917136917136, + "grad_norm": 0.499423086643219, + "learning_rate": 1.971784971784972e-05, + "loss": 0.0669, "step": 5770 }, { - "epoch": 1.4592274678111588, - "grad_norm": 0.46520429849624634, - "learning_rate": 2.124463519313305e-05, - "loss": 0.0637, + "epoch": 1.7166617166617166, + "grad_norm": 0.4177100956439972, + "learning_rate": 1.9700029700029702e-05, + "loss": 0.0643, "step": 5780 }, { - "epoch": 1.4617520828073718, - "grad_norm": 0.5674166679382324, - "learning_rate": 2.1229487503155767e-05, - "loss": 0.0744, + "epoch": 1.7196317196317197, + "grad_norm": 0.7960310578346252, + "learning_rate": 1.9682209682209684e-05, + "loss": 0.0724, "step": 5790 }, { - "epoch": 1.464276697803585, - "grad_norm": 0.894939661026001, - "learning_rate": 2.121433981317849e-05, - "loss": 0.0704, + "epoch": 1.7226017226017225, + "grad_norm": 0.4406733512878418, + "learning_rate": 1.9664389664389666e-05, + "loss": 0.0776, "step": 5800 }, { - "epoch": 1.466801312799798, - "grad_norm": 0.482416570186615, - "learning_rate": 2.1199192123201213e-05, - "loss": 0.0628, + "epoch": 1.7255717255717256, + "grad_norm": 0.530737042427063, + "learning_rate": 1.9646569646569645e-05, + "loss": 0.0693, "step": 5810 }, { - "epoch": 1.4693259277960111, - "grad_norm": 0.48222440481185913, - "learning_rate": 2.118404443322393e-05, - "loss": 0.0788, + "epoch": 1.7285417285417286, + "grad_norm": 0.29855164885520935, + "learning_rate": 1.9628749628749627e-05, + "loss": 0.0719, "step": 5820 }, { - "epoch": 1.4718505427922242, - "grad_norm": 0.527004063129425, - "learning_rate": 2.1168896743246656e-05, - "loss": 0.0658, + "epoch": 1.7315117315117314, + "grad_norm": 0.5606129765510559, + "learning_rate": 1.961092961092961e-05, + "loss": 0.0773, "step": 5830 }, { - "epoch": 1.4743751577884372, - "grad_norm": 0.5293876528739929, - "learning_rate": 2.1153749053269378e-05, - "loss": 0.063, + "epoch": 1.7344817344817345, + "grad_norm": 0.4716852307319641, + "learning_rate": 1.9593109593109595e-05, + "loss": 0.0699, "step": 5840 }, { - "epoch": 1.4768997727846505, - "grad_norm": 0.33477652072906494, - "learning_rate": 2.1138601363292096e-05, - "loss": 0.072, + "epoch": 1.7374517374517375, + "grad_norm": 0.39249199628829956, + "learning_rate": 1.9575289575289577e-05, + "loss": 0.078, "step": 5850 }, { - "epoch": 1.4794243877808635, - "grad_norm": 0.5224368572235107, - "learning_rate": 2.112345367331482e-05, - "loss": 0.0644, + "epoch": 1.7404217404217404, + "grad_norm": 0.5014438629150391, + "learning_rate": 1.955746955746956e-05, + "loss": 0.0753, "step": 5860 }, { - "epoch": 1.4819490027770765, - "grad_norm": 0.31001779437065125, - "learning_rate": 2.1108305983337543e-05, - "loss": 0.0698, + "epoch": 1.7433917433917434, + "grad_norm": 0.535271167755127, + "learning_rate": 1.953964953964954e-05, + "loss": 0.0722, "step": 5870 }, { - "epoch": 1.4844736177732896, - "grad_norm": 0.8478286862373352, - "learning_rate": 2.109315829336026e-05, - "loss": 0.0787, + "epoch": 1.7463617463617465, + "grad_norm": 0.3693440854549408, + "learning_rate": 1.952182952182952e-05, + "loss": 0.0824, "step": 5880 }, { - "epoch": 1.4869982327695026, - "grad_norm": 0.5729703903198242, - "learning_rate": 2.1078010603382986e-05, - "loss": 0.0689, + "epoch": 1.7493317493317493, + "grad_norm": 0.6997837424278259, + "learning_rate": 1.9504009504009503e-05, + "loss": 0.0719, "step": 5890 }, { - "epoch": 1.4895228477657159, - "grad_norm": 0.5856155753135681, - "learning_rate": 2.1062862913405708e-05, - "loss": 0.0642, + "epoch": 1.7523017523017523, + "grad_norm": 0.3417227864265442, + "learning_rate": 1.9486189486189485e-05, + "loss": 0.0701, "step": 5900 }, { - "epoch": 1.4920474627619287, - "grad_norm": 0.27725616097450256, - "learning_rate": 2.1047715223428426e-05, - "loss": 0.0663, + "epoch": 1.7552717552717554, + "grad_norm": 1.077194094657898, + "learning_rate": 1.946836946836947e-05, + "loss": 0.0787, "step": 5910 }, { - "epoch": 1.494572077758142, - "grad_norm": 0.6175593733787537, - "learning_rate": 2.103256753345115e-05, - "loss": 0.0742, + "epoch": 1.7582417582417582, + "grad_norm": 0.7957248687744141, + "learning_rate": 1.9450549450549452e-05, + "loss": 0.0878, "step": 5920 }, { - "epoch": 1.497096692754355, - "grad_norm": 0.6941758990287781, - "learning_rate": 2.101741984347387e-05, - "loss": 0.068, + "epoch": 1.7612117612117613, + "grad_norm": 0.7661225199699402, + "learning_rate": 1.9432729432729435e-05, + "loss": 0.0701, "step": 5930 }, { - "epoch": 1.499621307750568, - "grad_norm": 0.48814696073532104, - "learning_rate": 2.100227215349659e-05, - "loss": 0.0775, + "epoch": 1.7641817641817643, + "grad_norm": 0.4629841446876526, + "learning_rate": 1.9414909414909417e-05, + "loss": 0.0704, "step": 5940 }, { - "epoch": 1.5021459227467813, - "grad_norm": 0.5020901560783386, - "learning_rate": 2.0987124463519316e-05, - "loss": 0.0679, + "epoch": 1.7671517671517671, + "grad_norm": 0.9389346241950989, + "learning_rate": 1.9397089397089396e-05, + "loss": 0.076, "step": 5950 }, { - "epoch": 1.504670537742994, - "grad_norm": 0.4444803297519684, - "learning_rate": 2.0971976773542034e-05, - "loss": 0.0706, + "epoch": 1.77012177012177, + "grad_norm": 0.3709728717803955, + "learning_rate": 1.9379269379269378e-05, + "loss": 0.064, "step": 5960 }, { - "epoch": 1.5071951527392073, - "grad_norm": 0.34238091111183167, - "learning_rate": 2.0956829083564755e-05, - "loss": 0.0577, + "epoch": 1.7730917730917732, + "grad_norm": 0.4123302102088928, + "learning_rate": 1.936144936144936e-05, + "loss": 0.0762, "step": 5970 }, { - "epoch": 1.5097197677354204, - "grad_norm": 0.8788526654243469, - "learning_rate": 2.094168139358748e-05, - "loss": 0.066, + "epoch": 1.776061776061776, + "grad_norm": 0.5153429508209229, + "learning_rate": 1.9343629343629345e-05, + "loss": 0.0825, "step": 5980 }, { - "epoch": 1.5122443827316334, - "grad_norm": 1.2382627725601196, - "learning_rate": 2.09265337036102e-05, - "loss": 0.0768, + "epoch": 1.779031779031779, + "grad_norm": 0.2630942761898041, + "learning_rate": 1.9325809325809328e-05, + "loss": 0.0578, "step": 5990 }, { - "epoch": 1.5147689977278465, - "grad_norm": 0.47514763474464417, - "learning_rate": 2.091138601363292e-05, - "loss": 0.0797, + "epoch": 1.7820017820017822, + "grad_norm": 0.4419863522052765, + "learning_rate": 1.930798930798931e-05, + "loss": 0.0548, "step": 6000 }, { - "epoch": 1.5172936127240595, - "grad_norm": 0.8225613236427307, - "learning_rate": 2.0896238323655645e-05, - "loss": 0.0755, + "epoch": 1.784971784971785, + "grad_norm": 0.46090295910835266, + "learning_rate": 1.9290169290169292e-05, + "loss": 0.073, "step": 6010 }, { - "epoch": 1.5198182277202728, - "grad_norm": 0.31106212735176086, - "learning_rate": 2.0881090633678363e-05, - "loss": 0.0682, + "epoch": 1.7879417879417878, + "grad_norm": 1.1012392044067383, + "learning_rate": 1.927234927234927e-05, + "loss": 0.0725, "step": 6020 }, { - "epoch": 1.5223428427164858, - "grad_norm": 0.45431169867515564, - "learning_rate": 2.0865942943701085e-05, - "loss": 0.0703, + "epoch": 1.7909117909117909, + "grad_norm": 0.422880083322525, + "learning_rate": 1.9254529254529253e-05, + "loss": 0.0674, "step": 6030 }, { - "epoch": 1.5248674577126988, - "grad_norm": 0.3643419146537781, - "learning_rate": 2.085079525372381e-05, - "loss": 0.0719, + "epoch": 1.793881793881794, + "grad_norm": 0.6051161885261536, + "learning_rate": 1.9236709236709235e-05, + "loss": 0.0669, "step": 6040 }, { - "epoch": 1.5273920727089119, - "grad_norm": 0.42831987142562866, - "learning_rate": 2.0835647563746528e-05, - "loss": 0.0858, + "epoch": 1.7968517968517967, + "grad_norm": 0.351578027009964, + "learning_rate": 1.921888921888922e-05, + "loss": 0.069, "step": 6050 }, { - "epoch": 1.529916687705125, - "grad_norm": 0.5199233889579773, - "learning_rate": 2.082049987376925e-05, - "loss": 0.0674, + "epoch": 1.7998217998217998, + "grad_norm": 0.606691300868988, + "learning_rate": 1.9201069201069203e-05, + "loss": 0.0641, "step": 6060 }, { - "epoch": 1.5324413027013382, - "grad_norm": 0.3392798900604248, - "learning_rate": 2.0805352183791975e-05, - "loss": 0.0812, + "epoch": 1.8027918027918028, + "grad_norm": 0.8968992829322815, + "learning_rate": 1.9183249183249185e-05, + "loss": 0.0734, "step": 6070 }, { - "epoch": 1.534965917697551, - "grad_norm": 0.32933366298675537, - "learning_rate": 2.0790204493814693e-05, - "loss": 0.0639, + "epoch": 1.8057618057618057, + "grad_norm": 0.5204905867576599, + "learning_rate": 1.9165429165429167e-05, + "loss": 0.0741, "step": 6080 }, { - "epoch": 1.5374905326937642, - "grad_norm": 0.5892539024353027, - "learning_rate": 2.0775056803837415e-05, - "loss": 0.0734, + "epoch": 1.8087318087318087, + "grad_norm": 0.6135872602462769, + "learning_rate": 1.9147609147609146e-05, + "loss": 0.0791, "step": 6090 }, { - "epoch": 1.5400151476899773, - "grad_norm": 0.6368609070777893, - "learning_rate": 2.075990911386014e-05, - "loss": 0.0668, + "epoch": 1.8117018117018118, + "grad_norm": 0.5273720622062683, + "learning_rate": 1.9129789129789128e-05, + "loss": 0.0655, "step": 6100 }, { - "epoch": 1.5425397626861903, - "grad_norm": 0.6262642741203308, - "learning_rate": 2.0744761423882858e-05, - "loss": 0.069, + "epoch": 1.8146718146718146, + "grad_norm": 0.4117693305015564, + "learning_rate": 1.911196911196911e-05, + "loss": 0.0658, "step": 6110 }, { - "epoch": 1.5450643776824036, - "grad_norm": 0.6016247272491455, - "learning_rate": 2.072961373390558e-05, - "loss": 0.0602, + "epoch": 1.8176418176418176, + "grad_norm": 0.5177286267280579, + "learning_rate": 1.9094149094149096e-05, + "loss": 0.0773, "step": 6120 }, { - "epoch": 1.5475889926786164, - "grad_norm": 0.45948028564453125, - "learning_rate": 2.0714466043928304e-05, - "loss": 0.0799, + "epoch": 1.8206118206118207, + "grad_norm": 0.5179166793823242, + "learning_rate": 1.9076329076329078e-05, + "loss": 0.06, "step": 6130 }, { - "epoch": 1.5501136076748296, - "grad_norm": 0.30423131585121155, - "learning_rate": 2.0699318353951022e-05, - "loss": 0.0658, + "epoch": 1.8235818235818235, + "grad_norm": 0.48499223589897156, + "learning_rate": 1.905850905850906e-05, + "loss": 0.0876, "step": 6140 }, { - "epoch": 1.5526382226710427, - "grad_norm": 0.6848326921463013, - "learning_rate": 2.0684170663973744e-05, - "loss": 0.0737, + "epoch": 1.8265518265518266, + "grad_norm": 0.5573757886886597, + "learning_rate": 1.9040689040689042e-05, + "loss": 0.0684, "step": 6150 }, { - "epoch": 1.5551628376672557, - "grad_norm": 0.6539986729621887, - "learning_rate": 2.066902297399647e-05, - "loss": 0.0709, + "epoch": 1.8295218295218296, + "grad_norm": 0.481963574886322, + "learning_rate": 1.902286902286902e-05, + "loss": 0.0807, "step": 6160 }, { - "epoch": 1.557687452663469, - "grad_norm": 0.49498459696769714, - "learning_rate": 2.0653875284019187e-05, - "loss": 0.0653, + "epoch": 1.8324918324918325, + "grad_norm": 0.4293064475059509, + "learning_rate": 1.9005049005049003e-05, + "loss": 0.0779, "step": 6170 }, { - "epoch": 1.5602120676596818, - "grad_norm": 0.43628498911857605, - "learning_rate": 2.063872759404191e-05, - "loss": 0.0655, + "epoch": 1.8354618354618355, + "grad_norm": 0.4655805826187134, + "learning_rate": 1.8987228987228986e-05, + "loss": 0.0782, "step": 6180 }, { - "epoch": 1.562736682655895, - "grad_norm": 0.350460022687912, - "learning_rate": 2.0623579904064634e-05, - "loss": 0.0563, + "epoch": 1.8384318384318385, + "grad_norm": 0.5430210828781128, + "learning_rate": 1.896940896940897e-05, + "loss": 0.0601, "step": 6190 }, { - "epoch": 1.565261297652108, - "grad_norm": 0.4810716509819031, - "learning_rate": 2.0608432214087352e-05, - "loss": 0.0709, + "epoch": 1.8414018414018414, + "grad_norm": 0.9118245244026184, + "learning_rate": 1.8951588951588953e-05, + "loss": 0.076, "step": 6200 }, { - "epoch": 1.5677859126483211, - "grad_norm": 0.421172171831131, - "learning_rate": 2.0593284524110074e-05, - "loss": 0.0732, + "epoch": 1.8443718443718444, + "grad_norm": 0.3974968194961548, + "learning_rate": 1.8933768933768935e-05, + "loss": 0.0742, "step": 6210 }, { - "epoch": 1.5703105276445342, - "grad_norm": 0.5165485143661499, - "learning_rate": 2.0578136834132795e-05, - "loss": 0.0628, + "epoch": 1.8473418473418475, + "grad_norm": 0.393530935049057, + "learning_rate": 1.8915948915948918e-05, + "loss": 0.0755, "step": 6220 }, { - "epoch": 1.5728351426407472, - "grad_norm": 0.41548728942871094, - "learning_rate": 2.0562989144155517e-05, - "loss": 0.0634, + "epoch": 1.8503118503118503, + "grad_norm": 0.6730740070343018, + "learning_rate": 1.8898128898128896e-05, + "loss": 0.0675, "step": 6230 }, { - "epoch": 1.5753597576369605, - "grad_norm": 0.4678884744644165, - "learning_rate": 2.054784145417824e-05, - "loss": 0.0735, + "epoch": 1.8532818532818531, + "grad_norm": 0.5142623782157898, + "learning_rate": 1.888030888030888e-05, + "loss": 0.0694, "step": 6240 }, { - "epoch": 1.5778843726331735, - "grad_norm": 0.6086229085922241, - "learning_rate": 2.0532693764200957e-05, - "loss": 0.0705, + "epoch": 1.8562518562518564, + "grad_norm": 0.344099223613739, + "learning_rate": 1.886248886248886e-05, + "loss": 0.0591, "step": 6250 }, { - "epoch": 1.5804089876293865, - "grad_norm": 0.5168741941452026, - "learning_rate": 2.051754607422368e-05, - "loss": 0.0763, + "epoch": 1.8592218592218592, + "grad_norm": 0.5664836168289185, + "learning_rate": 1.8844668844668846e-05, + "loss": 0.0725, "step": 6260 }, { - "epoch": 1.5829336026255996, - "grad_norm": 0.3280368745326996, - "learning_rate": 2.0502398384246403e-05, - "loss": 0.07, + "epoch": 1.862191862191862, + "grad_norm": 0.2773604989051819, + "learning_rate": 1.882684882684883e-05, + "loss": 0.0653, "step": 6270 }, { - "epoch": 1.5854582176218126, - "grad_norm": 0.545002818107605, - "learning_rate": 2.048725069426912e-05, - "loss": 0.0777, + "epoch": 1.865161865161865, + "grad_norm": 0.35496875643730164, + "learning_rate": 1.880902880902881e-05, + "loss": 0.0711, "step": 6280 }, { - "epoch": 1.5879828326180259, - "grad_norm": 0.2854851186275482, - "learning_rate": 2.0472103004291846e-05, - "loss": 0.0566, + "epoch": 1.8681318681318682, + "grad_norm": 0.2887316644191742, + "learning_rate": 1.8791208791208793e-05, + "loss": 0.0661, "step": 6290 }, { - "epoch": 1.5905074476142387, - "grad_norm": 0.44336074590682983, - "learning_rate": 2.0456955314314568e-05, - "loss": 0.063, + "epoch": 1.871101871101871, + "grad_norm": 0.5518425107002258, + "learning_rate": 1.8773388773388775e-05, + "loss": 0.0663, "step": 6300 }, { - "epoch": 1.593032062610452, - "grad_norm": 0.4831758141517639, - "learning_rate": 2.0441807624337286e-05, - "loss": 0.0621, + "epoch": 1.874071874071874, + "grad_norm": 0.6332146525382996, + "learning_rate": 1.8755568755568754e-05, + "loss": 0.079, "step": 6310 }, { - "epoch": 1.595556677606665, - "grad_norm": 0.7535089254379272, - "learning_rate": 2.042665993436001e-05, - "loss": 0.0761, + "epoch": 1.877041877041877, + "grad_norm": 0.49415746331214905, + "learning_rate": 1.8737748737748736e-05, + "loss": 0.071, "step": 6320 }, { - "epoch": 1.598081292602878, - "grad_norm": 0.417810320854187, - "learning_rate": 2.0411512244382733e-05, - "loss": 0.0812, + "epoch": 1.88001188001188, + "grad_norm": 0.6736321449279785, + "learning_rate": 1.871992871992872e-05, + "loss": 0.0749, "step": 6330 }, { - "epoch": 1.6006059075990913, - "grad_norm": 0.5797942876815796, - "learning_rate": 2.039636455440545e-05, - "loss": 0.0715, + "epoch": 1.882981882981883, + "grad_norm": 0.9153728485107422, + "learning_rate": 1.8702108702108704e-05, + "loss": 0.0689, "step": 6340 }, { - "epoch": 1.603130522595304, - "grad_norm": 0.4397270083427429, - "learning_rate": 2.0381216864428176e-05, - "loss": 0.0801, + "epoch": 1.885951885951886, + "grad_norm": 0.3608382046222687, + "learning_rate": 1.8684288684288686e-05, + "loss": 0.064, "step": 6350 }, { - "epoch": 1.6056551375915173, - "grad_norm": 0.39924344420433044, - "learning_rate": 2.0366069174450898e-05, - "loss": 0.0582, + "epoch": 1.8889218889218888, + "grad_norm": 0.3779090344905853, + "learning_rate": 1.8666468666468668e-05, + "loss": 0.072, "step": 6360 }, { - "epoch": 1.6081797525877304, - "grad_norm": 0.3915840685367584, - "learning_rate": 2.0350921484473616e-05, - "loss": 0.0647, + "epoch": 1.8918918918918919, + "grad_norm": 0.5436325669288635, + "learning_rate": 1.864864864864865e-05, + "loss": 0.0738, "step": 6370 }, { - "epoch": 1.6107043675839434, - "grad_norm": 0.4041799008846283, - "learning_rate": 2.033577379449634e-05, - "loss": 0.0604, + "epoch": 1.894861894861895, + "grad_norm": 0.720585823059082, + "learning_rate": 1.863082863082863e-05, + "loss": 0.0651, "step": 6380 }, { - "epoch": 1.6132289825801567, - "grad_norm": 0.45373886823654175, - "learning_rate": 2.0320626104519062e-05, - "loss": 0.0729, + "epoch": 1.8978318978318978, + "grad_norm": 0.3137255609035492, + "learning_rate": 1.861300861300861e-05, + "loss": 0.0634, "step": 6390 }, { - "epoch": 1.6157535975763695, - "grad_norm": 0.4872368276119232, - "learning_rate": 2.030547841454178e-05, - "loss": 0.0735, + "epoch": 1.9008019008019008, + "grad_norm": 0.6744168400764465, + "learning_rate": 1.8595188595188597e-05, + "loss": 0.0652, "step": 6400 }, { - "epoch": 1.6182782125725828, - "grad_norm": 0.6927065253257751, - "learning_rate": 2.0290330724564506e-05, - "loss": 0.0598, + "epoch": 1.9037719037719039, + "grad_norm": 0.33474233746528625, + "learning_rate": 1.857736857736858e-05, + "loss": 0.0663, "step": 6410 }, { - "epoch": 1.6208028275687958, - "grad_norm": 0.6960461139678955, - "learning_rate": 2.0275183034587227e-05, - "loss": 0.069, + "epoch": 1.9067419067419067, + "grad_norm": 0.3249685764312744, + "learning_rate": 1.855954855954856e-05, + "loss": 0.0772, "step": 6420 }, { - "epoch": 1.6233274425650088, - "grad_norm": 0.8550765514373779, - "learning_rate": 2.0260035344609945e-05, - "loss": 0.0718, + "epoch": 1.9097119097119097, + "grad_norm": 0.8481233716011047, + "learning_rate": 1.8541728541728543e-05, + "loss": 0.0715, "step": 6430 }, { - "epoch": 1.6258520575612219, - "grad_norm": 0.4117189049720764, - "learning_rate": 2.024488765463267e-05, - "loss": 0.0673, + "epoch": 1.9126819126819128, + "grad_norm": 0.40865710377693176, + "learning_rate": 1.8523908523908525e-05, + "loss": 0.0672, "step": 6440 }, { - "epoch": 1.628376672557435, - "grad_norm": 0.3952585458755493, - "learning_rate": 2.0229739964655392e-05, - "loss": 0.0743, + "epoch": 1.9156519156519156, + "grad_norm": 0.40034782886505127, + "learning_rate": 1.8506088506088504e-05, + "loss": 0.0599, "step": 6450 }, { - "epoch": 1.6309012875536482, - "grad_norm": 0.30420053005218506, - "learning_rate": 2.021459227467811e-05, - "loss": 0.0682, + "epoch": 1.9186219186219187, + "grad_norm": 0.37332504987716675, + "learning_rate": 1.8488268488268486e-05, + "loss": 0.0726, "step": 6460 }, { - "epoch": 1.6334259025498612, - "grad_norm": 0.8514861464500427, - "learning_rate": 2.0199444584700835e-05, - "loss": 0.068, + "epoch": 1.9215919215919217, + "grad_norm": 0.39983534812927246, + "learning_rate": 1.8470448470448472e-05, + "loss": 0.0673, "step": 6470 }, { - "epoch": 1.6359505175460742, - "grad_norm": 0.2670237421989441, - "learning_rate": 2.0184296894723557e-05, - "loss": 0.0704, + "epoch": 1.9245619245619245, + "grad_norm": 0.3581918776035309, + "learning_rate": 1.8452628452628454e-05, + "loss": 0.0644, "step": 6480 }, { - "epoch": 1.6384751325422873, - "grad_norm": 0.5565549731254578, - "learning_rate": 2.0169149204746275e-05, - "loss": 0.0726, + "epoch": 1.9275319275319274, + "grad_norm": 0.4143809676170349, + "learning_rate": 1.8434808434808436e-05, + "loss": 0.0582, "step": 6490 }, { - "epoch": 1.6409997475385003, - "grad_norm": 0.36750999093055725, - "learning_rate": 2.0154001514769e-05, - "loss": 0.0626, + "epoch": 1.9305019305019306, + "grad_norm": 0.42415744066238403, + "learning_rate": 1.841698841698842e-05, + "loss": 0.067, "step": 6500 }, { - "epoch": 1.6435243625347136, - "grad_norm": 0.35153648257255554, - "learning_rate": 2.0138853824791718e-05, - "loss": 0.0712, + "epoch": 1.9334719334719335, + "grad_norm": 0.5267529487609863, + "learning_rate": 1.83991683991684e-05, + "loss": 0.0562, "step": 6510 }, { - "epoch": 1.6460489775309264, - "grad_norm": 0.6402739882469177, - "learning_rate": 2.012370613481444e-05, - "loss": 0.0639, + "epoch": 1.9364419364419363, + "grad_norm": 0.44437262415885925, + "learning_rate": 1.838134838134838e-05, + "loss": 0.0723, "step": 6520 }, { - "epoch": 1.6485735925271396, - "grad_norm": 0.37415197491645813, - "learning_rate": 2.0108558444837165e-05, - "loss": 0.0691, + "epoch": 1.9394119394119396, + "grad_norm": 0.5024462938308716, + "learning_rate": 1.836352836352836e-05, + "loss": 0.0788, "step": 6530 }, { - "epoch": 1.6510982075233527, - "grad_norm": 0.2915560305118561, - "learning_rate": 2.0093410754859883e-05, - "loss": 0.071, + "epoch": 1.9423819423819424, + "grad_norm": 0.3392117917537689, + "learning_rate": 1.8345708345708347e-05, + "loss": 0.0685, "step": 6540 }, { - "epoch": 1.6536228225195657, - "grad_norm": 0.47494032979011536, - "learning_rate": 2.0078263064882604e-05, - "loss": 0.0677, + "epoch": 1.9453519453519452, + "grad_norm": 0.4275413751602173, + "learning_rate": 1.832788832788833e-05, + "loss": 0.0728, "step": 6550 }, { - "epoch": 1.656147437515779, - "grad_norm": 0.4690226912498474, - "learning_rate": 2.006311537490533e-05, - "loss": 0.0656, + "epoch": 1.9483219483219483, + "grad_norm": 0.3413922190666199, + "learning_rate": 1.831006831006831e-05, + "loss": 0.0694, "step": 6560 }, { - "epoch": 1.6586720525119918, - "grad_norm": 0.38596364855766296, - "learning_rate": 2.0047967684928048e-05, - "loss": 0.0546, + "epoch": 1.9512919512919513, + "grad_norm": 0.4779782295227051, + "learning_rate": 1.8292248292248294e-05, + "loss": 0.0654, "step": 6570 }, { - "epoch": 1.661196667508205, - "grad_norm": 0.5573495626449585, - "learning_rate": 2.003281999495077e-05, - "loss": 0.0693, + "epoch": 1.9542619542619541, + "grad_norm": 0.4912964701652527, + "learning_rate": 1.8274428274428276e-05, + "loss": 0.0729, "step": 6580 }, { - "epoch": 1.663721282504418, - "grad_norm": 0.3469643294811249, - "learning_rate": 2.0017672304973494e-05, - "loss": 0.0696, + "epoch": 1.9572319572319572, + "grad_norm": 0.3358478546142578, + "learning_rate": 1.8256608256608254e-05, + "loss": 0.069, "step": 6590 }, { - "epoch": 1.6662458975006311, - "grad_norm": 0.6143271923065186, - "learning_rate": 2.0002524614996212e-05, - "loss": 0.0685, + "epoch": 1.9602019602019602, + "grad_norm": 0.5066028237342834, + "learning_rate": 1.8238788238788237e-05, + "loss": 0.0626, "step": 6600 }, { - "epoch": 1.6687705124968444, - "grad_norm": 0.44451045989990234, - "learning_rate": 1.9987376925018934e-05, - "loss": 0.0621, + "epoch": 1.963171963171963, + "grad_norm": 0.5891350507736206, + "learning_rate": 1.8220968220968222e-05, + "loss": 0.0643, "step": 6610 }, { - "epoch": 1.6712951274930572, - "grad_norm": 0.39059174060821533, - "learning_rate": 1.997222923504166e-05, - "loss": 0.0689, + "epoch": 1.9661419661419661, + "grad_norm": 0.5142768621444702, + "learning_rate": 1.8203148203148204e-05, + "loss": 0.06, "step": 6620 }, { - "epoch": 1.6738197424892705, - "grad_norm": 0.5144139528274536, - "learning_rate": 1.9957081545064377e-05, - "loss": 0.0734, + "epoch": 1.9691119691119692, + "grad_norm": 0.463016539812088, + "learning_rate": 1.8185328185328187e-05, + "loss": 0.062, "step": 6630 }, { - "epoch": 1.6763443574854835, - "grad_norm": 0.3920697867870331, - "learning_rate": 1.99419338550871e-05, - "loss": 0.0734, + "epoch": 1.972081972081972, + "grad_norm": 0.27797237038612366, + "learning_rate": 1.816750816750817e-05, + "loss": 0.0638, "step": 6640 }, { - "epoch": 1.6788689724816965, - "grad_norm": 0.5351212024688721, - "learning_rate": 1.9926786165109824e-05, - "loss": 0.0702, + "epoch": 1.975051975051975, + "grad_norm": 0.8923652768135071, + "learning_rate": 1.814968814968815e-05, + "loss": 0.0815, "step": 6650 }, { - "epoch": 1.6813935874779096, - "grad_norm": 0.6645495295524597, - "learning_rate": 1.9911638475132542e-05, - "loss": 0.0828, + "epoch": 1.978021978021978, + "grad_norm": 0.43557631969451904, + "learning_rate": 1.813186813186813e-05, + "loss": 0.066, "step": 6660 }, { - "epoch": 1.6839182024741226, - "grad_norm": 0.3733726739883423, - "learning_rate": 1.9896490785155264e-05, - "loss": 0.0651, + "epoch": 1.980991980991981, + "grad_norm": 0.40481114387512207, + "learning_rate": 1.8114048114048112e-05, + "loss": 0.0685, "step": 6670 }, { - "epoch": 1.6864428174703359, - "grad_norm": 0.3866395950317383, - "learning_rate": 1.988134309517799e-05, - "loss": 0.0677, + "epoch": 1.983961983961984, + "grad_norm": 0.5298916101455688, + "learning_rate": 1.8096228096228097e-05, + "loss": 0.0546, "step": 6680 }, { - "epoch": 1.688967432466549, - "grad_norm": 0.5303260684013367, - "learning_rate": 1.9866195405200707e-05, - "loss": 0.0717, + "epoch": 1.986931986931987, + "grad_norm": 0.687917172908783, + "learning_rate": 1.807840807840808e-05, + "loss": 0.0713, "step": 6690 }, { - "epoch": 1.691492047462762, - "grad_norm": 0.24624003469944, - "learning_rate": 1.985104771522343e-05, - "loss": 0.0655, + "epoch": 1.9899019899019899, + "grad_norm": 0.44359517097473145, + "learning_rate": 1.8060588060588062e-05, + "loss": 0.0659, "step": 6700 }, { - "epoch": 1.694016662458975, - "grad_norm": 0.5795212984085083, - "learning_rate": 1.9835900025246153e-05, - "loss": 0.0757, + "epoch": 1.992871992871993, + "grad_norm": 0.48727744817733765, + "learning_rate": 1.8042768042768044e-05, + "loss": 0.075, "step": 6710 }, { - "epoch": 1.696541277455188, - "grad_norm": 0.39747050404548645, - "learning_rate": 1.982075233526887e-05, - "loss": 0.0753, + "epoch": 1.995841995841996, + "grad_norm": 0.46480777859687805, + "learning_rate": 1.8024948024948026e-05, + "loss": 0.0758, "step": 6720 }, { - "epoch": 1.6990658924514013, - "grad_norm": 0.611092209815979, - "learning_rate": 1.9805604645291593e-05, - "loss": 0.065, + "epoch": 1.9988119988119988, + "grad_norm": 0.7983739376068115, + "learning_rate": 1.8007128007128005e-05, + "loss": 0.0611, "step": 6730 }, { - "epoch": 1.701590507447614, - "grad_norm": 0.4604199230670929, - "learning_rate": 1.9790456955314318e-05, - "loss": 0.0661, + "epoch": 2.0, + "eval_f1": 0.49727767695099817, + "eval_loss": 0.05854379013180733, + "eval_runtime": 176.456, + "eval_samples_per_second": 215.459, + "eval_steps_per_second": 3.372, + "step": 6734 + }, + { + "epoch": 2.0017820017820016, + "grad_norm": 0.7805753946304321, + "learning_rate": 1.7989307989307987e-05, + "loss": 0.0529, "step": 6740 }, { - "epoch": 1.7041151224438273, - "grad_norm": 1.0068440437316895, - "learning_rate": 1.9775309265337036e-05, - "loss": 0.0748, + "epoch": 2.004752004752005, + "grad_norm": 0.436716765165329, + "learning_rate": 1.7971487971487973e-05, + "loss": 0.0657, "step": 6750 }, { - "epoch": 1.7066397374400404, - "grad_norm": 0.2989295721054077, - "learning_rate": 1.9760161575359758e-05, - "loss": 0.0593, + "epoch": 2.0077220077220077, + "grad_norm": 0.347323477268219, + "learning_rate": 1.7953667953667955e-05, + "loss": 0.0733, "step": 6760 }, { - "epoch": 1.7091643524362534, - "grad_norm": 0.6437642574310303, - "learning_rate": 1.9745013885382483e-05, - "loss": 0.0646, + "epoch": 2.0106920106920105, + "grad_norm": 0.4401879608631134, + "learning_rate": 1.7935847935847937e-05, + "loss": 0.0709, "step": 6770 }, { - "epoch": 1.7116889674324667, - "grad_norm": 0.40779128670692444, - "learning_rate": 1.97298661954052e-05, - "loss": 0.0727, + "epoch": 2.013662013662014, + "grad_norm": 0.6687297224998474, + "learning_rate": 1.791802791802792e-05, + "loss": 0.0655, "step": 6780 }, { - "epoch": 1.7142135824286795, - "grad_norm": 0.5573729872703552, - "learning_rate": 1.9714718505427923e-05, - "loss": 0.0684, + "epoch": 2.0166320166320166, + "grad_norm": 0.6250702142715454, + "learning_rate": 1.79002079002079e-05, + "loss": 0.072, "step": 6790 }, { - "epoch": 1.7167381974248928, - "grad_norm": 0.41327282786369324, - "learning_rate": 1.9699570815450644e-05, - "loss": 0.0738, + "epoch": 2.0196020196020195, + "grad_norm": 0.7498565912246704, + "learning_rate": 1.788238788238788e-05, + "loss": 0.0724, "step": 6800 }, { - "epoch": 1.7192628124211058, - "grad_norm": 0.6371347904205322, - "learning_rate": 1.9684423125473366e-05, - "loss": 0.0729, + "epoch": 2.0225720225720227, + "grad_norm": 0.5209968090057373, + "learning_rate": 1.7864567864567862e-05, + "loss": 0.0686, "step": 6810 }, { - "epoch": 1.7217874274173188, - "grad_norm": 0.6604834198951721, - "learning_rate": 1.9669275435496088e-05, - "loss": 0.0561, + "epoch": 2.0255420255420256, + "grad_norm": 0.6656198501586914, + "learning_rate": 1.7846747846747848e-05, + "loss": 0.0636, "step": 6820 }, { - "epoch": 1.724312042413532, - "grad_norm": 0.4899774193763733, - "learning_rate": 1.9654127745518806e-05, - "loss": 0.0701, + "epoch": 2.0285120285120284, + "grad_norm": 0.4398493766784668, + "learning_rate": 1.782892782892783e-05, + "loss": 0.0626, "step": 6830 }, { - "epoch": 1.726836657409745, - "grad_norm": 0.6170947551727295, - "learning_rate": 1.963898005554153e-05, - "loss": 0.0899, + "epoch": 2.0314820314820317, + "grad_norm": 0.3464367985725403, + "learning_rate": 1.7811107811107812e-05, + "loss": 0.0636, "step": 6840 }, { - "epoch": 1.7293612724059582, - "grad_norm": 0.5308715105056763, - "learning_rate": 1.9623832365564252e-05, - "loss": 0.0713, + "epoch": 2.0344520344520345, + "grad_norm": 0.5368358492851257, + "learning_rate": 1.7793287793287794e-05, + "loss": 0.0661, "step": 6850 }, { - "epoch": 1.7318858874021712, - "grad_norm": 0.4299701750278473, - "learning_rate": 1.960868467558697e-05, - "loss": 0.0642, + "epoch": 2.0374220374220373, + "grad_norm": 0.6472384929656982, + "learning_rate": 1.7775467775467776e-05, + "loss": 0.0647, "step": 6860 }, { - "epoch": 1.7344105023983842, - "grad_norm": 0.4577876329421997, - "learning_rate": 1.9593536985609695e-05, - "loss": 0.0776, + "epoch": 2.0403920403920406, + "grad_norm": 0.49248170852661133, + "learning_rate": 1.7757647757647755e-05, + "loss": 0.0689, "step": 6870 }, { - "epoch": 1.7369351173945973, - "grad_norm": 0.220742866396904, - "learning_rate": 1.9578389295632417e-05, - "loss": 0.0643, + "epoch": 2.0433620433620434, + "grad_norm": 0.2520935535430908, + "learning_rate": 1.7739827739827737e-05, + "loss": 0.074, "step": 6880 }, { - "epoch": 1.7394597323908103, - "grad_norm": 0.6673031449317932, - "learning_rate": 1.9563241605655135e-05, - "loss": 0.0546, + "epoch": 2.0463320463320462, + "grad_norm": 0.5810942053794861, + "learning_rate": 1.7722007722007723e-05, + "loss": 0.0675, "step": 6890 }, { - "epoch": 1.7419843473870236, - "grad_norm": 0.37436506152153015, - "learning_rate": 1.954809391567786e-05, - "loss": 0.0931, + "epoch": 2.0493020493020495, + "grad_norm": 0.7744080424308777, + "learning_rate": 1.7704187704187705e-05, + "loss": 0.0636, "step": 6900 }, { - "epoch": 1.7445089623832366, - "grad_norm": 0.4209068715572357, - "learning_rate": 1.9532946225700582e-05, - "loss": 0.0625, + "epoch": 2.0522720522720523, + "grad_norm": 0.6021985411643982, + "learning_rate": 1.7686367686367687e-05, + "loss": 0.0683, "step": 6910 }, { - "epoch": 1.7470335773794496, - "grad_norm": 0.5889585018157959, - "learning_rate": 1.95177985357233e-05, - "loss": 0.0562, + "epoch": 2.055242055242055, + "grad_norm": 0.6123180985450745, + "learning_rate": 1.766854766854767e-05, + "loss": 0.0791, "step": 6920 }, { - "epoch": 1.7495581923756627, - "grad_norm": 0.6516574621200562, - "learning_rate": 1.9502650845746025e-05, - "loss": 0.0692, + "epoch": 2.0582120582120584, + "grad_norm": 0.6447744965553284, + "learning_rate": 1.765072765072765e-05, + "loss": 0.0705, "step": 6930 }, { - "epoch": 1.7520828073718757, - "grad_norm": 0.5210825800895691, - "learning_rate": 1.9487503155768747e-05, - "loss": 0.0573, + "epoch": 2.0611820611820613, + "grad_norm": 0.5168854594230652, + "learning_rate": 1.7632907632907634e-05, + "loss": 0.0611, "step": 6940 }, { - "epoch": 1.754607422368089, - "grad_norm": 0.5769010186195374, - "learning_rate": 1.9472355465791465e-05, - "loss": 0.0726, + "epoch": 2.064152064152064, + "grad_norm": 0.9751071333885193, + "learning_rate": 1.7615087615087613e-05, + "loss": 0.0662, "step": 6950 }, { - "epoch": 1.7571320373643018, - "grad_norm": 0.24784168601036072, - "learning_rate": 1.945720777581419e-05, - "loss": 0.0607, + "epoch": 2.067122067122067, + "grad_norm": 0.5001913905143738, + "learning_rate": 1.7597267597267598e-05, + "loss": 0.0654, "step": 6960 }, { - "epoch": 1.759656652360515, - "grad_norm": 0.7145076394081116, - "learning_rate": 1.944206008583691e-05, - "loss": 0.0723, + "epoch": 2.07009207009207, + "grad_norm": 0.6123823523521423, + "learning_rate": 1.757944757944758e-05, + "loss": 0.0806, "step": 6970 }, { - "epoch": 1.762181267356728, - "grad_norm": 0.5195138454437256, - "learning_rate": 1.942691239585963e-05, - "loss": 0.072, + "epoch": 2.073062073062073, + "grad_norm": 0.5449308156967163, + "learning_rate": 1.7561627561627563e-05, + "loss": 0.0643, "step": 6980 }, { - "epoch": 1.7647058823529411, - "grad_norm": 0.7395075559616089, - "learning_rate": 1.9411764705882355e-05, - "loss": 0.059, + "epoch": 2.076032076032076, + "grad_norm": 0.31201791763305664, + "learning_rate": 1.7543807543807545e-05, + "loss": 0.0739, "step": 6990 }, { - "epoch": 1.7672304973491544, - "grad_norm": 0.4518623948097229, - "learning_rate": 1.9396617015905076e-05, - "loss": 0.064, + "epoch": 2.079002079002079, + "grad_norm": 0.8544298410415649, + "learning_rate": 1.7525987525987527e-05, + "loss": 0.069, "step": 7000 }, { - "epoch": 1.7697551123453672, - "grad_norm": 0.5325851440429688, - "learning_rate": 1.9381469325927794e-05, - "loss": 0.0735, + "epoch": 2.081972081972082, + "grad_norm": 0.5308842062950134, + "learning_rate": 1.750816750816751e-05, + "loss": 0.0658, "step": 7010 }, { - "epoch": 1.7722797273415805, - "grad_norm": 0.342540442943573, - "learning_rate": 1.936632163595052e-05, - "loss": 0.0572, + "epoch": 2.0849420849420848, + "grad_norm": 0.47668206691741943, + "learning_rate": 1.7490347490347488e-05, + "loss": 0.076, "step": 7020 }, { - "epoch": 1.7748043423377935, - "grad_norm": 0.513810932636261, - "learning_rate": 1.935117394597324e-05, - "loss": 0.0651, + "epoch": 2.087912087912088, + "grad_norm": 0.47977229952812195, + "learning_rate": 1.7472527472527473e-05, + "loss": 0.0683, "step": 7030 }, { - "epoch": 1.7773289573340065, - "grad_norm": 0.4068731665611267, - "learning_rate": 1.933602625599596e-05, - "loss": 0.0642, + "epoch": 2.090882090882091, + "grad_norm": 0.9374080896377563, + "learning_rate": 1.7454707454707456e-05, + "loss": 0.0743, "step": 7040 }, { - "epoch": 1.7798535723302198, - "grad_norm": 0.4329892098903656, - "learning_rate": 1.9320878566018684e-05, - "loss": 0.0639, + "epoch": 2.0938520938520937, + "grad_norm": 0.5665603280067444, + "learning_rate": 1.7436887436887438e-05, + "loss": 0.0725, "step": 7050 }, { - "epoch": 1.7823781873264326, - "grad_norm": 0.5116180777549744, - "learning_rate": 1.9305730876041406e-05, - "loss": 0.0647, + "epoch": 2.096822096822097, + "grad_norm": 0.44158872961997986, + "learning_rate": 1.741906741906742e-05, + "loss": 0.0769, "step": 7060 }, { - "epoch": 1.7849028023226459, - "grad_norm": 0.5484455823898315, - "learning_rate": 1.9290583186064124e-05, - "loss": 0.0597, + "epoch": 2.0997920997921, + "grad_norm": 0.36570894718170166, + "learning_rate": 1.7401247401247402e-05, + "loss": 0.0686, "step": 7070 }, { - "epoch": 1.787427417318859, - "grad_norm": 0.3904184103012085, - "learning_rate": 1.927543549608685e-05, - "loss": 0.0594, + "epoch": 2.1027621027621026, + "grad_norm": 0.4633289575576782, + "learning_rate": 1.7383427383427384e-05, + "loss": 0.0717, "step": 7080 }, { - "epoch": 1.789952032315072, - "grad_norm": 0.7509499192237854, - "learning_rate": 1.9260287806109567e-05, - "loss": 0.0644, + "epoch": 2.105732105732106, + "grad_norm": 0.6178393959999084, + "learning_rate": 1.7365607365607363e-05, + "loss": 0.0719, "step": 7090 }, { - "epoch": 1.792476647311285, - "grad_norm": 0.3884585499763489, - "learning_rate": 1.924514011613229e-05, - "loss": 0.0784, + "epoch": 2.1087021087021087, + "grad_norm": 0.3964090049266815, + "learning_rate": 1.734778734778735e-05, + "loss": 0.0542, "step": 7100 }, { - "epoch": 1.795001262307498, - "grad_norm": 0.39316463470458984, - "learning_rate": 1.9229992426155014e-05, - "loss": 0.0629, + "epoch": 2.1116721116721116, + "grad_norm": 0.3831978738307953, + "learning_rate": 1.732996732996733e-05, + "loss": 0.0711, "step": 7110 }, { - "epoch": 1.7975258773037113, - "grad_norm": 0.3391354978084564, - "learning_rate": 1.9214844736177732e-05, - "loss": 0.0541, + "epoch": 2.114642114642115, + "grad_norm": 0.4152994453907013, + "learning_rate": 1.7312147312147313e-05, + "loss": 0.0617, "step": 7120 }, { - "epoch": 1.8000504922999243, - "grad_norm": 0.5223536491394043, - "learning_rate": 1.9199697046200454e-05, - "loss": 0.0687, + "epoch": 2.1176121176121177, + "grad_norm": 0.5786647796630859, + "learning_rate": 1.7294327294327295e-05, + "loss": 0.0678, "step": 7130 }, { - "epoch": 1.8025751072961373, - "grad_norm": 0.40146803855895996, - "learning_rate": 1.918454935622318e-05, - "loss": 0.0738, + "epoch": 2.1205821205821205, + "grad_norm": 0.5444033145904541, + "learning_rate": 1.7276507276507277e-05, + "loss": 0.0605, "step": 7140 }, { - "epoch": 1.8050997222923504, - "grad_norm": 0.7420483231544495, - "learning_rate": 1.9169401666245897e-05, - "loss": 0.0642, + "epoch": 2.1235521235521237, + "grad_norm": 0.18499556183815002, + "learning_rate": 1.725868725868726e-05, + "loss": 0.0568, "step": 7150 }, { - "epoch": 1.8076243372885634, - "grad_norm": 0.5140613913536072, - "learning_rate": 1.915425397626862e-05, - "loss": 0.0648, + "epoch": 2.1265221265221266, + "grad_norm": 0.3817172050476074, + "learning_rate": 1.7240867240867238e-05, + "loss": 0.0559, "step": 7160 }, { - "epoch": 1.8101489522847767, - "grad_norm": 0.3334696292877197, - "learning_rate": 1.9139106286291343e-05, - "loss": 0.0693, + "epoch": 2.1294921294921294, + "grad_norm": 0.5504813194274902, + "learning_rate": 1.7223047223047224e-05, + "loss": 0.0642, "step": 7170 }, { - "epoch": 1.8126735672809895, - "grad_norm": 0.5419024229049683, - "learning_rate": 1.912395859631406e-05, - "loss": 0.0746, + "epoch": 2.1324621324621322, + "grad_norm": 0.34808218479156494, + "learning_rate": 1.7205227205227206e-05, + "loss": 0.0483, "step": 7180 }, { - "epoch": 1.8151981822772028, - "grad_norm": 0.4140032231807709, - "learning_rate": 1.9108810906336783e-05, - "loss": 0.067, + "epoch": 2.1354321354321355, + "grad_norm": 0.45135316252708435, + "learning_rate": 1.7187407187407188e-05, + "loss": 0.0591, "step": 7190 }, { - "epoch": 1.8177227972734158, - "grad_norm": 0.7290335297584534, - "learning_rate": 1.9093663216359508e-05, - "loss": 0.0688, + "epoch": 2.1384021384021383, + "grad_norm": 0.5405902862548828, + "learning_rate": 1.716958716958717e-05, + "loss": 0.0548, "step": 7200 }, { - "epoch": 1.8202474122696288, - "grad_norm": 0.47243237495422363, - "learning_rate": 1.9078515526382226e-05, - "loss": 0.0566, + "epoch": 2.141372141372141, + "grad_norm": 0.4525381624698639, + "learning_rate": 1.7151767151767152e-05, + "loss": 0.0775, "step": 7210 }, { - "epoch": 1.822772027265842, - "grad_norm": 0.4763794541358948, - "learning_rate": 1.9063367836404948e-05, - "loss": 0.0736, + "epoch": 2.1443421443421444, + "grad_norm": 0.9278238415718079, + "learning_rate": 1.7133947133947135e-05, + "loss": 0.0748, "step": 7220 }, { - "epoch": 1.825296642262055, - "grad_norm": 0.5379777550697327, - "learning_rate": 1.9048220146427673e-05, - "loss": 0.0673, + "epoch": 2.1473121473121473, + "grad_norm": 0.34462785720825195, + "learning_rate": 1.7116127116127117e-05, + "loss": 0.0693, "step": 7230 }, { - "epoch": 1.8278212572582682, - "grad_norm": 0.6144044399261475, - "learning_rate": 1.903307245645039e-05, - "loss": 0.077, + "epoch": 2.15028215028215, + "grad_norm": 0.5502927899360657, + "learning_rate": 1.70983070983071e-05, + "loss": 0.0704, "step": 7240 }, { - "epoch": 1.8303458722544812, - "grad_norm": 0.2949241101741791, - "learning_rate": 1.9017924766473113e-05, - "loss": 0.0617, + "epoch": 2.1532521532521534, + "grad_norm": 0.5558304786682129, + "learning_rate": 1.708048708048708e-05, + "loss": 0.072, "step": 7250 }, { - "epoch": 1.8328704872506942, - "grad_norm": 0.4626986086368561, - "learning_rate": 1.9002777076495838e-05, - "loss": 0.0774, + "epoch": 2.156222156222156, + "grad_norm": 0.43772050738334656, + "learning_rate": 1.7062667062667063e-05, + "loss": 0.0609, "step": 7260 }, { - "epoch": 1.8353951022469075, - "grad_norm": 0.3882731795310974, - "learning_rate": 1.8987629386518556e-05, - "loss": 0.0695, + "epoch": 2.159192159192159, + "grad_norm": 0.85486900806427, + "learning_rate": 1.7044847044847045e-05, + "loss": 0.0768, "step": 7270 }, { - "epoch": 1.8379197172431203, - "grad_norm": 0.49298906326293945, - "learning_rate": 1.8972481696541277e-05, - "loss": 0.0836, + "epoch": 2.1621621621621623, + "grad_norm": 0.31786465644836426, + "learning_rate": 1.7027027027027028e-05, + "loss": 0.0578, "step": 7280 }, { - "epoch": 1.8404443322393336, - "grad_norm": 0.5465462803840637, - "learning_rate": 1.8957334006564002e-05, - "loss": 0.0673, + "epoch": 2.165132165132165, + "grad_norm": 0.37934377789497375, + "learning_rate": 1.700920700920701e-05, + "loss": 0.0724, "step": 7290 }, { - "epoch": 1.8429689472355466, - "grad_norm": 0.6325553059577942, - "learning_rate": 1.894218631658672e-05, - "loss": 0.0761, + "epoch": 2.168102168102168, + "grad_norm": 0.5212098360061646, + "learning_rate": 1.6991386991386992e-05, + "loss": 0.0587, "step": 7300 }, { - "epoch": 1.8454935622317596, - "grad_norm": 0.3699894845485687, - "learning_rate": 1.8927038626609442e-05, - "loss": 0.0604, + "epoch": 2.171072171072171, + "grad_norm": 0.4610010087490082, + "learning_rate": 1.6973566973566974e-05, + "loss": 0.0659, "step": 7310 }, { - "epoch": 1.8480181772279727, - "grad_norm": 0.3315562903881073, - "learning_rate": 1.8911890936632167e-05, - "loss": 0.0728, + "epoch": 2.174042174042174, + "grad_norm": 0.4683549404144287, + "learning_rate": 1.6955746955746956e-05, + "loss": 0.0559, "step": 7320 }, { - "epoch": 1.8505427922241857, - "grad_norm": 0.3164065480232239, - "learning_rate": 1.8896743246654885e-05, - "loss": 0.0667, + "epoch": 2.177012177012177, + "grad_norm": 1.537858009338379, + "learning_rate": 1.693792693792694e-05, + "loss": 0.0657, "step": 7330 }, { - "epoch": 1.853067407220399, - "grad_norm": 0.3007540702819824, - "learning_rate": 1.8881595556677607e-05, - "loss": 0.0654, + "epoch": 2.17998217998218, + "grad_norm": 0.8588612675666809, + "learning_rate": 1.692010692010692e-05, + "loss": 0.0818, "step": 7340 }, { - "epoch": 1.855592022216612, - "grad_norm": 0.30912989377975464, - "learning_rate": 1.886644786670033e-05, - "loss": 0.0647, + "epoch": 2.182952182952183, + "grad_norm": 0.5644201636314392, + "learning_rate": 1.6902286902286903e-05, + "loss": 0.0553, "step": 7350 }, { - "epoch": 1.858116637212825, - "grad_norm": 0.51572185754776, - "learning_rate": 1.885130017672305e-05, - "loss": 0.0687, + "epoch": 2.185922185922186, + "grad_norm": 0.5589690804481506, + "learning_rate": 1.6884466884466885e-05, + "loss": 0.0634, "step": 7360 }, { - "epoch": 1.860641252209038, - "grad_norm": 0.3540509343147278, - "learning_rate": 1.8836152486745772e-05, - "loss": 0.0678, + "epoch": 2.188892188892189, + "grad_norm": 0.4421149790287018, + "learning_rate": 1.6866646866646867e-05, + "loss": 0.0714, "step": 7370 }, { - "epoch": 1.8631658672052511, - "grad_norm": 0.46048682928085327, - "learning_rate": 1.882100479676849e-05, - "loss": 0.0595, + "epoch": 2.191862191862192, + "grad_norm": 0.7251006960868835, + "learning_rate": 1.684882684882685e-05, + "loss": 0.0741, "step": 7380 }, { - "epoch": 1.8656904822014644, - "grad_norm": 0.3078758418560028, - "learning_rate": 1.8805857106791215e-05, - "loss": 0.053, + "epoch": 2.1948321948321947, + "grad_norm": 0.5653437972068787, + "learning_rate": 1.683100683100683e-05, + "loss": 0.0636, "step": 7390 }, { - "epoch": 1.8682150971976772, - "grad_norm": 0.500586211681366, - "learning_rate": 1.8790709416813937e-05, - "loss": 0.0732, + "epoch": 2.197802197802198, + "grad_norm": 0.37989261746406555, + "learning_rate": 1.6813186813186814e-05, + "loss": 0.0681, "step": 7400 }, { - "epoch": 1.8707397121938905, - "grad_norm": 0.5178824663162231, - "learning_rate": 1.8775561726836655e-05, - "loss": 0.0592, + "epoch": 2.200772200772201, + "grad_norm": 0.38947612047195435, + "learning_rate": 1.6795366795366796e-05, + "loss": 0.0584, "step": 7410 }, { - "epoch": 1.8732643271901035, - "grad_norm": 0.6682724952697754, - "learning_rate": 1.876041403685938e-05, - "loss": 0.073, + "epoch": 2.2037422037422036, + "grad_norm": 0.5566168427467346, + "learning_rate": 1.6777546777546778e-05, + "loss": 0.0567, "step": 7420 }, { - "epoch": 1.8757889421863165, - "grad_norm": 0.5322150588035583, - "learning_rate": 1.87452663468821e-05, - "loss": 0.0699, + "epoch": 2.206712206712207, + "grad_norm": 0.664364755153656, + "learning_rate": 1.675972675972676e-05, + "loss": 0.0735, "step": 7430 }, { - "epoch": 1.8783135571825298, - "grad_norm": 0.6210611462593079, - "learning_rate": 1.873011865690482e-05, - "loss": 0.0613, + "epoch": 2.2096822096822097, + "grad_norm": 0.3879406154155731, + "learning_rate": 1.6741906741906742e-05, + "loss": 0.0684, "step": 7440 }, { - "epoch": 1.8808381721787426, - "grad_norm": 0.7251406311988831, - "learning_rate": 1.8714970966927545e-05, - "loss": 0.0616, + "epoch": 2.2126522126522126, + "grad_norm": 0.34745240211486816, + "learning_rate": 1.6724086724086725e-05, + "loss": 0.0727, "step": 7450 }, { - "epoch": 1.8833627871749559, - "grad_norm": 0.5758464932441711, - "learning_rate": 1.8699823276950266e-05, - "loss": 0.0643, + "epoch": 2.215622215622216, + "grad_norm": 0.48188093304634094, + "learning_rate": 1.6706266706266707e-05, + "loss": 0.0622, "step": 7460 }, { - "epoch": 1.885887402171169, - "grad_norm": 0.36390137672424316, - "learning_rate": 1.8684675586972984e-05, - "loss": 0.0612, + "epoch": 2.2185922185922187, + "grad_norm": 0.6234533786773682, + "learning_rate": 1.668844668844669e-05, + "loss": 0.0681, "step": 7470 }, { - "epoch": 1.888412017167382, - "grad_norm": 0.41576454043388367, - "learning_rate": 1.866952789699571e-05, - "loss": 0.0682, + "epoch": 2.2215622215622215, + "grad_norm": 0.6129441261291504, + "learning_rate": 1.667062667062667e-05, + "loss": 0.0674, "step": 7480 }, { - "epoch": 1.8909366321635952, - "grad_norm": 0.3395916819572449, - "learning_rate": 1.865438020701843e-05, - "loss": 0.0628, + "epoch": 2.2245322245322248, + "grad_norm": 0.36409103870391846, + "learning_rate": 1.6652806652806653e-05, + "loss": 0.0673, "step": 7490 }, { - "epoch": 1.893461247159808, - "grad_norm": 0.49120160937309265, - "learning_rate": 1.863923251704115e-05, - "loss": 0.068, + "epoch": 2.2275022275022276, + "grad_norm": 0.8309186697006226, + "learning_rate": 1.6634986634986635e-05, + "loss": 0.0543, "step": 7500 }, { - "epoch": 1.8959858621560213, - "grad_norm": 0.2704300284385681, - "learning_rate": 1.8624084827063874e-05, - "loss": 0.0494, + "epoch": 2.2304722304722304, + "grad_norm": 0.4031508266925812, + "learning_rate": 1.6617166617166618e-05, + "loss": 0.0842, "step": 7510 }, { - "epoch": 1.8985104771522343, - "grad_norm": 0.4225012958049774, - "learning_rate": 1.8608937137086596e-05, - "loss": 0.0594, + "epoch": 2.2334422334422332, + "grad_norm": 0.35200947523117065, + "learning_rate": 1.65993465993466e-05, + "loss": 0.0611, "step": 7520 }, { - "epoch": 1.9010350921484473, - "grad_norm": 0.4334900975227356, - "learning_rate": 1.8593789447109314e-05, - "loss": 0.0765, + "epoch": 2.2364122364122365, + "grad_norm": 0.5327655673027039, + "learning_rate": 1.6581526581526582e-05, + "loss": 0.0611, "step": 7530 }, { - "epoch": 1.9035597071446604, - "grad_norm": 0.6376916766166687, - "learning_rate": 1.857864175713204e-05, - "loss": 0.0779, + "epoch": 2.2393822393822393, + "grad_norm": 0.3595449924468994, + "learning_rate": 1.6563706563706564e-05, + "loss": 0.0718, "step": 7540 }, { - "epoch": 1.9060843221408734, - "grad_norm": 0.426049143075943, - "learning_rate": 1.856349406715476e-05, - "loss": 0.0688, + "epoch": 2.242352242352242, + "grad_norm": 0.6577255129814148, + "learning_rate": 1.6545886545886546e-05, + "loss": 0.0748, "step": 7550 }, { - "epoch": 1.9086089371370867, - "grad_norm": 0.840045690536499, - "learning_rate": 1.854834637717748e-05, - "loss": 0.0718, + "epoch": 2.2453222453222454, + "grad_norm": 0.46405327320098877, + "learning_rate": 1.652806652806653e-05, + "loss": 0.0735, "step": 7560 }, { - "epoch": 1.9111335521332997, - "grad_norm": 0.8169825673103333, - "learning_rate": 1.8533198687200204e-05, - "loss": 0.0772, + "epoch": 2.2482922482922483, + "grad_norm": 0.6792459487915039, + "learning_rate": 1.651024651024651e-05, + "loss": 0.0809, "step": 7570 }, { - "epoch": 1.9136581671295128, - "grad_norm": 0.4841653108596802, - "learning_rate": 1.8518050997222925e-05, - "loss": 0.0697, + "epoch": 2.251262251262251, + "grad_norm": 0.4969274401664734, + "learning_rate": 1.6492426492426496e-05, + "loss": 0.0616, "step": 7580 }, { - "epoch": 1.9161827821257258, - "grad_norm": 0.5061050057411194, - "learning_rate": 1.8502903307245644e-05, - "loss": 0.0771, + "epoch": 2.2542322542322544, + "grad_norm": 0.7882120609283447, + "learning_rate": 1.6474606474606475e-05, + "loss": 0.0756, "step": 7590 }, { - "epoch": 1.9187073971219388, - "grad_norm": 0.5344434380531311, - "learning_rate": 1.848775561726837e-05, - "loss": 0.0708, + "epoch": 2.257202257202257, + "grad_norm": 0.4611985683441162, + "learning_rate": 1.6456786456786457e-05, + "loss": 0.0658, "step": 7600 }, { - "epoch": 1.921232012118152, - "grad_norm": 0.3740493655204773, - "learning_rate": 1.847260792729109e-05, - "loss": 0.0541, + "epoch": 2.26017226017226, + "grad_norm": 0.5099868774414062, + "learning_rate": 1.643896643896644e-05, + "loss": 0.0691, "step": 7610 }, { - "epoch": 1.923756627114365, - "grad_norm": 0.33391043543815613, - "learning_rate": 1.8457460237313808e-05, - "loss": 0.067, + "epoch": 2.2631422631422633, + "grad_norm": 0.461518257856369, + "learning_rate": 1.642114642114642e-05, + "loss": 0.0604, "step": 7620 }, { - "epoch": 1.9262812421105782, - "grad_norm": 0.6397750377655029, - "learning_rate": 1.8442312547336533e-05, - "loss": 0.0568, + "epoch": 2.266112266112266, + "grad_norm": 0.3580944240093231, + "learning_rate": 1.6403326403326404e-05, + "loss": 0.0609, "step": 7630 }, { - "epoch": 1.9288058571067912, - "grad_norm": 0.32809019088745117, - "learning_rate": 1.8427164857359255e-05, - "loss": 0.0688, + "epoch": 2.269082269082269, + "grad_norm": 0.36803242564201355, + "learning_rate": 1.6385506385506386e-05, + "loss": 0.0674, "step": 7640 }, { - "epoch": 1.9313304721030042, - "grad_norm": 0.45072248578071594, - "learning_rate": 1.8412017167381973e-05, - "loss": 0.0729, + "epoch": 2.2720522720522722, + "grad_norm": 0.3887629806995392, + "learning_rate": 1.636768636768637e-05, + "loss": 0.0733, "step": 7650 }, { - "epoch": 1.9338550870992175, - "grad_norm": 0.5833106637001038, - "learning_rate": 1.8396869477404698e-05, - "loss": 0.0714, + "epoch": 2.275022275022275, + "grad_norm": 0.6474005579948425, + "learning_rate": 1.634986634986635e-05, + "loss": 0.0679, "step": 7660 }, { - "epoch": 1.9363797020954303, - "grad_norm": 0.6556414365768433, - "learning_rate": 1.8381721787427416e-05, - "loss": 0.0835, + "epoch": 2.277992277992278, + "grad_norm": 0.6048378944396973, + "learning_rate": 1.6332046332046332e-05, + "loss": 0.0643, "step": 7670 }, { - "epoch": 1.9389043170916436, - "grad_norm": 0.6509954333305359, - "learning_rate": 1.8366574097450138e-05, - "loss": 0.0648, + "epoch": 2.280962280962281, + "grad_norm": 0.45778071880340576, + "learning_rate": 1.6314226314226314e-05, + "loss": 0.0673, "step": 7680 }, { - "epoch": 1.9414289320878566, - "grad_norm": 0.35850790143013, - "learning_rate": 1.8351426407472863e-05, - "loss": 0.0742, + "epoch": 2.283932283932284, + "grad_norm": 0.6061732172966003, + "learning_rate": 1.6296406296406297e-05, + "loss": 0.0786, "step": 7690 }, { - "epoch": 1.9439535470840696, - "grad_norm": 0.6519030332565308, - "learning_rate": 1.833627871749558e-05, - "loss": 0.0599, + "epoch": 2.286902286902287, + "grad_norm": 0.4730798602104187, + "learning_rate": 1.627858627858628e-05, + "loss": 0.0572, "step": 7700 }, { - "epoch": 1.946478162080283, - "grad_norm": 0.36720070242881775, - "learning_rate": 1.8321131027518303e-05, - "loss": 0.0694, + "epoch": 2.2898722898722896, + "grad_norm": 0.43137332797050476, + "learning_rate": 1.626076626076626e-05, + "loss": 0.0572, "step": 7710 }, { - "epoch": 1.9490027770764957, - "grad_norm": 0.5095828771591187, - "learning_rate": 1.8305983337541028e-05, - "loss": 0.0717, + "epoch": 2.292842292842293, + "grad_norm": 0.36243513226509094, + "learning_rate": 1.6242946242946247e-05, + "loss": 0.0722, "step": 7720 }, { - "epoch": 1.951527392072709, - "grad_norm": 0.5592676997184753, - "learning_rate": 1.8290835647563746e-05, - "loss": 0.0728, + "epoch": 2.2958122958122957, + "grad_norm": 0.6039568781852722, + "learning_rate": 1.6225126225126225e-05, + "loss": 0.0628, "step": 7730 }, { - "epoch": 1.954052007068922, - "grad_norm": 0.5880316495895386, - "learning_rate": 1.8275687957586467e-05, - "loss": 0.0705, + "epoch": 2.2987822987822986, + "grad_norm": 0.7253831028938293, + "learning_rate": 1.6207306207306207e-05, + "loss": 0.054, "step": 7740 }, { - "epoch": 1.956576622065135, - "grad_norm": 0.26635172963142395, - "learning_rate": 1.8260540267609192e-05, - "loss": 0.0736, + "epoch": 2.301752301752302, + "grad_norm": 0.675613522529602, + "learning_rate": 1.618948618948619e-05, + "loss": 0.0689, "step": 7750 }, { - "epoch": 1.9591012370613483, - "grad_norm": 0.4297441840171814, - "learning_rate": 1.824539257763191e-05, - "loss": 0.063, + "epoch": 2.3047223047223047, + "grad_norm": 0.5215617418289185, + "learning_rate": 1.6171666171666172e-05, + "loss": 0.0733, "step": 7760 }, { - "epoch": 1.9616258520575611, - "grad_norm": 0.4150530993938446, - "learning_rate": 1.8230244887654632e-05, - "loss": 0.0733, + "epoch": 2.3076923076923075, + "grad_norm": 0.6846175193786621, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.0615, "step": 7770 }, { - "epoch": 1.9641504670537744, - "grad_norm": 0.36856597661972046, - "learning_rate": 1.8215097197677357e-05, - "loss": 0.0586, + "epoch": 2.3106623106623108, + "grad_norm": 0.397480845451355, + "learning_rate": 1.6136026136026136e-05, + "loss": 0.0641, "step": 7780 }, { - "epoch": 1.9666750820499874, - "grad_norm": 0.36772435903549194, - "learning_rate": 1.8199949507700075e-05, - "loss": 0.0646, + "epoch": 2.3136323136323136, + "grad_norm": 0.33144304156303406, + "learning_rate": 1.6118206118206122e-05, + "loss": 0.0605, "step": 7790 }, { - "epoch": 1.9691996970462005, - "grad_norm": 0.34384581446647644, - "learning_rate": 1.8184801817722797e-05, - "loss": 0.0623, + "epoch": 2.3166023166023164, + "grad_norm": 0.452396035194397, + "learning_rate": 1.61003861003861e-05, + "loss": 0.0787, "step": 7800 }, { - "epoch": 1.9717243120424135, - "grad_norm": 0.5583040118217468, - "learning_rate": 1.8169654127745522e-05, - "loss": 0.0574, + "epoch": 2.3195723195723197, + "grad_norm": 0.4840039908885956, + "learning_rate": 1.6082566082566083e-05, + "loss": 0.0756, "step": 7810 }, { - "epoch": 1.9742489270386265, - "grad_norm": 0.8212075233459473, - "learning_rate": 1.815450643776824e-05, - "loss": 0.0764, + "epoch": 2.3225423225423225, + "grad_norm": 0.3425714671611786, + "learning_rate": 1.6064746064746065e-05, + "loss": 0.0661, "step": 7820 }, { - "epoch": 1.9767735420348398, - "grad_norm": 0.529600203037262, - "learning_rate": 1.8139358747790962e-05, - "loss": 0.0738, + "epoch": 2.3255123255123253, + "grad_norm": 0.5093306303024292, + "learning_rate": 1.6046926046926047e-05, + "loss": 0.0765, "step": 7830 }, { - "epoch": 1.9792981570310526, - "grad_norm": 0.3406949043273926, - "learning_rate": 1.8124211057813687e-05, - "loss": 0.0584, + "epoch": 2.3284823284823286, + "grad_norm": 0.37477800250053406, + "learning_rate": 1.602910602910603e-05, + "loss": 0.0751, "step": 7840 }, { - "epoch": 1.9818227720272659, - "grad_norm": 0.538336455821991, - "learning_rate": 1.8109063367836405e-05, - "loss": 0.0735, + "epoch": 2.3314523314523314, + "grad_norm": 0.6057155132293701, + "learning_rate": 1.601128601128601e-05, + "loss": 0.0738, "step": 7850 }, { - "epoch": 1.984347387023479, - "grad_norm": 0.47139012813568115, - "learning_rate": 1.8093915677859127e-05, - "loss": 0.0671, + "epoch": 2.3344223344223343, + "grad_norm": 0.5092906355857849, + "learning_rate": 1.5993465993465997e-05, + "loss": 0.0684, "step": 7860 }, { - "epoch": 1.986872002019692, - "grad_norm": 0.5175936222076416, - "learning_rate": 1.8078767987881848e-05, - "loss": 0.0744, + "epoch": 2.3373923373923375, + "grad_norm": 0.464747816324234, + "learning_rate": 1.5975645975645976e-05, + "loss": 0.0614, "step": 7870 }, { - "epoch": 1.9893966170159052, - "grad_norm": 0.44927799701690674, - "learning_rate": 1.806362029790457e-05, - "loss": 0.0758, + "epoch": 2.3403623403623404, + "grad_norm": 0.4768252372741699, + "learning_rate": 1.5957825957825958e-05, + "loss": 0.0722, "step": 7880 }, { - "epoch": 1.991921232012118, - "grad_norm": 0.6851469278335571, - "learning_rate": 1.804847260792729e-05, - "loss": 0.0634, + "epoch": 2.343332343332343, + "grad_norm": 0.5153640508651733, + "learning_rate": 1.594000594000594e-05, + "loss": 0.0713, "step": 7890 }, { - "epoch": 1.9944458470083313, - "grad_norm": 0.5457276701927185, - "learning_rate": 1.8033324917950013e-05, - "loss": 0.063, + "epoch": 2.3463023463023465, + "grad_norm": 0.6367037296295166, + "learning_rate": 1.5922185922185922e-05, + "loss": 0.0681, "step": 7900 }, { - "epoch": 1.9969704620045443, - "grad_norm": 0.3204804062843323, - "learning_rate": 1.8018177227972735e-05, - "loss": 0.073, + "epoch": 2.3492723492723493, + "grad_norm": 0.36707803606987, + "learning_rate": 1.5904365904365904e-05, + "loss": 0.0625, "step": 7910 }, { - "epoch": 1.9994950770007573, - "grad_norm": 0.27762261033058167, - "learning_rate": 1.8003029537995456e-05, - "loss": 0.08, + "epoch": 2.352242352242352, + "grad_norm": 0.21663016080856323, + "learning_rate": 1.5886545886545887e-05, + "loss": 0.0602, "step": 7920 }, { - "epoch": 2.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.051420293748378754, - "eval_runtime": 913.9012, - "eval_samples_per_second": 225.694, - "eval_steps_per_second": 3.527, - "step": 7922 - }, - { - "epoch": 2.0020196919969706, - "grad_norm": 0.3685190975666046, - "learning_rate": 1.7987881848018178e-05, - "loss": 0.0506, + "epoch": 2.3552123552123554, + "grad_norm": 0.5676469206809998, + "learning_rate": 1.5868725868725872e-05, + "loss": 0.0605, "step": 7930 }, { - "epoch": 2.0045443069931834, - "grad_norm": 0.420768141746521, - "learning_rate": 1.79727341580409e-05, - "loss": 0.0702, + "epoch": 2.358182358182358, + "grad_norm": 0.4324367940425873, + "learning_rate": 1.585090585090585e-05, + "loss": 0.0661, "step": 7940 }, { - "epoch": 2.0070689219893967, - "grad_norm": 0.24624113738536835, - "learning_rate": 1.795758646806362e-05, - "loss": 0.0616, + "epoch": 2.361152361152361, + "grad_norm": 0.40506285429000854, + "learning_rate": 1.5833085833085833e-05, + "loss": 0.0555, "step": 7950 }, { - "epoch": 2.0095935369856095, - "grad_norm": 0.37558090686798096, - "learning_rate": 1.794243877808634e-05, - "loss": 0.0597, + "epoch": 2.3641223641223643, + "grad_norm": 0.30328163504600525, + "learning_rate": 1.5815265815265815e-05, + "loss": 0.0676, "step": 7960 }, { - "epoch": 2.0121181519818228, - "grad_norm": 0.27011531591415405, - "learning_rate": 1.7927291088109064e-05, - "loss": 0.0686, + "epoch": 2.367092367092367, + "grad_norm": 0.449945330619812, + "learning_rate": 1.5797445797445797e-05, + "loss": 0.0731, "step": 7970 }, { - "epoch": 2.014642766978036, - "grad_norm": 0.4313770830631256, - "learning_rate": 1.7912143398131786e-05, - "loss": 0.0632, + "epoch": 2.37006237006237, + "grad_norm": 0.5466241836547852, + "learning_rate": 1.577962577962578e-05, + "loss": 0.0587, "step": 7980 }, { - "epoch": 2.017167381974249, - "grad_norm": 0.28532424569129944, - "learning_rate": 1.7896995708154504e-05, - "loss": 0.0509, + "epoch": 2.3730323730323732, + "grad_norm": 0.2828434407711029, + "learning_rate": 1.5761805761805762e-05, + "loss": 0.0699, "step": 7990 }, { - "epoch": 2.019691996970462, - "grad_norm": 0.4658902883529663, - "learning_rate": 1.788184801817723e-05, - "loss": 0.0736, + "epoch": 2.376002376002376, + "grad_norm": 0.7119054794311523, + "learning_rate": 1.5743985743985747e-05, + "loss": 0.0615, "step": 8000 }, { - "epoch": 2.022216611966675, - "grad_norm": 0.3360334634780884, - "learning_rate": 1.786670032819995e-05, - "loss": 0.0625, + "epoch": 2.378972378972379, + "grad_norm": 0.5612084269523621, + "learning_rate": 1.5726165726165726e-05, + "loss": 0.0713, "step": 8010 }, { - "epoch": 2.024741226962888, - "grad_norm": 0.41845816373825073, - "learning_rate": 1.785155263822267e-05, - "loss": 0.0694, + "epoch": 2.381942381942382, + "grad_norm": 0.3906906545162201, + "learning_rate": 1.5708345708345708e-05, + "loss": 0.0628, "step": 8020 }, { - "epoch": 2.0272658419591014, - "grad_norm": 0.39248284697532654, - "learning_rate": 1.7836404948245394e-05, - "loss": 0.0717, + "epoch": 2.384912384912385, + "grad_norm": 0.4246062636375427, + "learning_rate": 1.569052569052569e-05, + "loss": 0.0693, "step": 8030 }, { - "epoch": 2.0297904569553142, - "grad_norm": 0.36341196298599243, - "learning_rate": 1.7821257258268115e-05, - "loss": 0.0678, + "epoch": 2.387882387882388, + "grad_norm": 0.7282578349113464, + "learning_rate": 1.5672705672705673e-05, + "loss": 0.0627, "step": 8040 }, { - "epoch": 2.0323150719515275, - "grad_norm": 0.6026912927627563, - "learning_rate": 1.7806109568290833e-05, - "loss": 0.0735, + "epoch": 2.390852390852391, + "grad_norm": 0.4457809627056122, + "learning_rate": 1.5654885654885655e-05, + "loss": 0.0756, "step": 8050 }, { - "epoch": 2.0348396869477403, - "grad_norm": 0.44328150153160095, - "learning_rate": 1.779096187831356e-05, - "loss": 0.0735, + "epoch": 2.393822393822394, + "grad_norm": 0.460112065076828, + "learning_rate": 1.5637065637065637e-05, + "loss": 0.0715, "step": 8060 }, { - "epoch": 2.0373643019439536, - "grad_norm": 0.43636953830718994, - "learning_rate": 1.777581418833628e-05, - "loss": 0.068, + "epoch": 2.3967923967923968, + "grad_norm": 0.8285795450210571, + "learning_rate": 1.5619245619245622e-05, + "loss": 0.0731, "step": 8070 }, { - "epoch": 2.039888916940167, - "grad_norm": 0.5925287008285522, - "learning_rate": 1.7760666498358998e-05, - "loss": 0.0694, + "epoch": 2.3997623997623996, + "grad_norm": 0.6187976002693176, + "learning_rate": 1.56014256014256e-05, + "loss": 0.0711, "step": 8080 }, { - "epoch": 2.0424135319363796, - "grad_norm": 0.5771965384483337, - "learning_rate": 1.7745518808381723e-05, - "loss": 0.0677, + "epoch": 2.402732402732403, + "grad_norm": 0.42191964387893677, + "learning_rate": 1.5583605583605583e-05, + "loss": 0.062, "step": 8090 }, { - "epoch": 2.044938146932593, - "grad_norm": 0.5764312744140625, - "learning_rate": 1.7730371118404445e-05, - "loss": 0.0716, + "epoch": 2.4057024057024057, + "grad_norm": 0.3665825128555298, + "learning_rate": 1.5565785565785566e-05, + "loss": 0.0726, "step": 8100 }, { - "epoch": 2.0474627619288057, - "grad_norm": 0.5081468820571899, - "learning_rate": 1.7715223428427163e-05, - "loss": 0.0604, + "epoch": 2.4086724086724085, + "grad_norm": 0.535072386264801, + "learning_rate": 1.5547965547965548e-05, + "loss": 0.0767, "step": 8110 }, { - "epoch": 2.049987376925019, - "grad_norm": 0.3322690427303314, - "learning_rate": 1.7700075738449888e-05, - "loss": 0.0612, + "epoch": 2.4116424116424118, + "grad_norm": 0.5114570260047913, + "learning_rate": 1.553014553014553e-05, + "loss": 0.0571, "step": 8120 }, { - "epoch": 2.0525119919212322, - "grad_norm": 0.4772360622882843, - "learning_rate": 1.768492804847261e-05, - "loss": 0.0672, + "epoch": 2.4146124146124146, + "grad_norm": 0.5549605488777161, + "learning_rate": 1.5512325512325512e-05, + "loss": 0.0741, "step": 8130 }, { - "epoch": 2.055036606917445, - "grad_norm": 0.4442533850669861, - "learning_rate": 1.7669780358495328e-05, - "loss": 0.0537, + "epoch": 2.4175824175824174, + "grad_norm": 0.47435063123703003, + "learning_rate": 1.5494505494505498e-05, + "loss": 0.0715, "step": 8140 }, { - "epoch": 2.0575612219136583, - "grad_norm": 0.4396134614944458, - "learning_rate": 1.7654632668518053e-05, - "loss": 0.0592, + "epoch": 2.4205524205524207, + "grad_norm": 0.45239725708961487, + "learning_rate": 1.547668547668548e-05, + "loss": 0.076, "step": 8150 }, { - "epoch": 2.060085836909871, - "grad_norm": 1.0153625011444092, - "learning_rate": 1.7639484978540774e-05, - "loss": 0.068, + "epoch": 2.4235224235224235, + "grad_norm": 0.3530510663986206, + "learning_rate": 1.545886545886546e-05, + "loss": 0.0603, "step": 8160 }, { - "epoch": 2.0626104519060844, - "grad_norm": 0.4387151896953583, - "learning_rate": 1.7624337288563493e-05, - "loss": 0.0737, + "epoch": 2.4264924264924264, + "grad_norm": 0.34246182441711426, + "learning_rate": 1.544104544104544e-05, + "loss": 0.0687, "step": 8170 }, { - "epoch": 2.065135066902297, - "grad_norm": 0.6584615111351013, - "learning_rate": 1.7609189598586218e-05, - "loss": 0.0623, + "epoch": 2.4294624294624296, + "grad_norm": 0.4563390612602234, + "learning_rate": 1.5423225423225423e-05, + "loss": 0.0592, "step": 8180 }, { - "epoch": 2.0676596818985105, - "grad_norm": 0.5479759573936462, - "learning_rate": 1.759404190860894e-05, - "loss": 0.0728, + "epoch": 2.4324324324324325, + "grad_norm": 0.6311094760894775, + "learning_rate": 1.5405405405405405e-05, + "loss": 0.0734, "step": 8190 }, { - "epoch": 2.0701842968947237, - "grad_norm": 0.7336176633834839, - "learning_rate": 1.7578894218631657e-05, - "loss": 0.0735, + "epoch": 2.4354024354024353, + "grad_norm": 0.398874431848526, + "learning_rate": 1.5387585387585387e-05, + "loss": 0.0658, "step": 8200 }, { - "epoch": 2.0727089118909365, - "grad_norm": 0.41931140422821045, - "learning_rate": 1.7563746528654382e-05, - "loss": 0.0687, + "epoch": 2.4383724383724386, + "grad_norm": 0.47651001811027527, + "learning_rate": 1.5369765369765373e-05, + "loss": 0.0653, "step": 8210 }, { - "epoch": 2.07523352688715, - "grad_norm": 0.48326900601387024, - "learning_rate": 1.7548598838677104e-05, - "loss": 0.071, + "epoch": 2.4413424413424414, + "grad_norm": 0.5543814897537231, + "learning_rate": 1.5351945351945355e-05, + "loss": 0.077, "step": 8220 }, { - "epoch": 2.0777581418833626, - "grad_norm": 0.5461397767066956, - "learning_rate": 1.7533451148699822e-05, - "loss": 0.0588, + "epoch": 2.444312444312444, + "grad_norm": 0.6091018915176392, + "learning_rate": 1.5334125334125334e-05, + "loss": 0.0673, "step": 8230 }, { - "epoch": 2.080282756879576, - "grad_norm": 0.5842398405075073, - "learning_rate": 1.7518303458722547e-05, - "loss": 0.0687, + "epoch": 2.447282447282447, + "grad_norm": 0.5908657312393188, + "learning_rate": 1.5316305316305316e-05, + "loss": 0.0592, "step": 8240 }, { - "epoch": 2.082807371875789, - "grad_norm": 0.4675542116165161, - "learning_rate": 1.7503155768745265e-05, - "loss": 0.0789, + "epoch": 2.4502524502524503, + "grad_norm": 0.707524836063385, + "learning_rate": 1.5298485298485298e-05, + "loss": 0.067, "step": 8250 }, { - "epoch": 2.085331986872002, - "grad_norm": 0.38348913192749023, - "learning_rate": 1.7488008078767987e-05, - "loss": 0.0605, + "epoch": 2.453222453222453, + "grad_norm": 0.5802726745605469, + "learning_rate": 1.528066528066528e-05, + "loss": 0.0717, "step": 8260 }, { - "epoch": 2.087856601868215, - "grad_norm": 0.608074963092804, - "learning_rate": 1.7472860388790712e-05, - "loss": 0.068, + "epoch": 2.456192456192456, + "grad_norm": 0.5758719444274902, + "learning_rate": 1.5262845262845263e-05, + "loss": 0.0654, "step": 8270 }, { - "epoch": 2.090381216864428, - "grad_norm": 0.27725252509117126, - "learning_rate": 1.745771269881343e-05, - "loss": 0.0684, + "epoch": 2.4591624591624592, + "grad_norm": 0.2951982617378235, + "learning_rate": 1.5245025245025246e-05, + "loss": 0.0646, "step": 8280 }, { - "epoch": 2.0929058318606413, - "grad_norm": 0.5501505732536316, - "learning_rate": 1.7442565008836152e-05, - "loss": 0.081, + "epoch": 2.462132462132462, + "grad_norm": 0.6033930778503418, + "learning_rate": 1.5227205227205229e-05, + "loss": 0.0616, "step": 8290 }, { - "epoch": 2.0954304468568545, - "grad_norm": 0.28557008504867554, - "learning_rate": 1.7427417318858877e-05, - "loss": 0.06, + "epoch": 2.465102465102465, + "grad_norm": 0.7335968613624573, + "learning_rate": 1.520938520938521e-05, + "loss": 0.0527, "step": 8300 }, { - "epoch": 2.0979550618530673, - "grad_norm": 0.6145514249801636, - "learning_rate": 1.7412269628881595e-05, - "loss": 0.0684, + "epoch": 2.468072468072468, + "grad_norm": 0.2696143686771393, + "learning_rate": 1.5191565191565193e-05, + "loss": 0.062, "step": 8310 }, { - "epoch": 2.1004796768492806, - "grad_norm": 0.2920602262020111, - "learning_rate": 1.7397121938904317e-05, - "loss": 0.0693, + "epoch": 2.471042471042471, + "grad_norm": 0.24488599598407745, + "learning_rate": 1.5173745173745173e-05, + "loss": 0.0661, "step": 8320 }, { - "epoch": 2.1030042918454934, - "grad_norm": 0.48144832253456116, - "learning_rate": 1.738197424892704e-05, - "loss": 0.0705, + "epoch": 2.474012474012474, + "grad_norm": 0.43688690662384033, + "learning_rate": 1.5155925155925156e-05, + "loss": 0.0563, "step": 8330 }, { - "epoch": 2.1055289068417067, - "grad_norm": 0.30737602710723877, - "learning_rate": 1.736682655894976e-05, - "loss": 0.068, + "epoch": 2.476982476982477, + "grad_norm": 0.557025134563446, + "learning_rate": 1.5138105138105138e-05, + "loss": 0.0598, "step": 8340 }, { - "epoch": 2.1080535218379195, - "grad_norm": 0.5024394989013672, - "learning_rate": 1.735167886897248e-05, - "loss": 0.0662, + "epoch": 2.47995247995248, + "grad_norm": 0.6465451717376709, + "learning_rate": 1.5120285120285122e-05, + "loss": 0.0669, "step": 8350 }, { - "epoch": 2.1105781368341328, - "grad_norm": 0.3547489643096924, - "learning_rate": 1.7336531178995206e-05, - "loss": 0.0586, + "epoch": 2.4829224829224827, + "grad_norm": 0.38359346985816956, + "learning_rate": 1.5102465102465104e-05, + "loss": 0.0715, "step": 8360 }, { - "epoch": 2.113102751830346, - "grad_norm": 0.41034796833992004, - "learning_rate": 1.7321383489017924e-05, - "loss": 0.074, + "epoch": 2.485892485892486, + "grad_norm": 0.6876797080039978, + "learning_rate": 1.5084645084645086e-05, + "loss": 0.0799, "step": 8370 }, { - "epoch": 2.115627366826559, - "grad_norm": 0.6292237639427185, - "learning_rate": 1.7306235799040646e-05, - "loss": 0.0741, + "epoch": 2.488862488862489, + "grad_norm": 0.47395193576812744, + "learning_rate": 1.5066825066825068e-05, + "loss": 0.0699, "step": 8380 }, { - "epoch": 2.118151981822772, - "grad_norm": 0.6585634350776672, - "learning_rate": 1.729108810906337e-05, - "loss": 0.0647, + "epoch": 2.4918324918324917, + "grad_norm": 0.4975154399871826, + "learning_rate": 1.5049005049005049e-05, + "loss": 0.0655, "step": 8390 }, { - "epoch": 2.120676596818985, - "grad_norm": 0.44634106755256653, - "learning_rate": 1.727594041908609e-05, - "loss": 0.0599, + "epoch": 2.494802494802495, + "grad_norm": 0.5417085886001587, + "learning_rate": 1.503118503118503e-05, + "loss": 0.0768, "step": 8400 }, { - "epoch": 2.123201211815198, - "grad_norm": 0.570796549320221, - "learning_rate": 1.726079272910881e-05, - "loss": 0.0692, + "epoch": 2.4977724977724978, + "grad_norm": 0.593275785446167, + "learning_rate": 1.5013365013365013e-05, + "loss": 0.0613, "step": 8410 }, { - "epoch": 2.1257258268114114, - "grad_norm": 0.8458355069160461, - "learning_rate": 1.7245645039131532e-05, - "loss": 0.0707, + "epoch": 2.5007425007425006, + "grad_norm": 0.5146422386169434, + "learning_rate": 1.4995544995544995e-05, + "loss": 0.0539, "step": 8420 }, { - "epoch": 2.1282504418076242, - "grad_norm": 0.41282087564468384, - "learning_rate": 1.7230497349154254e-05, - "loss": 0.0647, + "epoch": 2.503712503712504, + "grad_norm": 0.5107398629188538, + "learning_rate": 1.4977724977724977e-05, + "loss": 0.0642, "step": 8430 }, { - "epoch": 2.1307750568038375, - "grad_norm": 0.39141160249710083, - "learning_rate": 1.7215349659176976e-05, - "loss": 0.0547, + "epoch": 2.5066825066825067, + "grad_norm": 0.40161600708961487, + "learning_rate": 1.4959904959904961e-05, + "loss": 0.0598, "step": 8440 }, { - "epoch": 2.1332996718000503, - "grad_norm": 0.564751923084259, - "learning_rate": 1.7200201969199697e-05, - "loss": 0.0632, + "epoch": 2.5096525096525095, + "grad_norm": 0.3634410500526428, + "learning_rate": 1.4942084942084943e-05, + "loss": 0.064, "step": 8450 }, { - "epoch": 2.1358242867962636, - "grad_norm": 0.7247843146324158, - "learning_rate": 1.718505427922242e-05, - "loss": 0.0639, + "epoch": 2.512622512622513, + "grad_norm": 0.29765084385871887, + "learning_rate": 1.4924264924264924e-05, + "loss": 0.0596, "step": 8460 }, { - "epoch": 2.138348901792477, - "grad_norm": 0.5174043774604797, - "learning_rate": 1.716990658924514e-05, - "loss": 0.062, + "epoch": 2.5155925155925156, + "grad_norm": 0.5146110653877258, + "learning_rate": 1.4906444906444908e-05, + "loss": 0.0794, "step": 8470 }, { - "epoch": 2.1408735167886896, - "grad_norm": 0.4329341650009155, - "learning_rate": 1.7154758899267862e-05, - "loss": 0.0651, + "epoch": 2.5185625185625184, + "grad_norm": 0.49415668845176697, + "learning_rate": 1.488862488862489e-05, + "loss": 0.0612, "step": 8480 }, { - "epoch": 2.143398131784903, - "grad_norm": 0.41230252385139465, - "learning_rate": 1.7139611209290584e-05, - "loss": 0.0633, + "epoch": 2.5215325215325217, + "grad_norm": 0.3173198997974396, + "learning_rate": 1.487080487080487e-05, + "loss": 0.0576, "step": 8490 }, { - "epoch": 2.1459227467811157, - "grad_norm": 0.5256580114364624, - "learning_rate": 1.7124463519313305e-05, - "loss": 0.0631, + "epoch": 2.5245025245025245, + "grad_norm": 0.4311143159866333, + "learning_rate": 1.4852984852984852e-05, + "loss": 0.0721, "step": 8500 }, { - "epoch": 2.148447361777329, - "grad_norm": 0.4602107107639313, - "learning_rate": 1.7109315829336027e-05, - "loss": 0.0732, + "epoch": 2.5274725274725274, + "grad_norm": 0.3858831524848938, + "learning_rate": 1.4835164835164836e-05, + "loss": 0.0553, "step": 8510 }, { - "epoch": 2.1509719767735422, - "grad_norm": 0.6555882096290588, - "learning_rate": 1.709416813935875e-05, - "loss": 0.0704, + "epoch": 2.5304425304425306, + "grad_norm": 0.4288255572319031, + "learning_rate": 1.4817344817344818e-05, + "loss": 0.0598, "step": 8520 }, { - "epoch": 2.153496591769755, - "grad_norm": 0.4370688199996948, - "learning_rate": 1.707902044938147e-05, - "loss": 0.0681, + "epoch": 2.5334125334125335, + "grad_norm": 0.6533911228179932, + "learning_rate": 1.4799524799524799e-05, + "loss": 0.0776, "step": 8530 }, { - "epoch": 2.1560212067659683, - "grad_norm": 0.7623379230499268, - "learning_rate": 1.7063872759404188e-05, - "loss": 0.0683, + "epoch": 2.5363825363825363, + "grad_norm": 0.4716707468032837, + "learning_rate": 1.4781704781704783e-05, + "loss": 0.0586, "step": 8540 }, { - "epoch": 2.158545821762181, - "grad_norm": 0.437193363904953, - "learning_rate": 1.7048725069426913e-05, - "loss": 0.063, + "epoch": 2.5393525393525396, + "grad_norm": 0.40273916721343994, + "learning_rate": 1.4763884763884765e-05, + "loss": 0.0665, "step": 8550 }, { - "epoch": 2.1610704367583944, - "grad_norm": 0.32816392183303833, - "learning_rate": 1.7033577379449635e-05, - "loss": 0.0637, + "epoch": 2.5423225423225424, + "grad_norm": 0.5408886075019836, + "learning_rate": 1.4746064746064745e-05, + "loss": 0.0606, "step": 8560 }, { - "epoch": 2.1635950517546076, - "grad_norm": 0.2817254960536957, - "learning_rate": 1.7018429689472353e-05, - "loss": 0.0563, + "epoch": 2.5452925452925452, + "grad_norm": 0.4306439757347107, + "learning_rate": 1.4728244728244728e-05, + "loss": 0.0725, "step": 8570 }, { - "epoch": 2.1661196667508205, - "grad_norm": 0.5059931874275208, - "learning_rate": 1.7003281999495078e-05, - "loss": 0.0582, + "epoch": 2.5482625482625485, + "grad_norm": 0.6419402360916138, + "learning_rate": 1.4710424710424711e-05, + "loss": 0.0808, "step": 8580 }, { - "epoch": 2.1686442817470337, - "grad_norm": 0.4076451063156128, - "learning_rate": 1.69881343095178e-05, - "loss": 0.0521, + "epoch": 2.5512325512325513, + "grad_norm": 0.4105408489704132, + "learning_rate": 1.4692604692604694e-05, + "loss": 0.0618, "step": 8590 }, { - "epoch": 2.1711688967432465, - "grad_norm": 0.5454453229904175, - "learning_rate": 1.6972986619540518e-05, - "loss": 0.0615, + "epoch": 2.554202554202554, + "grad_norm": 0.6038805246353149, + "learning_rate": 1.4674784674784674e-05, + "loss": 0.0583, "step": 8600 }, { - "epoch": 2.17369351173946, - "grad_norm": 0.39881065487861633, - "learning_rate": 1.6957838929563243e-05, - "loss": 0.0607, + "epoch": 2.5571725571725574, + "grad_norm": 0.6562811732292175, + "learning_rate": 1.4656964656964658e-05, + "loss": 0.0631, "step": 8610 }, { - "epoch": 2.1762181267356726, - "grad_norm": 0.38210931420326233, - "learning_rate": 1.6942691239585964e-05, - "loss": 0.073, + "epoch": 2.5601425601425603, + "grad_norm": 0.5605425238609314, + "learning_rate": 1.463914463914464e-05, + "loss": 0.0679, "step": 8620 }, { - "epoch": 2.178742741731886, - "grad_norm": 0.4913787245750427, - "learning_rate": 1.6927543549608683e-05, - "loss": 0.069, + "epoch": 2.563112563112563, + "grad_norm": 0.4276842474937439, + "learning_rate": 1.4621324621324622e-05, + "loss": 0.0592, "step": 8630 }, { - "epoch": 2.181267356728099, - "grad_norm": 0.4121997356414795, - "learning_rate": 1.6912395859631408e-05, - "loss": 0.0643, + "epoch": 2.5660825660825664, + "grad_norm": 0.46107247471809387, + "learning_rate": 1.4603504603504603e-05, + "loss": 0.0585, "step": 8640 }, { - "epoch": 2.183791971724312, - "grad_norm": 0.6225160360336304, - "learning_rate": 1.689724816965413e-05, - "loss": 0.0742, + "epoch": 2.569052569052569, + "grad_norm": 0.5134992003440857, + "learning_rate": 1.4585684585684587e-05, + "loss": 0.0734, "step": 8650 }, { - "epoch": 2.186316586720525, - "grad_norm": 0.40772151947021484, - "learning_rate": 1.6882100479676847e-05, - "loss": 0.0767, + "epoch": 2.572022572022572, + "grad_norm": 0.4581449031829834, + "learning_rate": 1.4567864567864569e-05, + "loss": 0.0549, "step": 8660 }, { - "epoch": 2.188841201716738, - "grad_norm": 0.38705331087112427, - "learning_rate": 1.6866952789699572e-05, - "loss": 0.0692, + "epoch": 2.574992574992575, + "grad_norm": 0.5128817558288574, + "learning_rate": 1.455004455004455e-05, + "loss": 0.0648, "step": 8670 }, { - "epoch": 2.1913658167129513, - "grad_norm": 0.4707449674606323, - "learning_rate": 1.6851805099722294e-05, - "loss": 0.0754, + "epoch": 2.577962577962578, + "grad_norm": 0.5839508771896362, + "learning_rate": 1.4532224532224533e-05, + "loss": 0.0794, "step": 8680 }, { - "epoch": 2.1938904317091645, - "grad_norm": 0.35473600029945374, - "learning_rate": 1.6836657409745012e-05, - "loss": 0.0524, + "epoch": 2.580932580932581, + "grad_norm": 0.4006098210811615, + "learning_rate": 1.4514404514404515e-05, + "loss": 0.0679, "step": 8690 }, { - "epoch": 2.1964150467053773, - "grad_norm": 0.2633616626262665, - "learning_rate": 1.6821509719767737e-05, - "loss": 0.0625, + "epoch": 2.5839025839025838, + "grad_norm": 0.24022406339645386, + "learning_rate": 1.4496584496584498e-05, + "loss": 0.0671, "step": 8700 }, { - "epoch": 2.1989396617015906, - "grad_norm": 0.6730284690856934, - "learning_rate": 1.680636202979046e-05, - "loss": 0.0647, + "epoch": 2.586872586872587, + "grad_norm": 0.390082448720932, + "learning_rate": 1.4478764478764478e-05, + "loss": 0.0533, "step": 8710 }, { - "epoch": 2.2014642766978034, - "grad_norm": 0.4892052412033081, - "learning_rate": 1.6791214339813177e-05, - "loss": 0.0567, + "epoch": 2.58984258984259, + "grad_norm": 0.5063132643699646, + "learning_rate": 1.4460944460944462e-05, + "loss": 0.065, "step": 8720 }, { - "epoch": 2.2039888916940167, - "grad_norm": 0.6040000319480896, - "learning_rate": 1.6776066649835902e-05, - "loss": 0.0647, + "epoch": 2.5928125928125927, + "grad_norm": 0.4413723647594452, + "learning_rate": 1.4443124443124444e-05, + "loss": 0.0628, "step": 8730 }, { - "epoch": 2.20651350669023, - "grad_norm": 0.43167611956596375, - "learning_rate": 1.6760918959858623e-05, - "loss": 0.0664, + "epoch": 2.5957825957825955, + "grad_norm": 0.5134592056274414, + "learning_rate": 1.4425304425304425e-05, + "loss": 0.0726, "step": 8740 }, { - "epoch": 2.2090381216864428, - "grad_norm": 0.651911735534668, - "learning_rate": 1.6745771269881342e-05, - "loss": 0.0746, + "epoch": 2.598752598752599, + "grad_norm": 0.6060248613357544, + "learning_rate": 1.4407484407484408e-05, + "loss": 0.0685, "step": 8750 }, { - "epoch": 2.211562736682656, - "grad_norm": 0.6243721842765808, - "learning_rate": 1.6730623579904067e-05, - "loss": 0.0572, + "epoch": 2.6017226017226016, + "grad_norm": 0.54691481590271, + "learning_rate": 1.438966438966439e-05, + "loss": 0.0707, "step": 8760 }, { - "epoch": 2.214087351678869, - "grad_norm": 0.6300668120384216, - "learning_rate": 1.6715475889926788e-05, - "loss": 0.0592, + "epoch": 2.6046926046926044, + "grad_norm": 0.3673838675022125, + "learning_rate": 1.4371844371844373e-05, + "loss": 0.0735, "step": 8770 }, { - "epoch": 2.216611966675082, - "grad_norm": 0.529558002948761, - "learning_rate": 1.6700328199949506e-05, - "loss": 0.0665, + "epoch": 2.6076626076626077, + "grad_norm": 0.47034141421318054, + "learning_rate": 1.4354024354024353e-05, + "loss": 0.0683, "step": 8780 }, { - "epoch": 2.219136581671295, - "grad_norm": 0.3494657278060913, - "learning_rate": 1.668518050997223e-05, - "loss": 0.0748, + "epoch": 2.6106326106326105, + "grad_norm": 0.6322289109230042, + "learning_rate": 1.4336204336204337e-05, + "loss": 0.0712, "step": 8790 }, { - "epoch": 2.221661196667508, - "grad_norm": 0.4483802914619446, - "learning_rate": 1.6670032819994953e-05, - "loss": 0.0617, + "epoch": 2.6136026136026134, + "grad_norm": 0.3621010482311249, + "learning_rate": 1.431838431838432e-05, + "loss": 0.0599, "step": 8800 }, { - "epoch": 2.2241858116637214, - "grad_norm": 0.5772615075111389, - "learning_rate": 1.665488513001767e-05, - "loss": 0.0618, + "epoch": 2.6165726165726166, + "grad_norm": 0.3426471948623657, + "learning_rate": 1.43005643005643e-05, + "loss": 0.0699, "step": 8810 }, { - "epoch": 2.2267104266599342, - "grad_norm": 0.5170813202857971, - "learning_rate": 1.6639737440040396e-05, - "loss": 0.0626, + "epoch": 2.6195426195426195, + "grad_norm": 0.4808509647846222, + "learning_rate": 1.4282744282744284e-05, + "loss": 0.0563, "step": 8820 }, { - "epoch": 2.2292350416561475, - "grad_norm": 0.4822421669960022, - "learning_rate": 1.6624589750063114e-05, - "loss": 0.0715, + "epoch": 2.6225126225126223, + "grad_norm": 0.588964581489563, + "learning_rate": 1.4264924264924266e-05, + "loss": 0.0667, "step": 8830 }, { - "epoch": 2.2317596566523603, - "grad_norm": 0.5152460336685181, - "learning_rate": 1.6609442060085836e-05, - "loss": 0.0628, + "epoch": 2.6254826254826256, + "grad_norm": 0.48968905210494995, + "learning_rate": 1.4247104247104248e-05, + "loss": 0.0648, "step": 8840 }, { - "epoch": 2.2342842716485736, - "grad_norm": 0.6732852458953857, - "learning_rate": 1.659429437010856e-05, - "loss": 0.0772, + "epoch": 2.6284526284526284, + "grad_norm": 0.4962276816368103, + "learning_rate": 1.4229284229284228e-05, + "loss": 0.0719, "step": 8850 }, { - "epoch": 2.236808886644787, - "grad_norm": 0.6237836480140686, - "learning_rate": 1.657914668013128e-05, - "loss": 0.0759, + "epoch": 2.631422631422631, + "grad_norm": 0.5036596059799194, + "learning_rate": 1.4211464211464212e-05, + "loss": 0.0736, "step": 8860 }, { - "epoch": 2.2393335016409996, - "grad_norm": 0.3084138035774231, - "learning_rate": 1.6563998990154e-05, - "loss": 0.0588, + "epoch": 2.6343926343926345, + "grad_norm": 0.47525274753570557, + "learning_rate": 1.4193644193644194e-05, + "loss": 0.0606, "step": 8870 }, { - "epoch": 2.241858116637213, - "grad_norm": 0.4294467568397522, - "learning_rate": 1.6548851300176726e-05, - "loss": 0.0659, + "epoch": 2.6373626373626373, + "grad_norm": 0.6138589978218079, + "learning_rate": 1.4175824175824177e-05, + "loss": 0.064, "step": 8880 }, { - "epoch": 2.2443827316334257, - "grad_norm": 0.42405208945274353, - "learning_rate": 1.6533703610199444e-05, - "loss": 0.0611, + "epoch": 2.64033264033264, + "grad_norm": 0.2877761721611023, + "learning_rate": 1.4158004158004159e-05, + "loss": 0.0645, "step": 8890 }, { - "epoch": 2.246907346629639, - "grad_norm": 0.30669859051704407, - "learning_rate": 1.6518555920222166e-05, - "loss": 0.0729, + "epoch": 2.6433026433026434, + "grad_norm": 0.4664807617664337, + "learning_rate": 1.4140184140184141e-05, + "loss": 0.0573, "step": 8900 }, { - "epoch": 2.2494319616258522, - "grad_norm": 0.6405044794082642, - "learning_rate": 1.650340823024489e-05, - "loss": 0.0644, + "epoch": 2.6462726462726462, + "grad_norm": 0.38519200682640076, + "learning_rate": 1.4122364122364123e-05, + "loss": 0.0666, "step": 8910 }, { - "epoch": 2.251956576622065, - "grad_norm": 0.5801793336868286, - "learning_rate": 1.648826054026761e-05, - "loss": 0.0726, + "epoch": 2.649242649242649, + "grad_norm": 0.7016706466674805, + "learning_rate": 1.4104544104544104e-05, + "loss": 0.0597, "step": 8920 }, { - "epoch": 2.2544811916182783, - "grad_norm": 0.5352018475532532, - "learning_rate": 1.647311285029033e-05, - "loss": 0.0505, + "epoch": 2.6522126522126523, + "grad_norm": 0.5760989785194397, + "learning_rate": 1.4086724086724087e-05, + "loss": 0.0606, "step": 8930 }, { - "epoch": 2.257005806614491, - "grad_norm": 0.4289718568325043, - "learning_rate": 1.6457965160313055e-05, - "loss": 0.0557, + "epoch": 2.655182655182655, + "grad_norm": 0.47811734676361084, + "learning_rate": 1.406890406890407e-05, + "loss": 0.0688, "step": 8940 }, { - "epoch": 2.2595304216107044, - "grad_norm": 0.6818227767944336, - "learning_rate": 1.6442817470335774e-05, - "loss": 0.061, + "epoch": 2.658152658152658, + "grad_norm": 0.3496223986148834, + "learning_rate": 1.4051084051084052e-05, + "loss": 0.0569, "step": 8950 }, { - "epoch": 2.2620550366069176, - "grad_norm": 0.5462810397148132, - "learning_rate": 1.6427669780358495e-05, - "loss": 0.0623, + "epoch": 2.6611226611226613, + "grad_norm": 0.6245877742767334, + "learning_rate": 1.4033264033264034e-05, + "loss": 0.0694, "step": 8960 }, { - "epoch": 2.2645796516031305, - "grad_norm": 0.48088541626930237, - "learning_rate": 1.6412522090381217e-05, - "loss": 0.0768, + "epoch": 2.664092664092664, + "grad_norm": 0.38785070180892944, + "learning_rate": 1.4015444015444016e-05, + "loss": 0.0599, "step": 8970 }, { - "epoch": 2.2671042665993437, - "grad_norm": 0.43914252519607544, - "learning_rate": 1.639737440040394e-05, - "loss": 0.0611, + "epoch": 2.667062667062667, + "grad_norm": 0.3740558624267578, + "learning_rate": 1.3997623997623998e-05, + "loss": 0.0562, "step": 8980 }, { - "epoch": 2.2696288815955565, - "grad_norm": 0.3516700267791748, - "learning_rate": 1.638222671042666e-05, - "loss": 0.0766, + "epoch": 2.67003267003267, + "grad_norm": 0.4402414560317993, + "learning_rate": 1.3979803979803979e-05, + "loss": 0.0595, "step": 8990 }, { - "epoch": 2.27215349659177, - "grad_norm": 0.38341203331947327, - "learning_rate": 1.636707902044938e-05, - "loss": 0.0631, + "epoch": 2.673002673002673, + "grad_norm": 0.6891340017318726, + "learning_rate": 1.3961983961983963e-05, + "loss": 0.0712, "step": 9000 }, { - "epoch": 2.274678111587983, - "grad_norm": 0.5238111615180969, - "learning_rate": 1.6351931330472103e-05, - "loss": 0.0568, + "epoch": 2.675972675972676, + "grad_norm": 0.44056686758995056, + "learning_rate": 1.3944163944163945e-05, + "loss": 0.0712, "step": 9010 }, { - "epoch": 2.277202726584196, - "grad_norm": 0.31196850538253784, - "learning_rate": 1.6336783640494825e-05, - "loss": 0.0671, + "epoch": 2.678942678942679, + "grad_norm": 0.42997923493385315, + "learning_rate": 1.3926343926343927e-05, + "loss": 0.0509, "step": 9020 }, { - "epoch": 2.279727341580409, - "grad_norm": 0.4655681550502777, - "learning_rate": 1.6321635950517546e-05, - "loss": 0.0671, + "epoch": 2.681912681912682, + "grad_norm": 0.4868006110191345, + "learning_rate": 1.390852390852391e-05, + "loss": 0.0722, "step": 9030 }, { - "epoch": 2.282251956576622, - "grad_norm": 0.4430312514305115, - "learning_rate": 1.6306488260540268e-05, - "loss": 0.0603, + "epoch": 2.684882684882685, + "grad_norm": 0.4716143310070038, + "learning_rate": 1.3890703890703891e-05, + "loss": 0.0643, "step": 9040 }, { - "epoch": 2.284776571572835, - "grad_norm": 0.5572938323020935, - "learning_rate": 1.629134057056299e-05, - "loss": 0.0516, + "epoch": 2.687852687852688, + "grad_norm": 0.4905288815498352, + "learning_rate": 1.3872883872883874e-05, + "loss": 0.0592, "step": 9050 }, { - "epoch": 2.2873011865690485, - "grad_norm": 0.48111096024513245, - "learning_rate": 1.627619288058571e-05, - "loss": 0.0658, + "epoch": 2.690822690822691, + "grad_norm": 0.4081631302833557, + "learning_rate": 1.3855063855063854e-05, + "loss": 0.0736, "step": 9060 }, { - "epoch": 2.2898258015652613, - "grad_norm": 0.40908750891685486, - "learning_rate": 1.6261045190608433e-05, - "loss": 0.0599, + "epoch": 2.6937926937926937, + "grad_norm": 0.447644978761673, + "learning_rate": 1.3837243837243838e-05, + "loss": 0.0654, "step": 9070 }, { - "epoch": 2.2923504165614745, - "grad_norm": 0.45796236395835876, - "learning_rate": 1.6245897500631154e-05, - "loss": 0.0617, + "epoch": 2.696762696762697, + "grad_norm": 0.22904683649539948, + "learning_rate": 1.381942381942382e-05, + "loss": 0.0562, "step": 9080 }, { - "epoch": 2.2948750315576874, - "grad_norm": 0.2645781934261322, - "learning_rate": 1.6230749810653876e-05, - "loss": 0.0573, + "epoch": 2.6997326997327, + "grad_norm": 0.5609009861946106, + "learning_rate": 1.3801603801603802e-05, + "loss": 0.0558, "step": 9090 }, { - "epoch": 2.2973996465539006, - "grad_norm": 0.45168253779411316, - "learning_rate": 1.6215602120676598e-05, - "loss": 0.0613, + "epoch": 2.7027027027027026, + "grad_norm": 0.6101239919662476, + "learning_rate": 1.3783783783783784e-05, + "loss": 0.0665, "step": 9100 }, { - "epoch": 2.2999242615501134, - "grad_norm": 0.3695938289165497, - "learning_rate": 1.620045443069932e-05, - "loss": 0.0652, + "epoch": 2.705672705672706, + "grad_norm": 0.49575671553611755, + "learning_rate": 1.3765963765963767e-05, + "loss": 0.0589, "step": 9110 }, { - "epoch": 2.3024488765463267, - "grad_norm": 0.36523544788360596, - "learning_rate": 1.6185306740722037e-05, - "loss": 0.0634, + "epoch": 2.7086427086427087, + "grad_norm": 0.5980531573295593, + "learning_rate": 1.3748143748143749e-05, + "loss": 0.0715, "step": 9120 }, { - "epoch": 2.30497349154254, - "grad_norm": 0.5313533544540405, - "learning_rate": 1.6170159050744762e-05, - "loss": 0.0646, + "epoch": 2.7116127116127116, + "grad_norm": 0.3581327497959137, + "learning_rate": 1.373032373032373e-05, + "loss": 0.0641, "step": 9130 }, { - "epoch": 2.3074981065387528, - "grad_norm": 0.47745630145072937, - "learning_rate": 1.6155011360767484e-05, - "loss": 0.0643, + "epoch": 2.714582714582715, + "grad_norm": 0.5521288514137268, + "learning_rate": 1.3712503712503713e-05, + "loss": 0.0611, "step": 9140 }, { - "epoch": 2.310022721534966, - "grad_norm": 0.4751470685005188, - "learning_rate": 1.6139863670790202e-05, - "loss": 0.0549, + "epoch": 2.7175527175527177, + "grad_norm": 0.617689847946167, + "learning_rate": 1.3694683694683695e-05, + "loss": 0.0556, "step": 9150 }, { - "epoch": 2.312547336531179, - "grad_norm": 0.5816293358802795, - "learning_rate": 1.6124715980812927e-05, - "loss": 0.0616, + "epoch": 2.7205227205227205, + "grad_norm": 0.32165491580963135, + "learning_rate": 1.3676863676863677e-05, + "loss": 0.0714, "step": 9160 }, { - "epoch": 2.315071951527392, - "grad_norm": 0.44545724987983704, - "learning_rate": 1.610956829083565e-05, - "loss": 0.0659, + "epoch": 2.7234927234927238, + "grad_norm": 0.3842147886753082, + "learning_rate": 1.365904365904366e-05, + "loss": 0.0599, "step": 9170 }, { - "epoch": 2.317596566523605, - "grad_norm": 0.3668888509273529, - "learning_rate": 1.6094420600858367e-05, - "loss": 0.0632, + "epoch": 2.7264627264627266, + "grad_norm": 0.41680991649627686, + "learning_rate": 1.3641223641223642e-05, + "loss": 0.0628, "step": 9180 }, { - "epoch": 2.320121181519818, - "grad_norm": 0.40825653076171875, - "learning_rate": 1.6079272910881092e-05, - "loss": 0.0794, + "epoch": 2.7294327294327294, + "grad_norm": 0.6326974630355835, + "learning_rate": 1.3623403623403624e-05, + "loss": 0.0571, "step": 9190 }, { - "epoch": 2.3226457965160314, - "grad_norm": 0.4783599376678467, - "learning_rate": 1.6064125220903813e-05, - "loss": 0.0709, + "epoch": 2.7324027324027322, + "grad_norm": 0.4563412070274353, + "learning_rate": 1.3605583605583606e-05, + "loss": 0.0647, "step": 9200 }, { - "epoch": 2.3251704115122442, - "grad_norm": 0.5379131436347961, - "learning_rate": 1.604897753092653e-05, - "loss": 0.0587, + "epoch": 2.7353727353727355, + "grad_norm": 0.5637513995170593, + "learning_rate": 1.3587763587763588e-05, + "loss": 0.0661, "step": 9210 }, { - "epoch": 2.3276950265084575, - "grad_norm": 0.5364894866943359, - "learning_rate": 1.6033829840949257e-05, - "loss": 0.0846, + "epoch": 2.7383427383427383, + "grad_norm": 0.373116135597229, + "learning_rate": 1.356994356994357e-05, + "loss": 0.0552, "step": 9220 }, { - "epoch": 2.3302196415046703, - "grad_norm": 0.4032694399356842, - "learning_rate": 1.6018682150971978e-05, - "loss": 0.0683, + "epoch": 2.741312741312741, + "grad_norm": 0.6611971259117126, + "learning_rate": 1.3552123552123553e-05, + "loss": 0.0694, "step": 9230 }, { - "epoch": 2.3327442565008836, - "grad_norm": 0.44544950127601624, - "learning_rate": 1.6003534460994696e-05, - "loss": 0.0609, + "epoch": 2.7442827442827444, + "grad_norm": 0.5132849812507629, + "learning_rate": 1.3534303534303535e-05, + "loss": 0.0607, "step": 9240 }, { - "epoch": 2.335268871497097, - "grad_norm": 0.39711642265319824, - "learning_rate": 1.598838677101742e-05, - "loss": 0.0531, + "epoch": 2.7472527472527473, + "grad_norm": 0.40150707960128784, + "learning_rate": 1.3516483516483517e-05, + "loss": 0.0682, "step": 9250 }, { - "epoch": 2.3377934864933096, - "grad_norm": 0.7067275643348694, - "learning_rate": 1.5973239081040143e-05, - "loss": 0.0645, + "epoch": 2.75022275022275, + "grad_norm": 0.8982331156730652, + "learning_rate": 1.3498663498663499e-05, + "loss": 0.058, "step": 9260 }, { - "epoch": 2.340318101489523, - "grad_norm": 0.404411256313324, - "learning_rate": 1.595809139106286e-05, - "loss": 0.0843, + "epoch": 2.753192753192753, + "grad_norm": 0.42595192790031433, + "learning_rate": 1.3480843480843481e-05, + "loss": 0.0673, "step": 9270 }, { - "epoch": 2.3428427164857357, - "grad_norm": 0.3514604866504669, - "learning_rate": 1.5942943701085586e-05, - "loss": 0.0607, + "epoch": 2.756162756162756, + "grad_norm": 0.5409243106842041, + "learning_rate": 1.3463023463023463e-05, + "loss": 0.0553, "step": 9280 }, { - "epoch": 2.345367331481949, - "grad_norm": 0.5159276723861694, - "learning_rate": 1.5927796011108308e-05, - "loss": 0.081, + "epoch": 2.759132759132759, + "grad_norm": 0.5729924440383911, + "learning_rate": 1.3445203445203446e-05, + "loss": 0.0677, "step": 9290 }, { - "epoch": 2.3478919464781622, - "grad_norm": 0.3323509395122528, - "learning_rate": 1.5912648321131026e-05, - "loss": 0.0694, + "epoch": 2.762102762102762, + "grad_norm": 0.4854719638824463, + "learning_rate": 1.3427383427383428e-05, + "loss": 0.054, "step": 9300 }, { - "epoch": 2.350416561474375, - "grad_norm": 0.32046014070510864, - "learning_rate": 1.589750063115375e-05, - "loss": 0.0596, + "epoch": 2.765072765072765, + "grad_norm": 0.7021495699882507, + "learning_rate": 1.340956340956341e-05, + "loss": 0.0618, "step": 9310 }, { - "epoch": 2.3529411764705883, - "grad_norm": 0.7215676307678223, - "learning_rate": 1.5882352941176473e-05, - "loss": 0.0699, + "epoch": 2.768042768042768, + "grad_norm": 0.5088809132575989, + "learning_rate": 1.3391743391743392e-05, + "loss": 0.0652, "step": 9320 }, { - "epoch": 2.355465791466801, - "grad_norm": 0.2936984896659851, - "learning_rate": 1.586720525119919e-05, - "loss": 0.0669, + "epoch": 2.7710127710127708, + "grad_norm": 0.3599695861339569, + "learning_rate": 1.3373923373923374e-05, + "loss": 0.0758, "step": 9330 }, { - "epoch": 2.3579904064630144, - "grad_norm": 0.4268344044685364, - "learning_rate": 1.5852057561221916e-05, - "loss": 0.0814, + "epoch": 2.773982773982774, + "grad_norm": 0.2429090142250061, + "learning_rate": 1.3356103356103356e-05, + "loss": 0.0721, "step": 9340 }, { - "epoch": 2.3605150214592276, - "grad_norm": 0.61527419090271, - "learning_rate": 1.5836909871244637e-05, - "loss": 0.0766, + "epoch": 2.776952776952777, + "grad_norm": 0.42269906401634216, + "learning_rate": 1.3338283338283339e-05, + "loss": 0.0642, "step": 9350 }, { - "epoch": 2.3630396364554405, - "grad_norm": 0.6297392249107361, - "learning_rate": 1.5821762181267356e-05, - "loss": 0.0697, + "epoch": 2.7799227799227797, + "grad_norm": 0.5263569951057434, + "learning_rate": 1.332046332046332e-05, + "loss": 0.0635, "step": 9360 }, { - "epoch": 2.3655642514516537, - "grad_norm": 0.5995168685913086, - "learning_rate": 1.580661449129008e-05, - "loss": 0.0556, + "epoch": 2.782892782892783, + "grad_norm": 0.3662327527999878, + "learning_rate": 1.3302643302643303e-05, + "loss": 0.0628, "step": 9370 }, { - "epoch": 2.3680888664478665, - "grad_norm": 0.246127650141716, - "learning_rate": 1.5791466801312802e-05, - "loss": 0.0651, + "epoch": 2.785862785862786, + "grad_norm": 0.43335428833961487, + "learning_rate": 1.3284823284823285e-05, + "loss": 0.0713, "step": 9380 }, { - "epoch": 2.37061348144408, - "grad_norm": 0.5713209509849548, - "learning_rate": 1.577631911133552e-05, - "loss": 0.0544, + "epoch": 2.7888327888327886, + "grad_norm": 0.5907623767852783, + "learning_rate": 1.3267003267003267e-05, + "loss": 0.0534, "step": 9390 }, { - "epoch": 2.373138096440293, - "grad_norm": 0.6119291186332703, - "learning_rate": 1.5761171421358245e-05, - "loss": 0.0598, + "epoch": 2.791802791802792, + "grad_norm": 0.340541809797287, + "learning_rate": 1.324918324918325e-05, + "loss": 0.052, "step": 9400 }, { - "epoch": 2.375662711436506, - "grad_norm": 0.612404465675354, - "learning_rate": 1.5746023731380964e-05, - "loss": 0.0605, + "epoch": 2.7947727947727947, + "grad_norm": 0.4090157151222229, + "learning_rate": 1.3231363231363232e-05, + "loss": 0.0686, "step": 9410 }, { - "epoch": 2.378187326432719, - "grad_norm": 0.5621523857116699, - "learning_rate": 1.5730876041403685e-05, - "loss": 0.0637, + "epoch": 2.7977427977427975, + "grad_norm": 0.3752903640270233, + "learning_rate": 1.3213543213543214e-05, + "loss": 0.0612, "step": 9420 }, { - "epoch": 2.380711941428932, - "grad_norm": 0.24175520241260529, - "learning_rate": 1.571572835142641e-05, - "loss": 0.0576, + "epoch": 2.800712800712801, + "grad_norm": 0.48351070284843445, + "learning_rate": 1.3195723195723196e-05, + "loss": 0.058, "step": 9430 }, { - "epoch": 2.383236556425145, - "grad_norm": 0.5484057664871216, - "learning_rate": 1.570058066144913e-05, - "loss": 0.0633, + "epoch": 2.8036828036828036, + "grad_norm": 0.7287899851799011, + "learning_rate": 1.3177903177903178e-05, + "loss": 0.0787, "step": 9440 }, { - "epoch": 2.3857611714213585, - "grad_norm": 0.4739590287208557, - "learning_rate": 1.568543297147185e-05, - "loss": 0.0768, + "epoch": 2.8066528066528065, + "grad_norm": 0.4591059684753418, + "learning_rate": 1.316008316008316e-05, + "loss": 0.0507, "step": 9450 }, { - "epoch": 2.3882857864175713, - "grad_norm": 0.3890620172023773, - "learning_rate": 1.5670285281494575e-05, - "loss": 0.07, + "epoch": 2.8096228096228097, + "grad_norm": 0.6308128833770752, + "learning_rate": 1.3142263142263142e-05, + "loss": 0.0783, "step": 9460 }, { - "epoch": 2.3908104014137845, - "grad_norm": 0.47377142310142517, - "learning_rate": 1.5655137591517293e-05, - "loss": 0.0611, + "epoch": 2.8125928125928126, + "grad_norm": 0.5566859841346741, + "learning_rate": 1.3124443124443125e-05, + "loss": 0.067, "step": 9470 }, { - "epoch": 2.3933350164099974, - "grad_norm": 0.6585646867752075, - "learning_rate": 1.5639989901540015e-05, - "loss": 0.0543, + "epoch": 2.8155628155628154, + "grad_norm": 0.42038193345069885, + "learning_rate": 1.3106623106623107e-05, + "loss": 0.0554, "step": 9480 }, { - "epoch": 2.3958596314062106, - "grad_norm": 0.28793618083000183, - "learning_rate": 1.562484221156274e-05, - "loss": 0.0696, + "epoch": 2.8185328185328187, + "grad_norm": 0.34577420353889465, + "learning_rate": 1.3088803088803089e-05, + "loss": 0.0742, "step": 9490 }, { - "epoch": 2.398384246402424, - "grad_norm": 0.3433607816696167, - "learning_rate": 1.5609694521585458e-05, - "loss": 0.0618, + "epoch": 2.8215028215028215, + "grad_norm": 0.5111622214317322, + "learning_rate": 1.3070983070983071e-05, + "loss": 0.068, "step": 9500 }, { - "epoch": 2.4009088613986367, - "grad_norm": 0.3919399678707123, - "learning_rate": 1.559454683160818e-05, - "loss": 0.0628, + "epoch": 2.8244728244728243, + "grad_norm": 0.24577349424362183, + "learning_rate": 1.3053163053163053e-05, + "loss": 0.0617, "step": 9510 }, { - "epoch": 2.40343347639485, - "grad_norm": 0.6731687784194946, - "learning_rate": 1.55793991416309e-05, - "loss": 0.0657, + "epoch": 2.8274428274428276, + "grad_norm": 0.3329918682575226, + "learning_rate": 1.3035343035343037e-05, + "loss": 0.072, "step": 9520 }, { - "epoch": 2.4059580913910628, - "grad_norm": 0.25971853733062744, - "learning_rate": 1.5564251451653623e-05, - "loss": 0.0754, + "epoch": 2.8304128304128304, + "grad_norm": 0.5380098819732666, + "learning_rate": 1.3017523017523018e-05, + "loss": 0.0618, "step": 9530 }, { - "epoch": 2.408482706387276, - "grad_norm": 0.4266602396965027, - "learning_rate": 1.5549103761676344e-05, - "loss": 0.0527, + "epoch": 2.8333828333828333, + "grad_norm": 0.539607584476471, + "learning_rate": 1.2999702999703e-05, + "loss": 0.0529, "step": 9540 }, { - "epoch": 2.411007321383489, - "grad_norm": 0.29480141401290894, - "learning_rate": 1.5533956071699066e-05, - "loss": 0.0815, + "epoch": 2.8363528363528365, + "grad_norm": 0.6192976236343384, + "learning_rate": 1.2981882981882982e-05, + "loss": 0.0695, "step": 9550 }, { - "epoch": 2.413531936379702, - "grad_norm": 0.6023832559585571, - "learning_rate": 1.5518808381721787e-05, - "loss": 0.0796, + "epoch": 2.8393228393228394, + "grad_norm": 0.44225507974624634, + "learning_rate": 1.2964062964062964e-05, + "loss": 0.0537, "step": 9560 }, { - "epoch": 2.4160565513759154, - "grad_norm": 0.4575349688529968, - "learning_rate": 1.550366069174451e-05, - "loss": 0.0694, + "epoch": 2.842292842292842, + "grad_norm": 0.5681447386741638, + "learning_rate": 1.2946242946242946e-05, + "loss": 0.0737, "step": 9570 }, { - "epoch": 2.418581166372128, - "grad_norm": 0.7292725443840027, - "learning_rate": 1.548851300176723e-05, - "loss": 0.0545, + "epoch": 2.8452628452628455, + "grad_norm": 0.5931240320205688, + "learning_rate": 1.2928422928422929e-05, + "loss": 0.0661, "step": 9580 }, { - "epoch": 2.4211057813683414, - "grad_norm": 1.021568775177002, - "learning_rate": 1.5473365311789952e-05, - "loss": 0.0528, + "epoch": 2.8482328482328483, + "grad_norm": 0.4011771082878113, + "learning_rate": 1.2910602910602912e-05, + "loss": 0.0661, "step": 9590 }, { - "epoch": 2.4236303963645542, - "grad_norm": 0.434799462556839, - "learning_rate": 1.5458217621812674e-05, - "loss": 0.0642, + "epoch": 2.851202851202851, + "grad_norm": 0.574195921421051, + "learning_rate": 1.2892782892782893e-05, + "loss": 0.0677, "step": 9600 }, { - "epoch": 2.4261550113607675, - "grad_norm": 0.5191155672073364, - "learning_rate": 1.5443069931835395e-05, - "loss": 0.0694, + "epoch": 2.8541728541728544, + "grad_norm": 0.5977892875671387, + "learning_rate": 1.2874962874962875e-05, + "loss": 0.075, "step": 9610 }, { - "epoch": 2.4286796263569803, - "grad_norm": 0.5991169214248657, - "learning_rate": 1.5427922241858117e-05, - "loss": 0.062, + "epoch": 2.857142857142857, + "grad_norm": 0.3630739152431488, + "learning_rate": 1.2857142857142857e-05, + "loss": 0.0555, "step": 9620 }, { - "epoch": 2.4312042413531936, - "grad_norm": 0.3980846107006073, - "learning_rate": 1.541277455188084e-05, - "loss": 0.0576, + "epoch": 2.86011286011286, + "grad_norm": 0.39152857661247253, + "learning_rate": 1.283932283932284e-05, + "loss": 0.0692, "step": 9630 }, { - "epoch": 2.433728856349407, - "grad_norm": 0.7166936993598938, - "learning_rate": 1.539762686190356e-05, - "loss": 0.0624, + "epoch": 2.8630828630828633, + "grad_norm": 0.2847200036048889, + "learning_rate": 1.2821502821502822e-05, + "loss": 0.0504, "step": 9640 }, { - "epoch": 2.4362534713456196, - "grad_norm": 0.5147587656974792, - "learning_rate": 1.5382479171926282e-05, - "loss": 0.0656, + "epoch": 2.866052866052866, + "grad_norm": 0.46334296464920044, + "learning_rate": 1.2803682803682804e-05, + "loss": 0.067, "step": 9650 }, { - "epoch": 2.438778086341833, - "grad_norm": 0.6524437665939331, - "learning_rate": 1.5367331481949003e-05, - "loss": 0.0603, + "epoch": 2.869022869022869, + "grad_norm": 0.6711926460266113, + "learning_rate": 1.2785862785862788e-05, + "loss": 0.0739, "step": 9660 }, { - "epoch": 2.4413027013380457, - "grad_norm": 0.2982568144798279, - "learning_rate": 1.5352183791971725e-05, - "loss": 0.0597, + "epoch": 2.8719928719928722, + "grad_norm": 0.5789605975151062, + "learning_rate": 1.2768042768042768e-05, + "loss": 0.0649, "step": 9670 }, { - "epoch": 2.443827316334259, - "grad_norm": 0.3818541169166565, - "learning_rate": 1.5337036101994447e-05, - "loss": 0.0646, + "epoch": 2.874962874962875, + "grad_norm": 0.5450757741928101, + "learning_rate": 1.275022275022275e-05, + "loss": 0.0659, "step": 9680 }, { - "epoch": 2.4463519313304722, - "grad_norm": 0.5956404209136963, - "learning_rate": 1.5321888412017168e-05, - "loss": 0.0632, + "epoch": 2.877932877932878, + "grad_norm": 0.4336056709289551, + "learning_rate": 1.2732402732402732e-05, + "loss": 0.064, "step": 9690 }, { - "epoch": 2.448876546326685, - "grad_norm": 0.6520951986312866, - "learning_rate": 1.5306740722039886e-05, - "loss": 0.0646, + "epoch": 2.880902880902881, + "grad_norm": 0.43332991003990173, + "learning_rate": 1.2714582714582715e-05, + "loss": 0.0703, "step": 9700 }, { - "epoch": 2.4514011613228983, - "grad_norm": 0.4105677008628845, - "learning_rate": 1.529159303206261e-05, - "loss": 0.0562, + "epoch": 2.883872883872884, + "grad_norm": 0.26582634449005127, + "learning_rate": 1.2696762696762697e-05, + "loss": 0.0573, "step": 9710 }, { - "epoch": 2.453925776319111, - "grad_norm": 0.4286990165710449, - "learning_rate": 1.5276445342085333e-05, - "loss": 0.0624, + "epoch": 2.886842886842887, + "grad_norm": 0.39930054545402527, + "learning_rate": 1.2678942678942679e-05, + "loss": 0.0608, "step": 9720 }, { - "epoch": 2.4564503913153244, - "grad_norm": 0.4525969922542572, - "learning_rate": 1.526129765210805e-05, - "loss": 0.0661, + "epoch": 2.88981288981289, + "grad_norm": 0.6703673601150513, + "learning_rate": 1.2661122661122663e-05, + "loss": 0.0706, "step": 9730 }, { - "epoch": 2.4589750063115376, - "grad_norm": 0.39278095960617065, - "learning_rate": 1.5246149962130776e-05, - "loss": 0.0597, + "epoch": 2.892782892782893, + "grad_norm": 0.3226848542690277, + "learning_rate": 1.2643302643302643e-05, + "loss": 0.0565, "step": 9740 }, { - "epoch": 2.4614996213077505, - "grad_norm": 0.3804191052913666, - "learning_rate": 1.5231002272153498e-05, - "loss": 0.0581, + "epoch": 2.8957528957528957, + "grad_norm": 0.4727514386177063, + "learning_rate": 1.2625482625482625e-05, + "loss": 0.0482, "step": 9750 }, { - "epoch": 2.4640242363039637, - "grad_norm": 0.6127219796180725, - "learning_rate": 1.5215854582176218e-05, - "loss": 0.0525, + "epoch": 2.8987228987228986, + "grad_norm": 0.744326651096344, + "learning_rate": 1.2607662607662608e-05, + "loss": 0.0679, "step": 9760 }, { - "epoch": 2.4665488513001765, - "grad_norm": 0.6907774209976196, - "learning_rate": 1.5200706892198941e-05, - "loss": 0.065, + "epoch": 2.901692901692902, + "grad_norm": 0.46024101972579956, + "learning_rate": 1.258984258984259e-05, + "loss": 0.0648, "step": 9770 }, { - "epoch": 2.46907346629639, - "grad_norm": 0.5706852674484253, - "learning_rate": 1.5185559202221661e-05, - "loss": 0.0556, + "epoch": 2.9046629046629047, + "grad_norm": 0.5013512969017029, + "learning_rate": 1.2572022572022572e-05, + "loss": 0.0563, "step": 9780 }, { - "epoch": 2.471598081292603, - "grad_norm": 0.5961136221885681, - "learning_rate": 1.5170411512244382e-05, - "loss": 0.0598, + "epoch": 2.9076329076329075, + "grad_norm": 0.7148948907852173, + "learning_rate": 1.2554202554202554e-05, + "loss": 0.0735, "step": 9790 }, { - "epoch": 2.474122696288816, - "grad_norm": 0.4143712818622589, - "learning_rate": 1.5155263822267106e-05, - "loss": 0.0605, + "epoch": 2.9106029106029108, + "grad_norm": 0.4620581865310669, + "learning_rate": 1.2536382536382538e-05, + "loss": 0.0678, "step": 9800 }, { - "epoch": 2.476647311285029, - "grad_norm": 0.4803559184074402, - "learning_rate": 1.5140116132289826e-05, - "loss": 0.0568, + "epoch": 2.9135729135729136, + "grad_norm": 0.5615851879119873, + "learning_rate": 1.2518562518562518e-05, + "loss": 0.0599, "step": 9810 }, { - "epoch": 2.479171926281242, - "grad_norm": 0.45310842990875244, - "learning_rate": 1.5124968442312547e-05, - "loss": 0.0728, + "epoch": 2.9165429165429164, + "grad_norm": 0.5745916366577148, + "learning_rate": 1.25007425007425e-05, + "loss": 0.0663, "step": 9820 }, { - "epoch": 2.481696541277455, - "grad_norm": 0.45321986079216003, - "learning_rate": 1.510982075233527e-05, - "loss": 0.0597, + "epoch": 2.9195129195129192, + "grad_norm": 0.34011173248291016, + "learning_rate": 1.2482922482922483e-05, + "loss": 0.0524, "step": 9830 }, { - "epoch": 2.4842211562736685, - "grad_norm": 0.5330350995063782, - "learning_rate": 1.509467306235799e-05, - "loss": 0.0598, + "epoch": 2.9224829224829225, + "grad_norm": 0.5845355987548828, + "learning_rate": 1.2465102465102467e-05, + "loss": 0.0625, "step": 9840 }, { - "epoch": 2.4867457712698813, - "grad_norm": 0.5140849947929382, - "learning_rate": 1.5079525372380712e-05, - "loss": 0.0658, + "epoch": 2.9254529254529253, + "grad_norm": 0.5317063331604004, + "learning_rate": 1.2447282447282447e-05, + "loss": 0.0589, "step": 9850 }, { - "epoch": 2.4892703862660945, - "grad_norm": 0.4241473972797394, - "learning_rate": 1.5064377682403435e-05, - "loss": 0.0586, + "epoch": 2.928422928422928, + "grad_norm": 0.3282083570957184, + "learning_rate": 1.242946242946243e-05, + "loss": 0.059, "step": 9860 }, { - "epoch": 2.4917950012623074, - "grad_norm": 0.28790709376335144, - "learning_rate": 1.5049229992426155e-05, - "loss": 0.0634, + "epoch": 2.9313929313929314, + "grad_norm": 0.3801690638065338, + "learning_rate": 1.2411642411642413e-05, + "loss": 0.0628, "step": 9870 }, { - "epoch": 2.4943196162585206, - "grad_norm": 0.5911887288093567, - "learning_rate": 1.5034082302448877e-05, - "loss": 0.069, + "epoch": 2.9343629343629343, + "grad_norm": 0.5469937324523926, + "learning_rate": 1.2393822393822394e-05, + "loss": 0.0681, "step": 9880 }, { - "epoch": 2.496844231254734, - "grad_norm": 0.5213605761528015, - "learning_rate": 1.50189346124716e-05, - "loss": 0.0699, + "epoch": 2.937332937332937, + "grad_norm": 0.7467171549797058, + "learning_rate": 1.2376002376002376e-05, + "loss": 0.0555, "step": 9890 }, { - "epoch": 2.4993688462509467, - "grad_norm": 0.5791930556297302, - "learning_rate": 1.500378692249432e-05, - "loss": 0.0546, + "epoch": 2.9403029403029404, + "grad_norm": 0.5576099157333374, + "learning_rate": 1.2358182358182358e-05, + "loss": 0.0722, "step": 9900 }, { - "epoch": 2.50189346124716, - "grad_norm": 0.5724365711212158, - "learning_rate": 1.498863923251704e-05, - "loss": 0.0554, + "epoch": 2.943272943272943, + "grad_norm": 0.5140604972839355, + "learning_rate": 1.2340362340362342e-05, + "loss": 0.0628, "step": 9910 }, { - "epoch": 2.5044180762433728, - "grad_norm": 0.5653268098831177, - "learning_rate": 1.4973491542539763e-05, - "loss": 0.0745, + "epoch": 2.946242946242946, + "grad_norm": 0.6918432116508484, + "learning_rate": 1.2322542322542322e-05, + "loss": 0.0709, "step": 9920 }, { - "epoch": 2.506942691239586, - "grad_norm": 0.6643403768539429, - "learning_rate": 1.4958343852562485e-05, - "loss": 0.0595, + "epoch": 2.9492129492129493, + "grad_norm": 0.4932166635990143, + "learning_rate": 1.2304722304722305e-05, + "loss": 0.0685, "step": 9930 }, { - "epoch": 2.5094673062357993, - "grad_norm": 0.5853692293167114, - "learning_rate": 1.4943196162585205e-05, - "loss": 0.066, + "epoch": 2.952182952182952, + "grad_norm": 0.42789584398269653, + "learning_rate": 1.2286902286902288e-05, + "loss": 0.0654, "step": 9940 }, { - "epoch": 2.511991921232012, - "grad_norm": 0.3577601909637451, - "learning_rate": 1.4928048472607928e-05, - "loss": 0.0661, + "epoch": 2.955152955152955, + "grad_norm": 0.6951575875282288, + "learning_rate": 1.2269082269082269e-05, + "loss": 0.0677, "step": 9950 }, { - "epoch": 2.5145165362282254, - "grad_norm": 0.44483545422554016, - "learning_rate": 1.491290078263065e-05, - "loss": 0.0679, + "epoch": 2.9581229581229582, + "grad_norm": 0.5306366682052612, + "learning_rate": 1.2251262251262251e-05, + "loss": 0.0561, "step": 9960 }, { - "epoch": 2.517041151224438, - "grad_norm": 0.7027893662452698, - "learning_rate": 1.489775309265337e-05, - "loss": 0.065, + "epoch": 2.961092961092961, + "grad_norm": 0.45280131697654724, + "learning_rate": 1.2233442233442233e-05, + "loss": 0.0594, "step": 9970 }, { - "epoch": 2.5195657662206514, - "grad_norm": 0.8064749240875244, - "learning_rate": 1.4882605402676093e-05, - "loss": 0.0532, + "epoch": 2.964062964062964, + "grad_norm": 0.5789720416069031, + "learning_rate": 1.2215622215622217e-05, + "loss": 0.0691, "step": 9980 }, { - "epoch": 2.5220903812168647, - "grad_norm": 0.3491119146347046, - "learning_rate": 1.4867457712698814e-05, - "loss": 0.0586, + "epoch": 2.967032967032967, + "grad_norm": 0.4295837879180908, + "learning_rate": 1.2197802197802198e-05, + "loss": 0.0492, "step": 9990 }, { - "epoch": 2.5246149962130775, - "grad_norm": 0.3388938307762146, - "learning_rate": 1.4852310022721534e-05, - "loss": 0.0663, + "epoch": 2.97000297000297, + "grad_norm": 0.3032509684562683, + "learning_rate": 1.217998217998218e-05, + "loss": 0.0634, "step": 10000 }, { - "epoch": 2.5271396112092903, - "grad_norm": 0.4186761975288391, - "learning_rate": 1.4837162332744258e-05, - "loss": 0.0611, + "epoch": 2.972972972972973, + "grad_norm": 0.6462733149528503, + "learning_rate": 1.2162162162162164e-05, + "loss": 0.0667, "step": 10010 }, { - "epoch": 2.5296642262055036, - "grad_norm": 0.4181320071220398, - "learning_rate": 1.4822014642766979e-05, - "loss": 0.0554, + "epoch": 2.975942975942976, + "grad_norm": 0.5056395530700684, + "learning_rate": 1.2144342144342144e-05, + "loss": 0.0637, "step": 10020 }, { - "epoch": 2.532188841201717, - "grad_norm": 0.555503785610199, - "learning_rate": 1.4806866952789699e-05, - "loss": 0.0749, + "epoch": 2.978912978912979, + "grad_norm": 0.3662366569042206, + "learning_rate": 1.2126522126522126e-05, + "loss": 0.0709, "step": 10030 }, { - "epoch": 2.53471345619793, - "grad_norm": 0.4351447522640228, - "learning_rate": 1.4791719262812422e-05, - "loss": 0.0635, + "epoch": 2.9818829818829817, + "grad_norm": 0.49650683999061584, + "learning_rate": 1.2108702108702108e-05, + "loss": 0.0711, "step": 10040 }, { - "epoch": 2.537238071194143, - "grad_norm": 0.3531211018562317, - "learning_rate": 1.4776571572835144e-05, - "loss": 0.065, + "epoch": 2.984852984852985, + "grad_norm": 0.44112861156463623, + "learning_rate": 1.2090882090882092e-05, + "loss": 0.066, "step": 10050 }, { - "epoch": 2.5397626861903557, - "grad_norm": 0.3265394866466522, - "learning_rate": 1.4761423882857864e-05, - "loss": 0.0569, + "epoch": 2.987822987822988, + "grad_norm": 0.5365132689476013, + "learning_rate": 1.2073062073062073e-05, + "loss": 0.0589, "step": 10060 }, { - "epoch": 2.542287301186569, - "grad_norm": 0.30541831254959106, - "learning_rate": 1.4746276192880585e-05, - "loss": 0.0704, + "epoch": 2.9907929907929907, + "grad_norm": 0.4564819931983948, + "learning_rate": 1.2055242055242055e-05, + "loss": 0.0686, "step": 10070 }, { - "epoch": 2.5448119161827822, - "grad_norm": 0.5428284406661987, - "learning_rate": 1.4731128502903309e-05, - "loss": 0.0591, + "epoch": 2.993762993762994, + "grad_norm": 0.6063446402549744, + "learning_rate": 1.2037422037422039e-05, + "loss": 0.0673, "step": 10080 }, { - "epoch": 2.547336531178995, - "grad_norm": 0.6441836357116699, - "learning_rate": 1.4715980812926029e-05, - "loss": 0.0717, + "epoch": 2.9967329967329968, + "grad_norm": 0.516140878200531, + "learning_rate": 1.2019602019602021e-05, + "loss": 0.0454, "step": 10090 }, { - "epoch": 2.5498611461752083, - "grad_norm": 0.35294800996780396, - "learning_rate": 1.470083312294875e-05, - "loss": 0.0502, + "epoch": 2.9997029997029996, + "grad_norm": 0.36144575476646423, + "learning_rate": 1.2001782001782001e-05, + "loss": 0.0581, "step": 10100 }, { - "epoch": 2.552385761171421, - "grad_norm": 0.36238688230514526, - "learning_rate": 1.4685685432971473e-05, - "loss": 0.073, + "epoch": 3.0, + "eval_f1": 0.49727767695099817, + "eval_loss": 0.059147998690605164, + "eval_runtime": 179.7759, + "eval_samples_per_second": 211.48, + "eval_steps_per_second": 3.31, + "step": 10101 + }, + { + "epoch": 3.002673002673003, + "grad_norm": 0.46553778648376465, + "learning_rate": 1.1983961983961984e-05, + "loss": 0.0572, "step": 10110 }, { - "epoch": 2.5549103761676344, - "grad_norm": 0.42970699071884155, - "learning_rate": 1.4670537742994193e-05, - "loss": 0.0505, + "epoch": 3.0056430056430057, + "grad_norm": 0.38310161232948303, + "learning_rate": 1.1966141966141967e-05, + "loss": 0.0653, "step": 10120 }, { - "epoch": 2.5574349911638476, - "grad_norm": 0.6428592205047607, - "learning_rate": 1.4655390053016915e-05, - "loss": 0.0779, + "epoch": 3.0086130086130085, + "grad_norm": 0.7176486253738403, + "learning_rate": 1.1948321948321948e-05, + "loss": 0.0696, "step": 10130 }, { - "epoch": 2.5599596061600605, - "grad_norm": 0.6469938158988953, - "learning_rate": 1.4640242363039638e-05, - "loss": 0.0654, + "epoch": 3.011583011583012, + "grad_norm": 0.3964185118675232, + "learning_rate": 1.193050193050193e-05, + "loss": 0.0559, "step": 10140 }, { - "epoch": 2.5624842211562737, - "grad_norm": 0.5174428224563599, - "learning_rate": 1.4625094673062358e-05, - "loss": 0.0723, + "epoch": 3.0145530145530146, + "grad_norm": 0.480051189661026, + "learning_rate": 1.1912681912681914e-05, + "loss": 0.0688, "step": 10150 }, { - "epoch": 2.5650088361524865, - "grad_norm": 0.4240271747112274, - "learning_rate": 1.460994698308508e-05, - "loss": 0.0593, + "epoch": 3.0175230175230174, + "grad_norm": 0.4801310896873474, + "learning_rate": 1.1894861894861896e-05, + "loss": 0.0653, "step": 10160 }, { - "epoch": 2.5675334511487, - "grad_norm": 0.763982892036438, - "learning_rate": 1.4594799293107803e-05, - "loss": 0.0576, + "epoch": 3.0204930204930207, + "grad_norm": 0.7674263119697571, + "learning_rate": 1.1877041877041877e-05, + "loss": 0.0639, "step": 10170 }, { - "epoch": 2.570058066144913, - "grad_norm": 0.6305162906646729, - "learning_rate": 1.4579651603130523e-05, - "loss": 0.0742, + "epoch": 3.0234630234630235, + "grad_norm": 0.35185834765434265, + "learning_rate": 1.1859221859221859e-05, + "loss": 0.0446, "step": 10180 }, { - "epoch": 2.572582681141126, - "grad_norm": 0.4666813910007477, - "learning_rate": 1.4564503913153245e-05, - "loss": 0.0645, + "epoch": 3.0264330264330264, + "grad_norm": 0.6630620956420898, + "learning_rate": 1.1841401841401843e-05, + "loss": 0.0705, "step": 10190 }, { - "epoch": 2.575107296137339, - "grad_norm": 0.5371018052101135, - "learning_rate": 1.4549356223175964e-05, - "loss": 0.0706, + "epoch": 3.029403029403029, + "grad_norm": 0.5050874352455139, + "learning_rate": 1.1823581823581823e-05, + "loss": 0.0563, "step": 10200 }, { - "epoch": 2.577631911133552, - "grad_norm": 0.38728779554367065, - "learning_rate": 1.4534208533198688e-05, - "loss": 0.0695, + "epoch": 3.0323730323730325, + "grad_norm": 0.29523542523384094, + "learning_rate": 1.1805761805761805e-05, + "loss": 0.0598, "step": 10210 }, { - "epoch": 2.580156526129765, - "grad_norm": 0.537994921207428, - "learning_rate": 1.451906084322141e-05, - "loss": 0.0714, + "epoch": 3.0353430353430353, + "grad_norm": 0.5692099928855896, + "learning_rate": 1.1787941787941789e-05, + "loss": 0.0733, "step": 10220 }, { - "epoch": 2.5826811411259785, - "grad_norm": 0.41289886832237244, - "learning_rate": 1.450391315324413e-05, - "loss": 0.0703, + "epoch": 3.038313038313038, + "grad_norm": 0.714964747428894, + "learning_rate": 1.1770121770121771e-05, + "loss": 0.0547, "step": 10230 }, { - "epoch": 2.5852057561221913, - "grad_norm": 0.5759279727935791, - "learning_rate": 1.4488765463266852e-05, - "loss": 0.066, + "epoch": 3.0412830412830414, + "grad_norm": 0.4890214502811432, + "learning_rate": 1.1752301752301752e-05, + "loss": 0.0679, "step": 10240 }, { - "epoch": 2.5877303711184045, - "grad_norm": 0.5359590649604797, - "learning_rate": 1.4473617773289574e-05, - "loss": 0.0704, + "epoch": 3.044253044253044, + "grad_norm": 0.5631494522094727, + "learning_rate": 1.1734481734481734e-05, + "loss": 0.06, "step": 10250 }, { - "epoch": 2.5902549861146174, - "grad_norm": 0.3472817540168762, - "learning_rate": 1.4458470083312294e-05, - "loss": 0.0545, + "epoch": 3.047223047223047, + "grad_norm": 0.6472118496894836, + "learning_rate": 1.1716661716661718e-05, + "loss": 0.0533, "step": 10260 }, { - "epoch": 2.5927796011108306, - "grad_norm": 0.5667416453361511, - "learning_rate": 1.4443322393335017e-05, - "loss": 0.0577, + "epoch": 3.0501930501930503, + "grad_norm": 0.6611066460609436, + "learning_rate": 1.1698841698841698e-05, + "loss": 0.0591, "step": 10270 }, { - "epoch": 2.595304216107044, - "grad_norm": 0.5809875726699829, - "learning_rate": 1.4428174703357739e-05, - "loss": 0.0551, + "epoch": 3.053163053163053, + "grad_norm": 0.4274856448173523, + "learning_rate": 1.168102168102168e-05, + "loss": 0.0652, "step": 10280 }, { - "epoch": 2.5978288311032567, - "grad_norm": 0.4938434362411499, - "learning_rate": 1.4413027013380459e-05, - "loss": 0.056, + "epoch": 3.056133056133056, + "grad_norm": 0.32548412680625916, + "learning_rate": 1.1663201663201664e-05, + "loss": 0.0678, "step": 10290 }, { - "epoch": 2.60035344609947, - "grad_norm": 0.33972424268722534, - "learning_rate": 1.4397879323403182e-05, - "loss": 0.0595, + "epoch": 3.0591030591030592, + "grad_norm": 0.36015450954437256, + "learning_rate": 1.1645381645381647e-05, + "loss": 0.0691, "step": 10300 }, { - "epoch": 2.6028780610956828, - "grad_norm": 0.42544251680374146, - "learning_rate": 1.4382731633425904e-05, - "loss": 0.0608, + "epoch": 3.062073062073062, + "grad_norm": 0.5831524133682251, + "learning_rate": 1.1627561627561627e-05, + "loss": 0.0735, "step": 10310 }, { - "epoch": 2.605402676091896, - "grad_norm": 0.3733600378036499, - "learning_rate": 1.4367583943448624e-05, - "loss": 0.0496, + "epoch": 3.065043065043065, + "grad_norm": 0.7021368741989136, + "learning_rate": 1.160974160974161e-05, + "loss": 0.0728, "step": 10320 }, { - "epoch": 2.6079272910881093, - "grad_norm": 0.313872754573822, - "learning_rate": 1.4352436253471347e-05, - "loss": 0.0641, + "epoch": 3.068013068013068, + "grad_norm": 0.5424765944480896, + "learning_rate": 1.1591921591921593e-05, + "loss": 0.0617, "step": 10330 }, { - "epoch": 2.610451906084322, - "grad_norm": 0.43706610798835754, - "learning_rate": 1.4337288563494068e-05, - "loss": 0.0546, + "epoch": 3.070983070983071, + "grad_norm": 0.7176571488380432, + "learning_rate": 1.1574101574101574e-05, + "loss": 0.0677, "step": 10340 }, { - "epoch": 2.6129765210805354, - "grad_norm": 0.5997895002365112, - "learning_rate": 1.4322140873516788e-05, - "loss": 0.0645, + "epoch": 3.073953073953074, + "grad_norm": 0.33526375889778137, + "learning_rate": 1.1556281556281556e-05, + "loss": 0.0626, "step": 10350 }, { - "epoch": 2.615501136076748, - "grad_norm": 0.4480789601802826, - "learning_rate": 1.430699318353951e-05, - "loss": 0.0563, + "epoch": 3.076923076923077, + "grad_norm": 0.4724681079387665, + "learning_rate": 1.153846153846154e-05, + "loss": 0.0575, "step": 10360 }, { - "epoch": 2.6180257510729614, - "grad_norm": 0.41993817687034607, - "learning_rate": 1.4291845493562233e-05, - "loss": 0.0578, + "epoch": 3.07989307989308, + "grad_norm": 0.6367087364196777, + "learning_rate": 1.1520641520641522e-05, + "loss": 0.0538, "step": 10370 }, { - "epoch": 2.6205503660691747, - "grad_norm": 4.695851802825928, - "learning_rate": 1.4276697803584953e-05, - "loss": 0.0712, + "epoch": 3.0828630828630827, + "grad_norm": 0.31437206268310547, + "learning_rate": 1.1502821502821502e-05, + "loss": 0.0643, "step": 10380 }, { - "epoch": 2.6230749810653875, - "grad_norm": 0.5636560320854187, - "learning_rate": 1.4261550113607675e-05, - "loss": 0.0627, + "epoch": 3.085833085833086, + "grad_norm": 0.4423040449619293, + "learning_rate": 1.1485001485001484e-05, + "loss": 0.0684, "step": 10390 }, { - "epoch": 2.6255995960616008, - "grad_norm": 0.2798742949962616, - "learning_rate": 1.4246402423630398e-05, - "loss": 0.0707, + "epoch": 3.088803088803089, + "grad_norm": 0.4041610360145569, + "learning_rate": 1.1467181467181468e-05, + "loss": 0.0552, "step": 10400 }, { - "epoch": 2.6281242110578136, - "grad_norm": 0.5203710794448853, - "learning_rate": 1.4231254733653118e-05, - "loss": 0.0637, + "epoch": 3.0917730917730917, + "grad_norm": 0.4148096442222595, + "learning_rate": 1.144936144936145e-05, + "loss": 0.0693, "step": 10410 }, { - "epoch": 2.630648826054027, - "grad_norm": 0.6120650768280029, - "learning_rate": 1.421610704367584e-05, - "loss": 0.0615, + "epoch": 3.094743094743095, + "grad_norm": 0.30476808547973633, + "learning_rate": 1.1431541431541431e-05, + "loss": 0.0684, "step": 10420 }, { - "epoch": 2.63317344105024, - "grad_norm": 0.3410748541355133, - "learning_rate": 1.4200959353698563e-05, - "loss": 0.068, + "epoch": 3.0977130977130978, + "grad_norm": 0.7706785798072815, + "learning_rate": 1.1413721413721415e-05, + "loss": 0.0707, "step": 10430 }, { - "epoch": 2.635698056046453, - "grad_norm": 0.4275479018688202, - "learning_rate": 1.4185811663721283e-05, - "loss": 0.0638, + "epoch": 3.1006831006831006, + "grad_norm": 0.3732987940311432, + "learning_rate": 1.1395901395901397e-05, + "loss": 0.0619, "step": 10440 }, { - "epoch": 2.6382226710426657, - "grad_norm": 0.5067179799079895, - "learning_rate": 1.4170663973744004e-05, - "loss": 0.062, + "epoch": 3.1036531036531034, + "grad_norm": 0.4054795503616333, + "learning_rate": 1.1378081378081377e-05, + "loss": 0.0666, "step": 10450 }, { - "epoch": 2.640747286038879, - "grad_norm": 0.2900165617465973, - "learning_rate": 1.4155516283766726e-05, - "loss": 0.0603, + "epoch": 3.1066231066231067, + "grad_norm": 0.660860538482666, + "learning_rate": 1.136026136026136e-05, + "loss": 0.0639, "step": 10460 }, { - "epoch": 2.6432719010350922, - "grad_norm": 0.4279478192329407, - "learning_rate": 1.4140368593789447e-05, - "loss": 0.085, + "epoch": 3.1095931095931095, + "grad_norm": 0.5180338025093079, + "learning_rate": 1.1342441342441343e-05, + "loss": 0.068, "step": 10470 }, { - "epoch": 2.6457965160313055, - "grad_norm": 0.5949720144271851, - "learning_rate": 1.4125220903812169e-05, - "loss": 0.0642, + "epoch": 3.1125631125631124, + "grad_norm": 0.44153326749801636, + "learning_rate": 1.1324621324621326e-05, + "loss": 0.0621, "step": 10480 }, { - "epoch": 2.6483211310275183, - "grad_norm": 0.4991612732410431, - "learning_rate": 1.4110073213834889e-05, - "loss": 0.0563, + "epoch": 3.1155331155331156, + "grad_norm": 0.6957278251647949, + "learning_rate": 1.1306801306801306e-05, + "loss": 0.0553, "step": 10490 }, { - "epoch": 2.650845746023731, - "grad_norm": 1.0148282051086426, - "learning_rate": 1.4094925523857612e-05, - "loss": 0.0675, + "epoch": 3.1185031185031185, + "grad_norm": 0.29442107677459717, + "learning_rate": 1.128898128898129e-05, + "loss": 0.0646, "step": 10500 }, { - "epoch": 2.6533703610199444, - "grad_norm": 0.5165835022926331, - "learning_rate": 1.4079777833880334e-05, - "loss": 0.0713, + "epoch": 3.1214731214731213, + "grad_norm": 0.4631500244140625, + "learning_rate": 1.1271161271161272e-05, + "loss": 0.0699, "step": 10510 }, { - "epoch": 2.6558949760161576, - "grad_norm": 0.7783892154693604, - "learning_rate": 1.4064630143903054e-05, - "loss": 0.0625, + "epoch": 3.1244431244431246, + "grad_norm": 0.4856095314025879, + "learning_rate": 1.1253341253341253e-05, + "loss": 0.0685, "step": 10520 }, { - "epoch": 2.6584195910123705, - "grad_norm": 0.24568358063697815, - "learning_rate": 1.4049482453925777e-05, - "loss": 0.0578, + "epoch": 3.1274131274131274, + "grad_norm": 0.7424579858779907, + "learning_rate": 1.1235521235521235e-05, + "loss": 0.0639, "step": 10530 }, { - "epoch": 2.6609442060085837, - "grad_norm": 0.46469801664352417, - "learning_rate": 1.4034334763948499e-05, - "loss": 0.0734, + "epoch": 3.13038313038313, + "grad_norm": 0.5345817804336548, + "learning_rate": 1.1217701217701219e-05, + "loss": 0.0641, "step": 10540 }, { - "epoch": 2.6634688210047965, - "grad_norm": 0.5760740637779236, - "learning_rate": 1.4019187073971219e-05, - "loss": 0.0677, + "epoch": 3.1333531333531335, + "grad_norm": 0.5012867450714111, + "learning_rate": 1.11998811998812e-05, + "loss": 0.0803, "step": 10550 }, { - "epoch": 2.66599343600101, - "grad_norm": 0.6759510040283203, - "learning_rate": 1.4004039383993942e-05, - "loss": 0.064, + "epoch": 3.1363231363231363, + "grad_norm": 0.5213742852210999, + "learning_rate": 1.1182061182061181e-05, + "loss": 0.0469, "step": 10560 }, { - "epoch": 2.668518050997223, - "grad_norm": 0.6074597239494324, - "learning_rate": 1.3988891694016663e-05, - "loss": 0.0609, + "epoch": 3.139293139293139, + "grad_norm": 0.39430922269821167, + "learning_rate": 1.1164241164241165e-05, + "loss": 0.0502, "step": 10570 }, { - "epoch": 2.671042665993436, - "grad_norm": 0.3964291214942932, - "learning_rate": 1.3973744004039383e-05, - "loss": 0.0589, + "epoch": 3.1422631422631424, + "grad_norm": 0.6875708699226379, + "learning_rate": 1.1146421146421147e-05, + "loss": 0.0617, "step": 10580 }, { - "epoch": 2.673567280989649, - "grad_norm": 0.4285549521446228, - "learning_rate": 1.3958596314062107e-05, - "loss": 0.0607, + "epoch": 3.1452331452331452, + "grad_norm": 0.4213047921657562, + "learning_rate": 1.1128601128601128e-05, + "loss": 0.067, "step": 10590 }, { - "epoch": 2.676091895985862, - "grad_norm": 0.4889032244682312, - "learning_rate": 1.3943448624084828e-05, - "loss": 0.0664, + "epoch": 3.148203148203148, + "grad_norm": 0.9495222568511963, + "learning_rate": 1.111078111078111e-05, + "loss": 0.0446, "step": 10600 }, { - "epoch": 2.678616510982075, - "grad_norm": 0.5509054660797119, - "learning_rate": 1.3928300934107548e-05, - "loss": 0.0657, + "epoch": 3.1511731511731513, + "grad_norm": 0.37120023369789124, + "learning_rate": 1.1092961092961094e-05, + "loss": 0.0727, "step": 10610 }, { - "epoch": 2.6811411259782885, - "grad_norm": 0.5647663474082947, - "learning_rate": 1.3913153244130271e-05, - "loss": 0.0712, + "epoch": 3.154143154143154, + "grad_norm": 0.44335830211639404, + "learning_rate": 1.1075141075141076e-05, + "loss": 0.0574, "step": 10620 }, { - "epoch": 2.6836657409745013, - "grad_norm": 0.558576762676239, - "learning_rate": 1.3898005554152993e-05, - "loss": 0.0713, + "epoch": 3.157113157113157, + "grad_norm": 0.6420602798461914, + "learning_rate": 1.1057321057321056e-05, + "loss": 0.0708, "step": 10630 }, { - "epoch": 2.6861903559707145, - "grad_norm": 0.4516654908657074, - "learning_rate": 1.3882857864175713e-05, - "loss": 0.0631, + "epoch": 3.1600831600831603, + "grad_norm": 0.4319610297679901, + "learning_rate": 1.103950103950104e-05, + "loss": 0.0646, "step": 10640 }, { - "epoch": 2.6887149709669274, - "grad_norm": 0.4486042559146881, - "learning_rate": 1.3867710174198434e-05, - "loss": 0.0653, + "epoch": 3.163053163053163, + "grad_norm": 0.34275874495506287, + "learning_rate": 1.1021681021681022e-05, + "loss": 0.0511, "step": 10650 }, { - "epoch": 2.6912395859631406, - "grad_norm": 0.6781389117240906, - "learning_rate": 1.3852562484221158e-05, - "loss": 0.0632, + "epoch": 3.166023166023166, + "grad_norm": 0.32853662967681885, + "learning_rate": 1.1003861003861003e-05, + "loss": 0.0476, "step": 10660 }, { - "epoch": 2.693764200959354, - "grad_norm": 0.49867871403694153, - "learning_rate": 1.3837414794243878e-05, - "loss": 0.069, + "epoch": 3.168993168993169, + "grad_norm": 0.7371835708618164, + "learning_rate": 1.0986040986040985e-05, + "loss": 0.067, "step": 10670 }, { - "epoch": 2.6962888159555667, - "grad_norm": 0.35217198729515076, - "learning_rate": 1.38222671042666e-05, - "loss": 0.0658, + "epoch": 3.171963171963172, + "grad_norm": 0.23537606000900269, + "learning_rate": 1.0968220968220969e-05, + "loss": 0.0636, "step": 10680 }, { - "epoch": 2.69881343095178, - "grad_norm": 0.6483206152915955, - "learning_rate": 1.3807119414289323e-05, - "loss": 0.0632, + "epoch": 3.174933174933175, + "grad_norm": 0.638041615486145, + "learning_rate": 1.0950400950400951e-05, + "loss": 0.0658, "step": 10690 }, { - "epoch": 2.7013380459479928, - "grad_norm": 0.3511044979095459, - "learning_rate": 1.3791971724312042e-05, - "loss": 0.0785, + "epoch": 3.177903177903178, + "grad_norm": 0.7828889489173889, + "learning_rate": 1.0932580932580932e-05, + "loss": 0.0603, "step": 10700 }, { - "epoch": 2.703862660944206, - "grad_norm": 0.38118109107017517, - "learning_rate": 1.3776824034334764e-05, - "loss": 0.0622, + "epoch": 3.180873180873181, + "grad_norm": 0.41569939255714417, + "learning_rate": 1.0914760914760916e-05, + "loss": 0.0528, "step": 10710 }, { - "epoch": 2.7063872759404193, - "grad_norm": 0.6023518443107605, - "learning_rate": 1.3761676344357486e-05, - "loss": 0.0665, + "epoch": 3.1838431838431838, + "grad_norm": 0.4870140552520752, + "learning_rate": 1.0896940896940898e-05, + "loss": 0.0565, "step": 10720 }, { - "epoch": 2.708911890936632, - "grad_norm": 0.87665855884552, - "learning_rate": 1.3746528654380207e-05, - "loss": 0.0527, + "epoch": 3.186813186813187, + "grad_norm": 0.3599897623062134, + "learning_rate": 1.087912087912088e-05, + "loss": 0.0469, "step": 10730 }, { - "epoch": 2.7114365059328454, - "grad_norm": 0.24186670780181885, - "learning_rate": 1.3731380964402929e-05, - "loss": 0.0605, + "epoch": 3.18978318978319, + "grad_norm": 0.26678797602653503, + "learning_rate": 1.086130086130086e-05, + "loss": 0.0708, "step": 10740 }, { - "epoch": 2.713961120929058, - "grad_norm": 0.40543287992477417, - "learning_rate": 1.371623327442565e-05, - "loss": 0.0714, + "epoch": 3.1927531927531927, + "grad_norm": 0.6243604421615601, + "learning_rate": 1.0843480843480844e-05, + "loss": 0.0486, "step": 10750 }, { - "epoch": 2.7164857359252714, - "grad_norm": 0.5654604434967041, - "learning_rate": 1.3701085584448372e-05, - "loss": 0.0649, + "epoch": 3.1957231957231955, + "grad_norm": 0.5825532674789429, + "learning_rate": 1.0825660825660826e-05, + "loss": 0.0663, "step": 10760 }, { - "epoch": 2.7190103509214847, - "grad_norm": 0.4096258580684662, - "learning_rate": 1.3685937894471094e-05, - "loss": 0.0575, + "epoch": 3.198693198693199, + "grad_norm": 0.4092167913913727, + "learning_rate": 1.0807840807840807e-05, + "loss": 0.0704, "step": 10770 }, { - "epoch": 2.7215349659176975, - "grad_norm": 0.28115513920783997, - "learning_rate": 1.3670790204493814e-05, - "loss": 0.0506, + "epoch": 3.2016632016632016, + "grad_norm": 0.5701293349266052, + "learning_rate": 1.079002079002079e-05, + "loss": 0.0635, "step": 10780 }, { - "epoch": 2.7240595809139108, - "grad_norm": 0.4821475148200989, - "learning_rate": 1.3655642514516537e-05, - "loss": 0.0572, + "epoch": 3.2046332046332044, + "grad_norm": 0.25863227248191833, + "learning_rate": 1.0772200772200773e-05, + "loss": 0.0641, "step": 10790 }, { - "epoch": 2.7265841959101236, - "grad_norm": 0.6362419128417969, - "learning_rate": 1.3640494824539258e-05, - "loss": 0.07, + "epoch": 3.2076032076032077, + "grad_norm": 0.3742627203464508, + "learning_rate": 1.0754380754380755e-05, + "loss": 0.0698, "step": 10800 }, { - "epoch": 2.729108810906337, - "grad_norm": 0.33636218309402466, - "learning_rate": 1.3625347134561978e-05, - "loss": 0.0565, + "epoch": 3.2105732105732105, + "grad_norm": 0.3190728724002838, + "learning_rate": 1.0736560736560736e-05, + "loss": 0.0585, "step": 10810 }, { - "epoch": 2.73163342590255, - "grad_norm": 0.5278832912445068, - "learning_rate": 1.3610199444584702e-05, - "loss": 0.0742, + "epoch": 3.2135432135432134, + "grad_norm": 0.49537599086761475, + "learning_rate": 1.071874071874072e-05, + "loss": 0.0635, "step": 10820 }, { - "epoch": 2.734158040898763, - "grad_norm": 0.6074432134628296, - "learning_rate": 1.3595051754607423e-05, - "loss": 0.0684, + "epoch": 3.2165132165132166, + "grad_norm": 0.3896566927433014, + "learning_rate": 1.0700920700920702e-05, + "loss": 0.0753, "step": 10830 }, { - "epoch": 2.736682655894976, - "grad_norm": 0.37134847044944763, - "learning_rate": 1.3579904064630143e-05, - "loss": 0.0757, + "epoch": 3.2194832194832195, + "grad_norm": 0.6234869956970215, + "learning_rate": 1.0683100683100682e-05, + "loss": 0.0545, "step": 10840 }, { - "epoch": 2.739207270891189, - "grad_norm": 0.3969337046146393, - "learning_rate": 1.3564756374652866e-05, - "loss": 0.0566, + "epoch": 3.2224532224532223, + "grad_norm": 0.421795129776001, + "learning_rate": 1.0665280665280666e-05, + "loss": 0.0714, "step": 10850 }, { - "epoch": 2.7417318858874022, - "grad_norm": 0.8758741617202759, - "learning_rate": 1.3549608684675588e-05, - "loss": 0.0537, + "epoch": 3.2254232254232256, + "grad_norm": 0.6576681733131409, + "learning_rate": 1.0647460647460648e-05, + "loss": 0.0661, "step": 10860 }, { - "epoch": 2.7442565008836155, - "grad_norm": 0.619987428188324, - "learning_rate": 1.3534460994698308e-05, - "loss": 0.0597, + "epoch": 3.2283932283932284, + "grad_norm": 0.5803960561752319, + "learning_rate": 1.062964062964063e-05, + "loss": 0.0732, "step": 10870 }, { - "epoch": 2.7467811158798283, - "grad_norm": 0.4209834635257721, - "learning_rate": 1.3519313304721031e-05, - "loss": 0.0666, + "epoch": 3.2313632313632312, + "grad_norm": 0.39635559916496277, + "learning_rate": 1.0611820611820612e-05, + "loss": 0.0473, "step": 10880 }, { - "epoch": 2.749305730876041, - "grad_norm": 0.44348636269569397, - "learning_rate": 1.3504165614743753e-05, - "loss": 0.0577, + "epoch": 3.2343332343332345, + "grad_norm": 0.5573329329490662, + "learning_rate": 1.0594000594000595e-05, + "loss": 0.059, "step": 10890 }, { - "epoch": 2.7518303458722544, - "grad_norm": 0.3939560651779175, - "learning_rate": 1.3489017924766473e-05, - "loss": 0.0748, + "epoch": 3.2373032373032373, + "grad_norm": 0.6418017148971558, + "learning_rate": 1.0576180576180577e-05, + "loss": 0.0633, "step": 10900 }, { - "epoch": 2.7543549608684676, - "grad_norm": 0.44665518403053284, - "learning_rate": 1.3473870234789196e-05, - "loss": 0.05, + "epoch": 3.24027324027324, + "grad_norm": 0.6030585169792175, + "learning_rate": 1.0558360558360557e-05, + "loss": 0.0596, "step": 10910 }, { - "epoch": 2.756879575864681, - "grad_norm": 0.42193907499313354, - "learning_rate": 1.3458722544811918e-05, - "loss": 0.0683, + "epoch": 3.2432432432432434, + "grad_norm": 0.41735953092575073, + "learning_rate": 1.0540540540540541e-05, + "loss": 0.0585, "step": 10920 }, { - "epoch": 2.7594041908608937, - "grad_norm": 0.17752446234226227, - "learning_rate": 1.3443574854834637e-05, - "loss": 0.0561, + "epoch": 3.2462132462132463, + "grad_norm": 0.7560169696807861, + "learning_rate": 1.0522720522720523e-05, + "loss": 0.0566, "step": 10930 }, { - "epoch": 2.7619288058571065, - "grad_norm": 0.3520565927028656, - "learning_rate": 1.3428427164857359e-05, - "loss": 0.0618, + "epoch": 3.249183249183249, + "grad_norm": 0.2606422007083893, + "learning_rate": 1.0504900504900505e-05, + "loss": 0.0498, "step": 10940 }, { - "epoch": 2.76445342085332, - "grad_norm": 0.4644409418106079, - "learning_rate": 1.3413279474880082e-05, - "loss": 0.0565, + "epoch": 3.252153252153252, + "grad_norm": 0.5863521695137024, + "learning_rate": 1.0487080487080488e-05, + "loss": 0.0603, "step": 10950 }, { - "epoch": 2.766978035849533, - "grad_norm": 0.5722434520721436, - "learning_rate": 1.3398131784902802e-05, - "loss": 0.0669, + "epoch": 3.255123255123255, + "grad_norm": 0.4618661403656006, + "learning_rate": 1.046926046926047e-05, + "loss": 0.0709, "step": 10960 }, { - "epoch": 2.769502650845746, - "grad_norm": 0.42250752449035645, - "learning_rate": 1.3382984094925524e-05, - "loss": 0.0536, + "epoch": 3.258093258093258, + "grad_norm": 0.3728097975254059, + "learning_rate": 1.0451440451440452e-05, + "loss": 0.0605, "step": 10970 }, { - "epoch": 2.772027265841959, - "grad_norm": 0.35409343242645264, - "learning_rate": 1.3367836404948247e-05, - "loss": 0.0632, + "epoch": 3.261063261063261, + "grad_norm": 0.4798294007778168, + "learning_rate": 1.0433620433620434e-05, + "loss": 0.0559, "step": 10980 }, { - "epoch": 2.774551880838172, - "grad_norm": 0.32060950994491577, - "learning_rate": 1.3352688714970967e-05, - "loss": 0.0533, + "epoch": 3.264033264033264, + "grad_norm": 0.6178519129753113, + "learning_rate": 1.0415800415800416e-05, + "loss": 0.0696, "step": 10990 }, { - "epoch": 2.777076495834385, - "grad_norm": 0.5148407220840454, - "learning_rate": 1.3337541024993689e-05, - "loss": 0.063, + "epoch": 3.267003267003267, + "grad_norm": 0.4793247580528259, + "learning_rate": 1.0397980397980398e-05, + "loss": 0.059, "step": 11000 }, { - "epoch": 2.7796011108305985, - "grad_norm": 0.33185675740242004, - "learning_rate": 1.332239333501641e-05, - "loss": 0.0622, + "epoch": 3.2699732699732698, + "grad_norm": 0.33142969012260437, + "learning_rate": 1.038016038016038e-05, + "loss": 0.0439, "step": 11010 }, { - "epoch": 2.7821257258268113, - "grad_norm": 0.6653043031692505, - "learning_rate": 1.3307245645039132e-05, - "loss": 0.0626, + "epoch": 3.272943272943273, + "grad_norm": 0.261089950799942, + "learning_rate": 1.0362340362340363e-05, + "loss": 0.0586, "step": 11020 }, { - "epoch": 2.7846503408230245, - "grad_norm": 0.8676443099975586, - "learning_rate": 1.3292097955061853e-05, - "loss": 0.0692, + "epoch": 3.275913275913276, + "grad_norm": 0.34269529581069946, + "learning_rate": 1.0344520344520345e-05, + "loss": 0.0679, "step": 11030 }, { - "epoch": 2.7871749558192374, - "grad_norm": 0.6054368615150452, - "learning_rate": 1.3276950265084575e-05, - "loss": 0.0617, + "epoch": 3.2788832788832787, + "grad_norm": 0.4112348258495331, + "learning_rate": 1.0326700326700327e-05, + "loss": 0.0599, "step": 11040 }, { - "epoch": 2.7896995708154506, - "grad_norm": 0.416385680437088, - "learning_rate": 1.3261802575107297e-05, - "loss": 0.0576, + "epoch": 3.281853281853282, + "grad_norm": 0.5969886183738708, + "learning_rate": 1.030888030888031e-05, + "loss": 0.0719, "step": 11050 }, { - "epoch": 2.792224185811664, - "grad_norm": 0.37582552433013916, - "learning_rate": 1.3246654885130018e-05, - "loss": 0.0624, + "epoch": 3.284823284823285, + "grad_norm": 0.5105575323104858, + "learning_rate": 1.0291060291060291e-05, + "loss": 0.053, "step": 11060 }, { - "epoch": 2.7947488008078767, - "grad_norm": 0.44736284017562866, - "learning_rate": 1.3231507195152738e-05, - "loss": 0.0737, + "epoch": 3.2877932877932876, + "grad_norm": 0.4884382486343384, + "learning_rate": 1.0273240273240274e-05, + "loss": 0.0553, "step": 11070 }, { - "epoch": 2.79727341580409, - "grad_norm": 0.5286217927932739, - "learning_rate": 1.3216359505175461e-05, - "loss": 0.0644, + "epoch": 3.290763290763291, + "grad_norm": 0.4914264678955078, + "learning_rate": 1.0255420255420256e-05, + "loss": 0.0592, "step": 11080 }, { - "epoch": 2.7997980308003028, - "grad_norm": 0.3506781756877899, - "learning_rate": 1.3201211815198183e-05, - "loss": 0.0661, + "epoch": 3.2937332937332937, + "grad_norm": 0.44552749395370483, + "learning_rate": 1.0237600237600238e-05, + "loss": 0.0527, "step": 11090 }, { - "epoch": 2.802322645796516, - "grad_norm": 0.3819282054901123, - "learning_rate": 1.3186064125220903e-05, - "loss": 0.0506, + "epoch": 3.2967032967032965, + "grad_norm": 0.43704137206077576, + "learning_rate": 1.021978021978022e-05, + "loss": 0.062, "step": 11100 }, { - "epoch": 2.8048472607927293, - "grad_norm": 0.4702123999595642, - "learning_rate": 1.3170916435243626e-05, - "loss": 0.0613, + "epoch": 3.2996732996733, + "grad_norm": 0.45537468791007996, + "learning_rate": 1.0201960201960202e-05, + "loss": 0.0685, "step": 11110 }, { - "epoch": 2.807371875788942, - "grad_norm": 0.5164020657539368, - "learning_rate": 1.3155768745266348e-05, - "loss": 0.0569, + "epoch": 3.3026433026433026, + "grad_norm": 0.45990774035453796, + "learning_rate": 1.0184140184140184e-05, + "loss": 0.0675, "step": 11120 }, { - "epoch": 2.8098964907851554, - "grad_norm": 0.6037926077842712, - "learning_rate": 1.3140621055289068e-05, - "loss": 0.0697, + "epoch": 3.3056133056133055, + "grad_norm": 0.375456303358078, + "learning_rate": 1.0166320166320167e-05, + "loss": 0.0585, "step": 11130 }, { - "epoch": 2.812421105781368, - "grad_norm": 0.5212513208389282, - "learning_rate": 1.3125473365311791e-05, - "loss": 0.0482, + "epoch": 3.3085833085833087, + "grad_norm": 0.42089733481407166, + "learning_rate": 1.0148500148500149e-05, + "loss": 0.0636, "step": 11140 }, { - "epoch": 2.8149457207775814, - "grad_norm": 0.46721410751342773, - "learning_rate": 1.3110325675334513e-05, - "loss": 0.0607, + "epoch": 3.3115533115533116, + "grad_norm": 0.4135701060295105, + "learning_rate": 1.0130680130680131e-05, + "loss": 0.0742, "step": 11150 }, { - "epoch": 2.8174703357737947, - "grad_norm": 0.48438936471939087, - "learning_rate": 1.3095177985357232e-05, - "loss": 0.0685, + "epoch": 3.3145233145233144, + "grad_norm": 0.47297653555870056, + "learning_rate": 1.0112860112860113e-05, + "loss": 0.0723, "step": 11160 }, { - "epoch": 2.8199949507700075, - "grad_norm": 0.40378254652023315, - "learning_rate": 1.3080030295379956e-05, - "loss": 0.0684, + "epoch": 3.3174933174933177, + "grad_norm": 0.5323516726493835, + "learning_rate": 1.0095040095040095e-05, + "loss": 0.0577, "step": 11170 }, { - "epoch": 2.8225195657662208, - "grad_norm": 0.4804742634296417, - "learning_rate": 1.3064882605402677e-05, - "loss": 0.074, + "epoch": 3.3204633204633205, + "grad_norm": 0.37327128648757935, + "learning_rate": 1.0077220077220078e-05, + "loss": 0.0606, "step": 11180 }, { - "epoch": 2.8250441807624336, - "grad_norm": 0.4946554899215698, - "learning_rate": 1.3049734915425397e-05, - "loss": 0.0694, + "epoch": 3.3234333234333233, + "grad_norm": 0.9338498711585999, + "learning_rate": 1.005940005940006e-05, + "loss": 0.0679, "step": 11190 }, { - "epoch": 2.827568795758647, - "grad_norm": 0.3673482835292816, - "learning_rate": 1.303458722544812e-05, - "loss": 0.0694, + "epoch": 3.3264033264033266, + "grad_norm": 0.5313315987586975, + "learning_rate": 1.0041580041580042e-05, + "loss": 0.0592, "step": 11200 }, { - "epoch": 2.83009341075486, - "grad_norm": 0.4023125171661377, - "learning_rate": 1.3019439535470842e-05, - "loss": 0.0622, + "epoch": 3.3293733293733294, + "grad_norm": 0.27918875217437744, + "learning_rate": 1.0023760023760024e-05, + "loss": 0.0661, "step": 11210 }, { - "epoch": 2.832618025751073, - "grad_norm": 0.7260475158691406, - "learning_rate": 1.3004291845493562e-05, - "loss": 0.0761, + "epoch": 3.3323433323433322, + "grad_norm": 0.3626916706562042, + "learning_rate": 1.0005940005940006e-05, + "loss": 0.0493, "step": 11220 }, { - "epoch": 2.835142640747286, - "grad_norm": 0.5414501428604126, - "learning_rate": 1.2989144155516284e-05, - "loss": 0.0548, + "epoch": 3.3353133353133355, + "grad_norm": 0.43011564016342163, + "learning_rate": 9.988119988119988e-06, + "loss": 0.0681, "step": 11230 }, { - "epoch": 2.837667255743499, - "grad_norm": 0.48935428261756897, - "learning_rate": 1.2973996465539007e-05, - "loss": 0.0551, + "epoch": 3.3382833382833383, + "grad_norm": 0.5412601232528687, + "learning_rate": 9.97029997029997e-06, + "loss": 0.0639, "step": 11240 }, { - "epoch": 2.8401918707397122, - "grad_norm": 0.3654029071331024, - "learning_rate": 1.2958848775561727e-05, - "loss": 0.0526, + "epoch": 3.341253341253341, + "grad_norm": 0.6399582028388977, + "learning_rate": 9.952479952479953e-06, + "loss": 0.0497, "step": 11250 }, { - "epoch": 2.8427164857359255, - "grad_norm": 0.42648911476135254, - "learning_rate": 1.2943701085584448e-05, - "loss": 0.0666, + "epoch": 3.3442233442233444, + "grad_norm": 0.44264036417007446, + "learning_rate": 9.934659934659935e-06, + "loss": 0.0582, "step": 11260 }, { - "epoch": 2.8452411007321383, - "grad_norm": 0.49149778485298157, - "learning_rate": 1.292855339560717e-05, - "loss": 0.0688, + "epoch": 3.3471933471933473, + "grad_norm": 0.3276296854019165, + "learning_rate": 9.916839916839917e-06, + "loss": 0.0463, "step": 11270 }, { - "epoch": 2.8477657157283516, - "grad_norm": 0.5606464147567749, - "learning_rate": 1.2913405705629892e-05, - "loss": 0.0693, + "epoch": 3.35016335016335, + "grad_norm": 0.3752717077732086, + "learning_rate": 9.8990198990199e-06, + "loss": 0.0725, "step": 11280 }, { - "epoch": 2.8502903307245644, - "grad_norm": 0.7137352228164673, - "learning_rate": 1.2898258015652613e-05, - "loss": 0.0458, + "epoch": 3.3531333531333534, + "grad_norm": 0.5361660718917847, + "learning_rate": 9.881199881199881e-06, + "loss": 0.0674, "step": 11290 }, { - "epoch": 2.8528149457207777, - "grad_norm": 0.5822766423225403, - "learning_rate": 1.2883110325675335e-05, - "loss": 0.065, + "epoch": 3.356103356103356, + "grad_norm": 0.5567395687103271, + "learning_rate": 9.863379863379865e-06, + "loss": 0.0708, "step": 11300 }, { - "epoch": 2.855339560716991, - "grad_norm": 0.5194171071052551, - "learning_rate": 1.2867962635698056e-05, - "loss": 0.0605, + "epoch": 3.359073359073359, + "grad_norm": 0.4132004678249359, + "learning_rate": 9.845559845559846e-06, + "loss": 0.0667, "step": 11310 }, { - "epoch": 2.8578641757132037, - "grad_norm": 0.35691502690315247, - "learning_rate": 1.2852814945720778e-05, - "loss": 0.0463, + "epoch": 3.362043362043362, + "grad_norm": 0.5917862057685852, + "learning_rate": 9.827739827739828e-06, + "loss": 0.0494, "step": 11320 }, { - "epoch": 2.8603887907094165, - "grad_norm": 0.4269840121269226, - "learning_rate": 1.28376672557435e-05, - "loss": 0.0554, + "epoch": 3.365013365013365, + "grad_norm": 0.41967251896858215, + "learning_rate": 9.80991980991981e-06, + "loss": 0.0526, "step": 11330 }, { - "epoch": 2.86291340570563, - "grad_norm": 0.45535925030708313, - "learning_rate": 1.2822519565766221e-05, - "loss": 0.0542, + "epoch": 3.367983367983368, + "grad_norm": 0.2610296308994293, + "learning_rate": 9.792099792099792e-06, + "loss": 0.0597, "step": 11340 }, { - "epoch": 2.865438020701843, - "grad_norm": 0.4456949532032013, - "learning_rate": 1.2807371875788943e-05, - "loss": 0.0555, + "epoch": 3.3709533709533708, + "grad_norm": 0.6055606603622437, + "learning_rate": 9.774279774279774e-06, + "loss": 0.0615, "step": 11350 }, { - "epoch": 2.8679626356980563, - "grad_norm": 0.373177707195282, - "learning_rate": 1.2792224185811663e-05, - "loss": 0.0548, + "epoch": 3.373923373923374, + "grad_norm": 0.6907655596733093, + "learning_rate": 9.756459756459757e-06, + "loss": 0.0592, "step": 11360 }, { - "epoch": 2.870487250694269, - "grad_norm": 0.33291095495224, - "learning_rate": 1.2777076495834386e-05, - "loss": 0.0567, + "epoch": 3.376893376893377, + "grad_norm": 0.5287322402000427, + "learning_rate": 9.73863973863974e-06, + "loss": 0.0501, "step": 11370 }, { - "epoch": 2.873011865690482, - "grad_norm": 0.34440621733665466, - "learning_rate": 1.2761928805857107e-05, - "loss": 0.0664, + "epoch": 3.3798633798633797, + "grad_norm": 0.3826773762702942, + "learning_rate": 9.720819720819721e-06, + "loss": 0.0654, "step": 11380 }, { - "epoch": 2.875536480686695, - "grad_norm": 0.7178020477294922, - "learning_rate": 1.2746781115879827e-05, - "loss": 0.0661, + "epoch": 3.382833382833383, + "grad_norm": 0.4057276248931885, + "learning_rate": 9.702999702999703e-06, + "loss": 0.0686, "step": 11390 }, { - "epoch": 2.8780610956829085, - "grad_norm": 0.6484971642494202, - "learning_rate": 1.273163342590255e-05, - "loss": 0.0661, + "epoch": 3.385803385803386, + "grad_norm": 0.3789379596710205, + "learning_rate": 9.685179685179685e-06, + "loss": 0.0705, "step": 11400 }, { - "epoch": 2.8805857106791213, - "grad_norm": 0.44024258852005005, - "learning_rate": 1.2716485735925272e-05, - "loss": 0.0525, + "epoch": 3.3887733887733886, + "grad_norm": 0.6244688630104065, + "learning_rate": 9.667359667359667e-06, + "loss": 0.0529, "step": 11410 }, { - "epoch": 2.8831103256753345, - "grad_norm": 0.42825838923454285, - "learning_rate": 1.2701338045947992e-05, - "loss": 0.058, + "epoch": 3.391743391743392, + "grad_norm": 0.4109695255756378, + "learning_rate": 9.64953964953965e-06, + "loss": 0.0606, "step": 11420 }, { - "epoch": 2.8856349406715474, - "grad_norm": 0.2894943952560425, - "learning_rate": 1.2686190355970715e-05, - "loss": 0.0613, + "epoch": 3.3947133947133947, + "grad_norm": 0.5615403652191162, + "learning_rate": 9.631719631719632e-06, + "loss": 0.0619, "step": 11430 }, { - "epoch": 2.8881595556677606, - "grad_norm": 0.5295472741127014, - "learning_rate": 1.2671042665993437e-05, - "loss": 0.0623, + "epoch": 3.3976833976833976, + "grad_norm": 0.5328549742698669, + "learning_rate": 9.613899613899616e-06, + "loss": 0.0577, "step": 11440 }, { - "epoch": 2.890684170663974, - "grad_norm": 0.25687211751937866, - "learning_rate": 1.2655894976016157e-05, - "loss": 0.0495, + "epoch": 3.400653400653401, + "grad_norm": 0.6064325571060181, + "learning_rate": 9.596079596079596e-06, + "loss": 0.0627, "step": 11450 }, { - "epoch": 2.8932087856601867, - "grad_norm": 0.6008526682853699, - "learning_rate": 1.264074728603888e-05, - "loss": 0.0604, + "epoch": 3.4036234036234037, + "grad_norm": 0.3764317035675049, + "learning_rate": 9.578259578259578e-06, + "loss": 0.0492, "step": 11460 }, { - "epoch": 2.8957334006564, - "grad_norm": 0.3679234981536865, - "learning_rate": 1.2625599596061602e-05, - "loss": 0.0489, + "epoch": 3.4065934065934065, + "grad_norm": 0.40372684597969055, + "learning_rate": 9.56043956043956e-06, + "loss": 0.0625, "step": 11470 }, { - "epoch": 2.8982580156526128, - "grad_norm": 0.34721651673316956, - "learning_rate": 1.2610451906084322e-05, - "loss": 0.0619, + "epoch": 3.4095634095634098, + "grad_norm": 0.5874956250190735, + "learning_rate": 9.542619542619543e-06, + "loss": 0.0554, "step": 11480 }, { - "epoch": 2.900782630648826, - "grad_norm": 0.2352529764175415, - "learning_rate": 1.2595304216107045e-05, - "loss": 0.0641, + "epoch": 3.4125334125334126, + "grad_norm": 0.6757147908210754, + "learning_rate": 9.524799524799525e-06, + "loss": 0.0503, "step": 11490 }, { - "epoch": 2.9033072456450393, - "grad_norm": 0.621487557888031, - "learning_rate": 1.2580156526129767e-05, - "loss": 0.0636, + "epoch": 3.4155034155034154, + "grad_norm": 0.33406156301498413, + "learning_rate": 9.506979506979507e-06, + "loss": 0.0751, "step": 11500 }, { - "epoch": 2.905831860641252, - "grad_norm": 0.5426394939422607, - "learning_rate": 1.2565008836152487e-05, - "loss": 0.0694, + "epoch": 3.4184734184734182, + "grad_norm": 0.22471563518047333, + "learning_rate": 9.48915948915949e-06, + "loss": 0.0608, "step": 11510 }, { - "epoch": 2.9083564756374654, - "grad_norm": 0.5092729926109314, - "learning_rate": 1.2549861146175208e-05, - "loss": 0.0536, + "epoch": 3.4214434214434215, + "grad_norm": 0.29463276267051697, + "learning_rate": 9.471339471339471e-06, + "loss": 0.0541, "step": 11520 }, { - "epoch": 2.910881090633678, - "grad_norm": 0.4754561185836792, - "learning_rate": 1.2534713456197931e-05, - "loss": 0.0605, + "epoch": 3.4244134244134243, + "grad_norm": 0.5052198171615601, + "learning_rate": 9.453519453519453e-06, + "loss": 0.0705, "step": 11530 }, { - "epoch": 2.9134057056298914, - "grad_norm": 0.4031394124031067, - "learning_rate": 1.2519565766220651e-05, - "loss": 0.0765, + "epoch": 3.427383427383427, + "grad_norm": 0.4901563823223114, + "learning_rate": 9.435699435699436e-06, + "loss": 0.0676, "step": 11540 }, { - "epoch": 2.9159303206261047, - "grad_norm": 0.46325090527534485, - "learning_rate": 1.2504418076243373e-05, - "loss": 0.0691, + "epoch": 3.4303534303534304, + "grad_norm": 0.8629029989242554, + "learning_rate": 9.417879417879418e-06, + "loss": 0.0601, "step": 11550 }, { - "epoch": 2.9184549356223175, - "grad_norm": 0.5663125514984131, - "learning_rate": 1.2489270386266094e-05, - "loss": 0.0651, + "epoch": 3.4333234333234333, + "grad_norm": 0.4550071656703949, + "learning_rate": 9.4000594000594e-06, + "loss": 0.056, "step": 11560 }, { - "epoch": 2.9209795506185308, - "grad_norm": 0.4145601987838745, - "learning_rate": 1.2474122696288816e-05, - "loss": 0.0546, + "epoch": 3.436293436293436, + "grad_norm": 0.4358525574207306, + "learning_rate": 9.382239382239382e-06, + "loss": 0.0581, "step": 11570 }, { - "epoch": 2.9235041656147436, - "grad_norm": 0.42057228088378906, - "learning_rate": 1.2458975006311538e-05, - "loss": 0.056, + "epoch": 3.4392634392634394, + "grad_norm": 0.5264983773231506, + "learning_rate": 9.364419364419366e-06, + "loss": 0.0545, "step": 11580 }, { - "epoch": 2.926028780610957, - "grad_norm": 0.4702089726924896, - "learning_rate": 1.244382731633426e-05, - "loss": 0.0526, + "epoch": 3.442233442233442, + "grad_norm": 0.6651470065116882, + "learning_rate": 9.346599346599347e-06, + "loss": 0.0502, "step": 11590 }, { - "epoch": 2.92855339560717, - "grad_norm": 0.6012133359909058, - "learning_rate": 1.2428679626356981e-05, - "loss": 0.0588, + "epoch": 3.445203445203445, + "grad_norm": 0.8447353839874268, + "learning_rate": 9.328779328779329e-06, + "loss": 0.0635, "step": 11600 }, { - "epoch": 2.931078010603383, - "grad_norm": 0.7933056950569153, - "learning_rate": 1.2413531936379702e-05, - "loss": 0.0706, + "epoch": 3.4481734481734483, + "grad_norm": 0.39546453952789307, + "learning_rate": 9.31095931095931e-06, + "loss": 0.0569, "step": 11610 }, { - "epoch": 2.933602625599596, - "grad_norm": 0.49585798382759094, - "learning_rate": 1.2398384246402424e-05, - "loss": 0.0561, + "epoch": 3.451143451143451, + "grad_norm": 0.5016785860061646, + "learning_rate": 9.293139293139295e-06, + "loss": 0.0566, "step": 11620 }, { - "epoch": 2.936127240595809, - "grad_norm": 0.460102379322052, - "learning_rate": 1.2383236556425146e-05, - "loss": 0.0726, + "epoch": 3.454113454113454, + "grad_norm": 0.4469011723995209, + "learning_rate": 9.275319275319275e-06, + "loss": 0.0561, "step": 11630 }, { - "epoch": 2.9386518555920222, - "grad_norm": 0.5320731401443481, - "learning_rate": 1.2368088866447867e-05, - "loss": 0.0572, + "epoch": 3.457083457083457, + "grad_norm": 0.7788525819778442, + "learning_rate": 9.257499257499257e-06, + "loss": 0.0609, "step": 11640 }, { - "epoch": 2.9411764705882355, - "grad_norm": 0.5159802436828613, - "learning_rate": 1.2352941176470587e-05, - "loss": 0.0555, + "epoch": 3.46005346005346, + "grad_norm": 0.38840779662132263, + "learning_rate": 9.239679239679241e-06, + "loss": 0.0563, "step": 11650 }, { - "epoch": 2.9437010855844483, - "grad_norm": 0.5879008173942566, - "learning_rate": 1.233779348649331e-05, - "loss": 0.0504, + "epoch": 3.463023463023463, + "grad_norm": 0.4505913257598877, + "learning_rate": 9.221859221859222e-06, + "loss": 0.0551, "step": 11660 }, { - "epoch": 2.9462257005806616, - "grad_norm": 0.8441261053085327, - "learning_rate": 1.2322645796516032e-05, - "loss": 0.0631, + "epoch": 3.465993465993466, + "grad_norm": 0.4120921492576599, + "learning_rate": 9.204039204039204e-06, + "loss": 0.0626, "step": 11670 }, { - "epoch": 2.9487503155768744, - "grad_norm": 0.4483579695224762, - "learning_rate": 1.2307498106538752e-05, - "loss": 0.0639, + "epoch": 3.468963468963469, + "grad_norm": 0.32375073432922363, + "learning_rate": 9.186219186219186e-06, + "loss": 0.0702, "step": 11680 }, { - "epoch": 2.9512749305730877, - "grad_norm": 0.41955748200416565, - "learning_rate": 1.2292350416561475e-05, - "loss": 0.0641, + "epoch": 3.471933471933472, + "grad_norm": 0.7593043446540833, + "learning_rate": 9.16839916839917e-06, + "loss": 0.0585, "step": 11690 }, { - "epoch": 2.953799545569301, - "grad_norm": 0.5988761782646179, - "learning_rate": 1.2277202726584197e-05, - "loss": 0.0631, + "epoch": 3.474903474903475, + "grad_norm": 0.8873201608657837, + "learning_rate": 9.15057915057915e-06, + "loss": 0.0556, "step": 11700 }, { - "epoch": 2.9563241605655137, - "grad_norm": 0.9389402270317078, - "learning_rate": 1.2262055036606917e-05, - "loss": 0.0659, + "epoch": 3.477873477873478, + "grad_norm": 0.23573075234889984, + "learning_rate": 9.132759132759133e-06, + "loss": 0.0691, "step": 11710 }, { - "epoch": 2.958848775561727, - "grad_norm": 0.5522141456604004, - "learning_rate": 1.224690734662964e-05, - "loss": 0.062, + "epoch": 3.4808434808434807, + "grad_norm": 0.6220734119415283, + "learning_rate": 9.114939114939116e-06, + "loss": 0.0666, "step": 11720 }, { - "epoch": 2.96137339055794, - "grad_norm": 0.3770173490047455, - "learning_rate": 1.2231759656652362e-05, - "loss": 0.0573, + "epoch": 3.483813483813484, + "grad_norm": 0.34220409393310547, + "learning_rate": 9.097119097119097e-06, + "loss": 0.064, "step": 11730 }, { - "epoch": 2.963898005554153, - "grad_norm": 0.4838135540485382, - "learning_rate": 1.2216611966675082e-05, - "loss": 0.0605, + "epoch": 3.486783486783487, + "grad_norm": 0.46370092034339905, + "learning_rate": 9.079299079299079e-06, + "loss": 0.0723, "step": 11740 }, { - "epoch": 2.9664226205503663, - "grad_norm": 0.4662052094936371, - "learning_rate": 1.2201464276697805e-05, - "loss": 0.0716, + "epoch": 3.4897534897534896, + "grad_norm": 0.3304176926612854, + "learning_rate": 9.061479061479061e-06, + "loss": 0.0536, "step": 11750 }, { - "epoch": 2.968947235546579, - "grad_norm": 0.42140257358551025, - "learning_rate": 1.2186316586720526e-05, - "loss": 0.0711, + "epoch": 3.492723492723493, + "grad_norm": 0.31319358944892883, + "learning_rate": 9.043659043659045e-06, + "loss": 0.0643, "step": 11760 }, { - "epoch": 2.971471850542792, - "grad_norm": 0.41893675923347473, - "learning_rate": 1.2171168896743246e-05, - "loss": 0.0743, + "epoch": 3.4956934956934957, + "grad_norm": 0.7758733034133911, + "learning_rate": 9.025839025839026e-06, + "loss": 0.056, "step": 11770 }, { - "epoch": 2.973996465539005, - "grad_norm": 0.36561962962150574, - "learning_rate": 1.215602120676597e-05, - "loss": 0.0715, + "epoch": 3.4986634986634986, + "grad_norm": 0.48000404238700867, + "learning_rate": 9.008019008019008e-06, + "loss": 0.0533, "step": 11780 }, { - "epoch": 2.9765210805352185, - "grad_norm": 0.338679701089859, - "learning_rate": 1.2140873516788691e-05, - "loss": 0.0669, + "epoch": 3.501633501633502, + "grad_norm": 0.6242011189460754, + "learning_rate": 8.990198990198992e-06, + "loss": 0.06, "step": 11790 }, { - "epoch": 2.9790456955314317, - "grad_norm": 0.18762636184692383, - "learning_rate": 1.2125725826811411e-05, - "loss": 0.0695, + "epoch": 3.5046035046035047, + "grad_norm": 0.6063371896743774, + "learning_rate": 8.972378972378972e-06, + "loss": 0.0478, "step": 11800 }, { - "epoch": 2.9815703105276445, - "grad_norm": 0.32193130254745483, - "learning_rate": 1.2110578136834133e-05, - "loss": 0.0641, + "epoch": 3.5075735075735075, + "grad_norm": 0.4791490137577057, + "learning_rate": 8.954558954558954e-06, + "loss": 0.0486, "step": 11810 }, { - "epoch": 2.9840949255238574, - "grad_norm": 0.5772337317466736, - "learning_rate": 1.2095430446856854e-05, - "loss": 0.0551, + "epoch": 3.5105435105435108, + "grad_norm": 0.3279794752597809, + "learning_rate": 8.936738936738936e-06, + "loss": 0.0655, "step": 11820 }, { - "epoch": 2.9866195405200706, - "grad_norm": 0.58876633644104, - "learning_rate": 1.2080282756879576e-05, - "loss": 0.0702, + "epoch": 3.5135135135135136, + "grad_norm": 0.42596834897994995, + "learning_rate": 8.91891891891892e-06, + "loss": 0.0733, "step": 11830 }, { - "epoch": 2.989144155516284, - "grad_norm": 0.3960408568382263, - "learning_rate": 1.2065135066902297e-05, - "loss": 0.0608, + "epoch": 3.5164835164835164, + "grad_norm": 0.4257424771785736, + "learning_rate": 8.9010989010989e-06, + "loss": 0.0624, "step": 11840 }, { - "epoch": 2.9916687705124967, - "grad_norm": 0.4262355864048004, - "learning_rate": 1.2049987376925019e-05, - "loss": 0.0552, + "epoch": 3.5194535194535197, + "grad_norm": 0.46473416686058044, + "learning_rate": 8.883278883278883e-06, + "loss": 0.0562, "step": 11850 }, { - "epoch": 2.99419338550871, - "grad_norm": 0.5799320340156555, - "learning_rate": 1.203483968694774e-05, - "loss": 0.0492, + "epoch": 3.5224235224235225, + "grad_norm": 0.6375032663345337, + "learning_rate": 8.865458865458867e-06, + "loss": 0.0589, "step": 11860 }, { - "epoch": 2.9967180005049228, - "grad_norm": 0.3781181573867798, - "learning_rate": 1.2019691996970462e-05, - "loss": 0.0693, + "epoch": 3.5253935253935254, + "grad_norm": 0.35437679290771484, + "learning_rate": 8.847638847638847e-06, + "loss": 0.0731, "step": 11870 }, { - "epoch": 2.999242615501136, - "grad_norm": 0.5994321703910828, - "learning_rate": 1.2004544306993184e-05, - "loss": 0.0598, + "epoch": 3.5283635283635286, + "grad_norm": 0.5066477060317993, + "learning_rate": 8.82981882981883e-06, + "loss": 0.0614, "step": 11880 }, { - "epoch": 3.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.05238291248679161, - "eval_runtime": 905.9676, - "eval_samples_per_second": 227.67, - "eval_steps_per_second": 3.558, - "step": 11883 - }, - { - "epoch": 3.0017672304973493, - "grad_norm": 0.819588840007782, - "learning_rate": 1.1989396617015905e-05, - "loss": 0.0492, + "epoch": 3.5313335313335315, + "grad_norm": 0.5429478883743286, + "learning_rate": 8.811998811998812e-06, + "loss": 0.0571, "step": 11890 }, { - "epoch": 3.004291845493562, - "grad_norm": 0.41840866208076477, - "learning_rate": 1.1974248927038627e-05, - "loss": 0.0439, + "epoch": 3.5343035343035343, + "grad_norm": 0.6550652980804443, + "learning_rate": 8.794178794178795e-06, + "loss": 0.0569, "step": 11900 }, { - "epoch": 3.0068164604897754, - "grad_norm": 0.4053829312324524, - "learning_rate": 1.1959101237061349e-05, - "loss": 0.0586, + "epoch": 3.5372735372735375, + "grad_norm": 0.7283503413200378, + "learning_rate": 8.776358776358776e-06, + "loss": 0.0543, "step": 11910 }, { - "epoch": 3.009341075485988, - "grad_norm": 0.4883507788181305, - "learning_rate": 1.194395354708407e-05, - "loss": 0.0559, + "epoch": 3.5402435402435404, + "grad_norm": 0.3151548504829407, + "learning_rate": 8.758538758538758e-06, + "loss": 0.0566, "step": 11920 }, { - "epoch": 3.0118656904822014, - "grad_norm": 0.18890917301177979, - "learning_rate": 1.1928805857106792e-05, - "loss": 0.0629, + "epoch": 3.543213543213543, + "grad_norm": 0.507347583770752, + "learning_rate": 8.740718740718742e-06, + "loss": 0.053, "step": 11930 }, { - "epoch": 3.0143903054784147, - "grad_norm": 0.46093079447746277, - "learning_rate": 1.1913658167129512e-05, - "loss": 0.0534, + "epoch": 3.546183546183546, + "grad_norm": 0.7897441387176514, + "learning_rate": 8.722898722898724e-06, + "loss": 0.0688, "step": 11940 }, { - "epoch": 3.0169149204746275, - "grad_norm": 0.6295234560966492, - "learning_rate": 1.1898510477152235e-05, - "loss": 0.0696, + "epoch": 3.5491535491535493, + "grad_norm": 0.37791678309440613, + "learning_rate": 8.705078705078705e-06, + "loss": 0.0757, "step": 11950 }, { - "epoch": 3.0194395354708408, - "grad_norm": 0.6419150233268738, - "learning_rate": 1.1883362787174957e-05, - "loss": 0.0664, + "epoch": 3.552123552123552, + "grad_norm": 0.5913348197937012, + "learning_rate": 8.687258687258687e-06, + "loss": 0.0689, "step": 11960 }, { - "epoch": 3.0219641504670536, - "grad_norm": 0.7133572101593018, - "learning_rate": 1.1868215097197676e-05, - "loss": 0.0761, + "epoch": 3.555093555093555, + "grad_norm": 0.7024880647659302, + "learning_rate": 8.66943866943867e-06, + "loss": 0.0648, "step": 11970 }, { - "epoch": 3.024488765463267, - "grad_norm": 0.7136935591697693, - "learning_rate": 1.18530674072204e-05, - "loss": 0.0688, + "epoch": 3.5580635580635582, + "grad_norm": 0.37222567200660706, + "learning_rate": 8.651618651618651e-06, + "loss": 0.0595, "step": 11980 }, { - "epoch": 3.02701338045948, - "grad_norm": 0.33323633670806885, - "learning_rate": 1.1837919717243121e-05, - "loss": 0.0556, + "epoch": 3.561033561033561, + "grad_norm": 0.45320606231689453, + "learning_rate": 8.633798633798633e-06, + "loss": 0.0638, "step": 11990 }, { - "epoch": 3.029537995455693, - "grad_norm": 0.52031409740448, - "learning_rate": 1.1822772027265841e-05, - "loss": 0.0606, + "epoch": 3.564003564003564, + "grad_norm": 0.46902260184288025, + "learning_rate": 8.615978615978617e-06, + "loss": 0.0776, "step": 12000 }, { - "epoch": 3.032062610451906, - "grad_norm": 0.5969738364219666, - "learning_rate": 1.1807624337288565e-05, - "loss": 0.0625, + "epoch": 3.5669735669735667, + "grad_norm": 0.5722383260726929, + "learning_rate": 8.5981585981586e-06, + "loss": 0.0531, "step": 12010 }, { - "epoch": 3.034587225448119, - "grad_norm": 0.38949260115623474, - "learning_rate": 1.1792476647311286e-05, - "loss": 0.0525, + "epoch": 3.56994356994357, + "grad_norm": 0.5090997815132141, + "learning_rate": 8.58033858033858e-06, + "loss": 0.0534, "step": 12020 }, { - "epoch": 3.0371118404443322, - "grad_norm": 0.3799903988838196, - "learning_rate": 1.1777328957334006e-05, - "loss": 0.0561, + "epoch": 3.572913572913573, + "grad_norm": 0.4689802825450897, + "learning_rate": 8.562518562518562e-06, + "loss": 0.0717, "step": 12030 }, { - "epoch": 3.0396364554405455, - "grad_norm": 0.44943466782569885, - "learning_rate": 1.176218126735673e-05, - "loss": 0.0513, + "epoch": 3.5758835758835756, + "grad_norm": 0.4180223345756531, + "learning_rate": 8.544698544698546e-06, + "loss": 0.0456, "step": 12040 }, { - "epoch": 3.0421610704367583, - "grad_norm": 0.5962013602256775, - "learning_rate": 1.1747033577379451e-05, - "loss": 0.0551, + "epoch": 3.578853578853579, + "grad_norm": 0.30135074257850647, + "learning_rate": 8.526878526878526e-06, + "loss": 0.0548, "step": 12050 }, { - "epoch": 3.0446856854329716, - "grad_norm": 0.500968873500824, - "learning_rate": 1.173188588740217e-05, + "epoch": 3.5818235818235817, + "grad_norm": 0.5609501600265503, + "learning_rate": 8.509058509058509e-06, "loss": 0.0569, "step": 12060 }, { - "epoch": 3.0472103004291844, - "grad_norm": 0.46429353952407837, - "learning_rate": 1.1716738197424892e-05, - "loss": 0.0655, + "epoch": 3.5847935847935846, + "grad_norm": 0.30133068561553955, + "learning_rate": 8.491238491238492e-06, + "loss": 0.0499, "step": 12070 }, { - "epoch": 3.0497349154253977, - "grad_norm": 0.49735480546951294, - "learning_rate": 1.1701590507447614e-05, - "loss": 0.057, + "epoch": 3.587763587763588, + "grad_norm": 0.4278302490711212, + "learning_rate": 8.473418473418475e-06, + "loss": 0.0556, "step": 12080 }, { - "epoch": 3.052259530421611, - "grad_norm": 0.6153069138526917, - "learning_rate": 1.1686442817470336e-05, - "loss": 0.0585, + "epoch": 3.5907335907335907, + "grad_norm": 0.570552408695221, + "learning_rate": 8.455598455598455e-06, + "loss": 0.0667, "step": 12090 }, { - "epoch": 3.0547841454178237, - "grad_norm": 0.5006738305091858, - "learning_rate": 1.1671295127493057e-05, - "loss": 0.0589, + "epoch": 3.5937035937035935, + "grad_norm": 0.3945624828338623, + "learning_rate": 8.437778437778437e-06, + "loss": 0.0588, "step": 12100 }, { - "epoch": 3.057308760414037, - "grad_norm": 0.4022675156593323, - "learning_rate": 1.1656147437515779e-05, - "loss": 0.0704, + "epoch": 3.5966735966735968, + "grad_norm": 0.5016827583312988, + "learning_rate": 8.419958419958421e-06, + "loss": 0.0454, "step": 12110 }, { - "epoch": 3.05983337541025, - "grad_norm": 0.4161219000816345, - "learning_rate": 1.16409997475385e-05, - "loss": 0.0582, + "epoch": 3.5996435996435996, + "grad_norm": 0.4540993869304657, + "learning_rate": 8.402138402138402e-06, + "loss": 0.0692, "step": 12120 }, { - "epoch": 3.062357990406463, - "grad_norm": 0.33736857771873474, - "learning_rate": 1.1625852057561222e-05, - "loss": 0.0578, + "epoch": 3.6026136026136024, + "grad_norm": 0.4178672730922699, + "learning_rate": 8.384318384318384e-06, + "loss": 0.054, "step": 12130 }, { - "epoch": 3.0648826054026763, - "grad_norm": 0.5995674729347229, - "learning_rate": 1.1610704367583944e-05, - "loss": 0.0556, + "epoch": 3.6055836055836057, + "grad_norm": 0.6967900395393372, + "learning_rate": 8.366498366498368e-06, + "loss": 0.0629, "step": 12140 }, { - "epoch": 3.067407220398889, - "grad_norm": 0.5381020903587341, - "learning_rate": 1.1595556677606665e-05, - "loss": 0.0616, + "epoch": 3.6085536085536085, + "grad_norm": 0.5746413469314575, + "learning_rate": 8.34867834867835e-06, + "loss": 0.0635, "step": 12150 }, { - "epoch": 3.0699318353951024, - "grad_norm": 0.23393908143043518, - "learning_rate": 1.1580408987629387e-05, - "loss": 0.0784, + "epoch": 3.6115236115236113, + "grad_norm": 0.45530760288238525, + "learning_rate": 8.33085833085833e-06, + "loss": 0.0632, "step": 12160 }, { - "epoch": 3.072456450391315, - "grad_norm": 0.4823366105556488, - "learning_rate": 1.1565261297652108e-05, - "loss": 0.0698, + "epoch": 3.6144936144936146, + "grad_norm": 0.5083603858947754, + "learning_rate": 8.313038313038312e-06, + "loss": 0.0522, "step": 12170 }, { - "epoch": 3.0749810653875285, - "grad_norm": 0.5678715109825134, - "learning_rate": 1.155011360767483e-05, - "loss": 0.0576, + "epoch": 3.6174636174636174, + "grad_norm": 0.4417908489704132, + "learning_rate": 8.295218295218296e-06, + "loss": 0.0537, "step": 12180 }, { - "epoch": 3.0775056803837413, - "grad_norm": 0.42829954624176025, - "learning_rate": 1.1534965917697552e-05, - "loss": 0.0657, + "epoch": 3.6204336204336203, + "grad_norm": 0.5929802656173706, + "learning_rate": 8.277398277398278e-06, + "loss": 0.0628, "step": 12190 }, { - "epoch": 3.0800302953799545, - "grad_norm": 0.31741514801979065, - "learning_rate": 1.1519818227720273e-05, - "loss": 0.0513, + "epoch": 3.6234036234036235, + "grad_norm": 0.35801371932029724, + "learning_rate": 8.259578259578259e-06, + "loss": 0.0568, "step": 12200 }, { - "epoch": 3.082554910376168, - "grad_norm": 0.4757698178291321, - "learning_rate": 1.1504670537742995e-05, - "loss": 0.0458, + "epoch": 3.6263736263736264, + "grad_norm": 0.42152899503707886, + "learning_rate": 8.241758241758243e-06, + "loss": 0.0575, "step": 12210 }, { - "epoch": 3.0850795253723806, - "grad_norm": 0.49062609672546387, - "learning_rate": 1.1489522847765716e-05, - "loss": 0.0626, + "epoch": 3.629343629343629, + "grad_norm": 0.5134592652320862, + "learning_rate": 8.223938223938225e-06, + "loss": 0.0589, "step": 12220 }, { - "epoch": 3.087604140368594, - "grad_norm": 0.40323606133461, - "learning_rate": 1.1474375157788436e-05, - "loss": 0.0466, + "epoch": 3.6323136323136325, + "grad_norm": 0.5800890922546387, + "learning_rate": 8.206118206118205e-06, + "loss": 0.0711, "step": 12230 }, { - "epoch": 3.0901287553648067, - "grad_norm": 0.5706951022148132, - "learning_rate": 1.145922746781116e-05, - "loss": 0.0706, + "epoch": 3.6352836352836353, + "grad_norm": 0.6621565222740173, + "learning_rate": 8.188298188298188e-06, + "loss": 0.0626, "step": 12240 }, { - "epoch": 3.09265337036102, - "grad_norm": 0.5950903296470642, - "learning_rate": 1.1444079777833881e-05, - "loss": 0.0506, + "epoch": 3.638253638253638, + "grad_norm": 0.19206875562667847, + "learning_rate": 8.170478170478171e-06, + "loss": 0.054, "step": 12250 }, { - "epoch": 3.095177985357233, - "grad_norm": 0.21615612506866455, - "learning_rate": 1.1428932087856601e-05, - "loss": 0.0474, + "epoch": 3.6412236412236414, + "grad_norm": 0.3461471199989319, + "learning_rate": 8.152658152658154e-06, + "loss": 0.063, "step": 12260 }, { - "epoch": 3.097702600353446, - "grad_norm": 0.3570312261581421, - "learning_rate": 1.1413784397879324e-05, - "loss": 0.0654, + "epoch": 3.644193644193644, + "grad_norm": 0.5503948926925659, + "learning_rate": 8.134838134838134e-06, + "loss": 0.0674, "step": 12270 }, { - "epoch": 3.1002272153496593, - "grad_norm": 0.5387312769889832, - "learning_rate": 1.1398636707902046e-05, - "loss": 0.0669, + "epoch": 3.647163647163647, + "grad_norm": 0.3993360698223114, + "learning_rate": 8.117018117018118e-06, + "loss": 0.0523, "step": 12280 }, { - "epoch": 3.102751830345872, - "grad_norm": 0.36091628670692444, - "learning_rate": 1.1383489017924766e-05, - "loss": 0.0603, + "epoch": 3.6501336501336503, + "grad_norm": 0.5561977624893188, + "learning_rate": 8.0991980991981e-06, + "loss": 0.0601, "step": 12290 }, { - "epoch": 3.1052764453420854, - "grad_norm": 0.49484896659851074, - "learning_rate": 1.1368341327947489e-05, - "loss": 0.0503, + "epoch": 3.653103653103653, + "grad_norm": 0.4218428134918213, + "learning_rate": 8.08137808137808e-06, + "loss": 0.0597, "step": 12300 }, { - "epoch": 3.1078010603382986, - "grad_norm": 0.6939244270324707, - "learning_rate": 1.135319363797021e-05, - "loss": 0.0578, + "epoch": 3.656073656073656, + "grad_norm": 0.6830678582191467, + "learning_rate": 8.063558063558063e-06, + "loss": 0.0624, "step": 12310 }, { - "epoch": 3.1103256753345114, - "grad_norm": 0.6148266196250916, - "learning_rate": 1.133804594799293e-05, - "loss": 0.055, + "epoch": 3.6590436590436592, + "grad_norm": 0.5021694302558899, + "learning_rate": 8.045738045738047e-06, + "loss": 0.0725, "step": 12320 }, { - "epoch": 3.1128502903307247, - "grad_norm": 0.5472010374069214, - "learning_rate": 1.1322898258015654e-05, - "loss": 0.0527, + "epoch": 3.662013662013662, + "grad_norm": 0.6278291344642639, + "learning_rate": 8.027918027918029e-06, + "loss": 0.0715, "step": 12330 }, { - "epoch": 3.1153749053269375, - "grad_norm": 0.40635547041893005, - "learning_rate": 1.1307750568038375e-05, - "loss": 0.0734, + "epoch": 3.664983664983665, + "grad_norm": 0.7712084650993347, + "learning_rate": 8.01009801009801e-06, + "loss": 0.0609, "step": 12340 }, { - "epoch": 3.1178995203231508, - "grad_norm": 0.49441322684288025, - "learning_rate": 1.1292602878061095e-05, - "loss": 0.0686, + "epoch": 3.667953667953668, + "grad_norm": 0.47669193148612976, + "learning_rate": 7.992277992277993e-06, + "loss": 0.0645, "step": 12350 }, { - "epoch": 3.1204241353193636, - "grad_norm": 0.41052699089050293, - "learning_rate": 1.1277455188083817e-05, - "loss": 0.0636, + "epoch": 3.670923670923671, + "grad_norm": 0.5000527501106262, + "learning_rate": 7.974457974457975e-06, + "loss": 0.0463, "step": 12360 }, { - "epoch": 3.122948750315577, - "grad_norm": 0.5868958234786987, - "learning_rate": 1.1262307498106539e-05, - "loss": 0.0546, + "epoch": 3.673893673893674, + "grad_norm": 0.409820020198822, + "learning_rate": 7.956637956637956e-06, + "loss": 0.0552, "step": 12370 }, { - "epoch": 3.12547336531179, - "grad_norm": 0.3751147389411926, - "learning_rate": 1.124715980812926e-05, - "loss": 0.064, + "epoch": 3.676863676863677, + "grad_norm": 0.48183321952819824, + "learning_rate": 7.938817938817938e-06, + "loss": 0.0605, "step": 12380 }, { - "epoch": 3.127997980308003, - "grad_norm": 0.5202364325523376, - "learning_rate": 1.1232012118151982e-05, - "loss": 0.0536, + "epoch": 3.67983367983368, + "grad_norm": 0.5534571409225464, + "learning_rate": 7.920997920997922e-06, + "loss": 0.0638, "step": 12390 }, { - "epoch": 3.130522595304216, - "grad_norm": 0.5483293533325195, - "learning_rate": 1.1216864428174703e-05, - "loss": 0.0656, + "epoch": 3.6828036828036828, + "grad_norm": 0.4206744432449341, + "learning_rate": 7.903177903177904e-06, + "loss": 0.0634, "step": 12400 }, { - "epoch": 3.133047210300429, - "grad_norm": 0.6402902603149414, - "learning_rate": 1.1201716738197425e-05, - "loss": 0.0643, + "epoch": 3.685773685773686, + "grad_norm": 0.5539330244064331, + "learning_rate": 7.885357885357884e-06, + "loss": 0.0583, "step": 12410 }, { - "epoch": 3.1355718252966422, - "grad_norm": 0.609380841255188, - "learning_rate": 1.1186569048220147e-05, - "loss": 0.0763, + "epoch": 3.688743688743689, + "grad_norm": 0.32335200905799866, + "learning_rate": 7.867537867537868e-06, + "loss": 0.0604, "step": 12420 }, { - "epoch": 3.1380964402928555, - "grad_norm": 0.49389705061912537, - "learning_rate": 1.1171421358242868e-05, - "loss": 0.0727, + "epoch": 3.6917136917136917, + "grad_norm": 0.6858915686607361, + "learning_rate": 7.84971784971785e-06, + "loss": 0.0711, "step": 12430 }, { - "epoch": 3.1406210552890683, - "grad_norm": 0.40313732624053955, - "learning_rate": 1.115627366826559e-05, - "loss": 0.0514, + "epoch": 3.694683694683695, + "grad_norm": 0.4419819116592407, + "learning_rate": 7.831897831897831e-06, + "loss": 0.0535, "step": 12440 }, { - "epoch": 3.1431456702852816, - "grad_norm": 0.5067439675331116, - "learning_rate": 1.1141125978288311e-05, - "loss": 0.0592, + "epoch": 3.697653697653698, + "grad_norm": 0.5330691933631897, + "learning_rate": 7.814077814077813e-06, + "loss": 0.0604, "step": 12450 }, { - "epoch": 3.1456702852814944, - "grad_norm": 0.39174848794937134, - "learning_rate": 1.1125978288311033e-05, - "loss": 0.0668, + "epoch": 3.7006237006237006, + "grad_norm": 0.5260715484619141, + "learning_rate": 7.796257796257797e-06, + "loss": 0.0607, "step": 12460 }, { - "epoch": 3.1481949002777077, - "grad_norm": 0.6642642617225647, - "learning_rate": 1.1110830598333755e-05, - "loss": 0.0566, + "epoch": 3.7035937035937034, + "grad_norm": 0.7059239149093628, + "learning_rate": 7.77843777843778e-06, + "loss": 0.0683, "step": 12470 }, { - "epoch": 3.150719515273921, - "grad_norm": 0.5557689070701599, - "learning_rate": 1.1095682908356476e-05, - "loss": 0.0706, + "epoch": 3.7065637065637067, + "grad_norm": 0.31892430782318115, + "learning_rate": 7.76061776061776e-06, + "loss": 0.0701, "step": 12480 }, { - "epoch": 3.1532441302701337, - "grad_norm": 0.3642650246620178, - "learning_rate": 1.1080535218379198e-05, - "loss": 0.0457, + "epoch": 3.7095337095337095, + "grad_norm": 0.4127281606197357, + "learning_rate": 7.742797742797744e-06, + "loss": 0.0682, "step": 12490 }, { - "epoch": 3.155768745266347, - "grad_norm": 0.43238964676856995, - "learning_rate": 1.106538752840192e-05, - "loss": 0.0629, + "epoch": 3.7125037125037124, + "grad_norm": 0.23683589696884155, + "learning_rate": 7.724977724977726e-06, + "loss": 0.0612, "step": 12500 }, { - "epoch": 3.15829336026256, - "grad_norm": 0.4752700626850128, - "learning_rate": 1.1050239838424641e-05, - "loss": 0.0666, + "epoch": 3.7154737154737156, + "grad_norm": 0.47517532110214233, + "learning_rate": 7.707157707157708e-06, + "loss": 0.0462, "step": 12510 }, { - "epoch": 3.160817975258773, - "grad_norm": 0.6083767414093018, - "learning_rate": 1.103509214844736e-05, - "loss": 0.0621, + "epoch": 3.7184437184437185, + "grad_norm": 0.6467389464378357, + "learning_rate": 7.689337689337688e-06, + "loss": 0.0664, "step": 12520 }, { - "epoch": 3.1633425902549863, - "grad_norm": 0.5357356071472168, - "learning_rate": 1.1019944458470084e-05, - "loss": 0.0581, + "epoch": 3.7214137214137213, + "grad_norm": 0.6246938705444336, + "learning_rate": 7.671517671517672e-06, + "loss": 0.0638, "step": 12530 }, { - "epoch": 3.165867205251199, - "grad_norm": 0.3257083594799042, - "learning_rate": 1.1004796768492806e-05, - "loss": 0.063, + "epoch": 3.724383724383724, + "grad_norm": 0.4938197433948517, + "learning_rate": 7.653697653697654e-06, + "loss": 0.0629, "step": 12540 }, { - "epoch": 3.1683918202474124, - "grad_norm": 0.357324481010437, - "learning_rate": 1.0989649078515526e-05, - "loss": 0.0612, + "epoch": 3.7273537273537274, + "grad_norm": 0.5381590127944946, + "learning_rate": 7.635877635877635e-06, + "loss": 0.0621, "step": 12550 }, { - "epoch": 3.170916435243625, - "grad_norm": 0.4535214304924011, - "learning_rate": 1.0974501388538249e-05, - "loss": 0.064, + "epoch": 3.73032373032373, + "grad_norm": 0.2848157286643982, + "learning_rate": 7.618057618057619e-06, + "loss": 0.0641, "step": 12560 }, { - "epoch": 3.1734410502398385, - "grad_norm": 0.5646650791168213, - "learning_rate": 1.095935369856097e-05, - "loss": 0.0572, + "epoch": 3.733293733293733, + "grad_norm": 0.4204511046409607, + "learning_rate": 7.600237600237601e-06, + "loss": 0.0606, "step": 12570 }, { - "epoch": 3.1759656652360517, - "grad_norm": 0.8542249202728271, - "learning_rate": 1.094420600858369e-05, - "loss": 0.0692, + "epoch": 3.7362637362637363, + "grad_norm": 0.5741158723831177, + "learning_rate": 7.582417582417582e-06, + "loss": 0.0478, "step": 12580 }, { - "epoch": 3.1784902802322645, - "grad_norm": 0.5563963651657104, - "learning_rate": 1.0929058318606414e-05, - "loss": 0.0705, + "epoch": 3.739233739233739, + "grad_norm": 0.3851994574069977, + "learning_rate": 7.564597564597564e-06, + "loss": 0.067, "step": 12590 }, { - "epoch": 3.181014895228478, - "grad_norm": 0.5933377742767334, - "learning_rate": 1.0913910628629135e-05, - "loss": 0.0489, + "epoch": 3.742203742203742, + "grad_norm": 0.35587117075920105, + "learning_rate": 7.546777546777547e-06, + "loss": 0.0665, "step": 12600 }, { - "epoch": 3.1835395102246906, - "grad_norm": 0.6337215900421143, - "learning_rate": 1.0898762938651855e-05, - "loss": 0.0757, + "epoch": 3.7451737451737452, + "grad_norm": 0.30616384744644165, + "learning_rate": 7.528957528957529e-06, + "loss": 0.064, "step": 12610 }, { - "epoch": 3.186064125220904, - "grad_norm": 0.5588740110397339, - "learning_rate": 1.0883615248674578e-05, - "loss": 0.0617, + "epoch": 3.748143748143748, + "grad_norm": 0.5584198832511902, + "learning_rate": 7.511137511137511e-06, + "loss": 0.0546, "step": 12620 }, { - "epoch": 3.1885887402171167, - "grad_norm": 0.5218625068664551, - "learning_rate": 1.0868467558697298e-05, - "loss": 0.0604, + "epoch": 3.751113751113751, + "grad_norm": 0.3456946015357971, + "learning_rate": 7.493317493317493e-06, + "loss": 0.0571, "step": 12630 }, { - "epoch": 3.19111335521333, - "grad_norm": 0.3570559620857239, - "learning_rate": 1.085331986872002e-05, - "loss": 0.0571, + "epoch": 3.754083754083754, + "grad_norm": 0.5522321462631226, + "learning_rate": 7.475497475497476e-06, + "loss": 0.0809, "step": 12640 }, { - "epoch": 3.193637970209543, - "grad_norm": 0.48153752088546753, - "learning_rate": 1.0838172178742742e-05, - "loss": 0.0601, + "epoch": 3.757053757053757, + "grad_norm": 0.42469412088394165, + "learning_rate": 7.457677457677457e-06, + "loss": 0.0629, "step": 12650 }, { - "epoch": 3.196162585205756, - "grad_norm": 0.9248821139335632, - "learning_rate": 1.0823024488765463e-05, - "loss": 0.0651, + "epoch": 3.76002376002376, + "grad_norm": 0.5727609395980835, + "learning_rate": 7.4398574398574404e-06, + "loss": 0.0635, "step": 12660 }, { - "epoch": 3.1986872002019693, - "grad_norm": 0.5911086797714233, - "learning_rate": 1.0807876798788185e-05, - "loss": 0.0566, + "epoch": 3.762993762993763, + "grad_norm": 0.3833814859390259, + "learning_rate": 7.422037422037423e-06, + "loss": 0.0667, "step": 12670 }, { - "epoch": 3.201211815198182, - "grad_norm": 0.5010080337524414, - "learning_rate": 1.0792729108810906e-05, - "loss": 0.0568, + "epoch": 3.765963765963766, + "grad_norm": 0.7218992114067078, + "learning_rate": 7.404217404217404e-06, + "loss": 0.0589, "step": 12680 }, { - "epoch": 3.2037364301943954, - "grad_norm": 0.3672632873058319, - "learning_rate": 1.0777581418833628e-05, - "loss": 0.0699, + "epoch": 3.7689337689337687, + "grad_norm": 0.5727225542068481, + "learning_rate": 7.386397386397387e-06, + "loss": 0.0528, "step": 12690 }, { - "epoch": 3.2062610451906086, - "grad_norm": 0.38973551988601685, - "learning_rate": 1.076243372885635e-05, - "loss": 0.0593, + "epoch": 3.771903771903772, + "grad_norm": 0.38015714287757874, + "learning_rate": 7.368577368577368e-06, + "loss": 0.0538, "step": 12700 }, { - "epoch": 3.2087856601868214, - "grad_norm": 0.32008224725723267, - "learning_rate": 1.0747286038879071e-05, - "loss": 0.0752, + "epoch": 3.774873774873775, + "grad_norm": 0.32746824622154236, + "learning_rate": 7.350757350757351e-06, + "loss": 0.0511, "step": 12710 }, { - "epoch": 3.2113102751830347, - "grad_norm": 0.2462305873632431, - "learning_rate": 1.0732138348901793e-05, - "loss": 0.0585, + "epoch": 3.7778437778437777, + "grad_norm": 0.3238430321216583, + "learning_rate": 7.332937332937333e-06, + "loss": 0.0572, "step": 12720 }, { - "epoch": 3.2138348901792475, - "grad_norm": 0.34616610407829285, - "learning_rate": 1.0716990658924514e-05, - "loss": 0.054, + "epoch": 3.780813780813781, + "grad_norm": 0.3043205142021179, + "learning_rate": 7.315117315117316e-06, + "loss": 0.0543, "step": 12730 }, { - "epoch": 3.2163595051754608, - "grad_norm": 0.5276474952697754, - "learning_rate": 1.0701842968947236e-05, - "loss": 0.0641, + "epoch": 3.7837837837837838, + "grad_norm": 0.23511236906051636, + "learning_rate": 7.297297297297298e-06, + "loss": 0.0567, "step": 12740 }, { - "epoch": 3.218884120171674, - "grad_norm": 0.36549025774002075, - "learning_rate": 1.0686695278969957e-05, - "loss": 0.0632, + "epoch": 3.7867537867537866, + "grad_norm": 0.44706740975379944, + "learning_rate": 7.27947727947728e-06, + "loss": 0.0598, "step": 12750 }, { - "epoch": 3.221408735167887, - "grad_norm": 0.5248700380325317, - "learning_rate": 1.0671547588992679e-05, - "loss": 0.0706, + "epoch": 3.78972378972379, + "grad_norm": 0.700774610042572, + "learning_rate": 7.261657261657262e-06, + "loss": 0.0671, "step": 12760 }, { - "epoch": 3.2239333501641, - "grad_norm": 0.3740836977958679, - "learning_rate": 1.06563998990154e-05, - "loss": 0.0594, + "epoch": 3.7926937926937927, + "grad_norm": 0.35849860310554504, + "learning_rate": 7.2438372438372435e-06, + "loss": 0.0586, "step": 12770 }, { - "epoch": 3.226457965160313, - "grad_norm": 0.31425297260284424, - "learning_rate": 1.0641252209038122e-05, - "loss": 0.0608, + "epoch": 3.7956637956637955, + "grad_norm": 0.4785964787006378, + "learning_rate": 7.2260172260172265e-06, + "loss": 0.0616, "step": 12780 }, { - "epoch": 3.228982580156526, - "grad_norm": 0.5715880393981934, - "learning_rate": 1.0626104519060844e-05, - "loss": 0.0604, + "epoch": 3.798633798633799, + "grad_norm": 0.6433180570602417, + "learning_rate": 7.208197208197208e-06, + "loss": 0.0582, "step": 12790 }, { - "epoch": 3.231507195152739, - "grad_norm": 0.3549630343914032, - "learning_rate": 1.0610956829083565e-05, - "loss": 0.0471, + "epoch": 3.8016038016038016, + "grad_norm": 0.37284335494041443, + "learning_rate": 7.190377190377191e-06, + "loss": 0.0491, "step": 12800 }, { - "epoch": 3.2340318101489522, - "grad_norm": 0.6027510166168213, - "learning_rate": 1.0595809139106285e-05, - "loss": 0.0672, + "epoch": 3.8045738045738045, + "grad_norm": 0.576884388923645, + "learning_rate": 7.172557172557173e-06, + "loss": 0.0644, "step": 12810 }, { - "epoch": 3.2365564251451655, - "grad_norm": 0.6292756795883179, - "learning_rate": 1.0580661449129009e-05, - "loss": 0.0522, + "epoch": 3.8075438075438077, + "grad_norm": 0.5406507253646851, + "learning_rate": 7.154737154737155e-06, + "loss": 0.0621, "step": 12820 }, { - "epoch": 3.2390810401413783, - "grad_norm": 0.4945664405822754, - "learning_rate": 1.056551375915173e-05, + "epoch": 3.8105138105138106, + "grad_norm": 0.5398954749107361, + "learning_rate": 7.136917136917137e-06, "loss": 0.0606, "step": 12830 }, { - "epoch": 3.2416056551375916, - "grad_norm": 0.3837689757347107, - "learning_rate": 1.055036606917445e-05, - "loss": 0.058, + "epoch": 3.8134838134838134, + "grad_norm": 0.3366580605506897, + "learning_rate": 7.119097119097119e-06, + "loss": 0.0543, "step": 12840 }, { - "epoch": 3.2441302701338044, - "grad_norm": 0.41095155477523804, - "learning_rate": 1.0535218379197173e-05, - "loss": 0.0632, + "epoch": 3.8164538164538166, + "grad_norm": 0.4284731149673462, + "learning_rate": 7.101277101277102e-06, + "loss": 0.0608, "step": 12850 }, { - "epoch": 3.2466548851300177, - "grad_norm": 0.48984506726264954, - "learning_rate": 1.0520070689219895e-05, - "loss": 0.0628, + "epoch": 3.8194238194238195, + "grad_norm": 0.6001728773117065, + "learning_rate": 7.083457083457083e-06, + "loss": 0.052, "step": 12860 }, { - "epoch": 3.249179500126231, - "grad_norm": 0.6692824959754944, - "learning_rate": 1.0504922999242615e-05, - "loss": 0.0586, + "epoch": 3.8223938223938223, + "grad_norm": 0.6029428243637085, + "learning_rate": 7.065637065637066e-06, + "loss": 0.0516, "step": 12870 }, { - "epoch": 3.2517041151224437, - "grad_norm": 0.5213368535041809, - "learning_rate": 1.0489775309265338e-05, - "loss": 0.0497, + "epoch": 3.8253638253638256, + "grad_norm": 0.39351001381874084, + "learning_rate": 7.047817047817048e-06, + "loss": 0.0705, "step": 12880 }, { - "epoch": 3.254228730118657, - "grad_norm": 0.696983277797699, - "learning_rate": 1.0474627619288058e-05, - "loss": 0.0635, + "epoch": 3.8283338283338284, + "grad_norm": 0.5574610829353333, + "learning_rate": 7.02999702999703e-06, + "loss": 0.0749, "step": 12890 }, { - "epoch": 3.25675334511487, - "grad_norm": 0.29254981875419617, - "learning_rate": 1.045947992931078e-05, - "loss": 0.0527, + "epoch": 3.8313038313038312, + "grad_norm": 0.35019442439079285, + "learning_rate": 7.0121770121770125e-06, + "loss": 0.0612, "step": 12900 }, { - "epoch": 3.259277960111083, - "grad_norm": 0.3891927897930145, - "learning_rate": 1.0444332239333503e-05, - "loss": 0.067, + "epoch": 3.8342738342738345, + "grad_norm": 0.4754871129989624, + "learning_rate": 6.994356994356995e-06, + "loss": 0.0553, "step": 12910 }, { - "epoch": 3.2618025751072963, - "grad_norm": 0.49604347348213196, - "learning_rate": 1.0429184549356223e-05, - "loss": 0.0578, + "epoch": 3.8372438372438373, + "grad_norm": 0.42024219036102295, + "learning_rate": 6.976536976536977e-06, + "loss": 0.0655, "step": 12920 }, { - "epoch": 3.264327190103509, - "grad_norm": 0.5001896619796753, - "learning_rate": 1.0414036859378944e-05, - "loss": 0.0575, + "epoch": 3.84021384021384, + "grad_norm": 0.5387091636657715, + "learning_rate": 6.958716958716958e-06, + "loss": 0.0599, "step": 12930 }, { - "epoch": 3.2668518050997224, - "grad_norm": 0.24430322647094727, - "learning_rate": 1.0398889169401666e-05, - "loss": 0.0682, + "epoch": 3.8431838431838434, + "grad_norm": 0.570585310459137, + "learning_rate": 6.940896940896941e-06, + "loss": 0.0614, "step": 12940 }, { - "epoch": 3.269376420095935, - "grad_norm": 0.4671231508255005, - "learning_rate": 1.0383741479424388e-05, - "loss": 0.0628, + "epoch": 3.8461538461538463, + "grad_norm": 0.456065833568573, + "learning_rate": 6.923076923076923e-06, + "loss": 0.0654, "step": 12950 }, { - "epoch": 3.2719010350921485, - "grad_norm": 0.32760515809059143, - "learning_rate": 1.036859378944711e-05, - "loss": 0.0594, + "epoch": 3.849123849123849, + "grad_norm": 0.7067524790763855, + "learning_rate": 6.9052569052569056e-06, + "loss": 0.0756, "step": 12960 }, { - "epoch": 3.2744256500883617, - "grad_norm": 0.46056973934173584, - "learning_rate": 1.035344609946983e-05, - "loss": 0.0562, + "epoch": 3.8520938520938524, + "grad_norm": 0.46172401309013367, + "learning_rate": 6.887436887436888e-06, + "loss": 0.053, "step": 12970 }, { - "epoch": 3.2769502650845745, - "grad_norm": 0.3852224349975586, - "learning_rate": 1.0338298409492552e-05, - "loss": 0.064, + "epoch": 3.855063855063855, + "grad_norm": 0.561151921749115, + "learning_rate": 6.86961686961687e-06, + "loss": 0.0503, "step": 12980 }, { - "epoch": 3.279474880080788, - "grad_norm": 0.6717817187309265, - "learning_rate": 1.0323150719515274e-05, - "loss": 0.0585, + "epoch": 3.858033858033858, + "grad_norm": 0.3379230201244354, + "learning_rate": 6.851796851796852e-06, + "loss": 0.0626, "step": 12990 }, { - "epoch": 3.2819994950770006, - "grad_norm": 0.6648727655410767, - "learning_rate": 1.0308003029537996e-05, - "loss": 0.0667, + "epoch": 3.861003861003861, + "grad_norm": 0.6056146621704102, + "learning_rate": 6.833976833976834e-06, + "loss": 0.0628, "step": 13000 }, { - "epoch": 3.284524110073214, - "grad_norm": 0.4613960385322571, - "learning_rate": 1.0292855339560717e-05, - "loss": 0.0537, + "epoch": 3.863973863973864, + "grad_norm": 0.48145750164985657, + "learning_rate": 6.816156816156816e-06, + "loss": 0.061, "step": 13010 }, { - "epoch": 3.287048725069427, - "grad_norm": 0.3950752913951874, - "learning_rate": 1.0277707649583439e-05, - "loss": 0.0574, + "epoch": 3.866943866943867, + "grad_norm": 0.4073619246482849, + "learning_rate": 6.7983367983367986e-06, + "loss": 0.068, "step": 13020 }, { - "epoch": 3.28957334006564, - "grad_norm": 0.4458863437175751, - "learning_rate": 1.026255995960616e-05, - "loss": 0.0576, + "epoch": 3.8699138699138698, + "grad_norm": 0.4736767113208771, + "learning_rate": 6.780516780516781e-06, + "loss": 0.0595, "step": 13030 }, { - "epoch": 3.292097955061853, - "grad_norm": 0.5602406859397888, - "learning_rate": 1.0247412269628882e-05, - "loss": 0.0635, + "epoch": 3.872883872883873, + "grad_norm": 0.4397349953651428, + "learning_rate": 6.762696762696763e-06, + "loss": 0.0676, "step": 13040 }, { - "epoch": 3.294622570058066, - "grad_norm": 0.49716469645500183, - "learning_rate": 1.0232264579651604e-05, - "loss": 0.0537, + "epoch": 3.875853875853876, + "grad_norm": 0.4046313166618347, + "learning_rate": 6.744876744876745e-06, + "loss": 0.0552, "step": 13050 }, { - "epoch": 3.2971471850542793, - "grad_norm": 0.9081646800041199, - "learning_rate": 1.0217116889674325e-05, - "loss": 0.0572, + "epoch": 3.8788238788238787, + "grad_norm": 0.3561536371707916, + "learning_rate": 6.727056727056727e-06, + "loss": 0.0595, "step": 13060 }, { - "epoch": 3.2996718000504925, - "grad_norm": 0.36501345038414, - "learning_rate": 1.0201969199697047e-05, - "loss": 0.063, + "epoch": 3.8817938817938815, + "grad_norm": 0.5443368554115295, + "learning_rate": 6.7092367092367094e-06, + "loss": 0.0557, "step": 13070 }, { - "epoch": 3.3021964150467054, - "grad_norm": 0.419605553150177, - "learning_rate": 1.0186821509719768e-05, - "loss": 0.0564, + "epoch": 3.884763884763885, + "grad_norm": 0.515012264251709, + "learning_rate": 6.691416691416692e-06, + "loss": 0.0488, "step": 13080 }, { - "epoch": 3.3047210300429186, - "grad_norm": 0.4859483242034912, - "learning_rate": 1.017167381974249e-05, - "loss": 0.0581, + "epoch": 3.8877338877338876, + "grad_norm": 0.37932658195495605, + "learning_rate": 6.673596673596674e-06, + "loss": 0.0629, "step": 13090 }, { - "epoch": 3.3072456450391314, - "grad_norm": 0.6135731339454651, - "learning_rate": 1.015652612976521e-05, - "loss": 0.0562, + "epoch": 3.8907038907038904, + "grad_norm": 0.4500630795955658, + "learning_rate": 6.655776655776656e-06, + "loss": 0.0589, "step": 13100 }, { - "epoch": 3.3097702600353447, - "grad_norm": 0.5303016304969788, - "learning_rate": 1.0141378439787933e-05, - "loss": 0.0573, + "epoch": 3.8936738936738937, + "grad_norm": 0.759432852268219, + "learning_rate": 6.637956637956638e-06, + "loss": 0.0547, "step": 13110 }, { - "epoch": 3.3122948750315575, - "grad_norm": 0.34575608372688293, - "learning_rate": 1.0126230749810655e-05, - "loss": 0.0652, + "epoch": 3.8966438966438965, + "grad_norm": 0.6136061549186707, + "learning_rate": 6.62013662013662e-06, + "loss": 0.0572, "step": 13120 }, { - "epoch": 3.3148194900277708, - "grad_norm": 0.3637102544307709, - "learning_rate": 1.0111083059833375e-05, - "loss": 0.0558, + "epoch": 3.8996138996138994, + "grad_norm": 0.5580719709396362, + "learning_rate": 6.6023166023166025e-06, + "loss": 0.0573, "step": 13130 }, { - "epoch": 3.317344105023984, - "grad_norm": 0.3905535936355591, - "learning_rate": 1.0095935369856098e-05, - "loss": 0.0553, + "epoch": 3.9025839025839026, + "grad_norm": 0.5806880593299866, + "learning_rate": 6.584496584496585e-06, + "loss": 0.0624, "step": 13140 }, { - "epoch": 3.319868720020197, - "grad_norm": 0.3310402035713196, - "learning_rate": 1.008078767987882e-05, - "loss": 0.0748, + "epoch": 3.9055539055539055, + "grad_norm": 0.7236099243164062, + "learning_rate": 6.566676566676567e-06, + "loss": 0.069, "step": 13150 }, { - "epoch": 3.32239333501641, - "grad_norm": 0.6091330647468567, - "learning_rate": 1.006563998990154e-05, - "loss": 0.0561, + "epoch": 3.9085239085239083, + "grad_norm": 0.4525713622570038, + "learning_rate": 6.548856548856549e-06, + "loss": 0.065, "step": 13160 }, { - "epoch": 3.324917950012623, - "grad_norm": 0.4984453320503235, - "learning_rate": 1.0050492299924263e-05, - "loss": 0.0666, + "epoch": 3.9114939114939116, + "grad_norm": 0.5687326788902283, + "learning_rate": 6.531036531036531e-06, + "loss": 0.0592, "step": 13170 }, { - "epoch": 3.327442565008836, - "grad_norm": 0.5523614883422852, - "learning_rate": 1.0035344609946983e-05, - "loss": 0.0521, + "epoch": 3.9144639144639144, + "grad_norm": 0.4574839770793915, + "learning_rate": 6.513216513216513e-06, + "loss": 0.0691, "step": 13180 }, { - "epoch": 3.329967180005049, - "grad_norm": 0.28940534591674805, - "learning_rate": 1.0020196919969704e-05, - "loss": 0.0661, + "epoch": 3.9174339174339172, + "grad_norm": 0.4538971483707428, + "learning_rate": 6.4953964953964955e-06, + "loss": 0.0562, "step": 13190 }, { - "epoch": 3.3324917950012622, - "grad_norm": 0.4787939488887787, - "learning_rate": 1.0005049229992428e-05, - "loss": 0.0585, + "epoch": 3.9204039204039205, + "grad_norm": 0.32448068261146545, + "learning_rate": 6.477576477576478e-06, + "loss": 0.0555, "step": 13200 }, { - "epoch": 3.3350164099974755, - "grad_norm": 0.8407096266746521, - "learning_rate": 9.989901540015147e-06, - "loss": 0.0562, + "epoch": 3.9233739233739233, + "grad_norm": 0.5266978144645691, + "learning_rate": 6.45975645975646e-06, + "loss": 0.0532, "step": 13210 }, { - "epoch": 3.3375410249936883, - "grad_norm": 0.4084523618221283, - "learning_rate": 9.974753850037869e-06, - "loss": 0.0528, + "epoch": 3.926343926343926, + "grad_norm": 0.48830193281173706, + "learning_rate": 6.441936441936442e-06, + "loss": 0.0609, "step": 13220 }, { - "epoch": 3.3400656399899016, - "grad_norm": 0.5200213193893433, - "learning_rate": 9.95960616006059e-06, - "loss": 0.0648, + "epoch": 3.9293139293139294, + "grad_norm": 0.48386427760124207, + "learning_rate": 6.424116424116425e-06, + "loss": 0.0595, "step": 13230 }, { - "epoch": 3.3425902549861144, - "grad_norm": 0.34778347611427307, - "learning_rate": 9.944458470083312e-06, - "loss": 0.0495, + "epoch": 3.9322839322839322, + "grad_norm": 0.33438950777053833, + "learning_rate": 6.406296406296406e-06, + "loss": 0.0638, "step": 13240 }, { - "epoch": 3.3451148699823277, - "grad_norm": 0.5605157017707825, - "learning_rate": 9.929310780106034e-06, - "loss": 0.0706, + "epoch": 3.935253935253935, + "grad_norm": 0.5018361806869507, + "learning_rate": 6.3884763884763885e-06, + "loss": 0.0589, "step": 13250 }, { - "epoch": 3.347639484978541, - "grad_norm": 0.5080039501190186, - "learning_rate": 9.914163090128755e-06, - "loss": 0.0583, + "epoch": 3.9382239382239383, + "grad_norm": 0.5178138613700867, + "learning_rate": 6.370656370656371e-06, + "loss": 0.0619, "step": 13260 }, { - "epoch": 3.3501640999747537, - "grad_norm": 0.35784703493118286, - "learning_rate": 9.899015400151477e-06, - "loss": 0.0599, + "epoch": 3.941193941193941, + "grad_norm": 0.4681033194065094, + "learning_rate": 6.352836352836353e-06, + "loss": 0.0577, "step": 13270 }, { - "epoch": 3.352688714970967, - "grad_norm": 0.44402334094047546, - "learning_rate": 9.883867710174199e-06, - "loss": 0.0429, + "epoch": 3.944163944163944, + "grad_norm": 0.6118980050086975, + "learning_rate": 6.335016335016335e-06, + "loss": 0.0563, "step": 13280 }, { - "epoch": 3.35521332996718, - "grad_norm": 0.26014208793640137, - "learning_rate": 9.86872002019692e-06, - "loss": 0.0654, + "epoch": 3.9471339471339473, + "grad_norm": 0.4309462010860443, + "learning_rate": 6.317196317196317e-06, + "loss": 0.0465, "step": 13290 }, { - "epoch": 3.357737944963393, - "grad_norm": 0.5210455060005188, - "learning_rate": 9.853572330219642e-06, - "loss": 0.0524, + "epoch": 3.95010395010395, + "grad_norm": 0.5277587175369263, + "learning_rate": 6.2993762993763e-06, + "loss": 0.0706, "step": 13300 }, { - "epoch": 3.3602625599596063, - "grad_norm": 0.5096918344497681, - "learning_rate": 9.838424640242363e-06, - "loss": 0.0627, + "epoch": 3.953073953073953, + "grad_norm": 0.5027768611907959, + "learning_rate": 6.2815562815562815e-06, + "loss": 0.0491, "step": 13310 }, { - "epoch": 3.362787174955819, - "grad_norm": 0.46019914746284485, - "learning_rate": 9.823276950265085e-06, - "loss": 0.0534, + "epoch": 3.956043956043956, + "grad_norm": 0.560326874256134, + "learning_rate": 6.2637362637362645e-06, + "loss": 0.0487, "step": 13320 }, { - "epoch": 3.3653117899520324, - "grad_norm": 0.6467626690864563, - "learning_rate": 9.808129260287807e-06, - "loss": 0.0728, + "epoch": 3.959013959013959, + "grad_norm": 0.5669682621955872, + "learning_rate": 6.245916245916246e-06, + "loss": 0.0491, "step": 13330 }, { - "epoch": 3.367836404948245, - "grad_norm": 0.6337939500808716, - "learning_rate": 9.792981570310528e-06, - "loss": 0.0601, + "epoch": 3.961983961983962, + "grad_norm": 0.49655118584632874, + "learning_rate": 6.228096228096228e-06, + "loss": 0.0497, "step": 13340 }, { - "epoch": 3.3703610199444585, - "grad_norm": 0.36074209213256836, - "learning_rate": 9.77783388033325e-06, - "loss": 0.0602, + "epoch": 3.964953964953965, + "grad_norm": 0.8173234462738037, + "learning_rate": 6.21027621027621e-06, + "loss": 0.071, "step": 13350 }, { - "epoch": 3.3728856349406717, - "grad_norm": 0.5853038430213928, - "learning_rate": 9.762686190355971e-06, - "loss": 0.0488, + "epoch": 3.967923967923968, + "grad_norm": 0.5078877210617065, + "learning_rate": 6.192456192456192e-06, + "loss": 0.0482, "step": 13360 }, { - "epoch": 3.3754102499368845, - "grad_norm": 0.46384280920028687, - "learning_rate": 9.747538500378693e-06, - "loss": 0.0609, + "epoch": 3.970893970893971, + "grad_norm": 0.4183073937892914, + "learning_rate": 6.174636174636175e-06, + "loss": 0.0537, "step": 13370 }, { - "epoch": 3.377934864933098, - "grad_norm": 0.38020265102386475, - "learning_rate": 9.732390810401415e-06, - "loss": 0.0515, + "epoch": 3.973863973863974, + "grad_norm": 0.5460306406021118, + "learning_rate": 6.156816156816157e-06, + "loss": 0.0707, "step": 13380 }, { - "epoch": 3.3804594799293106, - "grad_norm": 0.5325652956962585, - "learning_rate": 9.717243120424134e-06, - "loss": 0.0609, + "epoch": 3.976833976833977, + "grad_norm": 0.8355798125267029, + "learning_rate": 6.13899613899614e-06, + "loss": 0.0663, "step": 13390 }, { - "epoch": 3.382984094925524, - "grad_norm": 0.3673112392425537, - "learning_rate": 9.702095430446858e-06, - "loss": 0.0729, + "epoch": 3.9798039798039797, + "grad_norm": 0.5097036361694336, + "learning_rate": 6.121176121176121e-06, + "loss": 0.0652, "step": 13400 }, { - "epoch": 3.385508709921737, - "grad_norm": 0.5106756091117859, - "learning_rate": 9.68694774046958e-06, - "loss": 0.0545, + "epoch": 3.982773982773983, + "grad_norm": 0.5116889476776123, + "learning_rate": 6.103356103356103e-06, + "loss": 0.0584, "step": 13410 }, { - "epoch": 3.38803332491795, - "grad_norm": 0.39151692390441895, - "learning_rate": 9.6718000504923e-06, - "loss": 0.0588, + "epoch": 3.985743985743986, + "grad_norm": 0.4749346971511841, + "learning_rate": 6.085536085536085e-06, + "loss": 0.0598, "step": 13420 }, { - "epoch": 3.390557939914163, - "grad_norm": 0.3701721131801605, - "learning_rate": 9.656652360515022e-06, - "loss": 0.0576, + "epoch": 3.9887139887139886, + "grad_norm": 0.3732450306415558, + "learning_rate": 6.0677160677160676e-06, + "loss": 0.0623, "step": 13430 }, { - "epoch": 3.393082554910376, - "grad_norm": 0.5336679816246033, - "learning_rate": 9.641504670537742e-06, - "loss": 0.0597, + "epoch": 3.991683991683992, + "grad_norm": 0.37360072135925293, + "learning_rate": 6.049896049896051e-06, + "loss": 0.0577, "step": 13440 }, { - "epoch": 3.3956071699065893, - "grad_norm": 0.4533694088459015, - "learning_rate": 9.626356980560464e-06, - "loss": 0.0708, + "epoch": 3.9946539946539947, + "grad_norm": 0.5997447967529297, + "learning_rate": 6.032076032076032e-06, + "loss": 0.0634, "step": 13450 }, { - "epoch": 3.3981317849028025, - "grad_norm": 0.5501024127006531, - "learning_rate": 9.611209290583187e-06, + "epoch": 3.9976239976239976, + "grad_norm": 0.4489789605140686, + "learning_rate": 6.014256014256015e-06, "loss": 0.059, "step": 13460 }, { - "epoch": 3.4006563998990154, - "grad_norm": 0.38329634070396423, - "learning_rate": 9.596061600605907e-06, - "loss": 0.0563, + "epoch": 4.0, + "eval_f1": 0.49727767695099817, + "eval_loss": 0.054960619658231735, + "eval_runtime": 178.7066, + "eval_samples_per_second": 212.745, + "eval_steps_per_second": 3.329, + "step": 13468 + }, + { + "epoch": 4.000594000594001, + "grad_norm": 0.8586022257804871, + "learning_rate": 5.996435996435996e-06, + "loss": 0.0538, "step": 13470 }, { - "epoch": 3.4031810148952286, - "grad_norm": 0.4871230721473694, - "learning_rate": 9.580913910628629e-06, - "loss": 0.0657, + "epoch": 4.003564003564003, + "grad_norm": 0.5165749192237854, + "learning_rate": 5.978615978615979e-06, + "loss": 0.0637, "step": 13480 }, { - "epoch": 3.4057056298914414, - "grad_norm": 0.6076348423957825, - "learning_rate": 9.565766220651352e-06, - "loss": 0.0664, + "epoch": 4.0065340065340065, + "grad_norm": 0.6126316785812378, + "learning_rate": 5.960795960795961e-06, + "loss": 0.0514, "step": 13490 }, { - "epoch": 3.4082302448876547, - "grad_norm": 0.44044622778892517, - "learning_rate": 9.550618530674072e-06, - "loss": 0.0657, + "epoch": 4.00950400950401, + "grad_norm": 0.572414219379425, + "learning_rate": 5.942975942975943e-06, + "loss": 0.0592, "step": 13500 }, { - "epoch": 3.410754859883868, - "grad_norm": 0.7991306781768799, - "learning_rate": 9.535470840696794e-06, - "loss": 0.0455, + "epoch": 4.012474012474012, + "grad_norm": 0.615561842918396, + "learning_rate": 5.925155925155926e-06, + "loss": 0.0551, "step": 13510 }, { - "epoch": 3.4132794748800808, - "grad_norm": 0.41269394755363464, - "learning_rate": 9.520323150719515e-06, + "epoch": 4.015444015444015, + "grad_norm": 0.5001277923583984, + "learning_rate": 5.907335907335907e-06, "loss": 0.0586, "step": 13520 }, { - "epoch": 3.415804089876294, - "grad_norm": 0.4661392867565155, - "learning_rate": 9.505175460742237e-06, - "loss": 0.066, + "epoch": 4.018414018414019, + "grad_norm": 0.4393932521343231, + "learning_rate": 5.88951588951589e-06, + "loss": 0.0567, "step": 13530 }, { - "epoch": 3.418328704872507, - "grad_norm": 0.7089284062385559, - "learning_rate": 9.490027770764958e-06, - "loss": 0.0605, + "epoch": 4.021384021384021, + "grad_norm": 0.3287888467311859, + "learning_rate": 5.8716958716958714e-06, + "loss": 0.0525, "step": 13540 }, { - "epoch": 3.42085331986872, - "grad_norm": 0.3510993421077728, - "learning_rate": 9.47488008078768e-06, - "loss": 0.0541, + "epoch": 4.024354024354024, + "grad_norm": 0.6189230680465698, + "learning_rate": 5.8538758538758545e-06, + "loss": 0.061, "step": 13550 }, { - "epoch": 3.423377934864933, - "grad_norm": 0.3425975739955902, - "learning_rate": 9.459732390810402e-06, - "loss": 0.0623, + "epoch": 4.027324027324028, + "grad_norm": 0.8372416496276855, + "learning_rate": 5.836055836055836e-06, + "loss": 0.0665, "step": 13560 }, { - "epoch": 3.425902549861146, - "grad_norm": 0.3647609353065491, - "learning_rate": 9.444584700833123e-06, - "loss": 0.0571, + "epoch": 4.03029403029403, + "grad_norm": 0.4526776373386383, + "learning_rate": 5.818235818235818e-06, + "loss": 0.0617, "step": 13570 }, { - "epoch": 3.4284271648573594, - "grad_norm": 0.46789881587028503, - "learning_rate": 9.429437010855845e-06, - "loss": 0.0537, + "epoch": 4.033264033264033, + "grad_norm": 0.6300592422485352, + "learning_rate": 5.800415800415801e-06, + "loss": 0.0684, "step": 13580 }, { - "epoch": 3.4309517798535722, - "grad_norm": 0.6661184430122375, - "learning_rate": 9.414289320878566e-06, - "loss": 0.0544, + "epoch": 4.0362340362340365, + "grad_norm": 0.7106212973594666, + "learning_rate": 5.782595782595782e-06, + "loss": 0.0635, "step": 13590 }, { - "epoch": 3.4334763948497855, - "grad_norm": 0.6467635035514832, - "learning_rate": 9.399141630901288e-06, - "loss": 0.0633, + "epoch": 4.039204039204039, + "grad_norm": 0.47979801893234253, + "learning_rate": 5.764775764775765e-06, + "loss": 0.0543, "step": 13600 }, { - "epoch": 3.4360010098459983, - "grad_norm": 0.5681377649307251, - "learning_rate": 9.38399394092401e-06, - "loss": 0.0613, + "epoch": 4.042174042174042, + "grad_norm": 0.704913854598999, + "learning_rate": 5.746955746955747e-06, + "loss": 0.0495, "step": 13610 }, { - "epoch": 3.4385256248422116, - "grad_norm": 0.49820029735565186, - "learning_rate": 9.368846250946731e-06, - "loss": 0.0477, + "epoch": 4.0451440451440455, + "grad_norm": 0.5323979258537292, + "learning_rate": 5.72913572913573e-06, + "loss": 0.0549, "step": 13620 }, { - "epoch": 3.4410502398384244, - "grad_norm": 0.5915389060974121, - "learning_rate": 9.353698560969453e-06, - "loss": 0.0599, + "epoch": 4.048114048114048, + "grad_norm": 0.6989266276359558, + "learning_rate": 5.711315711315711e-06, + "loss": 0.07, "step": 13630 }, { - "epoch": 3.4435748548346377, - "grad_norm": 0.540043830871582, - "learning_rate": 9.338550870992174e-06, - "loss": 0.0658, + "epoch": 4.051084051084051, + "grad_norm": 0.6013164520263672, + "learning_rate": 5.693495693495694e-06, + "loss": 0.0565, "step": 13640 }, { - "epoch": 3.446099469830851, - "grad_norm": 0.3997081220149994, - "learning_rate": 9.323403181014896e-06, - "loss": 0.054, + "epoch": 4.054054054054054, + "grad_norm": 0.43801942467689514, + "learning_rate": 5.675675675675676e-06, + "loss": 0.0574, "step": 13650 }, { - "epoch": 3.4486240848270637, - "grad_norm": 0.45656365156173706, - "learning_rate": 9.308255491037617e-06, - "loss": 0.058, + "epoch": 4.057024057024057, + "grad_norm": 0.6650937795639038, + "learning_rate": 5.6578556578556575e-06, + "loss": 0.0637, "step": 13660 }, { - "epoch": 3.451148699823277, - "grad_norm": 0.417140394449234, - "learning_rate": 9.293107801060339e-06, - "loss": 0.0652, + "epoch": 4.05999405999406, + "grad_norm": 0.4909881055355072, + "learning_rate": 5.6400356400356405e-06, + "loss": 0.0612, "step": 13670 }, { - "epoch": 3.45367331481949, - "grad_norm": 0.5246079564094543, - "learning_rate": 9.277960111083059e-06, - "loss": 0.0512, + "epoch": 4.062964062964063, + "grad_norm": 0.3323568105697632, + "learning_rate": 5.622215622215622e-06, + "loss": 0.0584, "step": 13680 }, { - "epoch": 3.456197929815703, - "grad_norm": 0.3504631519317627, - "learning_rate": 9.262812421105782e-06, - "loss": 0.0721, + "epoch": 4.065934065934066, + "grad_norm": 0.6184719800949097, + "learning_rate": 5.604395604395605e-06, + "loss": 0.0504, "step": 13690 }, { - "epoch": 3.4587225448119163, - "grad_norm": 0.29381871223449707, - "learning_rate": 9.247664731128504e-06, - "loss": 0.0553, + "epoch": 4.068904068904069, + "grad_norm": 0.5047394037246704, + "learning_rate": 5.586575586575586e-06, + "loss": 0.0531, "step": 13700 }, { - "epoch": 3.461247159808129, - "grad_norm": 0.41762417554855347, - "learning_rate": 9.232517041151224e-06, - "loss": 0.0734, + "epoch": 4.071874071874072, + "grad_norm": 0.6481796503067017, + "learning_rate": 5.568755568755569e-06, + "loss": 0.0613, "step": 13710 }, { - "epoch": 3.4637717748043424, - "grad_norm": 0.6137571930885315, - "learning_rate": 9.217369351173947e-06, - "loss": 0.0588, + "epoch": 4.074844074844075, + "grad_norm": 0.7215176224708557, + "learning_rate": 5.550935550935551e-06, + "loss": 0.0633, "step": 13720 }, { - "epoch": 3.466296389800555, - "grad_norm": 0.6194366812705994, - "learning_rate": 9.202221661196667e-06, - "loss": 0.0568, + "epoch": 4.077814077814078, + "grad_norm": 0.16339148581027985, + "learning_rate": 5.533115533115533e-06, + "loss": 0.058, "step": 13730 }, { - "epoch": 3.4688210047967685, - "grad_norm": 0.7250993251800537, - "learning_rate": 9.187073971219389e-06, - "loss": 0.056, + "epoch": 4.080784080784081, + "grad_norm": 0.5942262411117554, + "learning_rate": 5.515295515295516e-06, + "loss": 0.0733, "step": 13740 }, { - "epoch": 3.4713456197929817, - "grad_norm": 0.5507886409759521, - "learning_rate": 9.171926281242112e-06, - "loss": 0.0707, + "epoch": 4.0837540837540836, + "grad_norm": 0.4894910454750061, + "learning_rate": 5.497475497475497e-06, + "loss": 0.0645, "step": 13750 }, { - "epoch": 3.4738702347891945, - "grad_norm": 0.39389169216156006, - "learning_rate": 9.156778591264832e-06, - "loss": 0.064, + "epoch": 4.086724086724087, + "grad_norm": 0.44156116247177124, + "learning_rate": 5.47965547965548e-06, + "loss": 0.0555, "step": 13760 }, { - "epoch": 3.476394849785408, - "grad_norm": 0.47429707646369934, - "learning_rate": 9.141630901287553e-06, - "loss": 0.0618, + "epoch": 4.08969408969409, + "grad_norm": 0.45034366846084595, + "learning_rate": 5.461835461835461e-06, + "loss": 0.0648, "step": 13770 }, { - "epoch": 3.4789194647816206, - "grad_norm": 0.3083738088607788, - "learning_rate": 9.126483211310277e-06, - "loss": 0.0538, + "epoch": 4.0926640926640925, + "grad_norm": 0.43881091475486755, + "learning_rate": 5.444015444015444e-06, + "loss": 0.059, "step": 13780 }, { - "epoch": 3.481444079777834, - "grad_norm": 0.6611940860748291, - "learning_rate": 9.111335521332997e-06, - "loss": 0.0754, + "epoch": 4.095634095634096, + "grad_norm": 0.6181434988975525, + "learning_rate": 5.4261954261954265e-06, + "loss": 0.0698, "step": 13790 }, { - "epoch": 3.483968694774047, - "grad_norm": 0.3021230697631836, - "learning_rate": 9.096187831355718e-06, - "loss": 0.0522, + "epoch": 4.098604098604099, + "grad_norm": 0.39531105756759644, + "learning_rate": 5.408375408375409e-06, + "loss": 0.0563, "step": 13800 }, { - "epoch": 3.48649330977026, - "grad_norm": 0.34516650438308716, - "learning_rate": 9.08104014137844e-06, - "loss": 0.0532, + "epoch": 4.101574101574101, + "grad_norm": 0.44663333892822266, + "learning_rate": 5.390555390555391e-06, + "loss": 0.0529, "step": 13810 }, { - "epoch": 3.489017924766473, - "grad_norm": 0.2838568687438965, - "learning_rate": 9.065892451401161e-06, - "loss": 0.0555, + "epoch": 4.104544104544105, + "grad_norm": 0.591187059879303, + "learning_rate": 5.372735372735372e-06, + "loss": 0.0524, "step": 13820 }, { - "epoch": 3.491542539762686, - "grad_norm": 0.4129784405231476, - "learning_rate": 9.050744761423883e-06, - "loss": 0.0552, + "epoch": 4.107514107514108, + "grad_norm": 0.5794005990028381, + "learning_rate": 5.354915354915355e-06, + "loss": 0.0529, "step": 13830 }, { - "epoch": 3.4940671547588993, - "grad_norm": 0.539181113243103, - "learning_rate": 9.035597071446604e-06, - "loss": 0.0594, + "epoch": 4.11048411048411, + "grad_norm": 0.312919944524765, + "learning_rate": 5.337095337095337e-06, + "loss": 0.0651, "step": 13840 }, { - "epoch": 3.4965917697551125, - "grad_norm": 0.2918561100959778, - "learning_rate": 9.020449381469326e-06, - "loss": 0.0496, + "epoch": 4.113454113454114, + "grad_norm": 0.5957525968551636, + "learning_rate": 5.3192753192753196e-06, + "loss": 0.0523, "step": 13850 }, { - "epoch": 3.4991163847513254, - "grad_norm": 0.39629873633384705, - "learning_rate": 9.005301691492048e-06, - "loss": 0.0451, + "epoch": 4.116424116424117, + "grad_norm": 0.6151428818702698, + "learning_rate": 5.301455301455302e-06, + "loss": 0.0581, "step": 13860 }, { - "epoch": 3.5016409997475386, - "grad_norm": 0.842644214630127, - "learning_rate": 8.99015400151477e-06, - "loss": 0.0595, + "epoch": 4.119394119394119, + "grad_norm": 0.4753796458244324, + "learning_rate": 5.283635283635284e-06, + "loss": 0.0514, "step": 13870 }, { - "epoch": 3.5041656147437514, - "grad_norm": 0.3173305094242096, - "learning_rate": 8.975006311537491e-06, - "loss": 0.0517, + "epoch": 4.1223641223641225, + "grad_norm": 0.45062291622161865, + "learning_rate": 5.265815265815266e-06, + "loss": 0.0467, "step": 13880 }, { - "epoch": 3.5066902297399647, - "grad_norm": 0.19913332164287567, - "learning_rate": 8.959858621560212e-06, - "loss": 0.0617, + "epoch": 4.125334125334125, + "grad_norm": 0.4602527320384979, + "learning_rate": 5.247995247995247e-06, + "loss": 0.0583, "step": 13890 }, { - "epoch": 3.509214844736178, - "grad_norm": 0.42569276690483093, - "learning_rate": 8.944710931582934e-06, - "loss": 0.0543, + "epoch": 4.128304128304128, + "grad_norm": 0.543065071105957, + "learning_rate": 5.2301752301752304e-06, + "loss": 0.0493, "step": 13900 }, { - "epoch": 3.5117394597323908, - "grad_norm": 0.3855074346065521, - "learning_rate": 8.929563241605656e-06, - "loss": 0.0479, + "epoch": 4.1312741312741315, + "grad_norm": 0.40139061212539673, + "learning_rate": 5.212355212355213e-06, + "loss": 0.0543, "step": 13910 }, { - "epoch": 3.514264074728604, - "grad_norm": 0.7564893960952759, - "learning_rate": 8.914415551628377e-06, - "loss": 0.07, + "epoch": 4.134244134244134, + "grad_norm": 0.40932586789131165, + "learning_rate": 5.194535194535195e-06, + "loss": 0.0491, "step": 13920 }, { - "epoch": 3.516788689724817, - "grad_norm": 0.4746794104576111, - "learning_rate": 8.899267861651099e-06, - "loss": 0.0615, + "epoch": 4.137214137214137, + "grad_norm": 0.7136752605438232, + "learning_rate": 5.176715176715177e-06, + "loss": 0.0665, "step": 13930 }, { - "epoch": 3.51931330472103, - "grad_norm": 0.5218245387077332, - "learning_rate": 8.88412017167382e-06, - "loss": 0.0603, + "epoch": 4.14018414018414, + "grad_norm": 0.81020188331604, + "learning_rate": 5.158895158895159e-06, + "loss": 0.0516, "step": 13940 }, { - "epoch": 3.5218379197172434, - "grad_norm": 0.3240072429180145, - "learning_rate": 8.868972481696542e-06, - "loss": 0.0728, + "epoch": 4.143154143154143, + "grad_norm": 0.3689301311969757, + "learning_rate": 5.141075141075141e-06, + "loss": 0.0617, "step": 13950 }, { - "epoch": 3.524362534713456, - "grad_norm": 0.4840814471244812, - "learning_rate": 8.853824791719264e-06, - "loss": 0.0586, + "epoch": 4.146124146124146, + "grad_norm": 0.296916663646698, + "learning_rate": 5.1232551232551234e-06, + "loss": 0.0507, "step": 13960 }, { - "epoch": 3.5268871497096694, - "grad_norm": 0.4556601643562317, - "learning_rate": 8.838677101741984e-06, - "loss": 0.0612, + "epoch": 4.149094149094149, + "grad_norm": 0.45669737458229065, + "learning_rate": 5.105435105435106e-06, + "loss": 0.0589, "step": 13970 }, { - "epoch": 3.5294117647058822, - "grad_norm": 0.6337325572967529, - "learning_rate": 8.823529411764707e-06, - "loss": 0.0695, + "epoch": 4.152064152064152, + "grad_norm": 0.7257834076881409, + "learning_rate": 5.087615087615088e-06, + "loss": 0.0762, "step": 13980 }, { - "epoch": 3.5319363797020955, - "grad_norm": 0.5485546588897705, - "learning_rate": 8.808381721787427e-06, - "loss": 0.0599, + "epoch": 4.155034155034155, + "grad_norm": 0.4654732942581177, + "learning_rate": 5.06979506979507e-06, + "loss": 0.0503, "step": 13990 }, { - "epoch": 3.5344609946983088, - "grad_norm": 0.33449289202690125, - "learning_rate": 8.793234031810148e-06, - "loss": 0.058, + "epoch": 4.158004158004158, + "grad_norm": 0.4994029700756073, + "learning_rate": 5.051975051975052e-06, + "loss": 0.0535, "step": 14000 }, { - "epoch": 3.5369856096945216, - "grad_norm": 0.40033623576164246, - "learning_rate": 8.778086341832872e-06, - "loss": 0.0595, + "epoch": 4.160974160974161, + "grad_norm": 0.47293511033058167, + "learning_rate": 5.034155034155034e-06, + "loss": 0.0564, "step": 14010 }, { - "epoch": 3.5395102246907344, - "grad_norm": 0.31661325693130493, - "learning_rate": 8.762938651855591e-06, - "loss": 0.0552, + "epoch": 4.163944163944164, + "grad_norm": 0.3141496777534485, + "learning_rate": 5.0163350163350165e-06, + "loss": 0.0597, "step": 14020 }, { - "epoch": 3.5420348396869477, - "grad_norm": 0.32345449924468994, - "learning_rate": 8.747790961878313e-06, - "loss": 0.057, + "epoch": 4.166914166914167, + "grad_norm": 0.2851223051548004, + "learning_rate": 4.998514998514999e-06, + "loss": 0.0549, "step": 14030 }, { - "epoch": 3.544559454683161, - "grad_norm": 0.455110639333725, - "learning_rate": 8.732643271901036e-06, - "loss": 0.0639, + "epoch": 4.1698841698841695, + "grad_norm": 0.9652001261711121, + "learning_rate": 4.980694980694981e-06, + "loss": 0.0605, "step": 14040 }, { - "epoch": 3.5470840696793737, - "grad_norm": 0.47227227687835693, - "learning_rate": 8.717495581923756e-06, - "loss": 0.0646, + "epoch": 4.172854172854173, + "grad_norm": 0.6175165772438049, + "learning_rate": 4.962874962874963e-06, + "loss": 0.0574, "step": 14050 }, { - "epoch": 3.549608684675587, - "grad_norm": 0.7046650648117065, - "learning_rate": 8.702347891946478e-06, - "loss": 0.0452, + "epoch": 4.175824175824176, + "grad_norm": 0.39955687522888184, + "learning_rate": 4.945054945054945e-06, + "loss": 0.0733, "step": 14060 }, { - "epoch": 3.5521332996718, - "grad_norm": 0.28317102789878845, - "learning_rate": 8.687200201969201e-06, - "loss": 0.0657, + "epoch": 4.1787941787941785, + "grad_norm": 0.5539454817771912, + "learning_rate": 4.927234927234927e-06, + "loss": 0.0587, "step": 14070 }, { - "epoch": 3.554657914668013, - "grad_norm": 0.5077089071273804, - "learning_rate": 8.672052511991921e-06, - "loss": 0.048, + "epoch": 4.181764181764182, + "grad_norm": 0.574409008026123, + "learning_rate": 4.9094149094149095e-06, + "loss": 0.0642, "step": 14080 }, { - "epoch": 3.5571825296642263, - "grad_norm": 0.657687783241272, - "learning_rate": 8.656904822014643e-06, - "loss": 0.0745, + "epoch": 4.184734184734185, + "grad_norm": 0.4297143816947937, + "learning_rate": 4.891594891594892e-06, + "loss": 0.0567, "step": 14090 }, { - "epoch": 3.559707144660439, - "grad_norm": 0.4756108820438385, - "learning_rate": 8.641757132037364e-06, - "loss": 0.0565, + "epoch": 4.187704187704187, + "grad_norm": 0.49302181601524353, + "learning_rate": 4.873774873774874e-06, + "loss": 0.0562, "step": 14100 }, { - "epoch": 3.5622317596566524, - "grad_norm": 0.36463871598243713, - "learning_rate": 8.626609442060086e-06, - "loss": 0.0632, + "epoch": 4.190674190674191, + "grad_norm": 0.8171068429946899, + "learning_rate": 4.855954855954856e-06, + "loss": 0.0656, "step": 14110 }, { - "epoch": 3.564756374652865, - "grad_norm": 0.514712393283844, - "learning_rate": 8.611461752082807e-06, - "loss": 0.0609, + "epoch": 4.193644193644194, + "grad_norm": 0.6117607951164246, + "learning_rate": 4.838134838134839e-06, + "loss": 0.0596, "step": 14120 }, { - "epoch": 3.5672809896490785, - "grad_norm": 0.4450944662094116, - "learning_rate": 8.596314062105529e-06, - "loss": 0.0515, + "epoch": 4.196614196614196, + "grad_norm": 0.33238255977630615, + "learning_rate": 4.82031482031482e-06, + "loss": 0.0493, "step": 14130 }, { - "epoch": 3.5698056046452917, - "grad_norm": 0.4317336976528168, - "learning_rate": 8.58116637212825e-06, - "loss": 0.0641, + "epoch": 4.1995841995842, + "grad_norm": 0.3627205789089203, + "learning_rate": 4.8024948024948025e-06, + "loss": 0.0579, "step": 14140 }, { - "epoch": 3.5723302196415045, - "grad_norm": 0.46943795680999756, - "learning_rate": 8.566018682150972e-06, - "loss": 0.0495, + "epoch": 4.202554202554203, + "grad_norm": 0.6033427119255066, + "learning_rate": 4.784674784674785e-06, + "loss": 0.0568, "step": 14150 }, { - "epoch": 3.574854834637718, - "grad_norm": 0.42267486453056335, - "learning_rate": 8.550870992173694e-06, - "loss": 0.0525, + "epoch": 4.205524205524205, + "grad_norm": 0.5274185538291931, + "learning_rate": 4.766854766854767e-06, + "loss": 0.062, "step": 14160 }, { - "epoch": 3.5773794496339306, - "grad_norm": 0.4673076272010803, - "learning_rate": 8.535723302196415e-06, - "loss": 0.0621, + "epoch": 4.2084942084942085, + "grad_norm": 0.4550093114376068, + "learning_rate": 4.749034749034749e-06, + "loss": 0.0747, "step": 14170 }, { - "epoch": 3.579904064630144, - "grad_norm": 0.549739420413971, - "learning_rate": 8.520575612219137e-06, - "loss": 0.0711, + "epoch": 4.211464211464212, + "grad_norm": 0.382213294506073, + "learning_rate": 4.731214731214731e-06, + "loss": 0.0457, "step": 14180 }, { - "epoch": 3.582428679626357, - "grad_norm": 0.5667104125022888, - "learning_rate": 8.505427922241859e-06, - "loss": 0.0607, + "epoch": 4.214434214434214, + "grad_norm": 0.5736550092697144, + "learning_rate": 4.713394713394714e-06, + "loss": 0.0614, "step": 14190 }, { - "epoch": 3.58495329462257, - "grad_norm": 0.5759524703025818, - "learning_rate": 8.49028023226458e-06, - "loss": 0.054, + "epoch": 4.2174042174042174, + "grad_norm": 0.5673187971115112, + "learning_rate": 4.6955746955746955e-06, + "loss": 0.0582, "step": 14200 }, { - "epoch": 3.587477909618783, - "grad_norm": 0.5231726765632629, - "learning_rate": 8.475132542287302e-06, - "loss": 0.0651, + "epoch": 4.220374220374221, + "grad_norm": 0.6587729454040527, + "learning_rate": 4.677754677754678e-06, + "loss": 0.0681, "step": 14210 }, { - "epoch": 3.590002524614996, - "grad_norm": 0.5827665328979492, - "learning_rate": 8.459984852310023e-06, - "loss": 0.069, + "epoch": 4.223344223344223, + "grad_norm": 0.6249194741249084, + "learning_rate": 4.65993465993466e-06, + "loss": 0.0662, "step": 14220 }, { - "epoch": 3.5925271396112093, - "grad_norm": 0.35911238193511963, - "learning_rate": 8.444837162332745e-06, - "loss": 0.0602, + "epoch": 4.226314226314226, + "grad_norm": 0.6569053530693054, + "learning_rate": 4.642114642114642e-06, + "loss": 0.0634, "step": 14230 }, { - "epoch": 3.5950517546074225, - "grad_norm": 0.5111239552497864, - "learning_rate": 8.429689472355467e-06, - "loss": 0.0549, + "epoch": 4.22928422928423, + "grad_norm": 0.6076725125312805, + "learning_rate": 4.624294624294624e-06, + "loss": 0.0641, "step": 14240 }, { - "epoch": 3.5975763696036354, - "grad_norm": 0.39395228028297424, - "learning_rate": 8.414541782378186e-06, - "loss": 0.0704, + "epoch": 4.232254232254232, + "grad_norm": 0.4433649182319641, + "learning_rate": 4.606474606474606e-06, + "loss": 0.0471, "step": 14250 }, { - "epoch": 3.6001009845998486, - "grad_norm": 0.30438244342803955, - "learning_rate": 8.399394092400908e-06, - "loss": 0.0545, + "epoch": 4.235224235224235, + "grad_norm": 0.34535735845565796, + "learning_rate": 4.588654588654589e-06, + "loss": 0.0619, "step": 14260 }, { - "epoch": 3.6026255995960614, - "grad_norm": 0.6330828666687012, - "learning_rate": 8.384246402423631e-06, - "loss": 0.0691, + "epoch": 4.238194238194239, + "grad_norm": 0.3933964967727661, + "learning_rate": 4.570834570834571e-06, + "loss": 0.0588, "step": 14270 }, { - "epoch": 3.6051502145922747, - "grad_norm": 0.6127652525901794, - "learning_rate": 8.369098712446351e-06, - "loss": 0.061, + "epoch": 4.241164241164241, + "grad_norm": 0.577758252620697, + "learning_rate": 4.553014553014554e-06, + "loss": 0.0625, "step": 14280 }, { - "epoch": 3.607674829588488, - "grad_norm": 0.5500686764717102, - "learning_rate": 8.353951022469073e-06, - "loss": 0.0565, + "epoch": 4.244134244134244, + "grad_norm": 0.4267483353614807, + "learning_rate": 4.535194535194535e-06, + "loss": 0.0456, "step": 14290 }, { - "epoch": 3.6101994445847008, - "grad_norm": 0.3351685404777527, - "learning_rate": 8.338803332491796e-06, - "loss": 0.0463, + "epoch": 4.2471042471042475, + "grad_norm": 0.42397600412368774, + "learning_rate": 4.517374517374517e-06, + "loss": 0.056, "step": 14300 }, { - "epoch": 3.612724059580914, - "grad_norm": 0.6388446092605591, - "learning_rate": 8.323655642514516e-06, - "loss": 0.0567, + "epoch": 4.25007425007425, + "grad_norm": 0.3087056279182434, + "learning_rate": 4.499554499554499e-06, + "loss": 0.0564, "step": 14310 }, { - "epoch": 3.615248674577127, - "grad_norm": 0.311712384223938, - "learning_rate": 8.308507952537238e-06, - "loss": 0.0499, + "epoch": 4.253044253044253, + "grad_norm": 0.3736560046672821, + "learning_rate": 4.481734481734482e-06, + "loss": 0.0647, "step": 14320 }, { - "epoch": 3.61777328957334, - "grad_norm": 0.5051207542419434, - "learning_rate": 8.293360262559961e-06, - "loss": 0.0612, + "epoch": 4.256014256014256, + "grad_norm": 0.37401074171066284, + "learning_rate": 4.463914463914465e-06, + "loss": 0.0526, "step": 14330 }, { - "epoch": 3.6202979045695534, - "grad_norm": 0.6908370852470398, - "learning_rate": 8.27821257258268e-06, - "loss": 0.0677, + "epoch": 4.258984258984259, + "grad_norm": 0.6431254744529724, + "learning_rate": 4.446094446094446e-06, + "loss": 0.072, "step": 14340 }, { - "epoch": 3.622822519565766, - "grad_norm": 0.5216169953346252, - "learning_rate": 8.263064882605402e-06, - "loss": 0.0646, + "epoch": 4.261954261954262, + "grad_norm": 0.3994961380958557, + "learning_rate": 4.428274428274429e-06, + "loss": 0.0539, "step": 14350 }, { - "epoch": 3.6253471345619794, - "grad_norm": 0.35106194019317627, - "learning_rate": 8.247917192628126e-06, - "loss": 0.0689, + "epoch": 4.2649242649242645, + "grad_norm": 0.5059460997581482, + "learning_rate": 4.41045441045441e-06, + "loss": 0.0557, "step": 14360 }, { - "epoch": 3.6278717495581922, - "grad_norm": 0.5394479036331177, - "learning_rate": 8.232769502650846e-06, - "loss": 0.0534, + "epoch": 4.267894267894268, + "grad_norm": 0.201277494430542, + "learning_rate": 4.392634392634393e-06, + "loss": 0.0557, "step": 14370 }, { - "epoch": 3.6303963645544055, - "grad_norm": 0.5065087676048279, - "learning_rate": 8.217621812673567e-06, - "loss": 0.0588, + "epoch": 4.270864270864271, + "grad_norm": 0.22198070585727692, + "learning_rate": 4.374814374814375e-06, + "loss": 0.0526, "step": 14380 }, { - "epoch": 3.6329209795506188, - "grad_norm": 0.6077365875244141, - "learning_rate": 8.202474122696289e-06, - "loss": 0.062, + "epoch": 4.273834273834273, + "grad_norm": 0.2608140707015991, + "learning_rate": 4.356994356994357e-06, + "loss": 0.0518, "step": 14390 }, { - "epoch": 3.6354455945468316, - "grad_norm": 0.5733447074890137, - "learning_rate": 8.18732643271901e-06, - "loss": 0.0664, + "epoch": 4.276804276804277, + "grad_norm": 0.3986319601535797, + "learning_rate": 4.33917433917434e-06, + "loss": 0.0596, "step": 14400 }, { - "epoch": 3.637970209543045, - "grad_norm": 0.3470951020717621, - "learning_rate": 8.172178742741732e-06, - "loss": 0.0619, + "epoch": 4.27977427977428, + "grad_norm": 0.5883366465568542, + "learning_rate": 4.321354321354321e-06, + "loss": 0.058, "step": 14410 }, { - "epoch": 3.6404948245392577, - "grad_norm": 0.7638422250747681, - "learning_rate": 8.157031052764454e-06, - "loss": 0.0622, + "epoch": 4.282744282744282, + "grad_norm": 0.7146860361099243, + "learning_rate": 4.303534303534304e-06, + "loss": 0.0638, "step": 14420 }, { - "epoch": 3.643019439535471, - "grad_norm": 0.5586839914321899, - "learning_rate": 8.141883362787175e-06, - "loss": 0.0616, + "epoch": 4.285714285714286, + "grad_norm": 0.630452036857605, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.0519, "step": 14430 }, { - "epoch": 3.645544054531684, - "grad_norm": 0.28789180517196655, - "learning_rate": 8.126735672809897e-06, - "loss": 0.0548, + "epoch": 4.288684288684289, + "grad_norm": 0.7049713730812073, + "learning_rate": 4.2678942678942685e-06, + "loss": 0.0559, "step": 14440 }, { - "epoch": 3.648068669527897, - "grad_norm": 0.351252943277359, - "learning_rate": 8.111587982832618e-06, - "loss": 0.0554, + "epoch": 4.291654291654291, + "grad_norm": 0.31321823596954346, + "learning_rate": 4.25007425007425e-06, + "loss": 0.054, "step": 14450 }, { - "epoch": 3.65059328452411, - "grad_norm": 0.5091794729232788, - "learning_rate": 8.09644029285534e-06, - "loss": 0.0624, + "epoch": 4.2946242946242945, + "grad_norm": 0.8444371223449707, + "learning_rate": 4.232254232254232e-06, + "loss": 0.0589, "step": 14460 }, { - "epoch": 3.653117899520323, - "grad_norm": 0.5550070405006409, - "learning_rate": 8.081292602878062e-06, - "loss": 0.0479, + "epoch": 4.297594297594298, + "grad_norm": 0.5905739665031433, + "learning_rate": 4.214434214434215e-06, + "loss": 0.0633, "step": 14470 }, { - "epoch": 3.6556425145165363, - "grad_norm": 0.5677917003631592, - "learning_rate": 8.066144912900783e-06, - "loss": 0.0508, + "epoch": 4.3005643005643, + "grad_norm": 0.4641624093055725, + "learning_rate": 4.196614196614196e-06, + "loss": 0.0611, "step": 14480 }, { - "epoch": 3.6581671295127496, - "grad_norm": 0.38157495856285095, - "learning_rate": 8.050997222923505e-06, - "loss": 0.06, + "epoch": 4.303534303534303, + "grad_norm": 0.5575865507125854, + "learning_rate": 4.178794178794179e-06, + "loss": 0.0576, "step": 14490 }, { - "epoch": 3.6606917445089624, - "grad_norm": 0.4893856942653656, - "learning_rate": 8.035849532946226e-06, - "loss": 0.0583, + "epoch": 4.306504306504307, + "grad_norm": 0.7232492566108704, + "learning_rate": 4.160974160974161e-06, + "loss": 0.0575, "step": 14500 }, { - "epoch": 3.663216359505175, - "grad_norm": 0.5423852801322937, - "learning_rate": 8.020701842968948e-06, - "loss": 0.0569, + "epoch": 4.309474309474309, + "grad_norm": 0.5242018103599548, + "learning_rate": 4.143154143154144e-06, + "loss": 0.0692, "step": 14510 }, { - "epoch": 3.6657409745013885, - "grad_norm": 0.2972621023654938, - "learning_rate": 8.00555415299167e-06, - "loss": 0.0471, + "epoch": 4.312444312444312, + "grad_norm": 0.622914731502533, + "learning_rate": 4.125334125334125e-06, + "loss": 0.0688, "step": 14520 }, { - "epoch": 3.6682655894976017, - "grad_norm": 0.324066162109375, - "learning_rate": 7.990406463014391e-06, - "loss": 0.0552, + "epoch": 4.315414315414316, + "grad_norm": 0.5062875151634216, + "learning_rate": 4.107514107514108e-06, + "loss": 0.0542, "step": 14530 }, { - "epoch": 3.6707902044938145, - "grad_norm": 0.5666695237159729, - "learning_rate": 7.975258773037111e-06, - "loss": 0.0493, + "epoch": 4.318384318384318, + "grad_norm": 0.5135970711708069, + "learning_rate": 4.08969408969409e-06, + "loss": 0.0525, "step": 14540 }, { - "epoch": 3.673314819490028, - "grad_norm": 0.4640502333641052, - "learning_rate": 7.960111083059833e-06, - "loss": 0.0564, + "epoch": 4.321354321354321, + "grad_norm": 0.2701030969619751, + "learning_rate": 4.0718740718740715e-06, + "loss": 0.0525, "step": 14550 }, { - "epoch": 3.6758394344862406, - "grad_norm": 0.2752133905887604, - "learning_rate": 7.944963393082556e-06, - "loss": 0.0634, + "epoch": 4.324324324324325, + "grad_norm": 0.7602173089981079, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.053, "step": 14560 }, { - "epoch": 3.678364049482454, - "grad_norm": 0.6435825824737549, - "learning_rate": 7.929815703105276e-06, - "loss": 0.0499, + "epoch": 4.327294327294327, + "grad_norm": 0.7320886254310608, + "learning_rate": 4.036234036234036e-06, + "loss": 0.0576, "step": 14570 }, { - "epoch": 3.680888664478667, - "grad_norm": 0.3424232006072998, - "learning_rate": 7.914668013127997e-06, - "loss": 0.0558, + "epoch": 4.33026433026433, + "grad_norm": 0.422878623008728, + "learning_rate": 4.018414018414019e-06, + "loss": 0.0578, "step": 14580 }, { - "epoch": 3.68341327947488, - "grad_norm": 0.4150161147117615, - "learning_rate": 7.89952032315072e-06, - "loss": 0.0573, + "epoch": 4.3332343332343335, + "grad_norm": 0.449724018573761, + "learning_rate": 4.000594000594e-06, + "loss": 0.0664, "step": 14590 }, { - "epoch": 3.685937894471093, - "grad_norm": 0.47552725672721863, - "learning_rate": 7.88437263317344e-06, - "loss": 0.0609, + "epoch": 4.336204336204336, + "grad_norm": 0.22872653603553772, + "learning_rate": 3.982773982773983e-06, + "loss": 0.0529, "step": 14600 }, { - "epoch": 3.688462509467306, - "grad_norm": 0.2726339101791382, - "learning_rate": 7.869224943196162e-06, - "loss": 0.0565, + "epoch": 4.339174339174339, + "grad_norm": 0.4547821581363678, + "learning_rate": 3.964953964953965e-06, + "loss": 0.0541, "step": 14610 }, { - "epoch": 3.6909871244635193, - "grad_norm": 0.41144663095474243, - "learning_rate": 7.854077253218885e-06, - "loss": 0.0659, + "epoch": 4.342144342144342, + "grad_norm": 0.5161837339401245, + "learning_rate": 3.947133947133947e-06, + "loss": 0.0439, "step": 14620 }, { - "epoch": 3.6935117394597325, - "grad_norm": 0.4347810447216034, - "learning_rate": 7.838929563241605e-06, - "loss": 0.0634, + "epoch": 4.345114345114345, + "grad_norm": 0.6731418371200562, + "learning_rate": 3.92931392931393e-06, + "loss": 0.0554, "step": 14630 }, { - "epoch": 3.6960363544559454, - "grad_norm": 0.6239407658576965, - "learning_rate": 7.823781873264327e-06, - "loss": 0.0612, + "epoch": 4.348084348084348, + "grad_norm": 0.46018585562705994, + "learning_rate": 3.911493911493911e-06, + "loss": 0.0632, "step": 14640 }, { - "epoch": 3.6985609694521586, - "grad_norm": 0.43966999650001526, - "learning_rate": 7.80863418328705e-06, - "loss": 0.0541, + "epoch": 4.351054351054351, + "grad_norm": 0.3375426232814789, + "learning_rate": 3.893673893673894e-06, + "loss": 0.065, "step": 14650 }, { - "epoch": 3.7010855844483714, - "grad_norm": 0.3874231278896332, - "learning_rate": 7.79348649330977e-06, - "loss": 0.0514, + "epoch": 4.354024354024354, + "grad_norm": 0.5720539093017578, + "learning_rate": 3.875853875853875e-06, + "loss": 0.0595, "step": 14660 }, { - "epoch": 3.7036101994445847, - "grad_norm": 0.5817126035690308, - "learning_rate": 7.778338803332492e-06, - "loss": 0.0631, + "epoch": 4.356994356994357, + "grad_norm": 0.542365312576294, + "learning_rate": 3.858033858033858e-06, + "loss": 0.0493, "step": 14670 }, { - "epoch": 3.706134814440798, - "grad_norm": 0.5537297129631042, - "learning_rate": 7.763191113355213e-06, - "loss": 0.0709, + "epoch": 4.35996435996436, + "grad_norm": 0.6491771340370178, + "learning_rate": 3.8402138402138406e-06, + "loss": 0.0588, "step": 14680 }, { - "epoch": 3.7086594294370108, - "grad_norm": 0.5339685678482056, - "learning_rate": 7.748043423377935e-06, - "loss": 0.0665, + "epoch": 4.362934362934363, + "grad_norm": 0.7092576622962952, + "learning_rate": 3.822393822393823e-06, + "loss": 0.0637, "step": 14690 }, { - "epoch": 3.711184044433224, - "grad_norm": 0.33265355229377747, - "learning_rate": 7.732895733400657e-06, - "loss": 0.0544, + "epoch": 4.365904365904366, + "grad_norm": 0.5155068635940552, + "learning_rate": 3.804573804573805e-06, + "loss": 0.0502, "step": 14700 }, { - "epoch": 3.713708659429437, - "grad_norm": 0.6044656038284302, - "learning_rate": 7.717748043423378e-06, - "loss": 0.051, + "epoch": 4.368874368874369, + "grad_norm": 0.31838563084602356, + "learning_rate": 3.7867537867537867e-06, + "loss": 0.0575, "step": 14710 }, { - "epoch": 3.71623327442565, - "grad_norm": 0.6232115626335144, - "learning_rate": 7.7026003534461e-06, - "loss": 0.0707, + "epoch": 4.371844371844372, + "grad_norm": 0.7911087274551392, + "learning_rate": 3.7689337689337693e-06, + "loss": 0.0628, "step": 14720 }, { - "epoch": 3.7187578894218634, - "grad_norm": 0.43664541840553284, - "learning_rate": 7.687452663468821e-06, - "loss": 0.0607, + "epoch": 4.374814374814375, + "grad_norm": 0.26239511370658875, + "learning_rate": 3.751113751113751e-06, + "loss": 0.0452, "step": 14730 }, { - "epoch": 3.721282504418076, - "grad_norm": 0.36150795221328735, - "learning_rate": 7.672304973491543e-06, - "loss": 0.0556, + "epoch": 4.377784377784378, + "grad_norm": 0.5743318796157837, + "learning_rate": 3.733293733293733e-06, + "loss": 0.0618, "step": 14740 }, { - "epoch": 3.7238071194142894, - "grad_norm": 0.49992161989212036, - "learning_rate": 7.657157283514264e-06, - "loss": 0.0624, + "epoch": 4.3807543807543805, + "grad_norm": 0.520468533039093, + "learning_rate": 3.7154737154737153e-06, + "loss": 0.0578, "step": 14750 }, { - "epoch": 3.7263317344105022, - "grad_norm": 0.7528823614120483, - "learning_rate": 7.642009593536986e-06, - "loss": 0.0724, + "epoch": 4.383724383724384, + "grad_norm": 0.30406662821769714, + "learning_rate": 3.6976536976536975e-06, + "loss": 0.0479, "step": 14760 }, { - "epoch": 3.7288563494067155, - "grad_norm": 0.5596957206726074, - "learning_rate": 7.6268619035597085e-06, - "loss": 0.0539, + "epoch": 4.386694386694387, + "grad_norm": 0.363372266292572, + "learning_rate": 3.67983367983368e-06, + "loss": 0.0523, "step": 14770 }, { - "epoch": 3.7313809644029288, - "grad_norm": 0.532844603061676, - "learning_rate": 7.6117142135824284e-06, - "loss": 0.0656, + "epoch": 4.389664389664389, + "grad_norm": 0.6119177341461182, + "learning_rate": 3.6620136620136623e-06, + "loss": 0.0593, "step": 14780 }, { - "epoch": 3.7339055793991416, - "grad_norm": 0.568172812461853, - "learning_rate": 7.596566523605151e-06, - "loss": 0.0753, + "epoch": 4.392634392634393, + "grad_norm": 0.7049959897994995, + "learning_rate": 3.6441936441936444e-06, + "loss": 0.0554, "step": 14790 }, { - "epoch": 3.736430194395355, - "grad_norm": 0.7042198181152344, - "learning_rate": 7.581418833627871e-06, - "loss": 0.072, + "epoch": 4.395604395604396, + "grad_norm": 0.6827198266983032, + "learning_rate": 3.6263736263736266e-06, + "loss": 0.0536, "step": 14800 }, { - "epoch": 3.7389548093915677, - "grad_norm": 0.46920961141586304, - "learning_rate": 7.566271143650593e-06, - "loss": 0.0563, + "epoch": 4.398574398574398, + "grad_norm": 0.4505496025085449, + "learning_rate": 3.6085536085536088e-06, + "loss": 0.044, "step": 14810 }, { - "epoch": 3.741479424387781, - "grad_norm": 0.655121922492981, - "learning_rate": 7.551123453673316e-06, - "loss": 0.0644, + "epoch": 4.401544401544402, + "grad_norm": 0.36443957686424255, + "learning_rate": 3.5907335907335905e-06, + "loss": 0.065, "step": 14820 }, { - "epoch": 3.744004039383994, - "grad_norm": 0.4194955825805664, - "learning_rate": 7.5359757636960356e-06, - "loss": 0.0552, + "epoch": 4.404514404514405, + "grad_norm": 0.4884301424026489, + "learning_rate": 3.5729135729135727e-06, + "loss": 0.0476, "step": 14830 }, { - "epoch": 3.746528654380207, - "grad_norm": 0.44179287552833557, - "learning_rate": 7.520828073718758e-06, - "loss": 0.0618, + "epoch": 4.407484407484407, + "grad_norm": 0.504188597202301, + "learning_rate": 3.5550935550935553e-06, + "loss": 0.0563, "step": 14840 }, { - "epoch": 3.7490532693764202, - "grad_norm": 0.576785147190094, - "learning_rate": 7.5056803837414804e-06, - "loss": 0.0603, + "epoch": 4.410454410454411, + "grad_norm": 0.19332559406757355, + "learning_rate": 3.5372735372735375e-06, + "loss": 0.0727, "step": 14850 }, { - "epoch": 3.751577884372633, - "grad_norm": 0.5226455330848694, - "learning_rate": 7.490532693764201e-06, - "loss": 0.0549, + "epoch": 4.413424413424414, + "grad_norm": 0.33928439021110535, + "learning_rate": 3.5194535194535196e-06, + "loss": 0.0538, "step": 14860 }, { - "epoch": 3.7541024993688463, - "grad_norm": 0.5605834722518921, - "learning_rate": 7.475385003786923e-06, - "loss": 0.0614, + "epoch": 4.416394416394416, + "grad_norm": 0.6077583432197571, + "learning_rate": 3.501633501633502e-06, + "loss": 0.0583, "step": 14870 }, { - "epoch": 3.7566271143650596, - "grad_norm": 0.4599710702896118, - "learning_rate": 7.4602373138096435e-06, - "loss": 0.0579, + "epoch": 4.4193644193644195, + "grad_norm": 0.5217536091804504, + "learning_rate": 3.483813483813484e-06, + "loss": 0.0515, "step": 14880 }, { - "epoch": 3.7591517293612724, - "grad_norm": 0.3665611445903778, - "learning_rate": 7.445089623832366e-06, - "loss": 0.0503, + "epoch": 4.422334422334423, + "grad_norm": 0.7069948315620422, + "learning_rate": 3.465993465993466e-06, + "loss": 0.0787, "step": 14890 }, { - "epoch": 3.761676344357485, - "grad_norm": 0.5790762901306152, - "learning_rate": 7.4299419338550876e-06, - "loss": 0.0584, + "epoch": 4.425304425304425, + "grad_norm": 0.5601736307144165, + "learning_rate": 3.448173448173448e-06, + "loss": 0.0603, "step": 14900 }, { - "epoch": 3.7642009593536985, - "grad_norm": 0.4075251817703247, - "learning_rate": 7.414794243877808e-06, - "loss": 0.0451, + "epoch": 4.428274428274428, + "grad_norm": 0.687710702419281, + "learning_rate": 3.4303534303534305e-06, + "loss": 0.0743, "step": 14910 }, { - "epoch": 3.7667255743499117, - "grad_norm": 0.3984999656677246, - "learning_rate": 7.399646553900531e-06, - "loss": 0.0604, + "epoch": 4.431244431244432, + "grad_norm": 0.4097294807434082, + "learning_rate": 3.4125334125334127e-06, + "loss": 0.053, "step": 14920 }, { - "epoch": 3.769250189346125, - "grad_norm": 0.5887860059738159, - "learning_rate": 7.3844988639232515e-06, - "loss": 0.0585, + "epoch": 4.434214434214434, + "grad_norm": 0.42233291268348694, + "learning_rate": 3.394713394713395e-06, + "loss": 0.0547, "step": 14930 }, { - "epoch": 3.771774804342338, - "grad_norm": 0.516468346118927, - "learning_rate": 7.369351173945973e-06, - "loss": 0.0646, + "epoch": 4.437184437184437, + "grad_norm": 0.44154420495033264, + "learning_rate": 3.376893376893377e-06, + "loss": 0.0619, "step": 14940 }, { - "epoch": 3.7742994193385506, - "grad_norm": 0.455007404088974, - "learning_rate": 7.3542034839686955e-06, - "loss": 0.0615, + "epoch": 4.440154440154441, + "grad_norm": 0.8544827699661255, + "learning_rate": 3.359073359073359e-06, + "loss": 0.0567, "step": 14950 }, { - "epoch": 3.776824034334764, - "grad_norm": 0.5999513864517212, - "learning_rate": 7.339055793991416e-06, - "loss": 0.0566, + "epoch": 4.443124443124443, + "grad_norm": 0.5352084636688232, + "learning_rate": 3.3412533412533413e-06, + "loss": 0.0544, "step": 14960 }, { - "epoch": 3.779348649330977, - "grad_norm": 0.5931031703948975, - "learning_rate": 7.323908104014138e-06, - "loss": 0.0552, + "epoch": 4.446094446094446, + "grad_norm": 0.3393558859825134, + "learning_rate": 3.3234333234333235e-06, + "loss": 0.048, "step": 14970 }, { - "epoch": 3.78187326432719, - "grad_norm": 0.3761138319969177, - "learning_rate": 7.30876041403686e-06, - "loss": 0.0728, + "epoch": 4.4490644490644495, + "grad_norm": 0.4206056296825409, + "learning_rate": 3.3056133056133057e-06, + "loss": 0.0599, "step": 14980 }, { - "epoch": 3.784397879323403, - "grad_norm": 0.5019908547401428, - "learning_rate": 7.293612724059581e-06, - "loss": 0.0564, + "epoch": 4.452034452034452, + "grad_norm": 0.4742394983768463, + "learning_rate": 3.287793287793288e-06, + "loss": 0.067, "step": 14990 }, { - "epoch": 3.786922494319616, - "grad_norm": 0.4987894296646118, - "learning_rate": 7.278465034082303e-06, - "loss": 0.0403, + "epoch": 4.455004455004455, + "grad_norm": 0.5058844685554504, + "learning_rate": 3.26997326997327e-06, + "loss": 0.0625, "step": 15000 }, { - "epoch": 3.7894471093158293, - "grad_norm": 0.6358135342597961, - "learning_rate": 7.263317344105023e-06, - "loss": 0.0575, + "epoch": 4.457974457974458, + "grad_norm": 0.39022761583328247, + "learning_rate": 3.252153252153252e-06, + "loss": 0.0634, "step": 15010 }, { - "epoch": 3.7919717243120425, - "grad_norm": 0.3665125072002411, - "learning_rate": 7.248169654127746e-06, - "loss": 0.0626, + "epoch": 4.460944460944461, + "grad_norm": 0.46778586506843567, + "learning_rate": 3.2343332343332344e-06, + "loss": 0.0609, "step": 15020 }, { - "epoch": 3.7944963393082554, - "grad_norm": 0.7710779309272766, - "learning_rate": 7.2330219641504674e-06, - "loss": 0.0558, + "epoch": 4.463914463914464, + "grad_norm": 0.7826917767524719, + "learning_rate": 3.2165132165132165e-06, + "loss": 0.0652, "step": 15030 }, { - "epoch": 3.7970209543044686, - "grad_norm": 0.7254796624183655, - "learning_rate": 7.217874274173188e-06, - "loss": 0.059, + "epoch": 4.4668844668844665, + "grad_norm": 0.3851190507411957, + "learning_rate": 3.1986931986931987e-06, + "loss": 0.0706, "step": 15040 }, { - "epoch": 3.7995455693006814, - "grad_norm": 0.6218250393867493, - "learning_rate": 7.202726584195911e-06, - "loss": 0.0594, + "epoch": 4.46985446985447, + "grad_norm": 0.5744338631629944, + "learning_rate": 3.1808731808731813e-06, + "loss": 0.0616, "step": 15050 }, { - "epoch": 3.8020701842968947, - "grad_norm": 0.4998759627342224, - "learning_rate": 7.187578894218632e-06, - "loss": 0.0708, + "epoch": 4.472824472824473, + "grad_norm": 0.2627275288105011, + "learning_rate": 3.163053163053163e-06, + "loss": 0.0545, "step": 15060 }, { - "epoch": 3.804594799293108, - "grad_norm": 0.31853801012039185, - "learning_rate": 7.172431204241353e-06, - "loss": 0.0613, + "epoch": 4.475794475794475, + "grad_norm": 0.6903046369552612, + "learning_rate": 3.1452331452331452e-06, + "loss": 0.0538, "step": 15070 }, { - "epoch": 3.8071194142893208, - "grad_norm": 0.5865182876586914, - "learning_rate": 7.157283514264075e-06, - "loss": 0.0539, + "epoch": 4.478764478764479, + "grad_norm": 0.49576228857040405, + "learning_rate": 3.1274131274131274e-06, + "loss": 0.0555, "step": 15080 }, { - "epoch": 3.809644029285534, - "grad_norm": 0.46650758385658264, - "learning_rate": 7.142135824286796e-06, - "loss": 0.052, + "epoch": 4.481734481734482, + "grad_norm": 0.5750555396080017, + "learning_rate": 3.1095931095931096e-06, + "loss": 0.0526, "step": 15090 }, { - "epoch": 3.812168644281747, - "grad_norm": 0.49982815980911255, - "learning_rate": 7.126988134309518e-06, - "loss": 0.053, + "epoch": 4.484704484704484, + "grad_norm": 0.5842902660369873, + "learning_rate": 3.0917730917730917e-06, + "loss": 0.0654, "step": 15100 }, { - "epoch": 3.81469325927796, - "grad_norm": 0.3356648087501526, - "learning_rate": 7.11184044433224e-06, - "loss": 0.0554, + "epoch": 4.487674487674488, + "grad_norm": 0.6240746974945068, + "learning_rate": 3.073953073953074e-06, + "loss": 0.0542, "step": 15110 }, { - "epoch": 3.8172178742741734, - "grad_norm": 0.42580679059028625, - "learning_rate": 7.096692754354961e-06, - "loss": 0.0588, + "epoch": 4.490644490644491, + "grad_norm": 0.5041930079460144, + "learning_rate": 3.0561330561330565e-06, + "loss": 0.0539, "step": 15120 }, { - "epoch": 3.819742489270386, - "grad_norm": 0.5860910415649414, - "learning_rate": 7.0815450643776825e-06, - "loss": 0.0571, + "epoch": 4.493614493614493, + "grad_norm": 0.7403512597084045, + "learning_rate": 3.0383130383130387e-06, + "loss": 0.0514, "step": 15130 }, { - "epoch": 3.8222671042665994, - "grad_norm": 0.7389113306999207, - "learning_rate": 7.066397374400404e-06, - "loss": 0.0635, + "epoch": 4.4965844965844965, + "grad_norm": 0.39922061562538147, + "learning_rate": 3.0204930204930204e-06, + "loss": 0.0552, "step": 15140 }, { - "epoch": 3.8247917192628123, - "grad_norm": 0.8048897385597229, - "learning_rate": 7.051249684423126e-06, - "loss": 0.0639, + "epoch": 4.4995544995545, + "grad_norm": 0.2986512780189514, + "learning_rate": 3.0026730026730026e-06, + "loss": 0.0417, "step": 15150 }, { - "epoch": 3.8273163342590255, - "grad_norm": 0.43790996074676514, - "learning_rate": 7.036101994445847e-06, - "loss": 0.0469, + "epoch": 4.502524502524502, + "grad_norm": 0.5681390166282654, + "learning_rate": 2.9848529848529848e-06, + "loss": 0.0576, "step": 15160 }, { - "epoch": 3.8298409492552388, - "grad_norm": 0.633665919303894, - "learning_rate": 7.020954304468568e-06, - "loss": 0.0525, + "epoch": 4.5054945054945055, + "grad_norm": 0.3006349802017212, + "learning_rate": 2.967032967032967e-06, + "loss": 0.0556, "step": 15170 }, { - "epoch": 3.8323655642514516, - "grad_norm": 0.48216986656188965, - "learning_rate": 7.0058066144912905e-06, - "loss": 0.0604, + "epoch": 4.508464508464509, + "grad_norm": 0.35743093490600586, + "learning_rate": 2.949212949212949e-06, + "loss": 0.0598, "step": 15180 }, { - "epoch": 3.834890179247665, - "grad_norm": 0.4421907663345337, - "learning_rate": 6.990658924514012e-06, - "loss": 0.0632, + "epoch": 4.511434511434511, + "grad_norm": 0.7890453934669495, + "learning_rate": 2.9313929313929317e-06, + "loss": 0.0611, "step": 15190 }, { - "epoch": 3.8374147942438777, - "grad_norm": 0.41442814469337463, - "learning_rate": 6.975511234536733e-06, - "loss": 0.051, + "epoch": 4.514404514404514, + "grad_norm": 0.5027909874916077, + "learning_rate": 2.913572913572914e-06, + "loss": 0.0599, "step": 15200 }, { - "epoch": 3.839939409240091, - "grad_norm": 0.35029637813568115, - "learning_rate": 6.960363544559455e-06, - "loss": 0.0415, + "epoch": 4.517374517374518, + "grad_norm": 0.41626325249671936, + "learning_rate": 2.895752895752896e-06, + "loss": 0.0689, "step": 15210 }, { - "epoch": 3.842464024236304, - "grad_norm": 0.5078822374343872, - "learning_rate": 6.945215854582176e-06, - "loss": 0.0536, + "epoch": 4.52034452034452, + "grad_norm": 0.48036375641822815, + "learning_rate": 2.877932877932878e-06, + "loss": 0.0591, "step": 15220 }, { - "epoch": 3.844988639232517, - "grad_norm": 0.5782067179679871, - "learning_rate": 6.930068164604898e-06, - "loss": 0.0663, + "epoch": 4.523314523314523, + "grad_norm": 0.3339380919933319, + "learning_rate": 2.86011286011286e-06, + "loss": 0.0522, "step": 15230 }, { - "epoch": 3.8475132542287303, - "grad_norm": 0.38581833243370056, - "learning_rate": 6.91492047462762e-06, - "loss": 0.0623, + "epoch": 4.526284526284527, + "grad_norm": 0.5192808508872986, + "learning_rate": 2.842292842292842e-06, + "loss": 0.0495, "step": 15240 }, { - "epoch": 3.850037869224943, - "grad_norm": 0.4432540535926819, - "learning_rate": 6.899772784650341e-06, - "loss": 0.0608, + "epoch": 4.529254529254529, + "grad_norm": 0.39185869693756104, + "learning_rate": 2.8244728244728243e-06, + "loss": 0.0523, "step": 15250 }, { - "epoch": 3.8525624842211563, - "grad_norm": 0.2796371877193451, - "learning_rate": 6.8846250946730624e-06, - "loss": 0.0515, + "epoch": 4.532224532224532, + "grad_norm": 0.5810967683792114, + "learning_rate": 2.806652806652807e-06, + "loss": 0.0596, "step": 15260 }, { - "epoch": 3.8550870992173696, - "grad_norm": 0.38160884380340576, - "learning_rate": 6.869477404695784e-06, - "loss": 0.0619, + "epoch": 4.5351945351945355, + "grad_norm": 0.48891574144363403, + "learning_rate": 2.788832788832789e-06, + "loss": 0.0684, "step": 15270 }, { - "epoch": 3.8576117142135824, - "grad_norm": 0.4360808730125427, - "learning_rate": 6.854329714718506e-06, - "loss": 0.0686, + "epoch": 4.538164538164538, + "grad_norm": 0.6249604821205139, + "learning_rate": 2.7710127710127712e-06, + "loss": 0.0605, "step": 15280 }, { - "epoch": 3.8601363292097957, - "grad_norm": 0.5405408143997192, - "learning_rate": 6.839182024741227e-06, - "loss": 0.0493, + "epoch": 4.541134541134541, + "grad_norm": 0.5719090700149536, + "learning_rate": 2.7531927531927534e-06, + "loss": 0.0513, "step": 15290 }, { - "epoch": 3.8626609442060085, - "grad_norm": 0.6223629713058472, - "learning_rate": 6.824034334763948e-06, - "loss": 0.0592, + "epoch": 4.5441045441045445, + "grad_norm": 0.5488110780715942, + "learning_rate": 2.7353727353727356e-06, + "loss": 0.0685, "step": 15300 }, { - "epoch": 3.8651855592022217, - "grad_norm": 0.6864993572235107, - "learning_rate": 6.80888664478667e-06, - "loss": 0.0597, + "epoch": 4.547074547074547, + "grad_norm": 0.38646382093429565, + "learning_rate": 2.7175527175527173e-06, + "loss": 0.0557, "step": 15310 }, { - "epoch": 3.867710174198435, - "grad_norm": 0.5163440108299255, - "learning_rate": 6.793738954809392e-06, - "loss": 0.052, + "epoch": 4.55004455004455, + "grad_norm": 0.4562876224517822, + "learning_rate": 2.6997326997326995e-06, + "loss": 0.0538, "step": 15320 }, { - "epoch": 3.870234789194648, - "grad_norm": 0.3290441334247589, - "learning_rate": 6.778591264832113e-06, - "loss": 0.0454, + "epoch": 4.553014553014553, + "grad_norm": 0.3206016719341278, + "learning_rate": 2.681912681912682e-06, + "loss": 0.0469, "step": 15330 }, { - "epoch": 3.8727594041908606, - "grad_norm": 0.5202774405479431, - "learning_rate": 6.763443574854835e-06, - "loss": 0.0494, + "epoch": 4.555984555984556, + "grad_norm": 0.4230201542377472, + "learning_rate": 2.6640926640926642e-06, + "loss": 0.064, "step": 15340 }, { - "epoch": 3.875284019187074, - "grad_norm": 0.8797051906585693, - "learning_rate": 6.748295884877557e-06, - "loss": 0.0664, + "epoch": 4.558954558954559, + "grad_norm": 0.6635040640830994, + "learning_rate": 2.6462726462726464e-06, + "loss": 0.0572, "step": 15350 }, { - "epoch": 3.877808634183287, - "grad_norm": 0.5602105855941772, - "learning_rate": 6.7331481949002775e-06, - "loss": 0.0639, + "epoch": 4.561924561924562, + "grad_norm": 0.6302227973937988, + "learning_rate": 2.6284526284526286e-06, + "loss": 0.0706, "step": 15360 }, { - "epoch": 3.8803332491795004, - "grad_norm": 0.33669281005859375, - "learning_rate": 6.718000504923e-06, - "loss": 0.0542, + "epoch": 4.564894564894565, + "grad_norm": 0.6194272637367249, + "learning_rate": 2.6106326106326108e-06, + "loss": 0.0787, "step": 15370 }, { - "epoch": 3.882857864175713, - "grad_norm": 0.41936057806015015, - "learning_rate": 6.702852814945721e-06, - "loss": 0.0545, + "epoch": 4.567864567864568, + "grad_norm": 0.7719616293907166, + "learning_rate": 2.592812592812593e-06, + "loss": 0.0657, "step": 15380 }, { - "epoch": 3.885382479171926, - "grad_norm": 0.3497774302959442, - "learning_rate": 6.687705124968442e-06, - "loss": 0.0515, + "epoch": 4.57083457083457, + "grad_norm": 0.541800856590271, + "learning_rate": 2.574992574992575e-06, + "loss": 0.0691, "step": 15390 }, { - "epoch": 3.8879070941681393, - "grad_norm": 0.8730018734931946, - "learning_rate": 6.672557434991164e-06, - "loss": 0.0539, + "epoch": 4.573804573804574, + "grad_norm": 0.4170493483543396, + "learning_rate": 2.5571725571725573e-06, + "loss": 0.0623, "step": 15400 }, { - "epoch": 3.8904317091643525, - "grad_norm": 0.3379184305667877, - "learning_rate": 6.6574097450138855e-06, - "loss": 0.0553, + "epoch": 4.576774576774577, + "grad_norm": 0.6817463636398315, + "learning_rate": 2.5393525393525394e-06, + "loss": 0.0475, "step": 15410 }, { - "epoch": 3.8929563241605654, - "grad_norm": 0.7440978288650513, - "learning_rate": 6.642262055036607e-06, - "loss": 0.0585, + "epoch": 4.579744579744579, + "grad_norm": 0.5324716567993164, + "learning_rate": 2.5215325215325216e-06, + "loss": 0.0609, "step": 15420 }, { - "epoch": 3.8954809391567786, - "grad_norm": 0.3775716722011566, - "learning_rate": 6.627114365059329e-06, - "loss": 0.0607, + "epoch": 4.5827145827145825, + "grad_norm": 0.3345739245414734, + "learning_rate": 2.5037125037125038e-06, + "loss": 0.0547, "step": 15430 }, { - "epoch": 3.8980055541529914, - "grad_norm": 0.5544441342353821, - "learning_rate": 6.61196667508205e-06, - "loss": 0.0638, + "epoch": 4.585684585684586, + "grad_norm": 0.5235359072685242, + "learning_rate": 2.485892485892486e-06, + "loss": 0.0439, "step": 15440 }, { - "epoch": 3.9005301691492047, - "grad_norm": 0.5136420726776123, - "learning_rate": 6.596818985104772e-06, - "loss": 0.0642, + "epoch": 4.588654588654588, + "grad_norm": 0.5767530202865601, + "learning_rate": 2.468072468072468e-06, + "loss": 0.0509, "step": 15450 }, { - "epoch": 3.903054784145418, - "grad_norm": 0.3824068307876587, - "learning_rate": 6.581671295127493e-06, - "loss": 0.0616, + "epoch": 4.5916245916245915, + "grad_norm": 0.37491995096206665, + "learning_rate": 2.4502524502524507e-06, + "loss": 0.0537, "step": 15460 }, { - "epoch": 3.9055793991416308, - "grad_norm": 0.6130052208900452, - "learning_rate": 6.566523605150215e-06, - "loss": 0.0558, + "epoch": 4.594594594594595, + "grad_norm": 0.49927496910095215, + "learning_rate": 2.4324324324324325e-06, + "loss": 0.0612, "step": 15470 }, { - "epoch": 3.908104014137844, - "grad_norm": 0.48635825514793396, - "learning_rate": 6.551375915172937e-06, - "loss": 0.0539, + "epoch": 4.597564597564597, + "grad_norm": 0.8037787079811096, + "learning_rate": 2.4146124146124146e-06, + "loss": 0.059, "step": 15480 }, { - "epoch": 3.910628629134057, - "grad_norm": 0.4900931417942047, - "learning_rate": 6.536228225195657e-06, - "loss": 0.0488, + "epoch": 4.6005346005346, + "grad_norm": 0.6241805553436279, + "learning_rate": 2.396792396792397e-06, + "loss": 0.0667, "step": 15490 }, { - "epoch": 3.91315324413027, - "grad_norm": 0.3312920331954956, - "learning_rate": 6.52108053521838e-06, - "loss": 0.0577, + "epoch": 4.603504603504604, + "grad_norm": 0.4899803698062897, + "learning_rate": 2.378972378972379e-06, + "loss": 0.0547, "step": 15500 }, { - "epoch": 3.9156778591264834, - "grad_norm": 0.38757237792015076, - "learning_rate": 6.505932845241101e-06, - "loss": 0.052, + "epoch": 4.606474606474606, + "grad_norm": 0.7477651834487915, + "learning_rate": 2.361152361152361e-06, + "loss": 0.055, "step": 15510 }, { - "epoch": 3.918202474122696, - "grad_norm": 0.5382483601570129, - "learning_rate": 6.490785155263822e-06, - "loss": 0.0617, + "epoch": 4.609444609444609, + "grad_norm": 0.35865089297294617, + "learning_rate": 2.3433323433323433e-06, + "loss": 0.0518, "step": 15520 }, { - "epoch": 3.9207270891189094, - "grad_norm": 0.7959476113319397, - "learning_rate": 6.475637465286545e-06, - "loss": 0.0619, + "epoch": 4.612414612414613, + "grad_norm": 0.6939175128936768, + "learning_rate": 2.325512325512326e-06, + "loss": 0.0556, "step": 15530 }, { - "epoch": 3.9232517041151223, - "grad_norm": 0.5656881332397461, - "learning_rate": 6.460489775309265e-06, - "loss": 0.057, + "epoch": 4.615384615384615, + "grad_norm": 0.6515450477600098, + "learning_rate": 2.307692307692308e-06, + "loss": 0.066, "step": 15540 }, { - "epoch": 3.9257763191113355, - "grad_norm": 0.32945945858955383, - "learning_rate": 6.445342085331987e-06, - "loss": 0.0467, + "epoch": 4.618354618354618, + "grad_norm": 0.5336460471153259, + "learning_rate": 2.28987228987229e-06, + "loss": 0.0601, "step": 15550 }, { - "epoch": 3.9283009341075488, - "grad_norm": 0.5367522835731506, - "learning_rate": 6.4301943953547086e-06, - "loss": 0.0495, + "epoch": 4.6213246213246215, + "grad_norm": 0.43573182821273804, + "learning_rate": 2.272052272052272e-06, + "loss": 0.062, "step": 15560 }, { - "epoch": 3.9308255491037616, - "grad_norm": 0.6505579948425293, - "learning_rate": 6.41504670537743e-06, - "loss": 0.0573, + "epoch": 4.624294624294624, + "grad_norm": 0.5898021459579468, + "learning_rate": 2.254232254232254e-06, + "loss": 0.0508, "step": 15570 }, { - "epoch": 3.933350164099975, - "grad_norm": 0.39535531401634216, - "learning_rate": 6.399899015400152e-06, - "loss": 0.0668, + "epoch": 4.627264627264627, + "grad_norm": 0.4121955931186676, + "learning_rate": 2.2364122364122363e-06, + "loss": 0.0572, "step": 15580 }, { - "epoch": 3.9358747790961877, - "grad_norm": 0.47247302532196045, - "learning_rate": 6.3847513254228725e-06, - "loss": 0.0602, + "epoch": 4.63023463023463, + "grad_norm": 0.4609485864639282, + "learning_rate": 2.2185922185922185e-06, + "loss": 0.0569, "step": 15590 }, { - "epoch": 3.938399394092401, - "grad_norm": 0.45602869987487793, - "learning_rate": 6.369603635445595e-06, - "loss": 0.0623, + "epoch": 4.633204633204633, + "grad_norm": 0.5508905053138733, + "learning_rate": 2.200772200772201e-06, + "loss": 0.0514, "step": 15600 }, { - "epoch": 3.940924009088614, - "grad_norm": 0.5000142455101013, - "learning_rate": 6.3544559454683165e-06, - "loss": 0.0492, + "epoch": 4.636174636174636, + "grad_norm": 0.2802213430404663, + "learning_rate": 2.1829521829521833e-06, + "loss": 0.0655, "step": 15610 }, { - "epoch": 3.943448624084827, - "grad_norm": 0.22823497653007507, - "learning_rate": 6.339308255491037e-06, - "loss": 0.0425, + "epoch": 4.639144639144639, + "grad_norm": 0.4371926784515381, + "learning_rate": 2.1651321651321654e-06, + "loss": 0.0651, "step": 15620 }, { - "epoch": 3.9459732390810403, - "grad_norm": 0.7233946323394775, - "learning_rate": 6.32416056551376e-06, - "loss": 0.0517, + "epoch": 4.642114642114642, + "grad_norm": 0.42453938722610474, + "learning_rate": 2.147312147312147e-06, + "loss": 0.0511, "step": 15630 }, { - "epoch": 3.948497854077253, - "grad_norm": 0.4734395444393158, - "learning_rate": 6.309012875536481e-06, - "loss": 0.0548, + "epoch": 4.645084645084645, + "grad_norm": 0.5437641143798828, + "learning_rate": 2.1294921294921294e-06, + "loss": 0.0668, "step": 15640 }, { - "epoch": 3.9510224690734663, - "grad_norm": 0.5519753098487854, - "learning_rate": 6.293865185559202e-06, - "loss": 0.054, + "epoch": 4.648054648054648, + "grad_norm": 0.3894469141960144, + "learning_rate": 2.1116721116721115e-06, + "loss": 0.062, "step": 15650 }, { - "epoch": 3.9535470840696796, - "grad_norm": 0.8592543601989746, - "learning_rate": 6.2787174955819245e-06, - "loss": 0.0622, + "epoch": 4.651024651024651, + "grad_norm": 0.47583234310150146, + "learning_rate": 2.0938520938520937e-06, + "loss": 0.0566, "step": 15660 }, { - "epoch": 3.9560716990658924, - "grad_norm": 0.716918408870697, - "learning_rate": 6.263569805604645e-06, - "loss": 0.0651, + "epoch": 4.653994653994654, + "grad_norm": 0.4548485279083252, + "learning_rate": 2.0760320760320763e-06, + "loss": 0.056, "step": 15670 }, { - "epoch": 3.9585963140621057, - "grad_norm": 0.5099024176597595, - "learning_rate": 6.248422115627367e-06, - "loss": 0.0668, + "epoch": 4.656964656964657, + "grad_norm": 0.802291989326477, + "learning_rate": 2.0582120582120585e-06, + "loss": 0.0655, "step": 15680 }, { - "epoch": 3.9611209290583185, - "grad_norm": 0.4536076784133911, - "learning_rate": 6.2332744256500884e-06, - "loss": 0.0477, + "epoch": 4.65993465993466, + "grad_norm": 0.7516531944274902, + "learning_rate": 2.0403920403920406e-06, + "loss": 0.0639, "step": 15690 }, { - "epoch": 3.9636455440545317, - "grad_norm": 0.6459512114524841, - "learning_rate": 6.21812673567281e-06, - "loss": 0.057, + "epoch": 4.662904662904663, + "grad_norm": 0.32423585653305054, + "learning_rate": 2.022572022572023e-06, + "loss": 0.0569, "step": 15700 }, { - "epoch": 3.966170159050745, - "grad_norm": 0.6501273512840271, - "learning_rate": 6.202979045695532e-06, - "loss": 0.0526, + "epoch": 4.665874665874666, + "grad_norm": 0.6043174266815186, + "learning_rate": 2.0047520047520046e-06, + "loss": 0.0579, "step": 15710 }, { - "epoch": 3.968694774046958, - "grad_norm": 0.40765637159347534, - "learning_rate": 6.187831355718253e-06, - "loss": 0.0639, + "epoch": 4.6688446688446685, + "grad_norm": 0.6407160758972168, + "learning_rate": 1.9869319869319867e-06, + "loss": 0.0567, "step": 15720 }, { - "epoch": 3.971219389043171, - "grad_norm": 0.564132571220398, - "learning_rate": 6.172683665740975e-06, - "loss": 0.0631, + "epoch": 4.671814671814672, + "grad_norm": 0.4470699727535248, + "learning_rate": 1.969111969111969e-06, + "loss": 0.0588, "step": 15730 }, { - "epoch": 3.973744004039384, - "grad_norm": 0.5302206873893738, - "learning_rate": 6.157535975763696e-06, - "loss": 0.0569, + "epoch": 4.674784674784675, + "grad_norm": 0.582695484161377, + "learning_rate": 1.9512919512919515e-06, + "loss": 0.0617, "step": 15740 }, { - "epoch": 3.976268619035597, - "grad_norm": 0.5838475227355957, - "learning_rate": 6.142388285786417e-06, - "loss": 0.0633, + "epoch": 4.6777546777546775, + "grad_norm": 0.5506203770637512, + "learning_rate": 1.9334719334719337e-06, + "loss": 0.0561, "step": 15750 }, { - "epoch": 3.9787932340318104, - "grad_norm": 0.46987318992614746, - "learning_rate": 6.12724059580914e-06, - "loss": 0.0603, + "epoch": 4.680724680724681, + "grad_norm": 0.47693562507629395, + "learning_rate": 1.915651915651916e-06, + "loss": 0.0681, "step": 15760 }, { - "epoch": 3.981317849028023, - "grad_norm": 0.5803635716438293, - "learning_rate": 6.112092905831861e-06, - "loss": 0.0631, + "epoch": 4.683694683694684, + "grad_norm": 0.5993645191192627, + "learning_rate": 1.8978318978318978e-06, + "loss": 0.0568, "step": 15770 }, { - "epoch": 3.983842464024236, - "grad_norm": 0.34555870294570923, - "learning_rate": 6.096945215854582e-06, - "loss": 0.059, + "epoch": 4.686664686664686, + "grad_norm": 0.4940324127674103, + "learning_rate": 1.88001188001188e-06, + "loss": 0.0598, "step": 15780 }, { - "epoch": 3.9863670790204493, - "grad_norm": 0.3343939781188965, - "learning_rate": 6.081797525877304e-06, - "loss": 0.0625, + "epoch": 4.68963468963469, + "grad_norm": 0.2504422664642334, + "learning_rate": 1.8621918621918623e-06, + "loss": 0.0529, "step": 15790 }, { - "epoch": 3.9888916940166625, - "grad_norm": 0.6097005605697632, - "learning_rate": 6.066649835900025e-06, - "loss": 0.0617, + "epoch": 4.692604692604693, + "grad_norm": 0.46793416142463684, + "learning_rate": 1.8443718443718445e-06, + "loss": 0.0512, "step": 15800 }, { - "epoch": 3.991416309012876, - "grad_norm": 0.495924174785614, - "learning_rate": 6.051502145922747e-06, - "loss": 0.0441, + "epoch": 4.695574695574695, + "grad_norm": 0.5812580585479736, + "learning_rate": 1.8265518265518265e-06, + "loss": 0.0651, "step": 15810 }, { - "epoch": 3.9939409240090886, - "grad_norm": 0.19013169407844543, - "learning_rate": 6.036354455945468e-06, - "loss": 0.0661, + "epoch": 4.698544698544699, + "grad_norm": 0.6013128757476807, + "learning_rate": 1.8087318087318088e-06, + "loss": 0.0468, "step": 15820 }, { - "epoch": 3.9964655390053014, - "grad_norm": 0.4961543083190918, - "learning_rate": 6.02120676596819e-06, - "loss": 0.0661, + "epoch": 4.701514701514702, + "grad_norm": 0.7705390453338623, + "learning_rate": 1.790911790911791e-06, + "loss": 0.068, "step": 15830 }, { - "epoch": 3.9989901540015147, - "grad_norm": 0.4737859070301056, - "learning_rate": 6.0060590759909115e-06, - "loss": 0.0627, + "epoch": 4.704484704484704, + "grad_norm": 0.3266391158103943, + "learning_rate": 1.7730917730917732e-06, + "loss": 0.0599, "step": 15840 }, { - "epoch": 4.0, - "eval_f1": 0.9705180789481339, - "eval_loss": 0.04224640876054764, - "eval_runtime": 902.072, - "eval_samples_per_second": 228.654, - "eval_steps_per_second": 3.573, - "step": 15844 - }, - { - "epoch": 4.001514768997728, - "grad_norm": 0.5551168322563171, - "learning_rate": 5.990911386013633e-06, - "loss": 0.0627, + "epoch": 4.7074547074547075, + "grad_norm": 0.5801645517349243, + "learning_rate": 1.7552717552717551e-06, + "loss": 0.05, "step": 15850 }, { - "epoch": 4.004039383993941, - "grad_norm": 0.5756297707557678, - "learning_rate": 5.975763696036355e-06, - "loss": 0.0502, + "epoch": 4.710424710424711, + "grad_norm": 0.4369991719722748, + "learning_rate": 1.7374517374517375e-06, + "loss": 0.0555, "step": 15860 }, { - "epoch": 4.006563998990154, - "grad_norm": 0.3061296045780182, - "learning_rate": 5.960616006059076e-06, - "loss": 0.0582, + "epoch": 4.713394713394713, + "grad_norm": 0.411531925201416, + "learning_rate": 1.7196317196317197e-06, + "loss": 0.0548, "step": 15870 }, { - "epoch": 4.009088613986367, - "grad_norm": 0.3693884313106537, - "learning_rate": 5.945468316081797e-06, - "loss": 0.055, + "epoch": 4.716364716364716, + "grad_norm": 0.3529096841812134, + "learning_rate": 1.7018117018117019e-06, + "loss": 0.0603, "step": 15880 }, { - "epoch": 4.01161322898258, - "grad_norm": 0.4693322777748108, - "learning_rate": 5.9303206261045195e-06, - "loss": 0.0596, + "epoch": 4.71933471933472, + "grad_norm": 0.4712686240673065, + "learning_rate": 1.683991683991684e-06, + "loss": 0.044, "step": 15890 }, { - "epoch": 4.014137843978793, - "grad_norm": 0.36858975887298584, - "learning_rate": 5.915172936127241e-06, - "loss": 0.048, + "epoch": 4.722304722304722, + "grad_norm": 0.5159938335418701, + "learning_rate": 1.6661716661716662e-06, + "loss": 0.0581, "step": 15900 }, { - "epoch": 4.016662458975007, - "grad_norm": 0.4288581907749176, - "learning_rate": 5.900025246149962e-06, - "loss": 0.0523, + "epoch": 4.725274725274725, + "grad_norm": 0.7334055304527283, + "learning_rate": 1.6483516483516484e-06, + "loss": 0.0515, "step": 15910 }, { - "epoch": 4.019187073971219, - "grad_norm": 0.5312148928642273, - "learning_rate": 5.884877556172684e-06, - "loss": 0.0549, + "epoch": 4.728244728244729, + "grad_norm": 0.6379159688949585, + "learning_rate": 1.6305316305316306e-06, + "loss": 0.0504, "step": 15920 }, { - "epoch": 4.021711688967432, - "grad_norm": 0.3998830020427704, - "learning_rate": 5.869729866195406e-06, - "loss": 0.0671, + "epoch": 4.731214731214731, + "grad_norm": 0.45981767773628235, + "learning_rate": 1.6127116127116127e-06, + "loss": 0.0656, "step": 15930 }, { - "epoch": 4.0242363039636455, - "grad_norm": 0.40000516176223755, - "learning_rate": 5.854582176218127e-06, - "loss": 0.0635, + "epoch": 4.734184734184734, + "grad_norm": 0.39534708857536316, + "learning_rate": 1.594891594891595e-06, + "loss": 0.0563, "step": 15940 }, { - "epoch": 4.026760918959859, - "grad_norm": 0.273234486579895, - "learning_rate": 5.839434486240848e-06, - "loss": 0.0569, + "epoch": 4.737154737154738, + "grad_norm": 0.40459519624710083, + "learning_rate": 1.577071577071577e-06, + "loss": 0.0539, "step": 15950 }, { - "epoch": 4.029285533956072, - "grad_norm": 0.6919571161270142, - "learning_rate": 5.82428679626357e-06, - "loss": 0.0547, + "epoch": 4.74012474012474, + "grad_norm": 0.353635311126709, + "learning_rate": 1.5592515592515594e-06, + "loss": 0.0473, "step": 15960 }, { - "epoch": 4.031810148952284, - "grad_norm": 0.508310079574585, - "learning_rate": 5.809139106286291e-06, - "loss": 0.0623, + "epoch": 4.743094743094743, + "grad_norm": 0.45498237013816833, + "learning_rate": 1.5414315414315414e-06, + "loss": 0.0547, "step": 15970 }, { - "epoch": 4.034334763948498, - "grad_norm": 0.5494049787521362, - "learning_rate": 5.793991416309013e-06, - "loss": 0.0612, + "epoch": 4.7460647460647465, + "grad_norm": 0.47604072093963623, + "learning_rate": 1.5236115236115236e-06, + "loss": 0.0664, "step": 15980 }, { - "epoch": 4.036859378944711, - "grad_norm": 0.4908718168735504, - "learning_rate": 5.778843726331735e-06, - "loss": 0.051, + "epoch": 4.749034749034749, + "grad_norm": 0.5030866265296936, + "learning_rate": 1.5057915057915057e-06, + "loss": 0.0542, "step": 15990 }, { - "epoch": 4.039383993940924, - "grad_norm": 0.40417003631591797, - "learning_rate": 5.763696036354456e-06, - "loss": 0.0651, + "epoch": 4.752004752004752, + "grad_norm": 0.6311854124069214, + "learning_rate": 1.4879714879714881e-06, + "loss": 0.0574, "step": 16000 }, { - "epoch": 4.041908608937137, - "grad_norm": 0.3193455934524536, - "learning_rate": 5.748548346377178e-06, - "loss": 0.0623, + "epoch": 4.754974754974755, + "grad_norm": 0.44515228271484375, + "learning_rate": 1.47015147015147e-06, + "loss": 0.0396, "step": 16010 }, { - "epoch": 4.04443322393335, - "grad_norm": 0.629706859588623, - "learning_rate": 5.733400656399899e-06, - "loss": 0.0538, + "epoch": 4.757944757944758, + "grad_norm": 0.31190499663352966, + "learning_rate": 1.4523314523314523e-06, + "loss": 0.048, "step": 16020 }, { - "epoch": 4.046957838929563, - "grad_norm": 0.5846245884895325, - "learning_rate": 5.718252966422621e-06, - "loss": 0.0633, + "epoch": 4.760914760914761, + "grad_norm": 0.3565562069416046, + "learning_rate": 1.4345114345114346e-06, + "loss": 0.0548, "step": 16030 }, { - "epoch": 4.049482453925776, - "grad_norm": 0.4376593828201294, - "learning_rate": 5.703105276445342e-06, - "loss": 0.0537, + "epoch": 4.763884763884764, + "grad_norm": 0.4140501320362091, + "learning_rate": 1.4166914166914168e-06, + "loss": 0.0577, "step": 16040 }, { - "epoch": 4.05200706892199, - "grad_norm": 0.44141125679016113, - "learning_rate": 5.687957586468064e-06, - "loss": 0.0563, + "epoch": 4.766854766854767, + "grad_norm": 0.5318161845207214, + "learning_rate": 1.3988713988713988e-06, + "loss": 0.052, "step": 16050 }, { - "epoch": 4.054531683918203, - "grad_norm": 0.466325581073761, - "learning_rate": 5.672809896490786e-06, - "loss": 0.0666, + "epoch": 4.76982476982477, + "grad_norm": 0.48852646350860596, + "learning_rate": 1.381051381051381e-06, + "loss": 0.0738, "step": 16060 }, { - "epoch": 4.057056298914415, - "grad_norm": 0.4581875205039978, - "learning_rate": 5.6576622065135065e-06, - "loss": 0.0635, + "epoch": 4.772794772794773, + "grad_norm": 0.501015305519104, + "learning_rate": 1.3632313632313633e-06, + "loss": 0.0584, "step": 16070 }, { - "epoch": 4.0595809139106285, - "grad_norm": 0.31961241364479065, - "learning_rate": 5.642514516536228e-06, - "loss": 0.045, + "epoch": 4.775764775764776, + "grad_norm": 0.46425512433052063, + "learning_rate": 1.3454113454113455e-06, + "loss": 0.0639, "step": 16080 }, { - "epoch": 4.062105528906842, - "grad_norm": 0.31504639983177185, - "learning_rate": 5.62736682655895e-06, - "loss": 0.0663, + "epoch": 4.778734778734779, + "grad_norm": 0.4860481321811676, + "learning_rate": 1.3275913275913275e-06, + "loss": 0.0624, "step": 16090 }, { - "epoch": 4.064630143903055, - "grad_norm": 0.19217759370803833, - "learning_rate": 5.612219136581671e-06, - "loss": 0.0684, + "epoch": 4.781704781704782, + "grad_norm": 0.7363678812980652, + "learning_rate": 1.3097713097713098e-06, + "loss": 0.0735, "step": 16100 }, { - "epoch": 4.067154758899268, - "grad_norm": 0.3876609802246094, - "learning_rate": 5.597071446604393e-06, - "loss": 0.0473, + "epoch": 4.784674784674785, + "grad_norm": 0.6220631003379822, + "learning_rate": 1.291951291951292e-06, + "loss": 0.0591, "step": 16110 }, { - "epoch": 4.069679373895481, - "grad_norm": 0.8723250031471252, - "learning_rate": 5.5819237566271145e-06, - "loss": 0.0634, + "epoch": 4.787644787644788, + "grad_norm": 0.3801935613155365, + "learning_rate": 1.2741312741312742e-06, + "loss": 0.0486, "step": 16120 }, { - "epoch": 4.072203988891694, - "grad_norm": 0.41938892006874084, - "learning_rate": 5.566776066649836e-06, - "loss": 0.0576, + "epoch": 4.79061479061479, + "grad_norm": 0.39542245864868164, + "learning_rate": 1.2563112563112563e-06, + "loss": 0.0567, "step": 16130 }, { - "epoch": 4.074728603887907, - "grad_norm": 0.39180928468704224, - "learning_rate": 5.551628376672558e-06, - "loss": 0.0562, + "epoch": 4.7935847935847935, + "grad_norm": 0.6074013113975525, + "learning_rate": 1.2384912384912385e-06, + "loss": 0.0597, "step": 16140 }, { - "epoch": 4.07725321888412, - "grad_norm": 0.5083798766136169, - "learning_rate": 5.536480686695279e-06, - "loss": 0.0487, + "epoch": 4.796554796554797, + "grad_norm": 0.3309972584247589, + "learning_rate": 1.2206712206712207e-06, + "loss": 0.0726, "step": 16150 }, { - "epoch": 4.079777833880334, - "grad_norm": 0.7230374217033386, - "learning_rate": 5.521332996718001e-06, - "loss": 0.0606, + "epoch": 4.799524799524799, + "grad_norm": 0.5621445775032043, + "learning_rate": 1.2028512028512029e-06, + "loss": 0.0561, "step": 16160 }, { - "epoch": 4.082302448876546, - "grad_norm": 0.8608932495117188, - "learning_rate": 5.506185306740722e-06, - "loss": 0.0479, + "epoch": 4.802494802494802, + "grad_norm": 0.571205198764801, + "learning_rate": 1.185031185031185e-06, + "loss": 0.0476, "step": 16170 }, { - "epoch": 4.084827063872759, - "grad_norm": 0.3399522006511688, - "learning_rate": 5.491037616763444e-06, - "loss": 0.062, + "epoch": 4.805464805464806, + "grad_norm": 0.4768125116825104, + "learning_rate": 1.1672111672111672e-06, + "loss": 0.0513, "step": 16180 }, { - "epoch": 4.0873516788689725, - "grad_norm": 0.6174184083938599, - "learning_rate": 5.475889926786166e-06, - "loss": 0.0567, + "epoch": 4.808434808434808, + "grad_norm": 0.5495672821998596, + "learning_rate": 1.1493911493911494e-06, + "loss": 0.0585, "step": 16190 }, { - "epoch": 4.089876293865186, - "grad_norm": 0.5198509693145752, - "learning_rate": 5.460742236808886e-06, - "loss": 0.0572, + "epoch": 4.811404811404811, + "grad_norm": 0.4319486916065216, + "learning_rate": 1.1315711315711318e-06, + "loss": 0.0559, "step": 16200 }, { - "epoch": 4.092400908861399, - "grad_norm": 0.807162344455719, - "learning_rate": 5.445594546831609e-06, - "loss": 0.0707, + "epoch": 4.814374814374815, + "grad_norm": 0.5664613246917725, + "learning_rate": 1.1137511137511137e-06, + "loss": 0.0524, "step": 16210 }, { - "epoch": 4.094925523857611, - "grad_norm": 0.4432941973209381, - "learning_rate": 5.43044685685433e-06, - "loss": 0.0505, + "epoch": 4.817344817344817, + "grad_norm": 0.4833865463733673, + "learning_rate": 1.0959310959310959e-06, + "loss": 0.0592, "step": 16220 }, { - "epoch": 4.097450138853825, - "grad_norm": 0.5982592701911926, - "learning_rate": 5.415299166877051e-06, - "loss": 0.0521, + "epoch": 4.82031482031482, + "grad_norm": 0.49978017807006836, + "learning_rate": 1.078111078111078e-06, + "loss": 0.0616, "step": 16230 }, { - "epoch": 4.099974753850038, - "grad_norm": 0.5292986631393433, - "learning_rate": 5.400151476899773e-06, - "loss": 0.0481, + "epoch": 4.8232848232848236, + "grad_norm": 0.434505432844162, + "learning_rate": 1.0602910602910604e-06, + "loss": 0.0699, "step": 16240 }, { - "epoch": 4.102499368846251, - "grad_norm": 0.42435184121131897, - "learning_rate": 5.385003786922494e-06, - "loss": 0.0549, + "epoch": 4.826254826254826, + "grad_norm": 0.4497815668582916, + "learning_rate": 1.0424710424710424e-06, + "loss": 0.0573, "step": 16250 }, { - "epoch": 4.1050239838424645, - "grad_norm": 0.5759508013725281, - "learning_rate": 5.369856096945216e-06, - "loss": 0.0574, + "epoch": 4.829224829224829, + "grad_norm": 0.5861119031906128, + "learning_rate": 1.0246510246510246e-06, + "loss": 0.0623, "step": 16260 }, { - "epoch": 4.107548598838677, - "grad_norm": 0.7036067843437195, - "learning_rate": 5.3547084069679375e-06, - "loss": 0.0633, + "epoch": 4.8321948321948325, + "grad_norm": 0.3796347677707672, + "learning_rate": 1.006831006831007e-06, + "loss": 0.0628, "step": 16270 }, { - "epoch": 4.11007321383489, - "grad_norm": 0.45638132095336914, - "learning_rate": 5.339560716990659e-06, - "loss": 0.0472, + "epoch": 4.835164835164835, + "grad_norm": 0.5198697447776794, + "learning_rate": 9.890109890109891e-07, + "loss": 0.0502, "step": 16280 }, { - "epoch": 4.112597828831103, - "grad_norm": 0.5197755098342896, - "learning_rate": 5.324413027013381e-06, - "loss": 0.0598, + "epoch": 4.838134838134838, + "grad_norm": 0.8420373797416687, + "learning_rate": 9.711909711909713e-07, + "loss": 0.0627, "step": 16290 }, { - "epoch": 4.115122443827317, - "grad_norm": 0.5755794048309326, - "learning_rate": 5.309265337036102e-06, - "loss": 0.0513, + "epoch": 4.841104841104841, + "grad_norm": 0.5385600328445435, + "learning_rate": 9.533709533709534e-07, + "loss": 0.054, "step": 16300 }, { - "epoch": 4.117647058823529, - "grad_norm": 0.470956414937973, - "learning_rate": 5.294117647058824e-06, - "loss": 0.0685, + "epoch": 4.844074844074844, + "grad_norm": 0.675041913986206, + "learning_rate": 9.355509355509356e-07, + "loss": 0.056, "step": 16310 }, { - "epoch": 4.120171673819742, - "grad_norm": 0.4178047478199005, - "learning_rate": 5.2789699570815455e-06, - "loss": 0.0638, + "epoch": 4.847044847044847, + "grad_norm": 0.6432201862335205, + "learning_rate": 9.177309177309178e-07, + "loss": 0.046, "step": 16320 }, { - "epoch": 4.1226962888159555, - "grad_norm": 0.3784389793872833, - "learning_rate": 5.263822267104266e-06, - "loss": 0.0605, + "epoch": 4.85001485001485, + "grad_norm": 0.26743176579475403, + "learning_rate": 8.999108999109e-07, + "loss": 0.0611, "step": 16330 }, { - "epoch": 4.125220903812169, - "grad_norm": 0.5465134382247925, - "learning_rate": 5.248674577126989e-06, - "loss": 0.0627, + "epoch": 4.852984852984853, + "grad_norm": 0.4432642459869385, + "learning_rate": 8.820908820908821e-07, + "loss": 0.0514, "step": 16340 }, { - "epoch": 4.127745518808382, - "grad_norm": 0.7110384106636047, - "learning_rate": 5.23352688714971e-06, - "loss": 0.0623, + "epoch": 4.855954855954856, + "grad_norm": 0.6377805471420288, + "learning_rate": 8.642708642708643e-07, + "loss": 0.0523, "step": 16350 }, { - "epoch": 4.130270133804594, - "grad_norm": 0.38853955268859863, - "learning_rate": 5.218379197172431e-06, - "loss": 0.0595, + "epoch": 4.858924858924859, + "grad_norm": 0.34250327944755554, + "learning_rate": 8.464508464508465e-07, + "loss": 0.053, "step": 16360 }, { - "epoch": 4.132794748800808, - "grad_norm": 0.44513699412345886, - "learning_rate": 5.203231507195153e-06, - "loss": 0.0487, + "epoch": 4.861894861894862, + "grad_norm": 0.32881438732147217, + "learning_rate": 8.286308286308286e-07, + "loss": 0.0568, "step": 16370 }, { - "epoch": 4.135319363797021, - "grad_norm": 0.4682462513446808, - "learning_rate": 5.188083817217874e-06, - "loss": 0.0694, + "epoch": 4.864864864864865, + "grad_norm": 0.38570886850357056, + "learning_rate": 8.108108108108109e-07, + "loss": 0.0611, "step": 16380 }, { - "epoch": 4.137843978793234, - "grad_norm": 0.599108099937439, - "learning_rate": 5.172936127240596e-06, - "loss": 0.047, + "epoch": 4.867834867834868, + "grad_norm": 0.34529945254325867, + "learning_rate": 7.92990792990793e-07, + "loss": 0.0528, "step": 16390 }, { - "epoch": 4.140368593789447, - "grad_norm": 0.49504604935646057, - "learning_rate": 5.157788437263317e-06, - "loss": 0.0502, + "epoch": 4.870804870804871, + "grad_norm": 0.401865690946579, + "learning_rate": 7.751707751707753e-07, + "loss": 0.0592, "step": 16400 }, { - "epoch": 4.14289320878566, - "grad_norm": 0.42351964116096497, - "learning_rate": 5.142640747286039e-06, - "loss": 0.0599, + "epoch": 4.873774873774874, + "grad_norm": 0.4996122419834137, + "learning_rate": 7.573507573507573e-07, + "loss": 0.0717, "step": 16410 }, { - "epoch": 4.145417823781873, - "grad_norm": 0.1978461593389511, - "learning_rate": 5.127493057308761e-06, - "loss": 0.0521, + "epoch": 4.876744876744877, + "grad_norm": 0.9507100582122803, + "learning_rate": 7.395307395307396e-07, + "loss": 0.0518, "step": 16420 }, { - "epoch": 4.147942438778086, - "grad_norm": 0.5165350437164307, - "learning_rate": 5.112345367331482e-06, - "loss": 0.0536, + "epoch": 4.8797148797148795, + "grad_norm": 0.5046108961105347, + "learning_rate": 7.217107217107217e-07, + "loss": 0.0649, "step": 16430 }, { - "epoch": 4.1504670537743, - "grad_norm": 0.4052433669567108, - "learning_rate": 5.097197677354204e-06, - "loss": 0.0575, + "epoch": 4.882684882684883, + "grad_norm": 0.6091616153717041, + "learning_rate": 7.03890703890704e-07, + "loss": 0.0544, "step": 16440 }, { - "epoch": 4.152991668770513, - "grad_norm": 0.6475027203559875, - "learning_rate": 5.082049987376925e-06, - "loss": 0.0541, + "epoch": 4.885654885654886, + "grad_norm": 0.526391863822937, + "learning_rate": 6.860706860706861e-07, + "loss": 0.0581, "step": 16450 }, { - "epoch": 4.155516283766725, - "grad_norm": 0.4289510250091553, - "learning_rate": 5.066902297399646e-06, - "loss": 0.0513, + "epoch": 4.888624888624888, + "grad_norm": 0.6778053641319275, + "learning_rate": 6.682506682506683e-07, + "loss": 0.0534, "step": 16460 }, { - "epoch": 4.1580408987629385, - "grad_norm": 0.38766923546791077, - "learning_rate": 5.051754607422369e-06, - "loss": 0.0487, + "epoch": 4.891594891594892, + "grad_norm": 0.5965909957885742, + "learning_rate": 6.504306504306505e-07, + "loss": 0.0584, "step": 16470 }, { - "epoch": 4.160565513759152, - "grad_norm": 0.7618467211723328, - "learning_rate": 5.03660691744509e-06, - "loss": 0.0519, + "epoch": 4.894564894564894, + "grad_norm": 0.4215756058692932, + "learning_rate": 6.326106326106326e-07, + "loss": 0.0603, "step": 16480 }, { - "epoch": 4.163090128755365, - "grad_norm": 0.3821544945240021, - "learning_rate": 5.021459227467811e-06, - "loss": 0.0628, + "epoch": 4.897534897534897, + "grad_norm": 0.3755158483982086, + "learning_rate": 6.147906147906148e-07, + "loss": 0.0571, "step": 16490 }, { - "epoch": 4.165614743751578, - "grad_norm": 0.436176061630249, - "learning_rate": 5.0063115374905325e-06, - "loss": 0.0524, + "epoch": 4.900504900504901, + "grad_norm": 0.6438432335853577, + "learning_rate": 5.969705969705971e-07, + "loss": 0.0544, "step": 16500 }, { - "epoch": 4.168139358747791, - "grad_norm": 0.6463722586631775, - "learning_rate": 4.991163847513255e-06, - "loss": 0.0535, + "epoch": 4.903474903474903, + "grad_norm": 0.4326302111148834, + "learning_rate": 5.791505791505791e-07, + "loss": 0.064, "step": 16510 }, { - "epoch": 4.170663973744004, - "grad_norm": 0.35036715865135193, - "learning_rate": 4.976016157535976e-06, - "loss": 0.0631, + "epoch": 4.906444906444906, + "grad_norm": 0.39046719670295715, + "learning_rate": 5.613305613305614e-07, + "loss": 0.0541, "step": 16520 }, { - "epoch": 4.173188588740217, - "grad_norm": 0.6320012211799622, - "learning_rate": 4.960868467558697e-06, - "loss": 0.0593, + "epoch": 4.9094149094149095, + "grad_norm": 0.35211583971977234, + "learning_rate": 5.435105435105435e-07, + "loss": 0.0629, "step": 16530 }, { - "epoch": 4.17571320373643, - "grad_norm": 0.29677146673202515, - "learning_rate": 4.945720777581419e-06, - "loss": 0.0512, + "epoch": 4.912384912384912, + "grad_norm": 0.6884411573410034, + "learning_rate": 5.256905256905258e-07, + "loss": 0.0538, "step": 16540 }, { - "epoch": 4.178237818732644, - "grad_norm": 0.4986904263496399, - "learning_rate": 4.9305730876041405e-06, - "loss": 0.0526, + "epoch": 4.915354915354915, + "grad_norm": 0.322244793176651, + "learning_rate": 5.078705078705078e-07, + "loss": 0.0696, "step": 16550 }, { - "epoch": 4.180762433728856, - "grad_norm": 0.5562833547592163, - "learning_rate": 4.915425397626862e-06, - "loss": 0.0522, + "epoch": 4.9183249183249185, + "grad_norm": 0.5389405488967896, + "learning_rate": 4.900504900504901e-07, + "loss": 0.0568, "step": 16560 }, { - "epoch": 4.183287048725069, - "grad_norm": 0.6436367630958557, - "learning_rate": 4.900277707649584e-06, - "loss": 0.0688, + "epoch": 4.921294921294921, + "grad_norm": 0.5457450747489929, + "learning_rate": 4.7223047223047227e-07, + "loss": 0.0529, "step": 16570 }, { - "epoch": 4.1858116637212825, - "grad_norm": 0.5076253414154053, - "learning_rate": 4.885130017672305e-06, - "loss": 0.0685, + "epoch": 4.924264924264924, + "grad_norm": 0.4502221643924713, + "learning_rate": 4.544104544104544e-07, + "loss": 0.0438, "step": 16580 }, { - "epoch": 4.188336278717496, - "grad_norm": 0.4076981544494629, - "learning_rate": 4.869982327695026e-06, - "loss": 0.0537, + "epoch": 4.927234927234927, + "grad_norm": 0.486514151096344, + "learning_rate": 4.365904365904366e-07, + "loss": 0.0625, "step": 16590 }, { - "epoch": 4.190860893713709, - "grad_norm": 0.44264063239097595, - "learning_rate": 4.8548346377177485e-06, - "loss": 0.0579, + "epoch": 4.93020493020493, + "grad_norm": 0.5433149337768555, + "learning_rate": 4.187704187704188e-07, + "loss": 0.0605, "step": 16600 }, { - "epoch": 4.193385508709921, - "grad_norm": 0.3889271020889282, - "learning_rate": 4.83968694774047e-06, - "loss": 0.0555, + "epoch": 4.933174933174933, + "grad_norm": 0.7155877351760864, + "learning_rate": 4.0095040095040095e-07, + "loss": 0.0538, "step": 16610 }, { - "epoch": 4.195910123706135, - "grad_norm": 0.5801649689674377, - "learning_rate": 4.824539257763191e-06, - "loss": 0.0371, + "epoch": 4.936144936144936, + "grad_norm": 0.24715302884578705, + "learning_rate": 3.831303831303831e-07, + "loss": 0.0484, "step": 16620 }, { - "epoch": 4.198434738702348, - "grad_norm": 0.43185538053512573, - "learning_rate": 4.809391567785912e-06, - "loss": 0.0711, + "epoch": 4.939114939114939, + "grad_norm": 0.47398287057876587, + "learning_rate": 3.653103653103653e-07, + "loss": 0.0638, "step": 16630 }, { - "epoch": 4.200959353698561, - "grad_norm": 0.7169790267944336, - "learning_rate": 4.794243877808635e-06, - "loss": 0.0692, + "epoch": 4.942084942084942, + "grad_norm": 0.31817615032196045, + "learning_rate": 3.4749034749034746e-07, + "loss": 0.0498, "step": 16640 }, { - "epoch": 4.2034839686947745, - "grad_norm": 0.3428625464439392, - "learning_rate": 4.779096187831356e-06, - "loss": 0.0659, + "epoch": 4.945054945054945, + "grad_norm": 0.30618026852607727, + "learning_rate": 3.296703296703297e-07, + "loss": 0.0606, "step": 16650 }, { - "epoch": 4.206008583690987, - "grad_norm": 0.39647018909454346, - "learning_rate": 4.763948497854077e-06, - "loss": 0.055, + "epoch": 4.948024948024948, + "grad_norm": 0.29916951060295105, + "learning_rate": 3.1185031185031186e-07, + "loss": 0.0471, "step": 16660 }, { - "epoch": 4.2085331986872, - "grad_norm": 0.40620213747024536, - "learning_rate": 4.748800807876799e-06, - "loss": 0.058, + "epoch": 4.950994950994951, + "grad_norm": 0.711303174495697, + "learning_rate": 2.9403029403029403e-07, + "loss": 0.0548, "step": 16670 }, { - "epoch": 4.211057813683413, - "grad_norm": 0.5194025635719299, - "learning_rate": 4.73365311789952e-06, - "loss": 0.0574, + "epoch": 4.953964953964954, + "grad_norm": 0.34268659353256226, + "learning_rate": 2.762102762102762e-07, + "loss": 0.052, "step": 16680 }, { - "epoch": 4.213582428679627, - "grad_norm": 0.4074387848377228, - "learning_rate": 4.718505427922242e-06, - "loss": 0.0525, + "epoch": 4.956934956934957, + "grad_norm": 0.5266901850700378, + "learning_rate": 2.5839025839025837e-07, + "loss": 0.061, "step": 16690 }, { - "epoch": 4.216107043675839, - "grad_norm": 0.670364260673523, - "learning_rate": 4.7033577379449636e-06, - "loss": 0.064, + "epoch": 4.95990495990496, + "grad_norm": 0.5868252515792847, + "learning_rate": 2.4057024057024054e-07, + "loss": 0.0563, "step": 16700 }, { - "epoch": 4.218631658672052, - "grad_norm": 0.5439589023590088, - "learning_rate": 4.688210047967685e-06, - "loss": 0.0553, + "epoch": 4.962874962874963, + "grad_norm": 0.24245183169841766, + "learning_rate": 2.2275022275022276e-07, + "loss": 0.0639, "step": 16710 }, { - "epoch": 4.2211562736682655, - "grad_norm": 0.5016559362411499, - "learning_rate": 4.673062357990407e-06, - "loss": 0.0526, + "epoch": 4.9658449658449655, + "grad_norm": 0.4494710862636566, + "learning_rate": 2.0493020493020493e-07, + "loss": 0.0498, "step": 16720 }, { - "epoch": 4.223680888664479, - "grad_norm": 0.5297515988349915, - "learning_rate": 4.657914668013128e-06, - "loss": 0.0533, + "epoch": 4.968814968814969, + "grad_norm": 0.525842010974884, + "learning_rate": 1.8711018711018713e-07, + "loss": 0.0578, "step": 16730 }, { - "epoch": 4.226205503660692, - "grad_norm": 0.5738322734832764, - "learning_rate": 4.64276697803585e-06, - "loss": 0.0653, + "epoch": 4.971784971784972, + "grad_norm": 0.6086418032646179, + "learning_rate": 1.692901692901693e-07, + "loss": 0.0424, "step": 16740 }, { - "epoch": 4.228730118656904, - "grad_norm": 2.0166008472442627, - "learning_rate": 4.627619288058571e-06, - "loss": 0.0644, + "epoch": 4.974754974754974, + "grad_norm": 0.4772314429283142, + "learning_rate": 1.5147015147015147e-07, + "loss": 0.0534, "step": 16750 }, { - "epoch": 4.231254733653118, - "grad_norm": 0.44286590814590454, - "learning_rate": 4.612471598081292e-06, - "loss": 0.0734, + "epoch": 4.977724977724978, + "grad_norm": 0.7088252305984497, + "learning_rate": 1.3365013365013367e-07, + "loss": 0.055, "step": 16760 }, { - "epoch": 4.233779348649331, - "grad_norm": 0.3186633288860321, - "learning_rate": 4.597323908104015e-06, - "loss": 0.044, + "epoch": 4.980694980694981, + "grad_norm": 0.5903290510177612, + "learning_rate": 1.1583011583011584e-07, + "loss": 0.0549, "step": 16770 }, { - "epoch": 4.236303963645544, - "grad_norm": 0.7005212306976318, - "learning_rate": 4.5821762181267355e-06, - "loss": 0.0586, + "epoch": 4.983664983664983, + "grad_norm": 0.48824286460876465, + "learning_rate": 9.801009801009801e-08, + "loss": 0.0468, "step": 16780 }, { - "epoch": 4.238828578641757, - "grad_norm": 0.858946681022644, - "learning_rate": 4.567028528149457e-06, - "loss": 0.0579, + "epoch": 4.986634986634987, + "grad_norm": 0.3941417634487152, + "learning_rate": 8.019008019008019e-08, + "loss": 0.0708, "step": 16790 }, { - "epoch": 4.24135319363797, - "grad_norm": 0.3274882137775421, - "learning_rate": 4.5518808381721795e-06, - "loss": 0.0578, + "epoch": 4.98960498960499, + "grad_norm": 0.5667988657951355, + "learning_rate": 6.237006237006238e-08, + "loss": 0.0532, "step": 16800 }, { - "epoch": 4.243877808634183, - "grad_norm": 0.5176664590835571, - "learning_rate": 4.5367331481949e-06, - "loss": 0.0637, + "epoch": 4.992574992574992, + "grad_norm": 0.4835197627544403, + "learning_rate": 4.4550044550044554e-08, + "loss": 0.0511, "step": 16810 }, { - "epoch": 4.246402423630396, - "grad_norm": 0.6378607749938965, - "learning_rate": 4.521585458217622e-06, - "loss": 0.0527, + "epoch": 4.9955449955449955, + "grad_norm": 0.5256985425949097, + "learning_rate": 2.673002673002673e-08, + "loss": 0.0624, "step": 16820 }, { - "epoch": 4.24892703862661, - "grad_norm": 0.3820461928844452, - "learning_rate": 4.5064377682403434e-06, - "loss": 0.0648, + "epoch": 4.998514998514999, + "grad_norm": 0.398252934217453, + "learning_rate": 8.91000891000891e-09, + "loss": 0.0563, "step": 16830 }, + { + "epoch": 5.0, + "eval_f1": 0.49727767695099817, + "eval_loss": 0.053983673453330994, + "eval_runtime": 176.2895, + "eval_samples_per_second": 215.662, + "eval_steps_per_second": 3.375, + "step": 16835 + }, { "epoch": 4.251451653622823, - "grad_norm": 0.34068912267684937, + "grad_norm": 0.4622216820716858, "learning_rate": 4.491290078263065e-06, - "loss": 0.0542, + "loss": 0.0754, "step": 16840 }, { "epoch": 4.253976268619035, - "grad_norm": 0.4906207323074341, + "grad_norm": 0.38623130321502686, "learning_rate": 4.476142388285787e-06, - "loss": 0.0547, + "loss": 0.0505, "step": 16850 }, { "epoch": 4.2565008836152485, - "grad_norm": 0.7165696024894714, + "grad_norm": 0.32597488164901733, "learning_rate": 4.460994698308508e-06, - "loss": 0.0496, + "loss": 0.0473, "step": 16860 }, { "epoch": 4.259025498611462, - "grad_norm": 0.3498016595840454, + "grad_norm": 0.599904477596283, "learning_rate": 4.44584700833123e-06, - "loss": 0.051, + "loss": 0.0524, "step": 16870 }, { "epoch": 4.261550113607675, - "grad_norm": 0.6449806094169617, + "grad_norm": 0.4074048101902008, "learning_rate": 4.4306993183539506e-06, - "loss": 0.0574, + "loss": 0.0605, "step": 16880 }, { "epoch": 4.264074728603888, - "grad_norm": 0.719499945640564, + "grad_norm": 0.626695454120636, "learning_rate": 4.415551628376672e-06, - "loss": 0.0619, + "loss": 0.0584, "step": 16890 }, { "epoch": 4.266599343600101, - "grad_norm": 0.630200207233429, + "grad_norm": 0.46520286798477173, "learning_rate": 4.400403938399395e-06, - "loss": 0.0521, + "loss": 0.0452, "step": 16900 }, { "epoch": 4.269123958596314, - "grad_norm": 0.982899010181427, + "grad_norm": 0.7951592206954956, "learning_rate": 4.385256248422115e-06, - "loss": 0.0658, + "loss": 0.071, "step": 16910 }, { "epoch": 4.271648573592527, - "grad_norm": 0.555623471736908, + "grad_norm": 0.5409834384918213, "learning_rate": 4.370108558444837e-06, - "loss": 0.0485, + "loss": 0.0467, "step": 16920 }, { "epoch": 4.27417318858874, - "grad_norm": 0.5891562700271606, + "grad_norm": 0.6036372780799866, "learning_rate": 4.354960868467559e-06, - "loss": 0.0513, + "loss": 0.064, "step": 16930 }, { "epoch": 4.276697803584954, - "grad_norm": 0.5145660042762756, + "grad_norm": 0.4542910158634186, "learning_rate": 4.33981317849028e-06, - "loss": 0.0509, + "loss": 0.0547, "step": 16940 }, { "epoch": 4.279222418581166, - "grad_norm": 0.33446022868156433, + "grad_norm": 0.6374622583389282, "learning_rate": 4.324665488513002e-06, - "loss": 0.0481, + "loss": 0.0537, "step": 16950 }, { "epoch": 4.281747033577379, - "grad_norm": 0.493359237909317, + "grad_norm": 0.6870420575141907, "learning_rate": 4.309517798535723e-06, - "loss": 0.0564, + "loss": 0.0639, "step": 16960 }, { "epoch": 4.2842716485735926, - "grad_norm": 0.3073669970035553, + "grad_norm": 0.24296802282333374, "learning_rate": 4.294370108558445e-06, - "loss": 0.062, + "loss": 0.0614, "step": 16970 }, { "epoch": 4.286796263569806, - "grad_norm": 0.609738826751709, + "grad_norm": 0.5068966150283813, "learning_rate": 4.2792224185811665e-06, - "loss": 0.0577, + "loss": 0.0667, "step": 16980 }, { "epoch": 4.289320878566019, - "grad_norm": 0.5454846620559692, + "grad_norm": 0.49634042382240295, "learning_rate": 4.264074728603888e-06, - "loss": 0.0553, + "loss": 0.0482, "step": 16990 }, { "epoch": 4.291845493562231, - "grad_norm": 0.5008363723754883, + "grad_norm": 0.8153424263000488, "learning_rate": 4.24892703862661e-06, - "loss": 0.0517, + "loss": 0.0542, "step": 17000 }, { "epoch": 4.294370108558445, - "grad_norm": 0.4839312434196472, + "grad_norm": 0.19083431363105774, "learning_rate": 4.233779348649331e-06, - "loss": 0.0588, + "loss": 0.0612, "step": 17010 }, { "epoch": 4.296894723554658, - "grad_norm": 0.47222980856895447, + "grad_norm": 0.4229993522167206, "learning_rate": 4.218631658672053e-06, - "loss": 0.0474, + "loss": 0.0518, "step": 17020 }, { "epoch": 4.299419338550871, - "grad_norm": 0.29100242257118225, + "grad_norm": 0.8197377920150757, "learning_rate": 4.2034839686947745e-06, - "loss": 0.0479, + "loss": 0.0547, "step": 17030 }, { "epoch": 4.3019439535470845, - "grad_norm": 0.9103265404701233, + "grad_norm": 0.44996774196624756, "learning_rate": 4.188336278717495e-06, - "loss": 0.0499, + "loss": 0.0495, "step": 17040 }, { "epoch": 4.304468568543297, - "grad_norm": 0.4569959044456482, + "grad_norm": 0.4352714419364929, "learning_rate": 4.173188588740217e-06, - "loss": 0.0439, + "loss": 0.047, "step": 17050 }, { "epoch": 4.30699318353951, - "grad_norm": 0.5705316662788391, + "grad_norm": 0.3896523714065552, "learning_rate": 4.158040898762939e-06, - "loss": 0.0577, + "loss": 0.0599, "step": 17060 }, { "epoch": 4.309517798535723, - "grad_norm": 0.6814060211181641, + "grad_norm": 0.6314728260040283, "learning_rate": 4.14289320878566e-06, - "loss": 0.0535, + "loss": 0.0604, "step": 17070 }, { "epoch": 4.312042413531937, - "grad_norm": 0.5100272297859192, + "grad_norm": 0.6164297461509705, "learning_rate": 4.127745518808382e-06, - "loss": 0.0582, + "loss": 0.0649, "step": 17080 }, { "epoch": 4.314567028528149, - "grad_norm": 0.520350456237793, + "grad_norm": 0.47392478585243225, "learning_rate": 4.112597828831104e-06, - "loss": 0.049, + "loss": 0.0482, "step": 17090 }, { "epoch": 4.317091643524362, - "grad_norm": 0.44908711314201355, + "grad_norm": 0.4184396266937256, "learning_rate": 4.097450138853825e-06, - "loss": 0.059, + "loss": 0.0576, "step": 17100 }, { "epoch": 4.3196162585205755, - "grad_norm": 0.3135251998901367, + "grad_norm": 0.3965582251548767, "learning_rate": 4.082302448876546e-06, - "loss": 0.0627, + "loss": 0.0658, "step": 17110 }, { "epoch": 4.322140873516789, - "grad_norm": 0.5100424885749817, + "grad_norm": 0.4759332835674286, "learning_rate": 4.067154758899268e-06, - "loss": 0.0762, + "loss": 0.0694, "step": 17120 }, { "epoch": 4.324665488513002, - "grad_norm": 1.0668565034866333, + "grad_norm": 0.6103851795196533, "learning_rate": 4.0520070689219896e-06, - "loss": 0.0628, + "loss": 0.0632, "step": 17130 }, { "epoch": 4.327190103509215, - "grad_norm": 0.3573605418205261, + "grad_norm": 0.3435596525669098, "learning_rate": 4.036859378944711e-06, - "loss": 0.0647, + "loss": 0.0632, "step": 17140 }, { "epoch": 4.329714718505428, - "grad_norm": 0.5911782383918762, + "grad_norm": 0.6255317330360413, "learning_rate": 4.021711688967433e-06, - "loss": 0.0554, + "loss": 0.0607, "step": 17150 }, { "epoch": 4.332239333501641, - "grad_norm": 0.7212706804275513, + "grad_norm": 0.8034877181053162, "learning_rate": 4.006563998990154e-06, - "loss": 0.0595, + "loss": 0.0624, "step": 17160 }, { "epoch": 4.334763948497854, - "grad_norm": 0.6161842346191406, + "grad_norm": 0.5104978084564209, "learning_rate": 3.991416309012875e-06, - "loss": 0.0585, + "loss": 0.0723, "step": 17170 }, { "epoch": 4.337288563494067, - "grad_norm": 0.5191289782524109, + "grad_norm": 0.6457841992378235, "learning_rate": 3.976268619035597e-06, - "loss": 0.0534, + "loss": 0.0622, "step": 17180 }, { "epoch": 4.33981317849028, - "grad_norm": 0.4344318211078644, + "grad_norm": 0.5124953985214233, "learning_rate": 3.961120929058319e-06, - "loss": 0.0569, + "loss": 0.0608, "step": 17190 }, { "epoch": 4.342337793486493, - "grad_norm": 0.8711338043212891, + "grad_norm": 0.4378756582736969, "learning_rate": 3.94597323908104e-06, - "loss": 0.0489, + "loss": 0.0483, "step": 17200 }, { "epoch": 4.344862408482706, - "grad_norm": 0.4723449647426605, + "grad_norm": 0.47140154242515564, "learning_rate": 3.9308255491037615e-06, - "loss": 0.0572, + "loss": 0.0649, "step": 17210 }, { "epoch": 4.34738702347892, - "grad_norm": 0.48766812682151794, + "grad_norm": 0.39003312587738037, "learning_rate": 3.915677859126484e-06, - "loss": 0.0609, + "loss": 0.0589, "step": 17220 }, { "epoch": 4.349911638475133, - "grad_norm": 0.41383126378059387, + "grad_norm": 0.5201835036277771, "learning_rate": 3.900530169149205e-06, - "loss": 0.0518, + "loss": 0.0528, "step": 17230 }, { "epoch": 4.352436253471345, - "grad_norm": 0.6354776620864868, + "grad_norm": 0.4116949439048767, "learning_rate": 3.885382479171926e-06, - "loss": 0.0564, + "loss": 0.0531, "step": 17240 }, { "epoch": 4.3549608684675585, - "grad_norm": 0.44549328088760376, + "grad_norm": 0.39697498083114624, "learning_rate": 3.870234789194648e-06, - "loss": 0.0643, + "loss": 0.0733, "step": 17250 }, { "epoch": 4.357485483463772, - "grad_norm": 0.5504968762397766, + "grad_norm": 0.4850797653198242, "learning_rate": 3.8550870992173695e-06, - "loss": 0.0601, + "loss": 0.0654, "step": 17260 }, { "epoch": 4.360010098459985, - "grad_norm": 0.4032590389251709, + "grad_norm": 0.42553943395614624, "learning_rate": 3.839939409240091e-06, - "loss": 0.0435, + "loss": 0.0451, "step": 17270 }, { "epoch": 4.362534713456198, - "grad_norm": 0.3765304982662201, + "grad_norm": 0.27774763107299805, "learning_rate": 3.824791719262813e-06, - "loss": 0.0514, + "loss": 0.0532, "step": 17280 }, { "epoch": 4.365059328452411, - "grad_norm": 0.4660661220550537, + "grad_norm": 0.36856329441070557, "learning_rate": 3.8096440292855342e-06, - "loss": 0.0565, + "loss": 0.0499, "step": 17290 }, { "epoch": 4.367583943448624, - "grad_norm": 0.6698967814445496, + "grad_norm": 0.6865664720535278, "learning_rate": 3.7944963393082554e-06, - "loss": 0.0531, + "loss": 0.0646, "step": 17300 }, { "epoch": 4.370108558444837, - "grad_norm": 0.7271828651428223, + "grad_norm": 0.809834897518158, "learning_rate": 3.7793486493309766e-06, - "loss": 0.057, + "loss": 0.0623, "step": 17310 }, { "epoch": 4.37263317344105, - "grad_norm": 0.2889387607574463, + "grad_norm": 0.5114462971687317, "learning_rate": 3.764200959353699e-06, - "loss": 0.05, + "loss": 0.055, "step": 17320 }, { "epoch": 4.375157788437264, - "grad_norm": 0.8666712045669556, + "grad_norm": 0.6078599095344543, "learning_rate": 3.74905326937642e-06, - "loss": 0.0638, + "loss": 0.0654, "step": 17330 }, { "epoch": 4.377682403433476, - "grad_norm": 0.3147795498371124, + "grad_norm": 0.48811349272727966, "learning_rate": 3.733905579399142e-06, - "loss": 0.0583, + "loss": 0.0555, "step": 17340 }, { "epoch": 4.380207018429689, - "grad_norm": 0.6090699434280396, + "grad_norm": 0.7374588847160339, "learning_rate": 3.718757889421863e-06, - "loss": 0.058, + "loss": 0.0593, "step": 17350 }, { "epoch": 4.3827316334259026, - "grad_norm": 0.3823327422142029, + "grad_norm": 0.6511560678482056, "learning_rate": 3.703610199444585e-06, - "loss": 0.0578, + "loss": 0.0544, "step": 17360 }, { "epoch": 4.385256248422116, - "grad_norm": 0.43010056018829346, + "grad_norm": 0.4263114333152771, "learning_rate": 3.6884625094673066e-06, - "loss": 0.0537, + "loss": 0.064, "step": 17370 }, { "epoch": 4.387780863418329, - "grad_norm": 0.5161568522453308, + "grad_norm": 0.2922056317329407, "learning_rate": 3.6733148194900277e-06, - "loss": 0.0578, + "loss": 0.0573, "step": 17380 }, { "epoch": 4.390305478414541, - "grad_norm": 0.3812313675880432, + "grad_norm": 0.7642768025398254, "learning_rate": 3.6581671295127493e-06, - "loss": 0.0586, + "loss": 0.0558, "step": 17390 }, { "epoch": 4.392830093410755, - "grad_norm": 0.4599802494049072, + "grad_norm": 0.5975914597511292, "learning_rate": 3.643019439535471e-06, - "loss": 0.0554, + "loss": 0.075, "step": 17400 }, { "epoch": 4.395354708406968, - "grad_norm": 0.4805625379085541, + "grad_norm": 0.4351644515991211, "learning_rate": 3.6278717495581925e-06, - "loss": 0.055, + "loss": 0.0635, "step": 17410 }, { "epoch": 4.397879323403181, - "grad_norm": 0.5854561924934387, + "grad_norm": 0.6523928046226501, "learning_rate": 3.612724059580914e-06, - "loss": 0.0554, + "loss": 0.0545, "step": 17420 }, { "epoch": 4.4004039383993945, - "grad_norm": 0.48171231150627136, + "grad_norm": 0.4286153018474579, "learning_rate": 3.5975763696036353e-06, - "loss": 0.0532, + "loss": 0.0507, "step": 17430 }, { "epoch": 4.402928553395607, - "grad_norm": 0.532574474811554, + "grad_norm": 0.402811735868454, "learning_rate": 3.582428679626357e-06, - "loss": 0.0617, + "loss": 0.0595, "step": 17440 }, { "epoch": 4.40545316839182, - "grad_norm": 0.33348578214645386, + "grad_norm": 0.5500208139419556, "learning_rate": 3.567280989649079e-06, - "loss": 0.0514, + "loss": 0.0553, "step": 17450 }, { "epoch": 4.407977783388033, - "grad_norm": 0.49266380071640015, + "grad_norm": 0.7133852243423462, "learning_rate": 3.5521332996718e-06, - "loss": 0.0564, + "loss": 0.059, "step": 17460 }, { "epoch": 4.410502398384247, - "grad_norm": 0.8559315800666809, + "grad_norm": 0.8194918036460876, "learning_rate": 3.5369856096945217e-06, - "loss": 0.0499, + "loss": 0.0521, "step": 17470 }, { "epoch": 4.41302701338046, - "grad_norm": 0.4928590953350067, + "grad_norm": 0.5027428865432739, "learning_rate": 3.521837919717243e-06, - "loss": 0.0422, + "loss": 0.0491, "step": 17480 }, { "epoch": 4.415551628376672, - "grad_norm": 0.37242400646209717, + "grad_norm": 0.46674638986587524, "learning_rate": 3.506690229739965e-06, - "loss": 0.0569, + "loss": 0.0543, "step": 17490 }, { "epoch": 4.4180762433728855, - "grad_norm": 0.5479409694671631, + "grad_norm": 0.6677160263061523, "learning_rate": 3.4915425397626865e-06, - "loss": 0.0498, + "loss": 0.0464, "step": 17500 }, { "epoch": 4.420600858369099, - "grad_norm": 0.5095922350883484, + "grad_norm": 0.3993780314922333, "learning_rate": 3.4763948497854076e-06, - "loss": 0.0629, + "loss": 0.0635, "step": 17510 }, { "epoch": 4.423125473365312, - "grad_norm": 0.49946799874305725, + "grad_norm": 0.44299226999282837, "learning_rate": 3.4612471598081292e-06, - "loss": 0.0719, + "loss": 0.0675, "step": 17520 }, { "epoch": 4.425650088361525, - "grad_norm": 0.34545761346817017, + "grad_norm": 0.47991326451301575, "learning_rate": 3.4460994698308512e-06, - "loss": 0.0471, + "loss": 0.0491, "step": 17530 }, { "epoch": 4.428174703357738, - "grad_norm": 0.3934885263442993, + "grad_norm": 0.5460741519927979, "learning_rate": 3.4309517798535724e-06, - "loss": 0.0645, + "loss": 0.0688, "step": 17540 }, { "epoch": 4.430699318353951, - "grad_norm": 0.5036693215370178, + "grad_norm": 0.5100826621055603, "learning_rate": 3.415804089876294e-06, - "loss": 0.0725, + "loss": 0.0686, "step": 17550 }, { "epoch": 4.433223933350164, - "grad_norm": 0.5618218779563904, + "grad_norm": 0.7981113195419312, "learning_rate": 3.400656399899015e-06, - "loss": 0.0535, + "loss": 0.0605, "step": 17560 }, { "epoch": 4.435748548346377, - "grad_norm": 0.41341492533683777, + "grad_norm": 0.42095330357551575, "learning_rate": 3.385508709921737e-06, - "loss": 0.0559, + "loss": 0.0621, "step": 17570 }, { "epoch": 4.43827316334259, - "grad_norm": 0.5488719940185547, + "grad_norm": 0.4400339722633362, "learning_rate": 3.3703610199444588e-06, - "loss": 0.0619, + "loss": 0.0636, "step": 17580 }, { "epoch": 4.440797778338803, - "grad_norm": 0.6855219006538391, + "grad_norm": 0.4648873805999756, "learning_rate": 3.35521332996718e-06, - "loss": 0.061, + "loss": 0.0746, "step": 17590 }, { "epoch": 4.443322393335016, - "grad_norm": 0.5198519229888916, + "grad_norm": 0.4564558267593384, "learning_rate": 3.3400656399899016e-06, - "loss": 0.0484, + "loss": 0.0546, "step": 17600 }, { "epoch": 4.44584700833123, - "grad_norm": 0.4923730790615082, + "grad_norm": 0.4136642515659332, "learning_rate": 3.324917950012623e-06, - "loss": 0.052, + "loss": 0.0557, "step": 17610 }, { "epoch": 4.448371623327443, - "grad_norm": 0.60850590467453, + "grad_norm": 0.4328581392765045, "learning_rate": 3.3097702600353447e-06, - "loss": 0.0587, + "loss": 0.0565, "step": 17620 }, { "epoch": 4.450896238323656, - "grad_norm": 0.280381441116333, + "grad_norm": 0.3888933062553406, "learning_rate": 3.2946225700580663e-06, - "loss": 0.0626, + "loss": 0.057, "step": 17630 }, { "epoch": 4.4534208533198685, - "grad_norm": 0.6930276155471802, + "grad_norm": 0.5712131857872009, "learning_rate": 3.2794748800807875e-06, - "loss": 0.051, + "loss": 0.0484, "step": 17640 }, { "epoch": 4.455945468316082, - "grad_norm": 0.46625441312789917, + "grad_norm": 0.5881834626197815, "learning_rate": 3.2643271901035095e-06, - "loss": 0.0573, + "loss": 0.0531, "step": 17650 }, { "epoch": 4.458470083312295, - "grad_norm": 0.6228309869766235, + "grad_norm": 0.5216571688652039, "learning_rate": 3.249179500126231e-06, - "loss": 0.0503, + "loss": 0.0522, "step": 17660 }, { "epoch": 4.460994698308508, - "grad_norm": 0.33836373686790466, + "grad_norm": 0.5654059648513794, "learning_rate": 3.2340318101489523e-06, - "loss": 0.0495, + "loss": 0.0498, "step": 17670 }, { "epoch": 4.463519313304721, - "grad_norm": 0.2794092893600464, + "grad_norm": 0.6211521625518799, "learning_rate": 3.218884120171674e-06, - "loss": 0.0465, + "loss": 0.0546, "step": 17680 }, { "epoch": 4.466043928300934, - "grad_norm": 0.5994489192962646, + "grad_norm": 0.48394614458084106, "learning_rate": 3.2037364301943955e-06, "loss": 0.0588, "step": 17690 }, { "epoch": 4.468568543297147, - "grad_norm": 0.7377051711082458, + "grad_norm": 0.7053552269935608, "learning_rate": 3.188588740217117e-06, - "loss": 0.0563, + "loss": 0.0598, "step": 17700 }, { "epoch": 4.47109315829336, - "grad_norm": 0.6417201161384583, + "grad_norm": 0.4579329192638397, "learning_rate": 3.1734410502398387e-06, - "loss": 0.0491, + "loss": 0.0512, "step": 17710 }, { "epoch": 4.473617773289574, - "grad_norm": 0.8163862228393555, + "grad_norm": 0.3756571114063263, "learning_rate": 3.15829336026256e-06, - "loss": 0.0593, + "loss": 0.0512, "step": 17720 }, { "epoch": 4.476142388285786, - "grad_norm": 0.4559797942638397, + "grad_norm": 0.3513215482234955, "learning_rate": 3.1431456702852814e-06, - "loss": 0.0553, + "loss": 0.0557, "step": 17730 }, { "epoch": 4.478667003281999, - "grad_norm": 0.26883384585380554, + "grad_norm": 0.5204163193702698, "learning_rate": 3.127997980308003e-06, - "loss": 0.0763, + "loss": 0.0799, "step": 17740 }, { "epoch": 4.4811916182782126, - "grad_norm": 0.5811520218849182, + "grad_norm": 0.5801984071731567, "learning_rate": 3.1128502903307246e-06, - "loss": 0.0466, + "loss": 0.0419, "step": 17750 }, { "epoch": 4.483716233274426, - "grad_norm": 0.5414985418319702, + "grad_norm": 0.6490535140037537, "learning_rate": 3.0977026003534462e-06, - "loss": 0.0472, + "loss": 0.0551, "step": 17760 }, { "epoch": 4.486240848270639, - "grad_norm": 0.2951160669326782, + "grad_norm": 0.5970304012298584, "learning_rate": 3.0825549103761674e-06, - "loss": 0.0573, + "loss": 0.061, "step": 17770 }, { "epoch": 4.488765463266851, - "grad_norm": 0.5405964851379395, + "grad_norm": 0.8191946744918823, "learning_rate": 3.0674072203988894e-06, - "loss": 0.0562, + "loss": 0.058, "step": 17780 }, { "epoch": 4.491290078263065, - "grad_norm": 0.6366590261459351, + "grad_norm": 0.7532091736793518, "learning_rate": 3.052259530421611e-06, - "loss": 0.0531, + "loss": 0.0578, "step": 17790 }, { "epoch": 4.493814693259278, - "grad_norm": 0.4919568598270416, + "grad_norm": 0.6891248226165771, "learning_rate": 3.037111840444332e-06, - "loss": 0.0679, + "loss": 0.0776, "step": 17800 }, { "epoch": 4.496339308255491, - "grad_norm": 0.4891291856765747, + "grad_norm": 0.3589613139629364, "learning_rate": 3.0219641504670538e-06, - "loss": 0.057, + "loss": 0.0578, "step": 17810 }, { "epoch": 4.4988639232517045, - "grad_norm": 0.5892491340637207, + "grad_norm": 0.4397825598716736, "learning_rate": 3.0068164604897754e-06, - "loss": 0.0519, + "loss": 0.0477, "step": 17820 }, { "epoch": 4.501388538247917, - "grad_norm": 0.3350531756877899, + "grad_norm": 0.6630678772926331, "learning_rate": 2.991668770512497e-06, - "loss": 0.0572, + "loss": 0.0621, "step": 17830 }, { "epoch": 4.50391315324413, - "grad_norm": 0.4016503095626831, + "grad_norm": 0.4310142695903778, "learning_rate": 2.9765210805352185e-06, "loss": 0.0486, "step": 17840 }, { "epoch": 4.506437768240343, - "grad_norm": 0.41794684529304504, + "grad_norm": 0.5123319625854492, "learning_rate": 2.9613733905579397e-06, - "loss": 0.0423, + "loss": 0.0419, "step": 17850 }, { "epoch": 4.508962383236557, - "grad_norm": 0.5687925815582275, + "grad_norm": 0.8451969027519226, "learning_rate": 2.9462257005806617e-06, - "loss": 0.052, + "loss": 0.0636, "step": 17860 }, { "epoch": 4.51148699823277, - "grad_norm": 0.42348799109458923, + "grad_norm": 0.5869598388671875, "learning_rate": 2.9310780106033833e-06, - "loss": 0.0478, + "loss": 0.0592, "step": 17870 }, { "epoch": 4.514011613228982, - "grad_norm": 1.0323283672332764, + "grad_norm": 0.8282822370529175, "learning_rate": 2.9159303206261045e-06, - "loss": 0.0659, + "loss": 0.0614, "step": 17880 }, { "epoch": 4.5165362282251955, - "grad_norm": 0.5422408580780029, + "grad_norm": 0.5392325520515442, "learning_rate": 2.900782630648826e-06, - "loss": 0.0584, + "loss": 0.0537, "step": 17890 }, { "epoch": 4.519060843221409, - "grad_norm": 0.3516719341278076, + "grad_norm": 0.6844165325164795, "learning_rate": 2.8856349406715477e-06, - "loss": 0.0651, + "loss": 0.0628, "step": 17900 }, { "epoch": 4.521585458217622, - "grad_norm": 0.5614657402038574, + "grad_norm": 0.5177090764045715, "learning_rate": 2.8704872506942693e-06, - "loss": 0.0513, + "loss": 0.0536, "step": 17910 }, { "epoch": 4.524110073213835, - "grad_norm": 0.43666282296180725, + "grad_norm": 0.395877480506897, "learning_rate": 2.855339560716991e-06, - "loss": 0.0607, + "loss": 0.0602, "step": 17920 }, { "epoch": 4.526634688210048, - "grad_norm": 0.7072877883911133, + "grad_norm": 0.30185338854789734, "learning_rate": 2.840191870739712e-06, - "loss": 0.0625, + "loss": 0.0624, "step": 17930 }, { "epoch": 4.529159303206261, - "grad_norm": 0.5805211663246155, + "grad_norm": 0.5236132740974426, "learning_rate": 2.825044180762434e-06, - "loss": 0.0541, + "loss": 0.0648, "step": 17940 }, { "epoch": 4.531683918202474, - "grad_norm": 0.35863280296325684, + "grad_norm": 0.5160847306251526, "learning_rate": 2.8098964907851552e-06, - "loss": 0.0525, + "loss": 0.0554, "step": 17950 }, { "epoch": 4.534208533198687, - "grad_norm": 0.6086267232894897, + "grad_norm": 0.5891533493995667, "learning_rate": 2.794748800807877e-06, - "loss": 0.0673, + "loss": 0.065, "step": 17960 }, { "epoch": 4.5367331481949, - "grad_norm": 0.33207792043685913, + "grad_norm": 0.3929848074913025, "learning_rate": 2.7796011108305984e-06, - "loss": 0.0628, + "loss": 0.064, "step": 17970 }, { "epoch": 4.539257763191113, - "grad_norm": 0.3145606517791748, + "grad_norm": 0.4245711863040924, "learning_rate": 2.76445342085332e-06, - "loss": 0.056, + "loss": 0.0589, "step": 17980 }, { "epoch": 4.541782378187326, - "grad_norm": 0.3434918224811554, + "grad_norm": 0.3840174376964569, "learning_rate": 2.7493057308760416e-06, - "loss": 0.0569, + "loss": 0.0533, "step": 17990 }, { "epoch": 4.54430699318354, - "grad_norm": 0.6462843418121338, + "grad_norm": 0.5826268196105957, "learning_rate": 2.734158040898763e-06, - "loss": 0.0615, + "loss": 0.0574, "step": 18000 }, { "epoch": 4.546831608179753, - "grad_norm": 0.5379069447517395, + "grad_norm": 0.34730613231658936, "learning_rate": 2.7190103509214844e-06, - "loss": 0.0618, + "loss": 0.0567, "step": 18010 }, { "epoch": 4.549356223175966, - "grad_norm": 0.36423930525779724, + "grad_norm": 0.45773857831954956, "learning_rate": 2.703862660944206e-06, - "loss": 0.0528, + "loss": 0.0601, "step": 18020 }, { "epoch": 4.5518808381721785, - "grad_norm": 0.5888210535049438, + "grad_norm": 0.3971637189388275, "learning_rate": 2.6887149709669276e-06, - "loss": 0.0581, + "loss": 0.0551, "step": 18030 }, { "epoch": 4.554405453168392, - "grad_norm": 0.3849516808986664, + "grad_norm": 0.46903976798057556, "learning_rate": 2.673567280989649e-06, - "loss": 0.0518, + "loss": 0.0553, "step": 18040 }, { "epoch": 4.556930068164605, - "grad_norm": 0.4130467474460602, + "grad_norm": 0.5735597014427185, "learning_rate": 2.6584195910123708e-06, - "loss": 0.0486, + "loss": 0.0544, "step": 18050 }, { "epoch": 4.559454683160818, - "grad_norm": 0.4366299510002136, + "grad_norm": 0.7702049612998962, "learning_rate": 2.643271901035092e-06, - "loss": 0.0449, + "loss": 0.0424, "step": 18060 }, { "epoch": 4.561979298157031, - "grad_norm": 0.6537075042724609, + "grad_norm": 0.7742976546287537, "learning_rate": 2.628124211057814e-06, - "loss": 0.0607, + "loss": 0.0625, "step": 18070 }, { "epoch": 4.564503913153244, - "grad_norm": 0.633960485458374, + "grad_norm": 0.7160853147506714, "learning_rate": 2.612976521080535e-06, - "loss": 0.0599, + "loss": 0.058, "step": 18080 }, { "epoch": 4.567028528149457, - "grad_norm": 0.3362826406955719, + "grad_norm": 0.23403172194957733, "learning_rate": 2.5978288311032567e-06, - "loss": 0.044, + "loss": 0.0477, "step": 18090 }, { "epoch": 4.56955314314567, - "grad_norm": 0.4045182764530182, + "grad_norm": 0.25679340958595276, "learning_rate": 2.5826811411259783e-06, - "loss": 0.0486, + "loss": 0.0578, "step": 18100 }, { "epoch": 4.572077758141884, - "grad_norm": 0.4667646884918213, + "grad_norm": 0.6108934879302979, "learning_rate": 2.5675334511487e-06, - "loss": 0.05, + "loss": 0.0567, "step": 18110 }, { "epoch": 4.574602373138097, - "grad_norm": 0.24448010325431824, + "grad_norm": 0.570832371711731, "learning_rate": 2.5523857611714215e-06, - "loss": 0.05, + "loss": 0.0503, "step": 18120 }, { "epoch": 4.577126988134309, - "grad_norm": 0.5539112091064453, + "grad_norm": 0.49613040685653687, "learning_rate": 2.537238071194143e-06, - "loss": 0.0607, + "loss": 0.0552, "step": 18130 }, { "epoch": 4.5796516031305226, - "grad_norm": 0.6582273244857788, + "grad_norm": 0.43599942326545715, "learning_rate": 2.5220903812168643e-06, - "loss": 0.0711, + "loss": 0.0736, "step": 18140 }, { "epoch": 4.582176218126736, - "grad_norm": 0.42286741733551025, + "grad_norm": 0.5823941230773926, "learning_rate": 2.5069426912395863e-06, - "loss": 0.0504, + "loss": 0.0452, "step": 18150 }, { "epoch": 4.584700833122949, - "grad_norm": 0.4873209595680237, + "grad_norm": 0.6966807842254639, "learning_rate": 2.4917950012623075e-06, - "loss": 0.0522, + "loss": 0.0567, "step": 18160 }, { "epoch": 4.587225448119161, - "grad_norm": 0.4046306014060974, + "grad_norm": 0.9933467507362366, "learning_rate": 2.476647311285029e-06, - "loss": 0.0562, + "loss": 0.0621, "step": 18170 }, { "epoch": 4.589750063115375, - "grad_norm": 0.3709363043308258, + "grad_norm": 0.44380226731300354, "learning_rate": 2.4614996213077506e-06, - "loss": 0.0635, + "loss": 0.0607, "step": 18180 }, { "epoch": 4.592274678111588, - "grad_norm": 0.2947100102901459, + "grad_norm": 0.3192310631275177, "learning_rate": 2.4463519313304722e-06, - "loss": 0.0619, + "loss": 0.0601, "step": 18190 }, { "epoch": 4.594799293107801, - "grad_norm": 0.6312539577484131, + "grad_norm": 0.5151782035827637, "learning_rate": 2.431204241353194e-06, - "loss": 0.0455, + "loss": 0.0441, "step": 18200 }, { "epoch": 4.5973239081040145, - "grad_norm": 0.5368882417678833, + "grad_norm": 0.8137912154197693, "learning_rate": 2.4160565513759154e-06, - "loss": 0.0613, + "loss": 0.0708, "step": 18210 }, { "epoch": 4.599848523100227, - "grad_norm": 0.760257363319397, + "grad_norm": 0.4802444875240326, "learning_rate": 2.4009088613986366e-06, - "loss": 0.0493, + "loss": 0.0553, "step": 18220 }, { "epoch": 4.60237313809644, - "grad_norm": 0.5237419009208679, + "grad_norm": 0.41935741901397705, "learning_rate": 2.3857611714213586e-06, - "loss": 0.0602, + "loss": 0.057, "step": 18230 }, { "epoch": 4.604897753092653, - "grad_norm": 0.6729007959365845, + "grad_norm": 0.42669227719306946, "learning_rate": 2.3706134814440798e-06, - "loss": 0.0576, + "loss": 0.0671, "step": 18240 }, { "epoch": 4.607422368088867, - "grad_norm": 0.30550557374954224, + "grad_norm": 0.5261390209197998, "learning_rate": 2.3554657914668014e-06, - "loss": 0.0431, + "loss": 0.0561, "step": 18250 }, { "epoch": 4.60994698308508, - "grad_norm": 0.6244345903396606, + "grad_norm": 0.495779424905777, "learning_rate": 2.340318101489523e-06, - "loss": 0.0657, + "loss": 0.059, "step": 18260 }, { "epoch": 4.612471598081292, - "grad_norm": 0.6878501772880554, + "grad_norm": 0.5515862107276917, "learning_rate": 2.325170411512244e-06, - "loss": 0.056, + "loss": 0.061, "step": 18270 }, { "epoch": 4.6149962130775055, - "grad_norm": 0.40880918502807617, + "grad_norm": 0.8136048913002014, "learning_rate": 2.310022721534966e-06, - "loss": 0.0512, + "loss": 0.0572, "step": 18280 }, { "epoch": 4.617520828073719, - "grad_norm": 0.5096030235290527, + "grad_norm": 0.393250972032547, "learning_rate": 2.2948750315576873e-06, - "loss": 0.0471, + "loss": 0.0523, "step": 18290 }, { "epoch": 4.620045443069932, - "grad_norm": 0.3214782178401947, + "grad_norm": 0.5420840978622437, "learning_rate": 2.279727341580409e-06, - "loss": 0.0517, + "loss": 0.0605, "step": 18300 }, { "epoch": 4.622570058066145, - "grad_norm": 0.3744233250617981, + "grad_norm": 0.5676819086074829, "learning_rate": 2.2645796516031305e-06, - "loss": 0.0517, + "loss": 0.0523, "step": 18310 }, { "epoch": 4.625094673062358, - "grad_norm": 0.45410263538360596, + "grad_norm": 0.36500880122184753, "learning_rate": 2.249431961625852e-06, - "loss": 0.0592, + "loss": 0.0562, "step": 18320 }, { "epoch": 4.627619288058571, - "grad_norm": 0.4763607680797577, + "grad_norm": 0.5303543210029602, "learning_rate": 2.2342842716485737e-06, - "loss": 0.0547, + "loss": 0.0474, "step": 18330 }, { "epoch": 4.630143903054784, - "grad_norm": 0.36201509833335876, + "grad_norm": 0.4387858510017395, "learning_rate": 2.2191365816712953e-06, - "loss": 0.0486, + "loss": 0.0438, "step": 18340 }, { "epoch": 4.632668518050997, - "grad_norm": 0.5421953797340393, + "grad_norm": 0.2990294098854065, "learning_rate": 2.2039888916940165e-06, - "loss": 0.0491, + "loss": 0.0447, "step": 18350 }, { "epoch": 4.63519313304721, - "grad_norm": 0.39594170451164246, + "grad_norm": 0.37644967436790466, "learning_rate": 2.1888412017167385e-06, - "loss": 0.0479, + "loss": 0.049, "step": 18360 }, { "epoch": 4.637717748043423, - "grad_norm": 0.6088730692863464, + "grad_norm": 0.5996664762496948, "learning_rate": 2.1736935117394597e-06, - "loss": 0.0594, + "loss": 0.0618, "step": 18370 }, { "epoch": 4.640242363039636, - "grad_norm": 0.3005998432636261, + "grad_norm": 0.5396886467933655, "learning_rate": 2.1585458217621813e-06, - "loss": 0.0537, + "loss": 0.054, "step": 18380 }, { "epoch": 4.64276697803585, - "grad_norm": 0.5010228753089905, + "grad_norm": 0.5311841368675232, "learning_rate": 2.143398131784903e-06, - "loss": 0.0609, + "loss": 0.059, "step": 18390 }, { "epoch": 4.645291593032063, - "grad_norm": 0.32724529504776, + "grad_norm": 0.6080347299575806, "learning_rate": 2.1282504418076244e-06, - "loss": 0.0639, + "loss": 0.063, "step": 18400 }, { "epoch": 4.647816208028276, - "grad_norm": 0.6042733192443848, + "grad_norm": 0.720029354095459, "learning_rate": 2.113102751830346e-06, - "loss": 0.0618, + "loss": 0.0674, "step": 18410 }, { "epoch": 4.6503408230244885, - "grad_norm": 0.3925885558128357, + "grad_norm": 0.26142123341560364, "learning_rate": 2.0979550618530672e-06, - "loss": 0.0482, + "loss": 0.043, "step": 18420 }, { "epoch": 4.652865438020702, - "grad_norm": 0.3752647936344147, + "grad_norm": 0.8284344673156738, "learning_rate": 2.082807371875789e-06, - "loss": 0.0607, + "loss": 0.0632, "step": 18430 }, { "epoch": 4.655390053016915, - "grad_norm": 0.5499758720397949, + "grad_norm": 0.5045512914657593, "learning_rate": 2.067659681898511e-06, - "loss": 0.0629, + "loss": 0.0558, "step": 18440 }, { "epoch": 4.657914668013128, - "grad_norm": 0.46264201402664185, + "grad_norm": 0.3474113345146179, "learning_rate": 2.052511991921232e-06, - "loss": 0.0515, + "loss": 0.0484, "step": 18450 }, { "epoch": 4.660439283009341, - "grad_norm": 0.4616946876049042, + "grad_norm": 0.4987110197544098, "learning_rate": 2.0373643019439536e-06, - "loss": 0.0563, + "loss": 0.0542, "step": 18460 }, { "epoch": 4.662963898005554, - "grad_norm": 0.5082404017448425, + "grad_norm": 0.48412591218948364, "learning_rate": 2.022216611966675e-06, - "loss": 0.0538, + "loss": 0.0585, "step": 18470 }, { "epoch": 4.665488513001767, - "grad_norm": 0.42462125420570374, + "grad_norm": 0.48798561096191406, "learning_rate": 2.0070689219893968e-06, - "loss": 0.05, + "loss": 0.0512, "step": 18480 }, { "epoch": 4.66801312799798, - "grad_norm": 0.3045310080051422, + "grad_norm": 0.3808564245700836, "learning_rate": 1.9919212320121184e-06, - "loss": 0.054, + "loss": 0.0607, "step": 18490 }, { "epoch": 4.670537742994194, - "grad_norm": 0.4136578142642975, + "grad_norm": 0.3918999135494232, "learning_rate": 1.9767735420348395e-06, - "loss": 0.0675, + "loss": 0.061, "step": 18500 }, { "epoch": 4.673062357990407, - "grad_norm": 0.644835352897644, + "grad_norm": 0.7011407017707825, "learning_rate": 1.961625852057561e-06, - "loss": 0.0574, + "loss": 0.0621, "step": 18510 }, { "epoch": 4.675586972986619, - "grad_norm": 0.4754480719566345, + "grad_norm": 0.31626319885253906, "learning_rate": 1.946478162080283e-06, - "loss": 0.0496, + "loss": 0.0564, "step": 18520 }, { "epoch": 4.6781115879828326, - "grad_norm": 0.6022984981536865, + "grad_norm": 0.5955636501312256, "learning_rate": 1.9313304721030043e-06, - "loss": 0.0586, + "loss": 0.0473, "step": 18530 }, { "epoch": 4.680636202979046, - "grad_norm": 0.5673237442970276, + "grad_norm": 0.50102698802948, "learning_rate": 1.916182782125726e-06, - "loss": 0.0565, + "loss": 0.0585, "step": 18540 }, { "epoch": 4.683160817975259, - "grad_norm": 0.42253974080085754, + "grad_norm": 0.46877047419548035, "learning_rate": 1.9010350921484475e-06, - "loss": 0.0555, + "loss": 0.0519, "step": 18550 }, { "epoch": 4.685685432971471, - "grad_norm": 0.1553160846233368, + "grad_norm": 0.45812228322029114, "learning_rate": 1.885887402171169e-06, - "loss": 0.0469, + "loss": 0.0466, "step": 18560 }, { "epoch": 4.688210047967685, - "grad_norm": 0.5742431282997131, + "grad_norm": 0.8704932332038879, "learning_rate": 1.8707397121938903e-06, - "loss": 0.0466, + "loss": 0.0538, "step": 18570 }, { "epoch": 4.690734662963898, - "grad_norm": 0.6274034380912781, + "grad_norm": 0.61441969871521, "learning_rate": 1.855592022216612e-06, - "loss": 0.0513, + "loss": 0.0577, "step": 18580 }, { "epoch": 4.693259277960111, - "grad_norm": 0.42444777488708496, + "grad_norm": 0.32448869943618774, "learning_rate": 1.8404443322393335e-06, - "loss": 0.0516, + "loss": 0.0501, "step": 18590 }, { "epoch": 4.6957838929563245, - "grad_norm": 0.8592566251754761, + "grad_norm": 0.5979995727539062, "learning_rate": 1.825296642262055e-06, - "loss": 0.0557, + "loss": 0.0502, "step": 18600 }, { "epoch": 4.698308507952537, - "grad_norm": 0.37311816215515137, + "grad_norm": 0.38927385210990906, "learning_rate": 1.8101489522847765e-06, - "loss": 0.0566, + "loss": 0.0562, "step": 18610 }, { "epoch": 4.70083312294875, - "grad_norm": 0.5313109159469604, + "grad_norm": 0.6443197131156921, "learning_rate": 1.7950012623074983e-06, - "loss": 0.0535, + "loss": 0.0485, "step": 18620 }, { "epoch": 4.703357737944963, - "grad_norm": 0.5695617198944092, + "grad_norm": 0.5753923654556274, "learning_rate": 1.7798535723302196e-06, - "loss": 0.0524, + "loss": 0.0455, "step": 18630 }, { "epoch": 4.705882352941177, - "grad_norm": 0.4023250937461853, + "grad_norm": 0.5932863354682922, "learning_rate": 1.7647058823529412e-06, - "loss": 0.0598, + "loss": 0.0651, "step": 18640 }, { "epoch": 4.70840696793739, - "grad_norm": 0.36584198474884033, + "grad_norm": 0.3984706401824951, "learning_rate": 1.7495581923756626e-06, - "loss": 0.0547, + "loss": 0.0575, "step": 18650 }, { "epoch": 4.710931582933602, - "grad_norm": 0.6956744194030762, + "grad_norm": 0.5167907476425171, "learning_rate": 1.7344105023983844e-06, - "loss": 0.0551, + "loss": 0.0639, "step": 18660 }, { "epoch": 4.7134561979298155, - "grad_norm": 0.5592136979103088, + "grad_norm": 0.5363221764564514, "learning_rate": 1.7192628124211058e-06, - "loss": 0.0428, + "loss": 0.0418, "step": 18670 }, { "epoch": 4.715980812926029, - "grad_norm": 0.7112722396850586, + "grad_norm": 0.584365963935852, "learning_rate": 1.7041151224438274e-06, - "loss": 0.0538, + "loss": 0.0477, "step": 18680 }, { "epoch": 4.718505427922242, - "grad_norm": 0.44742482900619507, + "grad_norm": 0.2417188286781311, "learning_rate": 1.6889674324665488e-06, - "loss": 0.0583, + "loss": 0.0543, "step": 18690 }, { "epoch": 4.721030042918455, - "grad_norm": 0.7442821264266968, + "grad_norm": 0.5733222365379333, "learning_rate": 1.6738197424892706e-06, - "loss": 0.066, + "loss": 0.0684, "step": 18700 }, { "epoch": 4.723554657914668, - "grad_norm": 0.5363455414772034, + "grad_norm": 0.7107726335525513, "learning_rate": 1.658672052511992e-06, - "loss": 0.0537, + "loss": 0.0543, "step": 18710 }, { "epoch": 4.726079272910881, - "grad_norm": 0.3161306381225586, + "grad_norm": 0.7579614520072937, "learning_rate": 1.6435243625347136e-06, - "loss": 0.0438, + "loss": 0.0507, "step": 18720 }, { "epoch": 4.728603887907094, - "grad_norm": 0.5690962076187134, + "grad_norm": 0.4801480174064636, "learning_rate": 1.628376672557435e-06, - "loss": 0.064, + "loss": 0.0611, "step": 18730 }, { "epoch": 4.731128502903307, - "grad_norm": 0.28230124711990356, + "grad_norm": 0.5679193139076233, "learning_rate": 1.6132289825801565e-06, - "loss": 0.0576, + "loss": 0.0555, "step": 18740 }, { "epoch": 4.733653117899521, - "grad_norm": 0.3042181134223938, + "grad_norm": 0.4143296480178833, "learning_rate": 1.5980812926028781e-06, - "loss": 0.0589, + "loss": 0.0564, "step": 18750 }, { "epoch": 4.736177732895733, - "grad_norm": 0.42241615056991577, + "grad_norm": 0.5309060215950012, "learning_rate": 1.5829336026255997e-06, - "loss": 0.0532, + "loss": 0.047, "step": 18760 }, { "epoch": 4.738702347891946, - "grad_norm": 0.6024519801139832, + "grad_norm": 0.49305611848831177, "learning_rate": 1.5677859126483211e-06, - "loss": 0.0586, + "loss": 0.053, "step": 18770 }, { "epoch": 4.74122696288816, - "grad_norm": 0.36207395792007446, + "grad_norm": 0.5996381044387817, "learning_rate": 1.5526382226710427e-06, - "loss": 0.0604, + "loss": 0.0659, "step": 18780 }, { "epoch": 4.743751577884373, - "grad_norm": 0.5053508877754211, + "grad_norm": 0.6321601271629333, "learning_rate": 1.5374905326937643e-06, - "loss": 0.0523, + "loss": 0.0447, "step": 18790 }, { "epoch": 4.746276192880586, - "grad_norm": 0.48265892267227173, + "grad_norm": 0.7180649638175964, "learning_rate": 1.522342842716486e-06, - "loss": 0.0594, + "loss": 0.0489, "step": 18800 }, { "epoch": 4.7488008078767985, - "grad_norm": 0.5995743274688721, + "grad_norm": 0.40703126788139343, "learning_rate": 1.5071951527392073e-06, - "loss": 0.0548, + "loss": 0.0584, "step": 18810 }, { "epoch": 4.751325422873012, - "grad_norm": 0.45999419689178467, + "grad_norm": 0.5110107064247131, "learning_rate": 1.4920474627619289e-06, - "loss": 0.0478, + "loss": 0.0486, "step": 18820 }, { "epoch": 4.753850037869225, - "grad_norm": 0.24280232191085815, + "grad_norm": 0.5644401907920837, "learning_rate": 1.4768997727846505e-06, - "loss": 0.0485, + "loss": 0.0562, "step": 18830 }, { "epoch": 4.756374652865438, - "grad_norm": 0.41278672218322754, + "grad_norm": 0.5056483745574951, "learning_rate": 1.4617520828073719e-06, - "loss": 0.0644, + "loss": 0.0705, "step": 18840 }, { "epoch": 4.758899267861651, - "grad_norm": 0.41428983211517334, + "grad_norm": 0.48723912239074707, "learning_rate": 1.4466043928300934e-06, - "loss": 0.0483, + "loss": 0.0561, "step": 18850 }, { "epoch": 4.761423882857864, - "grad_norm": 0.6497278809547424, + "grad_norm": 0.47025883197784424, "learning_rate": 1.4314567028528148e-06, - "loss": 0.0596, + "loss": 0.0602, "step": 18860 }, { "epoch": 4.763948497854077, - "grad_norm": 0.49583032727241516, + "grad_norm": 0.3964708745479584, "learning_rate": 1.4163090128755366e-06, - "loss": 0.0636, + "loss": 0.0634, "step": 18870 }, { "epoch": 4.76647311285029, - "grad_norm": 0.37153711915016174, + "grad_norm": 0.7490302920341492, "learning_rate": 1.401161322898258e-06, - "loss": 0.0581, + "loss": 0.0648, "step": 18880 }, { "epoch": 4.768997727846504, - "grad_norm": 0.14342792332172394, + "grad_norm": 0.3066612184047699, "learning_rate": 1.3860136329209796e-06, - "loss": 0.057, + "loss": 0.063, "step": 18890 }, { "epoch": 4.771522342842717, - "grad_norm": 0.5583682656288147, + "grad_norm": 0.4892236590385437, "learning_rate": 1.370865942943701e-06, - "loss": 0.0539, + "loss": 0.0498, "step": 18900 }, { "epoch": 4.774046957838929, - "grad_norm": 0.45823413133621216, + "grad_norm": 0.5352323651313782, "learning_rate": 1.3557182529664226e-06, - "loss": 0.052, + "loss": 0.0514, "step": 18910 }, { "epoch": 4.7765715728351426, - "grad_norm": 0.5518306493759155, + "grad_norm": 0.6631128191947937, "learning_rate": 1.3405705629891442e-06, - "loss": 0.0494, + "loss": 0.0567, "step": 18920 }, { "epoch": 4.779096187831356, - "grad_norm": 0.3273313343524933, + "grad_norm": 0.49421215057373047, "learning_rate": 1.3254228730118658e-06, - "loss": 0.0567, + "loss": 0.0556, "step": 18930 }, { "epoch": 4.781620802827569, - "grad_norm": 0.862960159778595, + "grad_norm": 0.36637288331985474, "learning_rate": 1.3102751830345872e-06, - "loss": 0.0562, + "loss": 0.0569, "step": 18940 }, { "epoch": 4.784145417823781, - "grad_norm": 0.590394914150238, + "grad_norm": 0.32764676213264465, "learning_rate": 1.2951274930573088e-06, - "loss": 0.0538, + "loss": 0.0584, "step": 18950 }, { "epoch": 4.786670032819995, - "grad_norm": 0.2639109194278717, + "grad_norm": 0.7302456498146057, "learning_rate": 1.2799798030800304e-06, - "loss": 0.0514, + "loss": 0.0535, "step": 18960 }, { "epoch": 4.789194647816208, - "grad_norm": 0.4883747100830078, + "grad_norm": 0.5171737670898438, "learning_rate": 1.264832113102752e-06, - "loss": 0.0621, + "loss": 0.0524, "step": 18970 }, { "epoch": 4.791719262812421, - "grad_norm": 0.6707940697669983, + "grad_norm": 0.6158316135406494, "learning_rate": 1.2496844231254733e-06, - "loss": 0.0583, + "loss": 0.0626, "step": 18980 }, { "epoch": 4.7942438778086345, - "grad_norm": 0.4944520890712738, + "grad_norm": 0.5882306694984436, "learning_rate": 1.234536733148195e-06, - "loss": 0.0536, + "loss": 0.0542, "step": 18990 }, { "epoch": 4.796768492804848, - "grad_norm": 0.5789937973022461, + "grad_norm": 0.6305384039878845, "learning_rate": 1.2193890431709165e-06, - "loss": 0.0616, + "loss": 0.0576, "step": 19000 }, { "epoch": 4.79929310780106, - "grad_norm": 0.3722054958343506, + "grad_norm": 0.46403953433036804, "learning_rate": 1.2042413531936381e-06, - "loss": 0.0358, + "loss": 0.0453, "step": 19010 }, { "epoch": 4.801817722797273, - "grad_norm": 0.5968698859214783, + "grad_norm": 0.6074075698852539, "learning_rate": 1.1890936632163595e-06, - "loss": 0.0621, + "loss": 0.0682, "step": 19020 }, { "epoch": 4.804342337793487, - "grad_norm": 0.41492170095443726, + "grad_norm": 0.43722423911094666, "learning_rate": 1.173945973239081e-06, - "loss": 0.0455, + "loss": 0.0542, "step": 19030 }, { "epoch": 4.8068669527897, - "grad_norm": 0.5387087464332581, + "grad_norm": 0.35191863775253296, "learning_rate": 1.1587982832618027e-06, - "loss": 0.0549, + "loss": 0.0639, "step": 19040 }, { "epoch": 4.809391567785912, - "grad_norm": 0.7498044967651367, + "grad_norm": 0.6911765336990356, "learning_rate": 1.1436505932845243e-06, - "loss": 0.056, + "loss": 0.0553, "step": 19050 }, { "epoch": 4.8119161827821255, - "grad_norm": 0.30329465866088867, + "grad_norm": 0.22319677472114563, "learning_rate": 1.1285029033072457e-06, - "loss": 0.0619, + "loss": 0.066, "step": 19060 }, { "epoch": 4.814440797778339, - "grad_norm": 0.6062590479850769, + "grad_norm": 0.520487904548645, "learning_rate": 1.1133552133299673e-06, - "loss": 0.0619, + "loss": 0.0596, "step": 19070 }, { "epoch": 4.816965412774552, - "grad_norm": 0.36201271414756775, + "grad_norm": 0.40240347385406494, "learning_rate": 1.0982075233526886e-06, - "loss": 0.0615, + "loss": 0.0618, "step": 19080 }, { "epoch": 4.819490027770765, - "grad_norm": 0.612701416015625, + "grad_norm": 0.6012730598449707, "learning_rate": 1.0830598333754104e-06, - "loss": 0.0393, + "loss": 0.0401, "step": 19090 }, { "epoch": 4.822014642766978, - "grad_norm": 0.48930907249450684, + "grad_norm": 0.6411862373352051, "learning_rate": 1.0679121433981318e-06, - "loss": 0.0508, + "loss": 0.0569, "step": 19100 }, { "epoch": 4.824539257763191, - "grad_norm": 0.5168036818504333, + "grad_norm": 0.7546837329864502, "learning_rate": 1.0527644534208532e-06, - "loss": 0.0613, + "loss": 0.0619, "step": 19110 }, { "epoch": 4.827063872759404, - "grad_norm": 0.7766786217689514, + "grad_norm": 0.5956974625587463, "learning_rate": 1.0376167634435748e-06, - "loss": 0.0644, + "loss": 0.0651, "step": 19120 }, { "epoch": 4.829588487755617, - "grad_norm": 0.504666268825531, + "grad_norm": 0.41826269030570984, "learning_rate": 1.0224690734662964e-06, - "loss": 0.0494, + "loss": 0.0459, "step": 19130 }, { "epoch": 4.832113102751831, - "grad_norm": 0.5199721455574036, + "grad_norm": 0.39252969622612, "learning_rate": 1.007321383489018e-06, - "loss": 0.0508, + "loss": 0.0549, "step": 19140 }, { "epoch": 4.834637717748043, - "grad_norm": 0.31914207339286804, + "grad_norm": 0.45689401030540466, "learning_rate": 9.921736935117394e-07, - "loss": 0.0478, + "loss": 0.048, "step": 19150 }, { "epoch": 4.837162332744256, - "grad_norm": 0.3759099841117859, + "grad_norm": 0.47611868381500244, "learning_rate": 9.77026003534461e-07, - "loss": 0.0457, + "loss": 0.0472, "step": 19160 }, { "epoch": 4.83968694774047, - "grad_norm": 0.5880575180053711, + "grad_norm": 0.5146605968475342, "learning_rate": 9.618783135571826e-07, - "loss": 0.0573, + "loss": 0.0537, "step": 19170 }, { "epoch": 4.842211562736683, - "grad_norm": 0.6293249726295471, + "grad_norm": 0.5189658999443054, "learning_rate": 9.467306235799042e-07, - "loss": 0.0545, + "loss": 0.0682, "step": 19180 }, { "epoch": 4.844736177732896, - "grad_norm": 0.5560355186462402, + "grad_norm": 0.37280428409576416, "learning_rate": 9.315829336026256e-07, - "loss": 0.0524, + "loss": 0.0581, "step": 19190 }, { "epoch": 4.8472607927291085, - "grad_norm": 0.48308777809143066, + "grad_norm": 0.5254796743392944, "learning_rate": 9.164352436253472e-07, - "loss": 0.054, + "loss": 0.0653, "step": 19200 }, { "epoch": 4.849785407725322, - "grad_norm": 0.6635215282440186, + "grad_norm": 0.5634022951126099, "learning_rate": 9.012875536480687e-07, - "loss": 0.066, + "loss": 0.0641, "step": 19210 }, { "epoch": 4.852310022721535, - "grad_norm": 0.637913703918457, + "grad_norm": 0.6558578014373779, "learning_rate": 8.861398636707903e-07, - "loss": 0.0654, + "loss": 0.0665, "step": 19220 }, { "epoch": 4.854834637717748, - "grad_norm": 0.3939951956272125, + "grad_norm": 0.7143204808235168, "learning_rate": 8.709921736935118e-07, - "loss": 0.0638, + "loss": 0.0615, "step": 19230 }, { "epoch": 4.857359252713961, - "grad_norm": 0.6704174876213074, + "grad_norm": 0.40588563680648804, "learning_rate": 8.558444837162333e-07, - "loss": 0.0572, + "loss": 0.059, "step": 19240 }, { "epoch": 4.859883867710174, - "grad_norm": 0.7500046491622925, + "grad_norm": 0.5825479030609131, "learning_rate": 8.406967937389548e-07, - "loss": 0.0581, + "loss": 0.0575, "step": 19250 }, { "epoch": 4.862408482706387, - "grad_norm": 0.5089032649993896, + "grad_norm": 0.6735103726387024, "learning_rate": 8.255491037616763e-07, - "loss": 0.0607, + "loss": 0.0667, "step": 19260 }, { "epoch": 4.8649330977026, - "grad_norm": 0.4917643368244171, + "grad_norm": 0.5344639420509338, "learning_rate": 8.104014137843979e-07, - "loss": 0.0484, + "loss": 0.0563, "step": 19270 }, { "epoch": 4.867457712698814, - "grad_norm": 0.4157859683036804, + "grad_norm": 0.611815869808197, "learning_rate": 7.952537238071194e-07, - "loss": 0.0562, + "loss": 0.0589, "step": 19280 }, { "epoch": 4.869982327695027, - "grad_norm": 0.8012222647666931, + "grad_norm": 0.5727031826972961, "learning_rate": 7.80106033829841e-07, - "loss": 0.0565, + "loss": 0.0601, "step": 19290 }, { "epoch": 4.872506942691239, - "grad_norm": 0.4449537694454193, + "grad_norm": 0.39414718747138977, "learning_rate": 7.649583438525624e-07, - "loss": 0.0579, + "loss": 0.0542, "step": 19300 }, { "epoch": 4.8750315576874526, - "grad_norm": 0.6649788022041321, + "grad_norm": 0.49244511127471924, "learning_rate": 7.49810653875284e-07, - "loss": 0.0713, + "loss": 0.0598, "step": 19310 }, { "epoch": 4.877556172683666, - "grad_norm": 0.3748777508735657, + "grad_norm": 0.5638169050216675, "learning_rate": 7.346629638980055e-07, - "loss": 0.0552, + "loss": 0.0537, "step": 19320 }, { "epoch": 4.880080787679879, - "grad_norm": 0.6150493621826172, + "grad_norm": 0.4944647550582886, "learning_rate": 7.195152739207271e-07, - "loss": 0.0476, + "loss": 0.0515, "step": 19330 }, { "epoch": 4.882605402676091, - "grad_norm": 0.5284005999565125, + "grad_norm": 0.847815215587616, "learning_rate": 7.043675839434486e-07, - "loss": 0.0556, + "loss": 0.0653, "step": 19340 }, { "epoch": 4.885130017672305, - "grad_norm": 0.3671182692050934, + "grad_norm": 0.7950305938720703, "learning_rate": 6.892198939661702e-07, - "loss": 0.0674, + "loss": 0.057, "step": 19350 }, { "epoch": 4.887654632668518, - "grad_norm": 0.6458919644355774, + "grad_norm": 0.680915892124176, "learning_rate": 6.740722039888917e-07, - "loss": 0.0611, + "loss": 0.0554, "step": 19360 }, { "epoch": 4.890179247664731, - "grad_norm": 0.47282344102859497, + "grad_norm": 0.42906680703163147, "learning_rate": 6.589245140116133e-07, - "loss": 0.0537, + "loss": 0.0535, "step": 19370 }, { "epoch": 4.8927038626609445, - "grad_norm": 0.8219795823097229, + "grad_norm": 0.872386634349823, "learning_rate": 6.437768240343348e-07, - "loss": 0.0607, + "loss": 0.0622, "step": 19380 }, { "epoch": 4.895228477657158, - "grad_norm": 0.7187753319740295, + "grad_norm": 0.619981586933136, "learning_rate": 6.286291340570563e-07, - "loss": 0.0509, + "loss": 0.0672, "step": 19390 }, { "epoch": 4.89775309265337, - "grad_norm": 0.3172253668308258, + "grad_norm": 0.538330614566803, "learning_rate": 6.134814440797779e-07, - "loss": 0.0483, + "loss": 0.0525, "step": 19400 }, { "epoch": 4.900277707649583, - "grad_norm": 0.5177561044692993, + "grad_norm": 0.4021759033203125, "learning_rate": 5.983337541024993e-07, - "loss": 0.056, + "loss": 0.0546, "step": 19410 }, { "epoch": 4.902802322645797, - "grad_norm": 0.27060073614120483, + "grad_norm": 0.6232868432998657, "learning_rate": 5.831860641252209e-07, - "loss": 0.0564, + "loss": 0.0581, "step": 19420 }, { "epoch": 4.90532693764201, - "grad_norm": 0.5498465895652771, + "grad_norm": 0.6456800699234009, "learning_rate": 5.680383741479424e-07, - "loss": 0.0472, + "loss": 0.0545, "step": 19430 }, { "epoch": 4.907851552638222, - "grad_norm": 0.48039332032203674, + "grad_norm": 0.5507019758224487, "learning_rate": 5.52890684170664e-07, - "loss": 0.0469, + "loss": 0.0576, "step": 19440 }, { "epoch": 4.9103761676344355, - "grad_norm": 0.27178895473480225, + "grad_norm": 0.2918814718723297, "learning_rate": 5.377429941933855e-07, - "loss": 0.0609, + "loss": 0.0527, "step": 19450 }, { "epoch": 4.912900782630649, - "grad_norm": 0.6592223048210144, + "grad_norm": 0.35016146302223206, "learning_rate": 5.225953042161071e-07, - "loss": 0.0593, + "loss": 0.0636, "step": 19460 }, { "epoch": 4.915425397626862, - "grad_norm": 0.547122061252594, + "grad_norm": 0.5368366837501526, "learning_rate": 5.074476142388286e-07, - "loss": 0.0509, + "loss": 0.0569, "step": 19470 }, { "epoch": 4.917950012623075, - "grad_norm": 0.44897332787513733, + "grad_norm": 0.5466439723968506, "learning_rate": 4.922999242615502e-07, - "loss": 0.0456, + "loss": 0.0508, "step": 19480 }, { "epoch": 4.9204746276192886, - "grad_norm": 0.6687337756156921, + "grad_norm": 0.6173040270805359, "learning_rate": 4.771522342842717e-07, - "loss": 0.0444, + "loss": 0.0504, "step": 19490 }, { "epoch": 4.922999242615501, - "grad_norm": 0.4966764450073242, + "grad_norm": 0.28498920798301697, "learning_rate": 4.6200454430699317e-07, - "loss": 0.0505, + "loss": 0.0572, "step": 19500 }, { "epoch": 4.925523857611714, - "grad_norm": 0.7039111256599426, + "grad_norm": 0.7897679209709167, "learning_rate": 4.468568543297147e-07, - "loss": 0.0537, + "loss": 0.053, "step": 19510 }, { "epoch": 4.928048472607927, - "grad_norm": 0.5682207345962524, + "grad_norm": 0.4405366778373718, "learning_rate": 4.3170916435243625e-07, - "loss": 0.0457, + "loss": 0.051, "step": 19520 }, { "epoch": 4.930573087604141, - "grad_norm": 0.3622889220714569, + "grad_norm": 0.7264717221260071, "learning_rate": 4.165614743751578e-07, - "loss": 0.0466, + "loss": 0.0535, "step": 19530 }, { "epoch": 4.933097702600353, - "grad_norm": 0.38879507780075073, + "grad_norm": 0.47195565700531006, "learning_rate": 4.0141378439787934e-07, - "loss": 0.0453, + "loss": 0.0416, "step": 19540 }, { "epoch": 4.935622317596566, - "grad_norm": 0.185908704996109, + "grad_norm": 0.4767369031906128, "learning_rate": 3.862660944206009e-07, - "loss": 0.041, + "loss": 0.049, "step": 19550 }, { "epoch": 4.93814693259278, - "grad_norm": 0.4912923276424408, + "grad_norm": 0.5228800177574158, "learning_rate": 3.711184044433224e-07, "loss": 0.0568, "step": 19560 }, { "epoch": 4.940671547588993, - "grad_norm": 0.22671930491924286, + "grad_norm": 0.5455029010772705, "learning_rate": 3.5597071446604396e-07, - "loss": 0.055, + "loss": 0.061, "step": 19570 }, { "epoch": 4.943196162585206, - "grad_norm": 0.43126681447029114, + "grad_norm": 0.4548329710960388, "learning_rate": 3.408230244887655e-07, - "loss": 0.0638, + "loss": 0.0543, "step": 19580 }, { "epoch": 4.9457207775814185, - "grad_norm": 0.42626145482063293, + "grad_norm": 0.41128185391426086, "learning_rate": 3.2567533451148704e-07, - "loss": 0.0517, + "loss": 0.0568, "step": 19590 }, { "epoch": 4.948245392577632, - "grad_norm": 0.6885790228843689, + "grad_norm": 0.3675704598426819, "learning_rate": 3.1052764453420853e-07, - "loss": 0.0595, + "loss": 0.0646, "step": 19600 }, { "epoch": 4.950770007573845, - "grad_norm": 0.5854836702346802, + "grad_norm": 0.49481600522994995, "learning_rate": 2.9537995455693007e-07, - "loss": 0.0653, + "loss": 0.0659, "step": 19610 }, { "epoch": 4.953294622570058, - "grad_norm": 0.4316968321800232, + "grad_norm": 0.3610905706882477, "learning_rate": 2.802322645796516e-07, - "loss": 0.0575, + "loss": 0.0606, "step": 19620 }, { "epoch": 4.9558192375662715, - "grad_norm": 0.31033676862716675, + "grad_norm": 0.4303690493106842, "learning_rate": 2.6508457460237316e-07, - "loss": 0.0416, + "loss": 0.0485, "step": 19630 }, { "epoch": 4.958343852562484, - "grad_norm": 0.37612465023994446, + "grad_norm": 0.4692881405353546, "learning_rate": 2.4993688462509464e-07, - "loss": 0.0531, + "loss": 0.0552, "step": 19640 }, { "epoch": 4.960868467558697, - "grad_norm": 0.5763729810714722, + "grad_norm": 0.7063325047492981, "learning_rate": 2.3478919464781619e-07, - "loss": 0.0602, + "loss": 0.0614, "step": 19650 }, { "epoch": 4.96339308255491, - "grad_norm": 0.8366661071777344, + "grad_norm": 0.6039048433303833, "learning_rate": 2.1964150467053775e-07, - "loss": 0.0678, + "loss": 0.0729, "step": 19660 }, { "epoch": 4.965917697551124, - "grad_norm": 0.517417848110199, + "grad_norm": 0.38355937600135803, "learning_rate": 2.044938146932593e-07, - "loss": 0.0524, + "loss": 0.054, "step": 19670 }, { "epoch": 4.968442312547337, - "grad_norm": 0.6865304708480835, + "grad_norm": 0.7297325134277344, "learning_rate": 1.893461247159808e-07, - "loss": 0.0574, + "loss": 0.057, "step": 19680 }, { "epoch": 4.970966927543549, - "grad_norm": 0.3418879508972168, + "grad_norm": 0.618418276309967, "learning_rate": 1.7419843473870235e-07, - "loss": 0.0531, + "loss": 0.0533, "step": 19690 }, { "epoch": 4.9734915425397626, - "grad_norm": 1.0368373394012451, + "grad_norm": 0.44941627979278564, "learning_rate": 1.5905074476142387e-07, - "loss": 0.0613, + "loss": 0.0626, "step": 19700 }, { "epoch": 4.976016157535976, - "grad_norm": 0.6324265599250793, + "grad_norm": 0.5745902061462402, "learning_rate": 1.439030547841454e-07, - "loss": 0.0472, + "loss": 0.0528, "step": 19710 }, { "epoch": 4.978540772532189, - "grad_norm": 0.44766560196876526, + "grad_norm": 0.6372010707855225, "learning_rate": 1.2875536480686695e-07, - "loss": 0.0465, + "loss": 0.0548, "step": 19720 }, { "epoch": 4.9810653875284014, - "grad_norm": 0.1590045988559723, + "grad_norm": 0.5590953826904297, "learning_rate": 1.1360767482958849e-07, - "loss": 0.0439, + "loss": 0.0515, "step": 19730 }, { "epoch": 4.983590002524615, - "grad_norm": 0.5014932155609131, + "grad_norm": 0.3603893518447876, "learning_rate": 9.845998485231003e-08, - "loss": 0.052, + "loss": 0.057, "step": 19740 }, { "epoch": 4.986114617520828, - "grad_norm": 0.703318178653717, + "grad_norm": 0.42396554350852966, "learning_rate": 8.331229487503156e-08, - "loss": 0.0569, + "loss": 0.0522, "step": 19750 }, { "epoch": 4.988639232517041, - "grad_norm": 0.5787055492401123, + "grad_norm": 0.6315743327140808, "learning_rate": 6.816460489775309e-08, - "loss": 0.0561, + "loss": 0.0475, "step": 19760 }, { "epoch": 4.9911638475132545, - "grad_norm": 0.5603996515274048, + "grad_norm": 0.535829484462738, "learning_rate": 5.301691492047463e-08, - "loss": 0.0532, + "loss": 0.0602, "step": 19770 }, { "epoch": 4.993688462509468, - "grad_norm": 0.7136797904968262, + "grad_norm": 0.706295371055603, "learning_rate": 3.786922494319616e-08, - "loss": 0.0509, + "loss": 0.0476, "step": 19780 }, { "epoch": 4.99621307750568, - "grad_norm": 0.5492009520530701, + "grad_norm": 0.5887550711631775, "learning_rate": 2.2721534965917698e-08, - "loss": 0.0573, + "loss": 0.0555, "step": 19790 }, { "epoch": 4.998737692501893, - "grad_norm": 0.364566445350647, + "grad_norm": 0.2900368273258209, "learning_rate": 7.573844988639233e-09, - "loss": 0.0622, + "loss": 0.0532, "step": 19800 }, { "epoch": 5.0, "eval_f1": 0.9705180789481339, - "eval_loss": 0.04460228607058525, - "eval_runtime": 901.9248, - "eval_samples_per_second": 228.691, - "eval_steps_per_second": 3.573, + "eval_loss": 0.04290741682052612, + "eval_runtime": 1160.2076, + "eval_samples_per_second": 177.78, + "eval_steps_per_second": 2.778, "step": 19805 }, { "epoch": 5.0, "step": 19805, - "total_flos": 9.820471825285631e+19, - "train_loss": 0.06684475449172272, - "train_runtime": 18247.0343, - "train_samples_per_second": 69.452, - "train_steps_per_second": 1.085 + "total_flos": 9.82001462664467e+19, + "train_loss": 0.0, + "train_runtime": 0.0662, + "train_samples_per_second": 19152190.372, + "train_steps_per_second": 299304.924 } ], "logging_steps": 10, @@ -13940,7 +13949,7 @@ "attributes": {} } }, - "total_flos": 9.820471825285631e+19, + "total_flos": 9.82001462664467e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null