{ "best_metric": 0.041273970156908035, "best_model_checkpoint": "./microsoft_dit/checkpoint-16835", "epoch": 5.0, "eval_steps": 500, "global_step": 16835, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00297000297000297, "grad_norm": 1.4537379741668701, "learning_rate": 2.9982179982179983e-05, "loss": 0.3085, "step": 10 }, { "epoch": 0.00594000594000594, "grad_norm": 1.9534900188446045, "learning_rate": 2.9964359964359965e-05, "loss": 0.2158, "step": 20 }, { "epoch": 0.00891000891000891, "grad_norm": 2.2834959030151367, "learning_rate": 2.9946539946539947e-05, "loss": 0.1555, "step": 30 }, { "epoch": 0.01188001188001188, "grad_norm": 2.4054393768310547, "learning_rate": 2.992871992871993e-05, "loss": 0.1099, "step": 40 }, { "epoch": 0.01485001485001485, "grad_norm": 1.9460220336914062, "learning_rate": 2.991089991089991e-05, "loss": 0.1214, "step": 50 }, { "epoch": 0.01782001782001782, "grad_norm": 1.1060905456542969, "learning_rate": 2.9893079893079894e-05, "loss": 0.1046, "step": 60 }, { "epoch": 0.02079002079002079, "grad_norm": 1.0159635543823242, "learning_rate": 2.9875259875259876e-05, "loss": 0.1055, "step": 70 }, { "epoch": 0.02376002376002376, "grad_norm": 2.2551214694976807, "learning_rate": 2.9857439857439858e-05, "loss": 0.099, "step": 80 }, { "epoch": 0.02673002673002673, "grad_norm": 1.3134405612945557, "learning_rate": 2.983961983961984e-05, "loss": 0.0975, "step": 90 }, { "epoch": 0.0297000297000297, "grad_norm": 1.055510401725769, "learning_rate": 2.9821799821799822e-05, "loss": 0.0945, "step": 100 }, { "epoch": 0.03267003267003267, "grad_norm": 1.4456548690795898, "learning_rate": 2.9803979803979805e-05, "loss": 0.1058, "step": 110 }, { "epoch": 0.03564003564003564, "grad_norm": 1.4712682962417603, "learning_rate": 2.9786159786159787e-05, "loss": 0.0901, "step": 120 }, { "epoch": 0.03861003861003861, "grad_norm": 2.145982265472412, "learning_rate": 2.976833976833977e-05, "loss": 0.1149, "step": 130 }, { "epoch": 0.04158004158004158, "grad_norm": 2.104418992996216, "learning_rate": 2.975051975051975e-05, "loss": 0.102, "step": 140 }, { "epoch": 0.04455004455004455, "grad_norm": 1.3641546964645386, "learning_rate": 2.9732699732699733e-05, "loss": 0.0996, "step": 150 }, { "epoch": 0.04752004752004752, "grad_norm": 2.1951277256011963, "learning_rate": 2.9714879714879715e-05, "loss": 0.092, "step": 160 }, { "epoch": 0.05049005049005049, "grad_norm": 1.2363626956939697, "learning_rate": 2.9697059697059698e-05, "loss": 0.0971, "step": 170 }, { "epoch": 0.05346005346005346, "grad_norm": 1.299441933631897, "learning_rate": 2.967923967923968e-05, "loss": 0.1, "step": 180 }, { "epoch": 0.05643005643005643, "grad_norm": 0.9083341956138611, "learning_rate": 2.9661419661419662e-05, "loss": 0.0997, "step": 190 }, { "epoch": 0.0594000594000594, "grad_norm": 1.1018556356430054, "learning_rate": 2.9643599643599644e-05, "loss": 0.0916, "step": 200 }, { "epoch": 0.062370062370062374, "grad_norm": 1.025562047958374, "learning_rate": 2.9625779625779626e-05, "loss": 0.0995, "step": 210 }, { "epoch": 0.06534006534006534, "grad_norm": 1.1609880924224854, "learning_rate": 2.960795960795961e-05, "loss": 0.0844, "step": 220 }, { "epoch": 0.0683100683100683, "grad_norm": 1.7311973571777344, "learning_rate": 2.959013959013959e-05, "loss": 0.0862, "step": 230 }, { "epoch": 0.07128007128007129, "grad_norm": 1.5643036365509033, "learning_rate": 2.9572319572319573e-05, "loss": 0.0999, "step": 240 }, { "epoch": 0.07425007425007425, "grad_norm": 1.468729019165039, "learning_rate": 2.9554499554499555e-05, "loss": 0.0989, "step": 250 }, { "epoch": 0.07722007722007722, "grad_norm": 0.8540601134300232, "learning_rate": 2.9536679536679537e-05, "loss": 0.0822, "step": 260 }, { "epoch": 0.08019008019008018, "grad_norm": 1.0522710084915161, "learning_rate": 2.951885951885952e-05, "loss": 0.0818, "step": 270 }, { "epoch": 0.08316008316008316, "grad_norm": 1.6779365539550781, "learning_rate": 2.95010395010395e-05, "loss": 0.0804, "step": 280 }, { "epoch": 0.08613008613008613, "grad_norm": 0.6255664825439453, "learning_rate": 2.9483219483219484e-05, "loss": 0.0958, "step": 290 }, { "epoch": 0.0891000891000891, "grad_norm": 0.9004175662994385, "learning_rate": 2.9465399465399466e-05, "loss": 0.0924, "step": 300 }, { "epoch": 0.09207009207009206, "grad_norm": 1.5575037002563477, "learning_rate": 2.9447579447579448e-05, "loss": 0.0927, "step": 310 }, { "epoch": 0.09504009504009504, "grad_norm": 1.6837743520736694, "learning_rate": 2.942975942975943e-05, "loss": 0.0997, "step": 320 }, { "epoch": 0.09801009801009801, "grad_norm": 0.6957836151123047, "learning_rate": 2.9411939411939412e-05, "loss": 0.07, "step": 330 }, { "epoch": 0.10098010098010098, "grad_norm": 1.7029621601104736, "learning_rate": 2.9394119394119395e-05, "loss": 0.109, "step": 340 }, { "epoch": 0.10395010395010396, "grad_norm": 1.519026756286621, "learning_rate": 2.9376299376299377e-05, "loss": 0.093, "step": 350 }, { "epoch": 0.10692010692010692, "grad_norm": 0.8110401034355164, "learning_rate": 2.935847935847936e-05, "loss": 0.0795, "step": 360 }, { "epoch": 0.10989010989010989, "grad_norm": 0.5369464159011841, "learning_rate": 2.934065934065934e-05, "loss": 0.0877, "step": 370 }, { "epoch": 0.11286011286011285, "grad_norm": 2.6125996112823486, "learning_rate": 2.9322839322839323e-05, "loss": 0.0852, "step": 380 }, { "epoch": 0.11583011583011583, "grad_norm": 0.8178906440734863, "learning_rate": 2.930501930501931e-05, "loss": 0.0708, "step": 390 }, { "epoch": 0.1188001188001188, "grad_norm": 1.6640368700027466, "learning_rate": 2.9287199287199288e-05, "loss": 0.0951, "step": 400 }, { "epoch": 0.12177012177012177, "grad_norm": 0.5659627318382263, "learning_rate": 2.926937926937927e-05, "loss": 0.0927, "step": 410 }, { "epoch": 0.12474012474012475, "grad_norm": 1.324707269668579, "learning_rate": 2.9251559251559252e-05, "loss": 0.0749, "step": 420 }, { "epoch": 0.1277101277101277, "grad_norm": 1.9675109386444092, "learning_rate": 2.9233739233739234e-05, "loss": 0.0827, "step": 430 }, { "epoch": 0.13068013068013068, "grad_norm": 1.659543752670288, "learning_rate": 2.9215919215919216e-05, "loss": 0.0938, "step": 440 }, { "epoch": 0.13365013365013365, "grad_norm": 1.0365532636642456, "learning_rate": 2.91980991980992e-05, "loss": 0.0922, "step": 450 }, { "epoch": 0.1366201366201366, "grad_norm": 1.2649776935577393, "learning_rate": 2.9180279180279184e-05, "loss": 0.0741, "step": 460 }, { "epoch": 0.13959013959013958, "grad_norm": 0.6527109146118164, "learning_rate": 2.9162459162459163e-05, "loss": 0.0865, "step": 470 }, { "epoch": 0.14256014256014257, "grad_norm": 1.706261396408081, "learning_rate": 2.9144639144639145e-05, "loss": 0.0805, "step": 480 }, { "epoch": 0.14553014553014554, "grad_norm": 1.399427890777588, "learning_rate": 2.9126819126819127e-05, "loss": 0.0821, "step": 490 }, { "epoch": 0.1485001485001485, "grad_norm": 1.1516929864883423, "learning_rate": 2.910899910899911e-05, "loss": 0.0877, "step": 500 }, { "epoch": 0.15147015147015147, "grad_norm": 0.6054642200469971, "learning_rate": 2.909117909117909e-05, "loss": 0.0732, "step": 510 }, { "epoch": 0.15444015444015444, "grad_norm": 1.518789291381836, "learning_rate": 2.9073359073359074e-05, "loss": 0.0929, "step": 520 }, { "epoch": 0.1574101574101574, "grad_norm": 0.8014304637908936, "learning_rate": 2.905553905553906e-05, "loss": 0.0948, "step": 530 }, { "epoch": 0.16038016038016037, "grad_norm": 1.0207316875457764, "learning_rate": 2.9037719037719038e-05, "loss": 0.083, "step": 540 }, { "epoch": 0.16335016335016336, "grad_norm": 0.8969537019729614, "learning_rate": 2.901989901989902e-05, "loss": 0.0819, "step": 550 }, { "epoch": 0.16632016632016633, "grad_norm": 1.3450151681900024, "learning_rate": 2.9002079002079002e-05, "loss": 0.0886, "step": 560 }, { "epoch": 0.1692901692901693, "grad_norm": 1.7601408958435059, "learning_rate": 2.8984258984258984e-05, "loss": 0.0898, "step": 570 }, { "epoch": 0.17226017226017226, "grad_norm": 1.2938804626464844, "learning_rate": 2.8966438966438967e-05, "loss": 0.0776, "step": 580 }, { "epoch": 0.17523017523017523, "grad_norm": 0.87087082862854, "learning_rate": 2.894861894861895e-05, "loss": 0.0869, "step": 590 }, { "epoch": 0.1782001782001782, "grad_norm": 0.7267048358917236, "learning_rate": 2.8930798930798934e-05, "loss": 0.0877, "step": 600 }, { "epoch": 0.18117018117018116, "grad_norm": 0.9188293218612671, "learning_rate": 2.8912978912978913e-05, "loss": 0.0701, "step": 610 }, { "epoch": 0.18414018414018413, "grad_norm": 1.1492931842803955, "learning_rate": 2.8895158895158895e-05, "loss": 0.0744, "step": 620 }, { "epoch": 0.18711018711018712, "grad_norm": 1.3128528594970703, "learning_rate": 2.8877338877338877e-05, "loss": 0.0807, "step": 630 }, { "epoch": 0.1900801900801901, "grad_norm": 0.9953811168670654, "learning_rate": 2.885951885951886e-05, "loss": 0.0727, "step": 640 }, { "epoch": 0.19305019305019305, "grad_norm": 0.6890339851379395, "learning_rate": 2.8841698841698842e-05, "loss": 0.0888, "step": 650 }, { "epoch": 0.19602019602019602, "grad_norm": 0.5367225408554077, "learning_rate": 2.8823878823878824e-05, "loss": 0.0912, "step": 660 }, { "epoch": 0.19899019899019899, "grad_norm": 0.6967876553535461, "learning_rate": 2.880605880605881e-05, "loss": 0.0831, "step": 670 }, { "epoch": 0.20196020196020195, "grad_norm": 1.0245673656463623, "learning_rate": 2.878823878823879e-05, "loss": 0.0906, "step": 680 }, { "epoch": 0.20493020493020492, "grad_norm": 0.6193961501121521, "learning_rate": 2.877041877041877e-05, "loss": 0.0798, "step": 690 }, { "epoch": 0.2079002079002079, "grad_norm": 1.8188307285308838, "learning_rate": 2.8752598752598753e-05, "loss": 0.0859, "step": 700 }, { "epoch": 0.21087021087021088, "grad_norm": 1.4549452066421509, "learning_rate": 2.8734778734778735e-05, "loss": 0.0759, "step": 710 }, { "epoch": 0.21384021384021384, "grad_norm": 1.0474627017974854, "learning_rate": 2.8716958716958717e-05, "loss": 0.0809, "step": 720 }, { "epoch": 0.2168102168102168, "grad_norm": 0.8879497051239014, "learning_rate": 2.86991386991387e-05, "loss": 0.0725, "step": 730 }, { "epoch": 0.21978021978021978, "grad_norm": 0.6862474679946899, "learning_rate": 2.8681318681318685e-05, "loss": 0.08, "step": 740 }, { "epoch": 0.22275022275022274, "grad_norm": 1.7506073713302612, "learning_rate": 2.8663498663498664e-05, "loss": 0.0992, "step": 750 }, { "epoch": 0.2257202257202257, "grad_norm": 0.9330048561096191, "learning_rate": 2.8645678645678646e-05, "loss": 0.0936, "step": 760 }, { "epoch": 0.2286902286902287, "grad_norm": 1.0609358549118042, "learning_rate": 2.8627858627858628e-05, "loss": 0.0816, "step": 770 }, { "epoch": 0.23166023166023167, "grad_norm": 0.6896756291389465, "learning_rate": 2.861003861003861e-05, "loss": 0.073, "step": 780 }, { "epoch": 0.23463023463023464, "grad_norm": 1.0784275531768799, "learning_rate": 2.8592218592218592e-05, "loss": 0.0799, "step": 790 }, { "epoch": 0.2376002376002376, "grad_norm": 1.0042153596878052, "learning_rate": 2.8574398574398574e-05, "loss": 0.079, "step": 800 }, { "epoch": 0.24057024057024057, "grad_norm": 1.3999987840652466, "learning_rate": 2.855657855657856e-05, "loss": 0.0841, "step": 810 }, { "epoch": 0.24354024354024353, "grad_norm": 0.6107861995697021, "learning_rate": 2.853875853875854e-05, "loss": 0.067, "step": 820 }, { "epoch": 0.2465102465102465, "grad_norm": 0.8890926837921143, "learning_rate": 2.852093852093852e-05, "loss": 0.0639, "step": 830 }, { "epoch": 0.2494802494802495, "grad_norm": 0.7977913618087769, "learning_rate": 2.8503118503118503e-05, "loss": 0.0745, "step": 840 }, { "epoch": 0.25245025245025243, "grad_norm": 1.1050574779510498, "learning_rate": 2.8485298485298485e-05, "loss": 0.0828, "step": 850 }, { "epoch": 0.2554202554202554, "grad_norm": 1.4865517616271973, "learning_rate": 2.8467478467478467e-05, "loss": 0.0971, "step": 860 }, { "epoch": 0.25839025839025836, "grad_norm": 0.8170812726020813, "learning_rate": 2.844965844965845e-05, "loss": 0.0736, "step": 870 }, { "epoch": 0.26136026136026136, "grad_norm": 0.8887490630149841, "learning_rate": 2.8431838431838435e-05, "loss": 0.0843, "step": 880 }, { "epoch": 0.26433026433026435, "grad_norm": 1.2814133167266846, "learning_rate": 2.8414018414018414e-05, "loss": 0.07, "step": 890 }, { "epoch": 0.2673002673002673, "grad_norm": 1.0275837182998657, "learning_rate": 2.8396198396198396e-05, "loss": 0.1014, "step": 900 }, { "epoch": 0.2702702702702703, "grad_norm": 2.31121563911438, "learning_rate": 2.8378378378378378e-05, "loss": 0.0798, "step": 910 }, { "epoch": 0.2732402732402732, "grad_norm": 0.7157280445098877, "learning_rate": 2.836055836055836e-05, "loss": 0.0856, "step": 920 }, { "epoch": 0.2762102762102762, "grad_norm": 1.51650071144104, "learning_rate": 2.8342738342738343e-05, "loss": 0.0886, "step": 930 }, { "epoch": 0.27918027918027916, "grad_norm": 1.0991870164871216, "learning_rate": 2.8324918324918325e-05, "loss": 0.0832, "step": 940 }, { "epoch": 0.28215028215028215, "grad_norm": 1.0901234149932861, "learning_rate": 2.830709830709831e-05, "loss": 0.0763, "step": 950 }, { "epoch": 0.28512028512028514, "grad_norm": 1.1311888694763184, "learning_rate": 2.8289278289278293e-05, "loss": 0.0766, "step": 960 }, { "epoch": 0.2880902880902881, "grad_norm": 0.6374753713607788, "learning_rate": 2.827145827145827e-05, "loss": 0.0736, "step": 970 }, { "epoch": 0.2910602910602911, "grad_norm": 1.2088044881820679, "learning_rate": 2.8253638253638253e-05, "loss": 0.0681, "step": 980 }, { "epoch": 0.294030294030294, "grad_norm": 0.7970484495162964, "learning_rate": 2.8235818235818236e-05, "loss": 0.0803, "step": 990 }, { "epoch": 0.297000297000297, "grad_norm": 1.2192399501800537, "learning_rate": 2.8217998217998218e-05, "loss": 0.0896, "step": 1000 }, { "epoch": 0.29997029997029995, "grad_norm": 1.2083388566970825, "learning_rate": 2.82001782001782e-05, "loss": 0.0795, "step": 1010 }, { "epoch": 0.30294030294030294, "grad_norm": 1.0027815103530884, "learning_rate": 2.8182358182358186e-05, "loss": 0.0874, "step": 1020 }, { "epoch": 0.30591030591030594, "grad_norm": 1.0552337169647217, "learning_rate": 2.8164538164538168e-05, "loss": 0.0718, "step": 1030 }, { "epoch": 0.3088803088803089, "grad_norm": 1.1285655498504639, "learning_rate": 2.8146718146718146e-05, "loss": 0.0723, "step": 1040 }, { "epoch": 0.31185031185031187, "grad_norm": 1.0405571460723877, "learning_rate": 2.812889812889813e-05, "loss": 0.0929, "step": 1050 }, { "epoch": 0.3148203148203148, "grad_norm": 0.9266412258148193, "learning_rate": 2.811107811107811e-05, "loss": 0.0831, "step": 1060 }, { "epoch": 0.3177903177903178, "grad_norm": 1.0532838106155396, "learning_rate": 2.8093258093258093e-05, "loss": 0.0767, "step": 1070 }, { "epoch": 0.32076032076032074, "grad_norm": 0.8444746136665344, "learning_rate": 2.8075438075438075e-05, "loss": 0.0779, "step": 1080 }, { "epoch": 0.32373032373032373, "grad_norm": 0.7850976586341858, "learning_rate": 2.805761805761806e-05, "loss": 0.0803, "step": 1090 }, { "epoch": 0.3267003267003267, "grad_norm": 1.0749289989471436, "learning_rate": 2.8039798039798043e-05, "loss": 0.0891, "step": 1100 }, { "epoch": 0.32967032967032966, "grad_norm": 0.9671429395675659, "learning_rate": 2.802197802197802e-05, "loss": 0.0734, "step": 1110 }, { "epoch": 0.33264033264033266, "grad_norm": 0.665507435798645, "learning_rate": 2.8004158004158004e-05, "loss": 0.0771, "step": 1120 }, { "epoch": 0.3356103356103356, "grad_norm": 0.6605273485183716, "learning_rate": 2.7986337986337986e-05, "loss": 0.0698, "step": 1130 }, { "epoch": 0.3385803385803386, "grad_norm": 0.6776021122932434, "learning_rate": 2.7968517968517968e-05, "loss": 0.0694, "step": 1140 }, { "epoch": 0.34155034155034153, "grad_norm": 0.8488766551017761, "learning_rate": 2.795069795069795e-05, "loss": 0.0664, "step": 1150 }, { "epoch": 0.3445203445203445, "grad_norm": 1.0404934883117676, "learning_rate": 2.7932877932877936e-05, "loss": 0.0759, "step": 1160 }, { "epoch": 0.3474903474903475, "grad_norm": 1.0605899095535278, "learning_rate": 2.7915057915057918e-05, "loss": 0.0863, "step": 1170 }, { "epoch": 0.35046035046035046, "grad_norm": 1.034513235092163, "learning_rate": 2.7897237897237897e-05, "loss": 0.0815, "step": 1180 }, { "epoch": 0.35343035343035345, "grad_norm": 1.6698535680770874, "learning_rate": 2.787941787941788e-05, "loss": 0.0869, "step": 1190 }, { "epoch": 0.3564003564003564, "grad_norm": 0.8041000366210938, "learning_rate": 2.786159786159786e-05, "loss": 0.0739, "step": 1200 }, { "epoch": 0.3593703593703594, "grad_norm": 0.8568106889724731, "learning_rate": 2.7843777843777843e-05, "loss": 0.0697, "step": 1210 }, { "epoch": 0.3623403623403623, "grad_norm": 1.3246437311172485, "learning_rate": 2.7825957825957826e-05, "loss": 0.0761, "step": 1220 }, { "epoch": 0.3653103653103653, "grad_norm": 1.048767328262329, "learning_rate": 2.780813780813781e-05, "loss": 0.0806, "step": 1230 }, { "epoch": 0.36828036828036825, "grad_norm": 0.8982730507850647, "learning_rate": 2.7790317790317793e-05, "loss": 0.0751, "step": 1240 }, { "epoch": 0.37125037125037125, "grad_norm": 0.6694151759147644, "learning_rate": 2.7772497772497772e-05, "loss": 0.0729, "step": 1250 }, { "epoch": 0.37422037422037424, "grad_norm": 0.5974312424659729, "learning_rate": 2.7754677754677754e-05, "loss": 0.0839, "step": 1260 }, { "epoch": 0.3771903771903772, "grad_norm": 1.6992155313491821, "learning_rate": 2.7736857736857736e-05, "loss": 0.0683, "step": 1270 }, { "epoch": 0.3801603801603802, "grad_norm": 1.6791558265686035, "learning_rate": 2.771903771903772e-05, "loss": 0.0809, "step": 1280 }, { "epoch": 0.3831303831303831, "grad_norm": 1.4574382305145264, "learning_rate": 2.77012177012177e-05, "loss": 0.0723, "step": 1290 }, { "epoch": 0.3861003861003861, "grad_norm": 0.6830180287361145, "learning_rate": 2.7683397683397686e-05, "loss": 0.0619, "step": 1300 }, { "epoch": 0.38907038907038904, "grad_norm": 0.7997535467147827, "learning_rate": 2.766557766557767e-05, "loss": 0.0797, "step": 1310 }, { "epoch": 0.39204039204039204, "grad_norm": 0.8323992490768433, "learning_rate": 2.7647757647757647e-05, "loss": 0.0717, "step": 1320 }, { "epoch": 0.39501039501039503, "grad_norm": 0.5575417876243591, "learning_rate": 2.762993762993763e-05, "loss": 0.072, "step": 1330 }, { "epoch": 0.39798039798039797, "grad_norm": 0.9314611554145813, "learning_rate": 2.761211761211761e-05, "loss": 0.0594, "step": 1340 }, { "epoch": 0.40095040095040096, "grad_norm": 1.7011877298355103, "learning_rate": 2.7594297594297594e-05, "loss": 0.0702, "step": 1350 }, { "epoch": 0.4039204039204039, "grad_norm": 0.8633953332901001, "learning_rate": 2.7576477576477576e-05, "loss": 0.0769, "step": 1360 }, { "epoch": 0.4068904068904069, "grad_norm": 0.6947808265686035, "learning_rate": 2.755865755865756e-05, "loss": 0.069, "step": 1370 }, { "epoch": 0.40986040986040984, "grad_norm": 0.5774619579315186, "learning_rate": 2.7540837540837544e-05, "loss": 0.0642, "step": 1380 }, { "epoch": 0.41283041283041283, "grad_norm": 1.0650389194488525, "learning_rate": 2.7523017523017522e-05, "loss": 0.0603, "step": 1390 }, { "epoch": 0.4158004158004158, "grad_norm": 0.9271636605262756, "learning_rate": 2.7505197505197505e-05, "loss": 0.0695, "step": 1400 }, { "epoch": 0.41877041877041876, "grad_norm": 1.1117199659347534, "learning_rate": 2.7487377487377487e-05, "loss": 0.0691, "step": 1410 }, { "epoch": 0.42174042174042176, "grad_norm": 0.7966795563697815, "learning_rate": 2.746955746955747e-05, "loss": 0.068, "step": 1420 }, { "epoch": 0.4247104247104247, "grad_norm": 1.0240360498428345, "learning_rate": 2.745173745173745e-05, "loss": 0.0656, "step": 1430 }, { "epoch": 0.4276804276804277, "grad_norm": 0.9488998651504517, "learning_rate": 2.7433917433917437e-05, "loss": 0.0666, "step": 1440 }, { "epoch": 0.4306504306504306, "grad_norm": 1.2475918531417847, "learning_rate": 2.741609741609742e-05, "loss": 0.0791, "step": 1450 }, { "epoch": 0.4336204336204336, "grad_norm": 1.125158667564392, "learning_rate": 2.7398277398277398e-05, "loss": 0.0768, "step": 1460 }, { "epoch": 0.4365904365904366, "grad_norm": 0.8701415657997131, "learning_rate": 2.738045738045738e-05, "loss": 0.0869, "step": 1470 }, { "epoch": 0.43956043956043955, "grad_norm": 1.3756756782531738, "learning_rate": 2.7362637362637362e-05, "loss": 0.0814, "step": 1480 }, { "epoch": 0.44253044253044255, "grad_norm": 1.0403056144714355, "learning_rate": 2.7344817344817344e-05, "loss": 0.0678, "step": 1490 }, { "epoch": 0.4455004455004455, "grad_norm": 0.45081251859664917, "learning_rate": 2.7326997326997326e-05, "loss": 0.0598, "step": 1500 }, { "epoch": 0.4484704484704485, "grad_norm": 0.4850378930568695, "learning_rate": 2.7309177309177312e-05, "loss": 0.0733, "step": 1510 }, { "epoch": 0.4514404514404514, "grad_norm": 1.3777422904968262, "learning_rate": 2.7291357291357294e-05, "loss": 0.068, "step": 1520 }, { "epoch": 0.4544104544104544, "grad_norm": 1.1350257396697998, "learning_rate": 2.7273537273537276e-05, "loss": 0.0789, "step": 1530 }, { "epoch": 0.4573804573804574, "grad_norm": 0.4942154288291931, "learning_rate": 2.7255717255717255e-05, "loss": 0.0721, "step": 1540 }, { "epoch": 0.46035046035046034, "grad_norm": 0.8224741816520691, "learning_rate": 2.7237897237897237e-05, "loss": 0.0683, "step": 1550 }, { "epoch": 0.46332046332046334, "grad_norm": 0.7868162393569946, "learning_rate": 2.722007722007722e-05, "loss": 0.0795, "step": 1560 }, { "epoch": 0.4662904662904663, "grad_norm": 0.9320521354675293, "learning_rate": 2.72022572022572e-05, "loss": 0.0809, "step": 1570 }, { "epoch": 0.46926046926046927, "grad_norm": 1.2369945049285889, "learning_rate": 2.7184437184437187e-05, "loss": 0.0606, "step": 1580 }, { "epoch": 0.4722304722304722, "grad_norm": 0.934948742389679, "learning_rate": 2.716661716661717e-05, "loss": 0.0659, "step": 1590 }, { "epoch": 0.4752004752004752, "grad_norm": 0.8786292672157288, "learning_rate": 2.714879714879715e-05, "loss": 0.0979, "step": 1600 }, { "epoch": 0.4781704781704782, "grad_norm": 0.5932102799415588, "learning_rate": 2.713097713097713e-05, "loss": 0.0868, "step": 1610 }, { "epoch": 0.48114048114048114, "grad_norm": 0.9076425433158875, "learning_rate": 2.7113157113157112e-05, "loss": 0.0734, "step": 1620 }, { "epoch": 0.48411048411048413, "grad_norm": 0.8655311465263367, "learning_rate": 2.7095337095337095e-05, "loss": 0.0582, "step": 1630 }, { "epoch": 0.48708048708048707, "grad_norm": 0.6988229155540466, "learning_rate": 2.7077517077517077e-05, "loss": 0.0815, "step": 1640 }, { "epoch": 0.49005049005049006, "grad_norm": 0.9337241649627686, "learning_rate": 2.7059697059697062e-05, "loss": 0.0675, "step": 1650 }, { "epoch": 0.493020493020493, "grad_norm": 1.3023432493209839, "learning_rate": 2.7041877041877044e-05, "loss": 0.0777, "step": 1660 }, { "epoch": 0.495990495990496, "grad_norm": 1.0824613571166992, "learning_rate": 2.7024057024057027e-05, "loss": 0.0707, "step": 1670 }, { "epoch": 0.498960498960499, "grad_norm": 0.7362037897109985, "learning_rate": 2.7006237006237005e-05, "loss": 0.0653, "step": 1680 }, { "epoch": 0.5019305019305019, "grad_norm": 0.8872225284576416, "learning_rate": 2.6988416988416988e-05, "loss": 0.0737, "step": 1690 }, { "epoch": 0.5049005049005049, "grad_norm": 1.3285760879516602, "learning_rate": 2.697059697059697e-05, "loss": 0.074, "step": 1700 }, { "epoch": 0.5078705078705079, "grad_norm": 0.8795962929725647, "learning_rate": 2.6952776952776952e-05, "loss": 0.0825, "step": 1710 }, { "epoch": 0.5108405108405109, "grad_norm": 1.0784797668457031, "learning_rate": 2.6934956934956937e-05, "loss": 0.0792, "step": 1720 }, { "epoch": 0.5138105138105138, "grad_norm": 1.0349229574203491, "learning_rate": 2.691713691713692e-05, "loss": 0.0775, "step": 1730 }, { "epoch": 0.5167805167805167, "grad_norm": 0.4279988706111908, "learning_rate": 2.6899316899316902e-05, "loss": 0.0746, "step": 1740 }, { "epoch": 0.5197505197505198, "grad_norm": 0.7496609091758728, "learning_rate": 2.688149688149688e-05, "loss": 0.0501, "step": 1750 }, { "epoch": 0.5227205227205227, "grad_norm": 0.5489044785499573, "learning_rate": 2.6863676863676863e-05, "loss": 0.0464, "step": 1760 }, { "epoch": 0.5256905256905257, "grad_norm": 0.6475281119346619, "learning_rate": 2.6845856845856845e-05, "loss": 0.0732, "step": 1770 }, { "epoch": 0.5286605286605287, "grad_norm": 0.9776890873908997, "learning_rate": 2.6828036828036827e-05, "loss": 0.0755, "step": 1780 }, { "epoch": 0.5316305316305316, "grad_norm": 0.8436118364334106, "learning_rate": 2.6810216810216813e-05, "loss": 0.0743, "step": 1790 }, { "epoch": 0.5346005346005346, "grad_norm": 1.0720716714859009, "learning_rate": 2.6792396792396795e-05, "loss": 0.0821, "step": 1800 }, { "epoch": 0.5375705375705375, "grad_norm": 1.699569582939148, "learning_rate": 2.6774576774576777e-05, "loss": 0.0664, "step": 1810 }, { "epoch": 0.5405405405405406, "grad_norm": 0.975058376789093, "learning_rate": 2.6756756756756756e-05, "loss": 0.0699, "step": 1820 }, { "epoch": 0.5435105435105435, "grad_norm": 0.9048154950141907, "learning_rate": 2.6738936738936738e-05, "loss": 0.0862, "step": 1830 }, { "epoch": 0.5464805464805464, "grad_norm": 1.2828189134597778, "learning_rate": 2.672111672111672e-05, "loss": 0.0683, "step": 1840 }, { "epoch": 0.5494505494505495, "grad_norm": 1.5076489448547363, "learning_rate": 2.6703296703296702e-05, "loss": 0.0827, "step": 1850 }, { "epoch": 0.5524205524205524, "grad_norm": 0.8987582325935364, "learning_rate": 2.6685476685476688e-05, "loss": 0.0637, "step": 1860 }, { "epoch": 0.5553905553905554, "grad_norm": 0.7532503008842468, "learning_rate": 2.666765666765667e-05, "loss": 0.0847, "step": 1870 }, { "epoch": 0.5583605583605583, "grad_norm": 0.8142328858375549, "learning_rate": 2.6649836649836652e-05, "loss": 0.0732, "step": 1880 }, { "epoch": 0.5613305613305614, "grad_norm": 0.6130629181861877, "learning_rate": 2.663201663201663e-05, "loss": 0.0858, "step": 1890 }, { "epoch": 0.5643005643005643, "grad_norm": 1.0010277032852173, "learning_rate": 2.6614196614196613e-05, "loss": 0.0773, "step": 1900 }, { "epoch": 0.5672705672705672, "grad_norm": 0.8659054040908813, "learning_rate": 2.6596376596376595e-05, "loss": 0.0778, "step": 1910 }, { "epoch": 0.5702405702405703, "grad_norm": 0.7538540959358215, "learning_rate": 2.6578556578556577e-05, "loss": 0.071, "step": 1920 }, { "epoch": 0.5732105732105732, "grad_norm": 0.9332623481750488, "learning_rate": 2.6560736560736563e-05, "loss": 0.0795, "step": 1930 }, { "epoch": 0.5761805761805762, "grad_norm": 1.3811964988708496, "learning_rate": 2.6542916542916545e-05, "loss": 0.0782, "step": 1940 }, { "epoch": 0.5791505791505791, "grad_norm": 0.8431365489959717, "learning_rate": 2.6525096525096527e-05, "loss": 0.0727, "step": 1950 }, { "epoch": 0.5821205821205822, "grad_norm": 0.6309846639633179, "learning_rate": 2.6507276507276506e-05, "loss": 0.074, "step": 1960 }, { "epoch": 0.5850905850905851, "grad_norm": 1.0400676727294922, "learning_rate": 2.648945648945649e-05, "loss": 0.0683, "step": 1970 }, { "epoch": 0.588060588060588, "grad_norm": 1.1572082042694092, "learning_rate": 2.647163647163647e-05, "loss": 0.0718, "step": 1980 }, { "epoch": 0.5910305910305911, "grad_norm": 0.6798633337020874, "learning_rate": 2.6453816453816453e-05, "loss": 0.0695, "step": 1990 }, { "epoch": 0.594000594000594, "grad_norm": 0.49746111035346985, "learning_rate": 2.6435996435996438e-05, "loss": 0.0726, "step": 2000 }, { "epoch": 0.596970596970597, "grad_norm": 0.7728568315505981, "learning_rate": 2.641817641817642e-05, "loss": 0.0681, "step": 2010 }, { "epoch": 0.5999405999405999, "grad_norm": 1.0050711631774902, "learning_rate": 2.6400356400356403e-05, "loss": 0.0783, "step": 2020 }, { "epoch": 0.6029106029106029, "grad_norm": 0.6406471729278564, "learning_rate": 2.638253638253638e-05, "loss": 0.0743, "step": 2030 }, { "epoch": 0.6058806058806059, "grad_norm": 0.5787233710289001, "learning_rate": 2.6364716364716364e-05, "loss": 0.0692, "step": 2040 }, { "epoch": 0.6088506088506088, "grad_norm": 1.646081566810608, "learning_rate": 2.6346896346896346e-05, "loss": 0.0712, "step": 2050 }, { "epoch": 0.6118206118206119, "grad_norm": 1.0009373426437378, "learning_rate": 2.6329076329076328e-05, "loss": 0.0775, "step": 2060 }, { "epoch": 0.6147906147906148, "grad_norm": 0.7711787819862366, "learning_rate": 2.6311256311256313e-05, "loss": 0.0649, "step": 2070 }, { "epoch": 0.6177606177606177, "grad_norm": 0.6170060038566589, "learning_rate": 2.6293436293436296e-05, "loss": 0.0722, "step": 2080 }, { "epoch": 0.6207306207306207, "grad_norm": 0.5135714411735535, "learning_rate": 2.6275616275616278e-05, "loss": 0.0676, "step": 2090 }, { "epoch": 0.6237006237006237, "grad_norm": 0.6718676090240479, "learning_rate": 2.625779625779626e-05, "loss": 0.0748, "step": 2100 }, { "epoch": 0.6266706266706267, "grad_norm": 0.47525379061698914, "learning_rate": 2.623997623997624e-05, "loss": 0.0685, "step": 2110 }, { "epoch": 0.6296406296406296, "grad_norm": 1.864790916442871, "learning_rate": 2.622215622215622e-05, "loss": 0.0934, "step": 2120 }, { "epoch": 0.6326106326106327, "grad_norm": 1.857503890991211, "learning_rate": 2.6204336204336203e-05, "loss": 0.0736, "step": 2130 }, { "epoch": 0.6355806355806356, "grad_norm": 0.8403912782669067, "learning_rate": 2.618651618651619e-05, "loss": 0.0686, "step": 2140 }, { "epoch": 0.6385506385506385, "grad_norm": 1.2553832530975342, "learning_rate": 2.616869616869617e-05, "loss": 0.075, "step": 2150 }, { "epoch": 0.6415206415206415, "grad_norm": 0.75575190782547, "learning_rate": 2.6150876150876153e-05, "loss": 0.072, "step": 2160 }, { "epoch": 0.6444906444906445, "grad_norm": 0.7265666723251343, "learning_rate": 2.6133056133056135e-05, "loss": 0.081, "step": 2170 }, { "epoch": 0.6474606474606475, "grad_norm": 0.4849323332309723, "learning_rate": 2.6115236115236114e-05, "loss": 0.0801, "step": 2180 }, { "epoch": 0.6504306504306504, "grad_norm": 1.0659674406051636, "learning_rate": 2.6097416097416096e-05, "loss": 0.07, "step": 2190 }, { "epoch": 0.6534006534006535, "grad_norm": 1.1422368288040161, "learning_rate": 2.6079596079596078e-05, "loss": 0.0689, "step": 2200 }, { "epoch": 0.6563706563706564, "grad_norm": 0.6629343032836914, "learning_rate": 2.6061776061776064e-05, "loss": 0.0647, "step": 2210 }, { "epoch": 0.6593406593406593, "grad_norm": 1.0721797943115234, "learning_rate": 2.6043956043956046e-05, "loss": 0.065, "step": 2220 }, { "epoch": 0.6623106623106623, "grad_norm": 1.3788233995437622, "learning_rate": 2.6026136026136028e-05, "loss": 0.0743, "step": 2230 }, { "epoch": 0.6652806652806653, "grad_norm": 1.3036271333694458, "learning_rate": 2.600831600831601e-05, "loss": 0.0883, "step": 2240 }, { "epoch": 0.6682506682506683, "grad_norm": 0.8231455683708191, "learning_rate": 2.599049599049599e-05, "loss": 0.0684, "step": 2250 }, { "epoch": 0.6712206712206712, "grad_norm": 1.1133084297180176, "learning_rate": 2.597267597267597e-05, "loss": 0.0658, "step": 2260 }, { "epoch": 0.6741906741906742, "grad_norm": 1.1839704513549805, "learning_rate": 2.5954855954855953e-05, "loss": 0.0613, "step": 2270 }, { "epoch": 0.6771606771606772, "grad_norm": 0.9162219762802124, "learning_rate": 2.593703593703594e-05, "loss": 0.0611, "step": 2280 }, { "epoch": 0.6801306801306801, "grad_norm": 0.9693069458007812, "learning_rate": 2.591921591921592e-05, "loss": 0.0738, "step": 2290 }, { "epoch": 0.6831006831006831, "grad_norm": 1.2180196046829224, "learning_rate": 2.5901395901395903e-05, "loss": 0.0706, "step": 2300 }, { "epoch": 0.6860706860706861, "grad_norm": 1.0316444635391235, "learning_rate": 2.5883575883575886e-05, "loss": 0.0678, "step": 2310 }, { "epoch": 0.689040689040689, "grad_norm": 0.9395328164100647, "learning_rate": 2.5865755865755864e-05, "loss": 0.0745, "step": 2320 }, { "epoch": 0.692010692010692, "grad_norm": 0.9199443459510803, "learning_rate": 2.5847935847935846e-05, "loss": 0.0736, "step": 2330 }, { "epoch": 0.694980694980695, "grad_norm": 0.6675525307655334, "learning_rate": 2.583011583011583e-05, "loss": 0.07, "step": 2340 }, { "epoch": 0.697950697950698, "grad_norm": 1.0296847820281982, "learning_rate": 2.5812295812295814e-05, "loss": 0.0667, "step": 2350 }, { "epoch": 0.7009207009207009, "grad_norm": 0.7614896893501282, "learning_rate": 2.5794475794475796e-05, "loss": 0.0598, "step": 2360 }, { "epoch": 0.7038907038907039, "grad_norm": 0.3479407727718353, "learning_rate": 2.577665577665578e-05, "loss": 0.073, "step": 2370 }, { "epoch": 0.7068607068607069, "grad_norm": 0.906836986541748, "learning_rate": 2.575883575883576e-05, "loss": 0.066, "step": 2380 }, { "epoch": 0.7098307098307098, "grad_norm": 1.1523551940917969, "learning_rate": 2.574101574101574e-05, "loss": 0.0752, "step": 2390 }, { "epoch": 0.7128007128007128, "grad_norm": 1.5044126510620117, "learning_rate": 2.572319572319572e-05, "loss": 0.0629, "step": 2400 }, { "epoch": 0.7157707157707157, "grad_norm": 0.7415446639060974, "learning_rate": 2.5705375705375707e-05, "loss": 0.0851, "step": 2410 }, { "epoch": 0.7187407187407188, "grad_norm": 0.9454631805419922, "learning_rate": 2.568755568755569e-05, "loss": 0.0818, "step": 2420 }, { "epoch": 0.7217107217107217, "grad_norm": 1.3333286046981812, "learning_rate": 2.566973566973567e-05, "loss": 0.0781, "step": 2430 }, { "epoch": 0.7246807246807246, "grad_norm": 1.030158519744873, "learning_rate": 2.5651915651915654e-05, "loss": 0.0752, "step": 2440 }, { "epoch": 0.7276507276507277, "grad_norm": 0.9422637820243835, "learning_rate": 2.5634095634095636e-05, "loss": 0.0779, "step": 2450 }, { "epoch": 0.7306207306207306, "grad_norm": 0.8229959011077881, "learning_rate": 2.5616275616275615e-05, "loss": 0.0644, "step": 2460 }, { "epoch": 0.7335907335907336, "grad_norm": 0.7680384516716003, "learning_rate": 2.5598455598455597e-05, "loss": 0.0717, "step": 2470 }, { "epoch": 0.7365607365607365, "grad_norm": 1.118505835533142, "learning_rate": 2.5580635580635582e-05, "loss": 0.0777, "step": 2480 }, { "epoch": 0.7395307395307396, "grad_norm": 0.7873930335044861, "learning_rate": 2.5562815562815565e-05, "loss": 0.0843, "step": 2490 }, { "epoch": 0.7425007425007425, "grad_norm": 0.688714861869812, "learning_rate": 2.5544995544995547e-05, "loss": 0.0664, "step": 2500 }, { "epoch": 0.7454707454707454, "grad_norm": 0.6850858330726624, "learning_rate": 2.552717552717553e-05, "loss": 0.0715, "step": 2510 }, { "epoch": 0.7484407484407485, "grad_norm": 0.71670001745224, "learning_rate": 2.550935550935551e-05, "loss": 0.0579, "step": 2520 }, { "epoch": 0.7514107514107514, "grad_norm": 0.8807306885719299, "learning_rate": 2.549153549153549e-05, "loss": 0.0568, "step": 2530 }, { "epoch": 0.7543807543807544, "grad_norm": 1.108122706413269, "learning_rate": 2.5473715473715472e-05, "loss": 0.0726, "step": 2540 }, { "epoch": 0.7573507573507573, "grad_norm": 0.91880863904953, "learning_rate": 2.5455895455895458e-05, "loss": 0.0697, "step": 2550 }, { "epoch": 0.7603207603207603, "grad_norm": 1.2890892028808594, "learning_rate": 2.543807543807544e-05, "loss": 0.0742, "step": 2560 }, { "epoch": 0.7632907632907633, "grad_norm": 1.3207858800888062, "learning_rate": 2.5420255420255422e-05, "loss": 0.0602, "step": 2570 }, { "epoch": 0.7662607662607662, "grad_norm": 1.0641006231307983, "learning_rate": 2.5402435402435404e-05, "loss": 0.0783, "step": 2580 }, { "epoch": 0.7692307692307693, "grad_norm": 0.7104412317276001, "learning_rate": 2.5384615384615386e-05, "loss": 0.0622, "step": 2590 }, { "epoch": 0.7722007722007722, "grad_norm": 1.2291561365127563, "learning_rate": 2.5366795366795365e-05, "loss": 0.0789, "step": 2600 }, { "epoch": 0.7751707751707752, "grad_norm": 0.9593638181686401, "learning_rate": 2.5348975348975347e-05, "loss": 0.0604, "step": 2610 }, { "epoch": 0.7781407781407781, "grad_norm": 0.7314882278442383, "learning_rate": 2.5331155331155333e-05, "loss": 0.0679, "step": 2620 }, { "epoch": 0.7811107811107811, "grad_norm": 1.0600396394729614, "learning_rate": 2.5313335313335315e-05, "loss": 0.0723, "step": 2630 }, { "epoch": 0.7840807840807841, "grad_norm": 0.8725117444992065, "learning_rate": 2.5295515295515297e-05, "loss": 0.0627, "step": 2640 }, { "epoch": 0.787050787050787, "grad_norm": 0.7169741988182068, "learning_rate": 2.527769527769528e-05, "loss": 0.0674, "step": 2650 }, { "epoch": 0.7900207900207901, "grad_norm": 0.8746957182884216, "learning_rate": 2.525987525987526e-05, "loss": 0.0729, "step": 2660 }, { "epoch": 0.792990792990793, "grad_norm": 0.5545559525489807, "learning_rate": 2.524205524205524e-05, "loss": 0.0829, "step": 2670 }, { "epoch": 0.7959607959607959, "grad_norm": 1.095009207725525, "learning_rate": 2.5224235224235222e-05, "loss": 0.0705, "step": 2680 }, { "epoch": 0.7989307989307989, "grad_norm": 1.138685941696167, "learning_rate": 2.5206415206415208e-05, "loss": 0.0683, "step": 2690 }, { "epoch": 0.8019008019008019, "grad_norm": 0.8513908386230469, "learning_rate": 2.518859518859519e-05, "loss": 0.0721, "step": 2700 }, { "epoch": 0.8048708048708049, "grad_norm": 0.6944445967674255, "learning_rate": 2.5170775170775172e-05, "loss": 0.0666, "step": 2710 }, { "epoch": 0.8078408078408078, "grad_norm": 0.7438215613365173, "learning_rate": 2.5152955152955155e-05, "loss": 0.0587, "step": 2720 }, { "epoch": 0.8108108108108109, "grad_norm": 1.1805411577224731, "learning_rate": 2.5135135135135137e-05, "loss": 0.0701, "step": 2730 }, { "epoch": 0.8137808137808138, "grad_norm": 0.8039736151695251, "learning_rate": 2.511731511731512e-05, "loss": 0.0611, "step": 2740 }, { "epoch": 0.8167508167508167, "grad_norm": 0.9000430107116699, "learning_rate": 2.5099495099495098e-05, "loss": 0.076, "step": 2750 }, { "epoch": 0.8197208197208197, "grad_norm": 1.0777641534805298, "learning_rate": 2.5081675081675083e-05, "loss": 0.0683, "step": 2760 }, { "epoch": 0.8226908226908227, "grad_norm": 1.0703097581863403, "learning_rate": 2.5063855063855065e-05, "loss": 0.0684, "step": 2770 }, { "epoch": 0.8256608256608257, "grad_norm": 1.3522082567214966, "learning_rate": 2.5046035046035048e-05, "loss": 0.0687, "step": 2780 }, { "epoch": 0.8286308286308286, "grad_norm": 0.916243314743042, "learning_rate": 2.502821502821503e-05, "loss": 0.071, "step": 2790 }, { "epoch": 0.8316008316008316, "grad_norm": 0.844256579875946, "learning_rate": 2.5010395010395012e-05, "loss": 0.0819, "step": 2800 }, { "epoch": 0.8345708345708346, "grad_norm": 0.5879322290420532, "learning_rate": 2.4992574992574994e-05, "loss": 0.0723, "step": 2810 }, { "epoch": 0.8375408375408375, "grad_norm": 1.0017356872558594, "learning_rate": 2.4974754974754973e-05, "loss": 0.0723, "step": 2820 }, { "epoch": 0.8405108405108405, "grad_norm": 1.0885043144226074, "learning_rate": 2.495693495693496e-05, "loss": 0.0607, "step": 2830 }, { "epoch": 0.8434808434808435, "grad_norm": 0.7798848748207092, "learning_rate": 2.493911493911494e-05, "loss": 0.057, "step": 2840 }, { "epoch": 0.8464508464508465, "grad_norm": 1.2266145944595337, "learning_rate": 2.4921294921294923e-05, "loss": 0.0785, "step": 2850 }, { "epoch": 0.8494208494208494, "grad_norm": 1.3794151544570923, "learning_rate": 2.4903474903474905e-05, "loss": 0.0711, "step": 2860 }, { "epoch": 0.8523908523908524, "grad_norm": 1.2152888774871826, "learning_rate": 2.4885654885654887e-05, "loss": 0.0783, "step": 2870 }, { "epoch": 0.8553608553608554, "grad_norm": 0.5685151219367981, "learning_rate": 2.486783486783487e-05, "loss": 0.0818, "step": 2880 }, { "epoch": 0.8583308583308583, "grad_norm": 1.0047805309295654, "learning_rate": 2.4850014850014848e-05, "loss": 0.0768, "step": 2890 }, { "epoch": 0.8613008613008613, "grad_norm": 0.4588945806026459, "learning_rate": 2.4832194832194834e-05, "loss": 0.0616, "step": 2900 }, { "epoch": 0.8642708642708643, "grad_norm": 0.815482497215271, "learning_rate": 2.4814374814374816e-05, "loss": 0.0734, "step": 2910 }, { "epoch": 0.8672408672408672, "grad_norm": 1.1462020874023438, "learning_rate": 2.4796554796554798e-05, "loss": 0.0716, "step": 2920 }, { "epoch": 0.8702108702108702, "grad_norm": 1.8708317279815674, "learning_rate": 2.477873477873478e-05, "loss": 0.0653, "step": 2930 }, { "epoch": 0.8731808731808732, "grad_norm": 0.5544032454490662, "learning_rate": 2.4760914760914762e-05, "loss": 0.07, "step": 2940 }, { "epoch": 0.8761508761508762, "grad_norm": 0.7917482256889343, "learning_rate": 2.4743094743094744e-05, "loss": 0.0656, "step": 2950 }, { "epoch": 0.8791208791208791, "grad_norm": 1.3552825450897217, "learning_rate": 2.4725274725274723e-05, "loss": 0.064, "step": 2960 }, { "epoch": 0.882090882090882, "grad_norm": 0.7009034752845764, "learning_rate": 2.470745470745471e-05, "loss": 0.0707, "step": 2970 }, { "epoch": 0.8850608850608851, "grad_norm": 0.835366427898407, "learning_rate": 2.468963468963469e-05, "loss": 0.0724, "step": 2980 }, { "epoch": 0.888030888030888, "grad_norm": 0.8953189253807068, "learning_rate": 2.4671814671814673e-05, "loss": 0.0692, "step": 2990 }, { "epoch": 0.891000891000891, "grad_norm": 1.1693567037582397, "learning_rate": 2.4653994653994655e-05, "loss": 0.0737, "step": 3000 }, { "epoch": 0.893970893970894, "grad_norm": 0.7871369123458862, "learning_rate": 2.4636174636174637e-05, "loss": 0.0812, "step": 3010 }, { "epoch": 0.896940896940897, "grad_norm": 0.7140387296676636, "learning_rate": 2.461835461835462e-05, "loss": 0.0703, "step": 3020 }, { "epoch": 0.8999108999108999, "grad_norm": 0.7188717126846313, "learning_rate": 2.46005346005346e-05, "loss": 0.062, "step": 3030 }, { "epoch": 0.9028809028809028, "grad_norm": 0.5643990635871887, "learning_rate": 2.4582714582714584e-05, "loss": 0.066, "step": 3040 }, { "epoch": 0.9058509058509059, "grad_norm": 0.8118051886558533, "learning_rate": 2.4564894564894566e-05, "loss": 0.0777, "step": 3050 }, { "epoch": 0.9088209088209088, "grad_norm": 0.5903987884521484, "learning_rate": 2.454707454707455e-05, "loss": 0.0691, "step": 3060 }, { "epoch": 0.9117909117909118, "grad_norm": 0.4959113299846649, "learning_rate": 2.452925452925453e-05, "loss": 0.0706, "step": 3070 }, { "epoch": 0.9147609147609148, "grad_norm": 0.6881820559501648, "learning_rate": 2.4511434511434513e-05, "loss": 0.0793, "step": 3080 }, { "epoch": 0.9177309177309178, "grad_norm": 1.0045626163482666, "learning_rate": 2.4493614493614495e-05, "loss": 0.0669, "step": 3090 }, { "epoch": 0.9207009207009207, "grad_norm": 0.6501134037971497, "learning_rate": 2.4475794475794474e-05, "loss": 0.0618, "step": 3100 }, { "epoch": 0.9236709236709236, "grad_norm": 0.7777647972106934, "learning_rate": 2.445797445797446e-05, "loss": 0.0802, "step": 3110 }, { "epoch": 0.9266409266409267, "grad_norm": 0.9002485275268555, "learning_rate": 2.444015444015444e-05, "loss": 0.0686, "step": 3120 }, { "epoch": 0.9296109296109296, "grad_norm": 0.6177936792373657, "learning_rate": 2.4422334422334424e-05, "loss": 0.0688, "step": 3130 }, { "epoch": 0.9325809325809326, "grad_norm": 0.6512916088104248, "learning_rate": 2.4404514404514406e-05, "loss": 0.068, "step": 3140 }, { "epoch": 0.9355509355509356, "grad_norm": 0.670874297618866, "learning_rate": 2.4386694386694388e-05, "loss": 0.0804, "step": 3150 }, { "epoch": 0.9385209385209385, "grad_norm": 0.9932950139045715, "learning_rate": 2.436887436887437e-05, "loss": 0.0639, "step": 3160 }, { "epoch": 0.9414909414909415, "grad_norm": 0.8478681445121765, "learning_rate": 2.435105435105435e-05, "loss": 0.0617, "step": 3170 }, { "epoch": 0.9444609444609444, "grad_norm": 0.6210806369781494, "learning_rate": 2.4333234333234334e-05, "loss": 0.0651, "step": 3180 }, { "epoch": 0.9474309474309475, "grad_norm": 0.8721588850021362, "learning_rate": 2.4315414315414317e-05, "loss": 0.0849, "step": 3190 }, { "epoch": 0.9504009504009504, "grad_norm": 0.5963863730430603, "learning_rate": 2.42975942975943e-05, "loss": 0.063, "step": 3200 }, { "epoch": 0.9533709533709533, "grad_norm": 1.1241910457611084, "learning_rate": 2.427977427977428e-05, "loss": 0.0669, "step": 3210 }, { "epoch": 0.9563409563409564, "grad_norm": 1.101955771446228, "learning_rate": 2.4261954261954263e-05, "loss": 0.067, "step": 3220 }, { "epoch": 0.9593109593109593, "grad_norm": 1.1508077383041382, "learning_rate": 2.4244134244134245e-05, "loss": 0.0666, "step": 3230 }, { "epoch": 0.9622809622809623, "grad_norm": 0.8571543097496033, "learning_rate": 2.4226314226314224e-05, "loss": 0.0811, "step": 3240 }, { "epoch": 0.9652509652509652, "grad_norm": 0.7995705008506775, "learning_rate": 2.420849420849421e-05, "loss": 0.0602, "step": 3250 }, { "epoch": 0.9682209682209683, "grad_norm": 0.6367846727371216, "learning_rate": 2.4190674190674192e-05, "loss": 0.0635, "step": 3260 }, { "epoch": 0.9711909711909712, "grad_norm": 0.6590407490730286, "learning_rate": 2.4172854172854174e-05, "loss": 0.0599, "step": 3270 }, { "epoch": 0.9741609741609741, "grad_norm": 1.3518911600112915, "learning_rate": 2.4155034155034156e-05, "loss": 0.0816, "step": 3280 }, { "epoch": 0.9771309771309772, "grad_norm": 0.7235366106033325, "learning_rate": 2.4137214137214138e-05, "loss": 0.0585, "step": 3290 }, { "epoch": 0.9801009801009801, "grad_norm": 0.7149487733840942, "learning_rate": 2.411939411939412e-05, "loss": 0.0611, "step": 3300 }, { "epoch": 0.9830709830709831, "grad_norm": 0.5758341550827026, "learning_rate": 2.4101574101574103e-05, "loss": 0.0575, "step": 3310 }, { "epoch": 0.986040986040986, "grad_norm": 0.6312692761421204, "learning_rate": 2.4083754083754085e-05, "loss": 0.0696, "step": 3320 }, { "epoch": 0.989010989010989, "grad_norm": 0.6071924567222595, "learning_rate": 2.4065934065934067e-05, "loss": 0.0736, "step": 3330 }, { "epoch": 0.991980991980992, "grad_norm": 1.0844005346298218, "learning_rate": 2.404811404811405e-05, "loss": 0.0608, "step": 3340 }, { "epoch": 0.9949509949509949, "grad_norm": 0.7785583734512329, "learning_rate": 2.403029403029403e-05, "loss": 0.0737, "step": 3350 }, { "epoch": 0.997920997920998, "grad_norm": 0.7778565287590027, "learning_rate": 2.4012474012474013e-05, "loss": 0.0584, "step": 3360 }, { "epoch": 1.0, "eval_f1": 0.33031292965957215, "eval_loss": 0.05296385660767555, "eval_runtime": 760.7693, "eval_samples_per_second": 49.974, "eval_steps_per_second": 0.782, "step": 3367 }, { "epoch": 1.0008910008910008, "grad_norm": 0.5195960402488708, "learning_rate": 2.3994653994653996e-05, "loss": 0.0643, "step": 3370 }, { "epoch": 1.0038610038610039, "grad_norm": 1.0091626644134521, "learning_rate": 2.3976833976833978e-05, "loss": 0.0659, "step": 3380 }, { "epoch": 1.006831006831007, "grad_norm": 0.6962494254112244, "learning_rate": 2.395901395901396e-05, "loss": 0.0741, "step": 3390 }, { "epoch": 1.0098010098010097, "grad_norm": 0.5195145010948181, "learning_rate": 2.3941193941193942e-05, "loss": 0.0661, "step": 3400 }, { "epoch": 1.0127710127710128, "grad_norm": 0.40490421652793884, "learning_rate": 2.3923373923373924e-05, "loss": 0.0667, "step": 3410 }, { "epoch": 1.0157410157410158, "grad_norm": 0.8673891425132751, "learning_rate": 2.3905553905553906e-05, "loss": 0.0648, "step": 3420 }, { "epoch": 1.0187110187110187, "grad_norm": 0.6482574343681335, "learning_rate": 2.388773388773389e-05, "loss": 0.0683, "step": 3430 }, { "epoch": 1.0216810216810217, "grad_norm": 1.0671459436416626, "learning_rate": 2.386991386991387e-05, "loss": 0.0731, "step": 3440 }, { "epoch": 1.0246510246510248, "grad_norm": 1.1997050046920776, "learning_rate": 2.3852093852093853e-05, "loss": 0.0824, "step": 3450 }, { "epoch": 1.0276210276210276, "grad_norm": 1.1399210691452026, "learning_rate": 2.3834273834273835e-05, "loss": 0.0631, "step": 3460 }, { "epoch": 1.0305910305910306, "grad_norm": 1.4414496421813965, "learning_rate": 2.3816453816453817e-05, "loss": 0.0644, "step": 3470 }, { "epoch": 1.0335610335610335, "grad_norm": 0.7612940669059753, "learning_rate": 2.37986337986338e-05, "loss": 0.0608, "step": 3480 }, { "epoch": 1.0365310365310365, "grad_norm": 1.8347103595733643, "learning_rate": 2.378081378081378e-05, "loss": 0.0853, "step": 3490 }, { "epoch": 1.0395010395010396, "grad_norm": 0.7422579526901245, "learning_rate": 2.3762993762993764e-05, "loss": 0.065, "step": 3500 }, { "epoch": 1.0424710424710424, "grad_norm": 0.4676852524280548, "learning_rate": 2.3745173745173746e-05, "loss": 0.0686, "step": 3510 }, { "epoch": 1.0454410454410454, "grad_norm": 0.7909456491470337, "learning_rate": 2.3727353727353728e-05, "loss": 0.0808, "step": 3520 }, { "epoch": 1.0484110484110485, "grad_norm": 1.0909233093261719, "learning_rate": 2.370953370953371e-05, "loss": 0.0786, "step": 3530 }, { "epoch": 1.0513810513810513, "grad_norm": 0.9881356358528137, "learning_rate": 2.3691713691713692e-05, "loss": 0.0745, "step": 3540 }, { "epoch": 1.0543510543510544, "grad_norm": 0.6421000361442566, "learning_rate": 2.3673893673893675e-05, "loss": 0.0681, "step": 3550 }, { "epoch": 1.0573210573210574, "grad_norm": 1.2213356494903564, "learning_rate": 2.3656073656073657e-05, "loss": 0.0691, "step": 3560 }, { "epoch": 1.0602910602910602, "grad_norm": 1.1838406324386597, "learning_rate": 2.363825363825364e-05, "loss": 0.0727, "step": 3570 }, { "epoch": 1.0632610632610633, "grad_norm": 0.8688098192214966, "learning_rate": 2.362043362043362e-05, "loss": 0.059, "step": 3580 }, { "epoch": 1.0662310662310661, "grad_norm": 0.891708493232727, "learning_rate": 2.3602613602613603e-05, "loss": 0.0652, "step": 3590 }, { "epoch": 1.0692010692010692, "grad_norm": 0.8974260091781616, "learning_rate": 2.3584793584793586e-05, "loss": 0.0627, "step": 3600 }, { "epoch": 1.0721710721710722, "grad_norm": 0.9412424564361572, "learning_rate": 2.3566973566973568e-05, "loss": 0.0773, "step": 3610 }, { "epoch": 1.075141075141075, "grad_norm": 0.8957776427268982, "learning_rate": 2.354915354915355e-05, "loss": 0.0771, "step": 3620 }, { "epoch": 1.078111078111078, "grad_norm": 0.9389250874519348, "learning_rate": 2.3531333531333532e-05, "loss": 0.0631, "step": 3630 }, { "epoch": 1.0810810810810811, "grad_norm": 0.716650664806366, "learning_rate": 2.3513513513513514e-05, "loss": 0.0525, "step": 3640 }, { "epoch": 1.084051084051084, "grad_norm": 0.779859721660614, "learning_rate": 2.3495693495693496e-05, "loss": 0.0824, "step": 3650 }, { "epoch": 1.087021087021087, "grad_norm": 0.5671702027320862, "learning_rate": 2.347787347787348e-05, "loss": 0.0783, "step": 3660 }, { "epoch": 1.08999108999109, "grad_norm": 0.8227300643920898, "learning_rate": 2.346005346005346e-05, "loss": 0.0463, "step": 3670 }, { "epoch": 1.092961092961093, "grad_norm": 0.6845151782035828, "learning_rate": 2.3442233442233443e-05, "loss": 0.0614, "step": 3680 }, { "epoch": 1.095931095931096, "grad_norm": 1.5416568517684937, "learning_rate": 2.3424413424413425e-05, "loss": 0.0752, "step": 3690 }, { "epoch": 1.098901098901099, "grad_norm": 0.8240406513214111, "learning_rate": 2.3406593406593407e-05, "loss": 0.0693, "step": 3700 }, { "epoch": 1.1018711018711018, "grad_norm": 0.713246762752533, "learning_rate": 2.338877338877339e-05, "loss": 0.051, "step": 3710 }, { "epoch": 1.1048411048411049, "grad_norm": 0.9789752960205078, "learning_rate": 2.337095337095337e-05, "loss": 0.0681, "step": 3720 }, { "epoch": 1.107811107811108, "grad_norm": 1.0538547039031982, "learning_rate": 2.3353133353133354e-05, "loss": 0.0622, "step": 3730 }, { "epoch": 1.1107811107811107, "grad_norm": 0.9167300462722778, "learning_rate": 2.3335313335313336e-05, "loss": 0.0695, "step": 3740 }, { "epoch": 1.1137511137511138, "grad_norm": 0.740674614906311, "learning_rate": 2.3317493317493318e-05, "loss": 0.0666, "step": 3750 }, { "epoch": 1.1167211167211166, "grad_norm": 0.6684421300888062, "learning_rate": 2.32996732996733e-05, "loss": 0.0711, "step": 3760 }, { "epoch": 1.1196911196911197, "grad_norm": 0.6935126781463623, "learning_rate": 2.3281853281853282e-05, "loss": 0.0653, "step": 3770 }, { "epoch": 1.1226611226611227, "grad_norm": 1.0922201871871948, "learning_rate": 2.3264033264033265e-05, "loss": 0.0653, "step": 3780 }, { "epoch": 1.1256311256311256, "grad_norm": 0.747627854347229, "learning_rate": 2.3246213246213247e-05, "loss": 0.0672, "step": 3790 }, { "epoch": 1.1286011286011286, "grad_norm": 1.3024978637695312, "learning_rate": 2.322839322839323e-05, "loss": 0.0725, "step": 3800 }, { "epoch": 1.1315711315711316, "grad_norm": 0.9191585779190063, "learning_rate": 2.321057321057321e-05, "loss": 0.0662, "step": 3810 }, { "epoch": 1.1345411345411345, "grad_norm": 1.0653009414672852, "learning_rate": 2.3192753192753193e-05, "loss": 0.0685, "step": 3820 }, { "epoch": 1.1375111375111375, "grad_norm": 1.1122196912765503, "learning_rate": 2.3174933174933175e-05, "loss": 0.073, "step": 3830 }, { "epoch": 1.1404811404811406, "grad_norm": 0.8093435764312744, "learning_rate": 2.3157113157113158e-05, "loss": 0.0709, "step": 3840 }, { "epoch": 1.1434511434511434, "grad_norm": 0.4847230613231659, "learning_rate": 2.313929313929314e-05, "loss": 0.0625, "step": 3850 }, { "epoch": 1.1464211464211465, "grad_norm": 0.8691696524620056, "learning_rate": 2.3121473121473122e-05, "loss": 0.0558, "step": 3860 }, { "epoch": 1.1493911493911493, "grad_norm": 1.140037178993225, "learning_rate": 2.3103653103653104e-05, "loss": 0.0704, "step": 3870 }, { "epoch": 1.1523611523611523, "grad_norm": 0.8088730573654175, "learning_rate": 2.3085833085833086e-05, "loss": 0.0666, "step": 3880 }, { "epoch": 1.1553311553311554, "grad_norm": 0.6530190110206604, "learning_rate": 2.306801306801307e-05, "loss": 0.0658, "step": 3890 }, { "epoch": 1.1583011583011582, "grad_norm": 0.7550728917121887, "learning_rate": 2.305019305019305e-05, "loss": 0.0715, "step": 3900 }, { "epoch": 1.1612711612711613, "grad_norm": 0.6546505689620972, "learning_rate": 2.3032373032373033e-05, "loss": 0.0682, "step": 3910 }, { "epoch": 1.1642411642411643, "grad_norm": 1.3673691749572754, "learning_rate": 2.3014553014553015e-05, "loss": 0.0628, "step": 3920 }, { "epoch": 1.1672111672111671, "grad_norm": 1.3295186758041382, "learning_rate": 2.2996732996732997e-05, "loss": 0.0632, "step": 3930 }, { "epoch": 1.1701811701811702, "grad_norm": 0.8255389332771301, "learning_rate": 2.297891297891298e-05, "loss": 0.0598, "step": 3940 }, { "epoch": 1.1731511731511732, "grad_norm": 0.7321466207504272, "learning_rate": 2.2961092961092965e-05, "loss": 0.0521, "step": 3950 }, { "epoch": 1.176121176121176, "grad_norm": 1.0865906476974487, "learning_rate": 2.2943272943272944e-05, "loss": 0.0705, "step": 3960 }, { "epoch": 1.179091179091179, "grad_norm": 0.691181480884552, "learning_rate": 2.2925452925452926e-05, "loss": 0.0805, "step": 3970 }, { "epoch": 1.1820611820611822, "grad_norm": 0.6251980662345886, "learning_rate": 2.2907632907632908e-05, "loss": 0.061, "step": 3980 }, { "epoch": 1.185031185031185, "grad_norm": 0.7137186527252197, "learning_rate": 2.288981288981289e-05, "loss": 0.0657, "step": 3990 }, { "epoch": 1.188001188001188, "grad_norm": 0.5930067896842957, "learning_rate": 2.2871992871992872e-05, "loss": 0.065, "step": 4000 }, { "epoch": 1.190971190971191, "grad_norm": 0.8055261373519897, "learning_rate": 2.2854172854172855e-05, "loss": 0.0605, "step": 4010 }, { "epoch": 1.193941193941194, "grad_norm": 0.8491489887237549, "learning_rate": 2.283635283635284e-05, "loss": 0.0768, "step": 4020 }, { "epoch": 1.196911196911197, "grad_norm": 0.5159242749214172, "learning_rate": 2.281853281853282e-05, "loss": 0.0577, "step": 4030 }, { "epoch": 1.1998811998811998, "grad_norm": 1.0682775974273682, "learning_rate": 2.28007128007128e-05, "loss": 0.0669, "step": 4040 }, { "epoch": 1.2028512028512028, "grad_norm": 0.7418850660324097, "learning_rate": 2.2782892782892783e-05, "loss": 0.064, "step": 4050 }, { "epoch": 1.2058212058212059, "grad_norm": 1.3865784406661987, "learning_rate": 2.2765072765072765e-05, "loss": 0.056, "step": 4060 }, { "epoch": 1.2087912087912087, "grad_norm": 1.0742945671081543, "learning_rate": 2.2747252747252748e-05, "loss": 0.0495, "step": 4070 }, { "epoch": 1.2117612117612118, "grad_norm": 1.8992480039596558, "learning_rate": 2.272943272943273e-05, "loss": 0.0603, "step": 4080 }, { "epoch": 1.2147312147312148, "grad_norm": 0.701547384262085, "learning_rate": 2.2711612711612715e-05, "loss": 0.0733, "step": 4090 }, { "epoch": 1.2177012177012176, "grad_norm": 0.7614421844482422, "learning_rate": 2.2693792693792694e-05, "loss": 0.0634, "step": 4100 }, { "epoch": 1.2206712206712207, "grad_norm": 0.6779104471206665, "learning_rate": 2.2675972675972676e-05, "loss": 0.0631, "step": 4110 }, { "epoch": 1.2236412236412235, "grad_norm": 0.7199757695198059, "learning_rate": 2.265815265815266e-05, "loss": 0.0628, "step": 4120 }, { "epoch": 1.2266112266112266, "grad_norm": 0.8466992378234863, "learning_rate": 2.264033264033264e-05, "loss": 0.0784, "step": 4130 }, { "epoch": 1.2295812295812296, "grad_norm": 0.5136345028877258, "learning_rate": 2.2622512622512623e-05, "loss": 0.0866, "step": 4140 }, { "epoch": 1.2325512325512324, "grad_norm": 0.7699841260910034, "learning_rate": 2.2604692604692605e-05, "loss": 0.0607, "step": 4150 }, { "epoch": 1.2355212355212355, "grad_norm": 0.61440509557724, "learning_rate": 2.258687258687259e-05, "loss": 0.0721, "step": 4160 }, { "epoch": 1.2384912384912385, "grad_norm": 0.5147203803062439, "learning_rate": 2.256905256905257e-05, "loss": 0.0615, "step": 4170 }, { "epoch": 1.2414612414612414, "grad_norm": 1.0179754495620728, "learning_rate": 2.255123255123255e-05, "loss": 0.0669, "step": 4180 }, { "epoch": 1.2444312444312444, "grad_norm": 1.2487351894378662, "learning_rate": 2.2533412533412534e-05, "loss": 0.0705, "step": 4190 }, { "epoch": 1.2474012474012475, "grad_norm": 0.8838121294975281, "learning_rate": 2.2515592515592516e-05, "loss": 0.0582, "step": 4200 }, { "epoch": 1.2503712503712503, "grad_norm": 0.5740695595741272, "learning_rate": 2.2497772497772498e-05, "loss": 0.0704, "step": 4210 }, { "epoch": 1.2533412533412533, "grad_norm": 1.0215144157409668, "learning_rate": 2.247995247995248e-05, "loss": 0.0672, "step": 4220 }, { "epoch": 1.2563112563112564, "grad_norm": 0.5340459942817688, "learning_rate": 2.2462132462132466e-05, "loss": 0.0637, "step": 4230 }, { "epoch": 1.2592812592812592, "grad_norm": 1.070460557937622, "learning_rate": 2.2444312444312444e-05, "loss": 0.0545, "step": 4240 }, { "epoch": 1.2622512622512623, "grad_norm": 0.8050958514213562, "learning_rate": 2.2426492426492427e-05, "loss": 0.0732, "step": 4250 }, { "epoch": 1.2652212652212653, "grad_norm": 1.1751881837844849, "learning_rate": 2.240867240867241e-05, "loss": 0.0643, "step": 4260 }, { "epoch": 1.2681912681912682, "grad_norm": 0.7636763453483582, "learning_rate": 2.239085239085239e-05, "loss": 0.0653, "step": 4270 }, { "epoch": 1.2711612711612712, "grad_norm": 0.9791406393051147, "learning_rate": 2.2373032373032373e-05, "loss": 0.0584, "step": 4280 }, { "epoch": 1.2741312741312742, "grad_norm": 0.7061544060707092, "learning_rate": 2.2355212355212355e-05, "loss": 0.0702, "step": 4290 }, { "epoch": 1.277101277101277, "grad_norm": 1.0918234586715698, "learning_rate": 2.233739233739234e-05, "loss": 0.0597, "step": 4300 }, { "epoch": 1.2800712800712801, "grad_norm": 1.0463823080062866, "learning_rate": 2.231957231957232e-05, "loss": 0.0571, "step": 4310 }, { "epoch": 1.2830412830412832, "grad_norm": 0.8148181438446045, "learning_rate": 2.2301752301752302e-05, "loss": 0.0825, "step": 4320 }, { "epoch": 1.286011286011286, "grad_norm": 0.9247533082962036, "learning_rate": 2.2283932283932284e-05, "loss": 0.0643, "step": 4330 }, { "epoch": 1.288981288981289, "grad_norm": 0.8927013278007507, "learning_rate": 2.2266112266112266e-05, "loss": 0.0695, "step": 4340 }, { "epoch": 1.2919512919512919, "grad_norm": 1.1298048496246338, "learning_rate": 2.2248292248292248e-05, "loss": 0.0601, "step": 4350 }, { "epoch": 1.294921294921295, "grad_norm": 0.40580281615257263, "learning_rate": 2.223047223047223e-05, "loss": 0.0742, "step": 4360 }, { "epoch": 1.2978912978912978, "grad_norm": 0.471123069524765, "learning_rate": 2.2212652212652216e-05, "loss": 0.0638, "step": 4370 }, { "epoch": 1.3008613008613008, "grad_norm": 1.53095543384552, "learning_rate": 2.2194832194832195e-05, "loss": 0.0695, "step": 4380 }, { "epoch": 1.3038313038313039, "grad_norm": 0.8293925523757935, "learning_rate": 2.2177012177012177e-05, "loss": 0.0526, "step": 4390 }, { "epoch": 1.3068013068013067, "grad_norm": 0.66612708568573, "learning_rate": 2.215919215919216e-05, "loss": 0.0776, "step": 4400 }, { "epoch": 1.3097713097713097, "grad_norm": 0.9548154473304749, "learning_rate": 2.214137214137214e-05, "loss": 0.0729, "step": 4410 }, { "epoch": 1.3127413127413128, "grad_norm": 0.5580644011497498, "learning_rate": 2.2123552123552123e-05, "loss": 0.0831, "step": 4420 }, { "epoch": 1.3157113157113156, "grad_norm": 0.5851168036460876, "learning_rate": 2.2105732105732106e-05, "loss": 0.0569, "step": 4430 }, { "epoch": 1.3186813186813187, "grad_norm": 0.4882819652557373, "learning_rate": 2.208791208791209e-05, "loss": 0.0692, "step": 4440 }, { "epoch": 1.3216513216513217, "grad_norm": 0.44905969500541687, "learning_rate": 2.207009207009207e-05, "loss": 0.0577, "step": 4450 }, { "epoch": 1.3246213246213245, "grad_norm": 0.8258479237556458, "learning_rate": 2.2052272052272052e-05, "loss": 0.0516, "step": 4460 }, { "epoch": 1.3275913275913276, "grad_norm": 0.8680855631828308, "learning_rate": 2.2034452034452034e-05, "loss": 0.0445, "step": 4470 }, { "epoch": 1.3305613305613306, "grad_norm": 0.6597207188606262, "learning_rate": 2.2016632016632017e-05, "loss": 0.0558, "step": 4480 }, { "epoch": 1.3335313335313335, "grad_norm": 1.0117040872573853, "learning_rate": 2.1998811998812e-05, "loss": 0.0676, "step": 4490 }, { "epoch": 1.3365013365013365, "grad_norm": 0.6703979969024658, "learning_rate": 2.198099198099198e-05, "loss": 0.0517, "step": 4500 }, { "epoch": 1.3394713394713396, "grad_norm": 1.2186622619628906, "learning_rate": 2.1963171963171966e-05, "loss": 0.0653, "step": 4510 }, { "epoch": 1.3424413424413424, "grad_norm": 0.6504166722297668, "learning_rate": 2.1945351945351945e-05, "loss": 0.0737, "step": 4520 }, { "epoch": 1.3454113454113454, "grad_norm": 0.8014166355133057, "learning_rate": 2.1927531927531927e-05, "loss": 0.0667, "step": 4530 }, { "epoch": 1.3483813483813485, "grad_norm": 0.6877809166908264, "learning_rate": 2.190971190971191e-05, "loss": 0.0683, "step": 4540 }, { "epoch": 1.3513513513513513, "grad_norm": 1.1891028881072998, "learning_rate": 2.1891891891891892e-05, "loss": 0.0659, "step": 4550 }, { "epoch": 1.3543213543213544, "grad_norm": 1.254156470298767, "learning_rate": 2.1874071874071874e-05, "loss": 0.065, "step": 4560 }, { "epoch": 1.3572913572913574, "grad_norm": 1.4208029508590698, "learning_rate": 2.1856251856251856e-05, "loss": 0.0733, "step": 4570 }, { "epoch": 1.3602613602613602, "grad_norm": 1.2454571723937988, "learning_rate": 2.183843183843184e-05, "loss": 0.0639, "step": 4580 }, { "epoch": 1.3632313632313633, "grad_norm": 0.7917842864990234, "learning_rate": 2.1820611820611824e-05, "loss": 0.0719, "step": 4590 }, { "epoch": 1.3662013662013661, "grad_norm": 0.7804074287414551, "learning_rate": 2.1802791802791803e-05, "loss": 0.0667, "step": 4600 }, { "epoch": 1.3691713691713692, "grad_norm": 0.9394906163215637, "learning_rate": 2.1784971784971785e-05, "loss": 0.0772, "step": 4610 }, { "epoch": 1.3721413721413722, "grad_norm": 0.9124456644058228, "learning_rate": 2.1767151767151767e-05, "loss": 0.0641, "step": 4620 }, { "epoch": 1.375111375111375, "grad_norm": 0.8035851716995239, "learning_rate": 2.174933174933175e-05, "loss": 0.0537, "step": 4630 }, { "epoch": 1.378081378081378, "grad_norm": 0.603728711605072, "learning_rate": 2.173151173151173e-05, "loss": 0.0671, "step": 4640 }, { "epoch": 1.381051381051381, "grad_norm": 0.5484776496887207, "learning_rate": 2.1713691713691717e-05, "loss": 0.0641, "step": 4650 }, { "epoch": 1.384021384021384, "grad_norm": 1.0918580293655396, "learning_rate": 2.16958716958717e-05, "loss": 0.0701, "step": 4660 }, { "epoch": 1.386991386991387, "grad_norm": 0.7597218751907349, "learning_rate": 2.1678051678051678e-05, "loss": 0.0612, "step": 4670 }, { "epoch": 1.3899613899613898, "grad_norm": 1.020501971244812, "learning_rate": 2.166023166023166e-05, "loss": 0.0683, "step": 4680 }, { "epoch": 1.392931392931393, "grad_norm": 0.3588350713253021, "learning_rate": 2.1642411642411642e-05, "loss": 0.0578, "step": 4690 }, { "epoch": 1.395901395901396, "grad_norm": 0.6665700674057007, "learning_rate": 2.1624591624591624e-05, "loss": 0.0586, "step": 4700 }, { "epoch": 1.3988713988713988, "grad_norm": 0.5543581247329712, "learning_rate": 2.1606771606771606e-05, "loss": 0.0572, "step": 4710 }, { "epoch": 1.4018414018414018, "grad_norm": 0.8856662511825562, "learning_rate": 2.1588951588951592e-05, "loss": 0.0647, "step": 4720 }, { "epoch": 1.4048114048114049, "grad_norm": 0.7327395081520081, "learning_rate": 2.1571131571131574e-05, "loss": 0.055, "step": 4730 }, { "epoch": 1.4077814077814077, "grad_norm": 1.0437580347061157, "learning_rate": 2.1553311553311553e-05, "loss": 0.052, "step": 4740 }, { "epoch": 1.4107514107514108, "grad_norm": 0.6243680119514465, "learning_rate": 2.1535491535491535e-05, "loss": 0.0606, "step": 4750 }, { "epoch": 1.4137214137214138, "grad_norm": 0.8211525678634644, "learning_rate": 2.1517671517671517e-05, "loss": 0.0586, "step": 4760 }, { "epoch": 1.4166914166914166, "grad_norm": 0.8621878027915955, "learning_rate": 2.14998514998515e-05, "loss": 0.0732, "step": 4770 }, { "epoch": 1.4196614196614197, "grad_norm": 0.9054310321807861, "learning_rate": 2.148203148203148e-05, "loss": 0.0548, "step": 4780 }, { "epoch": 1.4226314226314227, "grad_norm": 1.122381567955017, "learning_rate": 2.1464211464211467e-05, "loss": 0.0588, "step": 4790 }, { "epoch": 1.4256014256014256, "grad_norm": 1.5067578554153442, "learning_rate": 2.144639144639145e-05, "loss": 0.0592, "step": 4800 }, { "epoch": 1.4285714285714286, "grad_norm": 0.9946634769439697, "learning_rate": 2.1428571428571428e-05, "loss": 0.0588, "step": 4810 }, { "epoch": 1.4315414315414317, "grad_norm": 0.7225719094276428, "learning_rate": 2.141075141075141e-05, "loss": 0.0669, "step": 4820 }, { "epoch": 1.4345114345114345, "grad_norm": 0.7336903810501099, "learning_rate": 2.1392931392931392e-05, "loss": 0.0707, "step": 4830 }, { "epoch": 1.4374814374814375, "grad_norm": 1.2703900337219238, "learning_rate": 2.1375111375111375e-05, "loss": 0.0666, "step": 4840 }, { "epoch": 1.4404514404514406, "grad_norm": 1.0433528423309326, "learning_rate": 2.1357291357291357e-05, "loss": 0.0754, "step": 4850 }, { "epoch": 1.4434214434214434, "grad_norm": 0.6614531874656677, "learning_rate": 2.1339471339471342e-05, "loss": 0.0685, "step": 4860 }, { "epoch": 1.4463914463914465, "grad_norm": 0.7256604433059692, "learning_rate": 2.1321651321651325e-05, "loss": 0.0488, "step": 4870 }, { "epoch": 1.4493614493614493, "grad_norm": 0.781690776348114, "learning_rate": 2.1303831303831303e-05, "loss": 0.0599, "step": 4880 }, { "epoch": 1.4523314523314523, "grad_norm": 0.5681362748146057, "learning_rate": 2.1286011286011286e-05, "loss": 0.0654, "step": 4890 }, { "epoch": 1.4553014553014554, "grad_norm": 0.6243995428085327, "learning_rate": 2.1268191268191268e-05, "loss": 0.0695, "step": 4900 }, { "epoch": 1.4582714582714582, "grad_norm": 0.7592119574546814, "learning_rate": 2.125037125037125e-05, "loss": 0.0562, "step": 4910 }, { "epoch": 1.4612414612414613, "grad_norm": 0.740118682384491, "learning_rate": 2.1232551232551232e-05, "loss": 0.0617, "step": 4920 }, { "epoch": 1.464211464211464, "grad_norm": 0.8962658643722534, "learning_rate": 2.1214731214731218e-05, "loss": 0.0657, "step": 4930 }, { "epoch": 1.4671814671814671, "grad_norm": 0.6510963439941406, "learning_rate": 2.11969111969112e-05, "loss": 0.0639, "step": 4940 }, { "epoch": 1.4701514701514702, "grad_norm": 1.0592265129089355, "learning_rate": 2.117909117909118e-05, "loss": 0.0644, "step": 4950 }, { "epoch": 1.473121473121473, "grad_norm": 0.9842652678489685, "learning_rate": 2.116127116127116e-05, "loss": 0.0573, "step": 4960 }, { "epoch": 1.476091476091476, "grad_norm": 0.7804544568061829, "learning_rate": 2.1143451143451143e-05, "loss": 0.0605, "step": 4970 }, { "epoch": 1.4790614790614791, "grad_norm": 0.7872318625450134, "learning_rate": 2.1125631125631125e-05, "loss": 0.0561, "step": 4980 }, { "epoch": 1.482031482031482, "grad_norm": 0.9400390982627869, "learning_rate": 2.1107811107811107e-05, "loss": 0.0684, "step": 4990 }, { "epoch": 1.485001485001485, "grad_norm": 1.317718505859375, "learning_rate": 2.1089991089991093e-05, "loss": 0.0586, "step": 5000 }, { "epoch": 1.487971487971488, "grad_norm": 0.41417181491851807, "learning_rate": 2.1072171072171075e-05, "loss": 0.0671, "step": 5010 }, { "epoch": 1.4909414909414909, "grad_norm": 0.775679886341095, "learning_rate": 2.1054351054351054e-05, "loss": 0.0753, "step": 5020 }, { "epoch": 1.493911493911494, "grad_norm": 1.1018742322921753, "learning_rate": 2.1036531036531036e-05, "loss": 0.0684, "step": 5030 }, { "epoch": 1.496881496881497, "grad_norm": 0.877463161945343, "learning_rate": 2.1018711018711018e-05, "loss": 0.0622, "step": 5040 }, { "epoch": 1.4998514998514998, "grad_norm": 0.44593024253845215, "learning_rate": 2.1000891000891e-05, "loss": 0.0829, "step": 5050 }, { "epoch": 1.5028215028215028, "grad_norm": 0.7530653476715088, "learning_rate": 2.0983070983070982e-05, "loss": 0.0573, "step": 5060 }, { "epoch": 1.505791505791506, "grad_norm": 0.6737465262413025, "learning_rate": 2.0965250965250968e-05, "loss": 0.0657, "step": 5070 }, { "epoch": 1.5087615087615087, "grad_norm": 1.2247141599655151, "learning_rate": 2.094743094743095e-05, "loss": 0.0715, "step": 5080 }, { "epoch": 1.5117315117315118, "grad_norm": 0.5894985795021057, "learning_rate": 2.092961092961093e-05, "loss": 0.0635, "step": 5090 }, { "epoch": 1.5147015147015148, "grad_norm": 0.6736055016517639, "learning_rate": 2.091179091179091e-05, "loss": 0.0699, "step": 5100 }, { "epoch": 1.5176715176715176, "grad_norm": 1.1154353618621826, "learning_rate": 2.0893970893970893e-05, "loss": 0.0662, "step": 5110 }, { "epoch": 1.5206415206415207, "grad_norm": 1.1602790355682373, "learning_rate": 2.0876150876150875e-05, "loss": 0.0611, "step": 5120 }, { "epoch": 1.5236115236115237, "grad_norm": 0.6141194701194763, "learning_rate": 2.0858330858330858e-05, "loss": 0.0655, "step": 5130 }, { "epoch": 1.5265815265815266, "grad_norm": 0.7376190423965454, "learning_rate": 2.0840510840510843e-05, "loss": 0.055, "step": 5140 }, { "epoch": 1.5295515295515294, "grad_norm": 1.0901671648025513, "learning_rate": 2.0822690822690825e-05, "loss": 0.0678, "step": 5150 }, { "epoch": 1.5325215325215327, "grad_norm": 0.7825741171836853, "learning_rate": 2.0804870804870808e-05, "loss": 0.0722, "step": 5160 }, { "epoch": 1.5354915354915355, "grad_norm": 1.3154313564300537, "learning_rate": 2.0787050787050786e-05, "loss": 0.0651, "step": 5170 }, { "epoch": 1.5384615384615383, "grad_norm": 0.9979254603385925, "learning_rate": 2.076923076923077e-05, "loss": 0.0623, "step": 5180 }, { "epoch": 1.5414315414315416, "grad_norm": 0.6638979911804199, "learning_rate": 2.075141075141075e-05, "loss": 0.0724, "step": 5190 }, { "epoch": 1.5444015444015444, "grad_norm": 0.9916415214538574, "learning_rate": 2.0733590733590733e-05, "loss": 0.0665, "step": 5200 }, { "epoch": 1.5473715473715473, "grad_norm": 1.4179552793502808, "learning_rate": 2.071577071577072e-05, "loss": 0.062, "step": 5210 }, { "epoch": 1.5503415503415503, "grad_norm": 0.7684439420700073, "learning_rate": 2.06979506979507e-05, "loss": 0.0575, "step": 5220 }, { "epoch": 1.5533115533115534, "grad_norm": 0.7808251976966858, "learning_rate": 2.0680130680130683e-05, "loss": 0.065, "step": 5230 }, { "epoch": 1.5562815562815562, "grad_norm": 0.6247586011886597, "learning_rate": 2.066231066231066e-05, "loss": 0.0611, "step": 5240 }, { "epoch": 1.5592515592515592, "grad_norm": 0.9080334305763245, "learning_rate": 2.0644490644490644e-05, "loss": 0.0591, "step": 5250 }, { "epoch": 1.5622215622215623, "grad_norm": 1.0106734037399292, "learning_rate": 2.0626670626670626e-05, "loss": 0.055, "step": 5260 }, { "epoch": 1.565191565191565, "grad_norm": 0.8516258001327515, "learning_rate": 2.0608850608850608e-05, "loss": 0.0609, "step": 5270 }, { "epoch": 1.5681615681615682, "grad_norm": 0.7109993100166321, "learning_rate": 2.0591030591030594e-05, "loss": 0.0644, "step": 5280 }, { "epoch": 1.5711315711315712, "grad_norm": 1.2328643798828125, "learning_rate": 2.0573210573210576e-05, "loss": 0.0654, "step": 5290 }, { "epoch": 1.574101574101574, "grad_norm": 0.8040093183517456, "learning_rate": 2.0555390555390558e-05, "loss": 0.0558, "step": 5300 }, { "epoch": 1.577071577071577, "grad_norm": 0.4768785834312439, "learning_rate": 2.0537570537570537e-05, "loss": 0.0486, "step": 5310 }, { "epoch": 1.5800415800415801, "grad_norm": 0.8780914545059204, "learning_rate": 2.051975051975052e-05, "loss": 0.0597, "step": 5320 }, { "epoch": 1.583011583011583, "grad_norm": 0.5771424770355225, "learning_rate": 2.05019305019305e-05, "loss": 0.0696, "step": 5330 }, { "epoch": 1.585981585981586, "grad_norm": 0.8761149644851685, "learning_rate": 2.0484110484110483e-05, "loss": 0.0617, "step": 5340 }, { "epoch": 1.588951588951589, "grad_norm": 0.8866630792617798, "learning_rate": 2.046629046629047e-05, "loss": 0.0619, "step": 5350 }, { "epoch": 1.5919215919215919, "grad_norm": 0.8091713190078735, "learning_rate": 2.044847044847045e-05, "loss": 0.0705, "step": 5360 }, { "epoch": 1.594891594891595, "grad_norm": 1.079195261001587, "learning_rate": 2.0430650430650433e-05, "loss": 0.0555, "step": 5370 }, { "epoch": 1.597861597861598, "grad_norm": 0.5570653676986694, "learning_rate": 2.0412830412830412e-05, "loss": 0.0759, "step": 5380 }, { "epoch": 1.6008316008316008, "grad_norm": 0.8151302933692932, "learning_rate": 2.0395010395010394e-05, "loss": 0.0644, "step": 5390 }, { "epoch": 1.6038016038016036, "grad_norm": 0.9399856925010681, "learning_rate": 2.0377190377190376e-05, "loss": 0.0507, "step": 5400 }, { "epoch": 1.606771606771607, "grad_norm": 0.7044445872306824, "learning_rate": 2.035937035937036e-05, "loss": 0.0638, "step": 5410 }, { "epoch": 1.6097416097416097, "grad_norm": 0.9745432734489441, "learning_rate": 2.0341550341550344e-05, "loss": 0.0588, "step": 5420 }, { "epoch": 1.6127116127116126, "grad_norm": 0.8628720045089722, "learning_rate": 2.0323730323730326e-05, "loss": 0.0627, "step": 5430 }, { "epoch": 1.6156816156816158, "grad_norm": 0.8440065979957581, "learning_rate": 2.0305910305910308e-05, "loss": 0.056, "step": 5440 }, { "epoch": 1.6186516186516187, "grad_norm": 0.7349764108657837, "learning_rate": 2.0288090288090287e-05, "loss": 0.0524, "step": 5450 }, { "epoch": 1.6216216216216215, "grad_norm": 0.4457996189594269, "learning_rate": 2.027027027027027e-05, "loss": 0.0678, "step": 5460 }, { "epoch": 1.6245916245916245, "grad_norm": 1.1030367612838745, "learning_rate": 2.025245025245025e-05, "loss": 0.075, "step": 5470 }, { "epoch": 1.6275616275616276, "grad_norm": 0.6578485369682312, "learning_rate": 2.0234630234630234e-05, "loss": 0.0586, "step": 5480 }, { "epoch": 1.6305316305316304, "grad_norm": 0.8512071967124939, "learning_rate": 2.021681021681022e-05, "loss": 0.0619, "step": 5490 }, { "epoch": 1.6335016335016335, "grad_norm": 0.3718603551387787, "learning_rate": 2.01989901989902e-05, "loss": 0.077, "step": 5500 }, { "epoch": 1.6364716364716365, "grad_norm": 0.6400578618049622, "learning_rate": 2.0181170181170183e-05, "loss": 0.0443, "step": 5510 }, { "epoch": 1.6394416394416393, "grad_norm": 0.6334373354911804, "learning_rate": 2.0163350163350162e-05, "loss": 0.0749, "step": 5520 }, { "epoch": 1.6424116424116424, "grad_norm": 1.0908116102218628, "learning_rate": 2.0145530145530144e-05, "loss": 0.0597, "step": 5530 }, { "epoch": 1.6453816453816454, "grad_norm": 0.8702746033668518, "learning_rate": 2.0127710127710127e-05, "loss": 0.0622, "step": 5540 }, { "epoch": 1.6483516483516483, "grad_norm": 0.816605269908905, "learning_rate": 2.010989010989011e-05, "loss": 0.0555, "step": 5550 }, { "epoch": 1.6513216513216513, "grad_norm": 1.7561945915222168, "learning_rate": 2.0092070092070094e-05, "loss": 0.0609, "step": 5560 }, { "epoch": 1.6542916542916544, "grad_norm": 1.0770184993743896, "learning_rate": 2.0074250074250076e-05, "loss": 0.0737, "step": 5570 }, { "epoch": 1.6572616572616572, "grad_norm": 0.8192417025566101, "learning_rate": 2.005643005643006e-05, "loss": 0.0607, "step": 5580 }, { "epoch": 1.6602316602316602, "grad_norm": 0.49099376797676086, "learning_rate": 2.0038610038610037e-05, "loss": 0.0621, "step": 5590 }, { "epoch": 1.6632016632016633, "grad_norm": 0.8861784338951111, "learning_rate": 2.002079002079002e-05, "loss": 0.0548, "step": 5600 }, { "epoch": 1.6661716661716661, "grad_norm": 0.6367491483688354, "learning_rate": 2.0002970002970002e-05, "loss": 0.0685, "step": 5610 }, { "epoch": 1.6691416691416692, "grad_norm": 0.639358401298523, "learning_rate": 1.9985149985149984e-05, "loss": 0.0572, "step": 5620 }, { "epoch": 1.6721116721116722, "grad_norm": 0.8413973450660706, "learning_rate": 1.996732996732997e-05, "loss": 0.063, "step": 5630 }, { "epoch": 1.675081675081675, "grad_norm": 1.1640899181365967, "learning_rate": 1.994950994950995e-05, "loss": 0.0683, "step": 5640 }, { "epoch": 1.678051678051678, "grad_norm": 1.299309253692627, "learning_rate": 1.9931689931689934e-05, "loss": 0.0676, "step": 5650 }, { "epoch": 1.6810216810216811, "grad_norm": 1.032827377319336, "learning_rate": 1.9913869913869913e-05, "loss": 0.0658, "step": 5660 }, { "epoch": 1.683991683991684, "grad_norm": 0.36365097761154175, "learning_rate": 1.9896049896049895e-05, "loss": 0.0605, "step": 5670 }, { "epoch": 1.6869616869616868, "grad_norm": 0.8830529451370239, "learning_rate": 1.9878229878229877e-05, "loss": 0.0676, "step": 5680 }, { "epoch": 1.68993168993169, "grad_norm": 0.5479172468185425, "learning_rate": 1.986040986040986e-05, "loss": 0.0798, "step": 5690 }, { "epoch": 1.692901692901693, "grad_norm": 1.1123652458190918, "learning_rate": 1.9842589842589845e-05, "loss": 0.0528, "step": 5700 }, { "epoch": 1.6958716958716957, "grad_norm": 0.9681738018989563, "learning_rate": 1.9824769824769827e-05, "loss": 0.0641, "step": 5710 }, { "epoch": 1.698841698841699, "grad_norm": 0.7309338450431824, "learning_rate": 1.980694980694981e-05, "loss": 0.0764, "step": 5720 }, { "epoch": 1.7018117018117018, "grad_norm": 0.8220120668411255, "learning_rate": 1.978912978912979e-05, "loss": 0.0679, "step": 5730 }, { "epoch": 1.7047817047817047, "grad_norm": 0.9567219018936157, "learning_rate": 1.977130977130977e-05, "loss": 0.0637, "step": 5740 }, { "epoch": 1.7077517077517077, "grad_norm": 0.643904447555542, "learning_rate": 1.9753489753489752e-05, "loss": 0.0672, "step": 5750 }, { "epoch": 1.7107217107217108, "grad_norm": 1.4598885774612427, "learning_rate": 1.9735669735669734e-05, "loss": 0.067, "step": 5760 }, { "epoch": 1.7136917136917136, "grad_norm": 0.8001982569694519, "learning_rate": 1.971784971784972e-05, "loss": 0.0579, "step": 5770 }, { "epoch": 1.7166617166617166, "grad_norm": 0.858055591583252, "learning_rate": 1.9700029700029702e-05, "loss": 0.0548, "step": 5780 }, { "epoch": 1.7196317196317197, "grad_norm": 1.0781798362731934, "learning_rate": 1.9682209682209684e-05, "loss": 0.067, "step": 5790 }, { "epoch": 1.7226017226017225, "grad_norm": 0.9981959462165833, "learning_rate": 1.9664389664389666e-05, "loss": 0.0689, "step": 5800 }, { "epoch": 1.7255717255717256, "grad_norm": 0.9247766137123108, "learning_rate": 1.9646569646569645e-05, "loss": 0.0595, "step": 5810 }, { "epoch": 1.7285417285417286, "grad_norm": 0.8209492564201355, "learning_rate": 1.9628749628749627e-05, "loss": 0.0627, "step": 5820 }, { "epoch": 1.7315117315117314, "grad_norm": 1.1569684743881226, "learning_rate": 1.961092961092961e-05, "loss": 0.0727, "step": 5830 }, { "epoch": 1.7344817344817345, "grad_norm": 0.6315358281135559, "learning_rate": 1.9593109593109595e-05, "loss": 0.0673, "step": 5840 }, { "epoch": 1.7374517374517375, "grad_norm": 0.7351091504096985, "learning_rate": 1.9575289575289577e-05, "loss": 0.0673, "step": 5850 }, { "epoch": 1.7404217404217404, "grad_norm": 0.8699774742126465, "learning_rate": 1.955746955746956e-05, "loss": 0.0732, "step": 5860 }, { "epoch": 1.7433917433917434, "grad_norm": 0.992885172367096, "learning_rate": 1.953964953964954e-05, "loss": 0.0674, "step": 5870 }, { "epoch": 1.7463617463617465, "grad_norm": 0.8643231391906738, "learning_rate": 1.952182952182952e-05, "loss": 0.0755, "step": 5880 }, { "epoch": 1.7493317493317493, "grad_norm": 0.8017232418060303, "learning_rate": 1.9504009504009503e-05, "loss": 0.0671, "step": 5890 }, { "epoch": 1.7523017523017523, "grad_norm": 0.5877105593681335, "learning_rate": 1.9486189486189485e-05, "loss": 0.0658, "step": 5900 }, { "epoch": 1.7552717552717554, "grad_norm": 1.359218716621399, "learning_rate": 1.946836946836947e-05, "loss": 0.0626, "step": 5910 }, { "epoch": 1.7582417582417582, "grad_norm": 0.6869223117828369, "learning_rate": 1.9450549450549452e-05, "loss": 0.0775, "step": 5920 }, { "epoch": 1.7612117612117613, "grad_norm": 0.7375755906105042, "learning_rate": 1.9432729432729435e-05, "loss": 0.0599, "step": 5930 }, { "epoch": 1.7641817641817643, "grad_norm": 0.9355472326278687, "learning_rate": 1.9414909414909417e-05, "loss": 0.0632, "step": 5940 }, { "epoch": 1.7671517671517671, "grad_norm": 1.0422033071517944, "learning_rate": 1.9397089397089396e-05, "loss": 0.0644, "step": 5950 }, { "epoch": 1.77012177012177, "grad_norm": 0.7586382627487183, "learning_rate": 1.9379269379269378e-05, "loss": 0.0615, "step": 5960 }, { "epoch": 1.7730917730917732, "grad_norm": 1.0248056650161743, "learning_rate": 1.936144936144936e-05, "loss": 0.0708, "step": 5970 }, { "epoch": 1.776061776061776, "grad_norm": 0.9860632419586182, "learning_rate": 1.9343629343629345e-05, "loss": 0.0735, "step": 5980 }, { "epoch": 1.779031779031779, "grad_norm": 0.8044900894165039, "learning_rate": 1.9325809325809328e-05, "loss": 0.051, "step": 5990 }, { "epoch": 1.7820017820017822, "grad_norm": 0.7025960087776184, "learning_rate": 1.930798930798931e-05, "loss": 0.0535, "step": 6000 }, { "epoch": 1.784971784971785, "grad_norm": 0.9908897876739502, "learning_rate": 1.9290169290169292e-05, "loss": 0.0578, "step": 6010 }, { "epoch": 1.7879417879417878, "grad_norm": 1.2506797313690186, "learning_rate": 1.927234927234927e-05, "loss": 0.065, "step": 6020 }, { "epoch": 1.7909117909117909, "grad_norm": 1.023766040802002, "learning_rate": 1.9254529254529253e-05, "loss": 0.0631, "step": 6030 }, { "epoch": 1.793881793881794, "grad_norm": 0.8149070739746094, "learning_rate": 1.9236709236709235e-05, "loss": 0.0621, "step": 6040 }, { "epoch": 1.7968517968517967, "grad_norm": 0.6476550698280334, "learning_rate": 1.921888921888922e-05, "loss": 0.066, "step": 6050 }, { "epoch": 1.7998217998217998, "grad_norm": 0.65140700340271, "learning_rate": 1.9201069201069203e-05, "loss": 0.0611, "step": 6060 }, { "epoch": 1.8027918027918028, "grad_norm": 0.9502988457679749, "learning_rate": 1.9183249183249185e-05, "loss": 0.0615, "step": 6070 }, { "epoch": 1.8057618057618057, "grad_norm": 0.916491687297821, "learning_rate": 1.9165429165429167e-05, "loss": 0.0643, "step": 6080 }, { "epoch": 1.8087318087318087, "grad_norm": 1.169666051864624, "learning_rate": 1.9147609147609146e-05, "loss": 0.0715, "step": 6090 }, { "epoch": 1.8117018117018118, "grad_norm": 1.0344257354736328, "learning_rate": 1.9129789129789128e-05, "loss": 0.0605, "step": 6100 }, { "epoch": 1.8146718146718146, "grad_norm": 0.49185994267463684, "learning_rate": 1.911196911196911e-05, "loss": 0.0621, "step": 6110 }, { "epoch": 1.8176418176418176, "grad_norm": 0.9639203548431396, "learning_rate": 1.9094149094149096e-05, "loss": 0.0655, "step": 6120 }, { "epoch": 1.8206118206118207, "grad_norm": 0.6532425284385681, "learning_rate": 1.9076329076329078e-05, "loss": 0.0542, "step": 6130 }, { "epoch": 1.8235818235818235, "grad_norm": 0.455677330493927, "learning_rate": 1.905850905850906e-05, "loss": 0.0791, "step": 6140 }, { "epoch": 1.8265518265518266, "grad_norm": 0.5976959466934204, "learning_rate": 1.9040689040689042e-05, "loss": 0.0586, "step": 6150 }, { "epoch": 1.8295218295218296, "grad_norm": 0.6233227849006653, "learning_rate": 1.902286902286902e-05, "loss": 0.0765, "step": 6160 }, { "epoch": 1.8324918324918325, "grad_norm": 1.4189893007278442, "learning_rate": 1.9005049005049003e-05, "loss": 0.0683, "step": 6170 }, { "epoch": 1.8354618354618355, "grad_norm": 0.5655115246772766, "learning_rate": 1.8987228987228986e-05, "loss": 0.067, "step": 6180 }, { "epoch": 1.8384318384318385, "grad_norm": 0.7144678235054016, "learning_rate": 1.896940896940897e-05, "loss": 0.0537, "step": 6190 }, { "epoch": 1.8414018414018414, "grad_norm": 1.3380944728851318, "learning_rate": 1.8951588951588953e-05, "loss": 0.0697, "step": 6200 }, { "epoch": 1.8443718443718444, "grad_norm": 0.6233878135681152, "learning_rate": 1.8933768933768935e-05, "loss": 0.0702, "step": 6210 }, { "epoch": 1.8473418473418475, "grad_norm": 0.7405056357383728, "learning_rate": 1.8915948915948918e-05, "loss": 0.0654, "step": 6220 }, { "epoch": 1.8503118503118503, "grad_norm": 1.0644950866699219, "learning_rate": 1.8898128898128896e-05, "loss": 0.0654, "step": 6230 }, { "epoch": 1.8532818532818531, "grad_norm": 1.2101881504058838, "learning_rate": 1.888030888030888e-05, "loss": 0.0612, "step": 6240 }, { "epoch": 1.8562518562518564, "grad_norm": 0.7648733258247375, "learning_rate": 1.886248886248886e-05, "loss": 0.0492, "step": 6250 }, { "epoch": 1.8592218592218592, "grad_norm": 0.764968991279602, "learning_rate": 1.8844668844668846e-05, "loss": 0.0582, "step": 6260 }, { "epoch": 1.862191862191862, "grad_norm": 0.4919446110725403, "learning_rate": 1.882684882684883e-05, "loss": 0.0593, "step": 6270 }, { "epoch": 1.865161865161865, "grad_norm": 0.784898579120636, "learning_rate": 1.880902880902881e-05, "loss": 0.0651, "step": 6280 }, { "epoch": 1.8681318681318682, "grad_norm": 0.4835127890110016, "learning_rate": 1.8791208791208793e-05, "loss": 0.0545, "step": 6290 }, { "epoch": 1.871101871101871, "grad_norm": 0.7325404286384583, "learning_rate": 1.8773388773388775e-05, "loss": 0.0622, "step": 6300 }, { "epoch": 1.874071874071874, "grad_norm": 0.9660061001777649, "learning_rate": 1.8755568755568754e-05, "loss": 0.0767, "step": 6310 }, { "epoch": 1.877041877041877, "grad_norm": 1.0770442485809326, "learning_rate": 1.8737748737748736e-05, "loss": 0.0657, "step": 6320 }, { "epoch": 1.88001188001188, "grad_norm": 1.3359770774841309, "learning_rate": 1.871992871992872e-05, "loss": 0.0686, "step": 6330 }, { "epoch": 1.882981882981883, "grad_norm": 0.6175670027732849, "learning_rate": 1.8702108702108704e-05, "loss": 0.0665, "step": 6340 }, { "epoch": 1.885951885951886, "grad_norm": 0.7021560072898865, "learning_rate": 1.8684288684288686e-05, "loss": 0.0621, "step": 6350 }, { "epoch": 1.8889218889218888, "grad_norm": 0.5976589918136597, "learning_rate": 1.8666468666468668e-05, "loss": 0.0612, "step": 6360 }, { "epoch": 1.8918918918918919, "grad_norm": 0.6338089108467102, "learning_rate": 1.864864864864865e-05, "loss": 0.0636, "step": 6370 }, { "epoch": 1.894861894861895, "grad_norm": 0.9274225234985352, "learning_rate": 1.863082863082863e-05, "loss": 0.057, "step": 6380 }, { "epoch": 1.8978318978318978, "grad_norm": 0.42748093605041504, "learning_rate": 1.861300861300861e-05, "loss": 0.0583, "step": 6390 }, { "epoch": 1.9008019008019008, "grad_norm": 0.7819433808326721, "learning_rate": 1.8595188595188597e-05, "loss": 0.0581, "step": 6400 }, { "epoch": 1.9037719037719039, "grad_norm": 0.5644189119338989, "learning_rate": 1.857736857736858e-05, "loss": 0.0554, "step": 6410 }, { "epoch": 1.9067419067419067, "grad_norm": 0.6357256770133972, "learning_rate": 1.855954855954856e-05, "loss": 0.0755, "step": 6420 }, { "epoch": 1.9097119097119097, "grad_norm": 0.737325131893158, "learning_rate": 1.8541728541728543e-05, "loss": 0.0742, "step": 6430 }, { "epoch": 1.9126819126819128, "grad_norm": 0.6249455809593201, "learning_rate": 1.8523908523908525e-05, "loss": 0.0624, "step": 6440 }, { "epoch": 1.9156519156519156, "grad_norm": 0.8265432119369507, "learning_rate": 1.8506088506088504e-05, "loss": 0.0535, "step": 6450 }, { "epoch": 1.9186219186219187, "grad_norm": 0.5451765656471252, "learning_rate": 1.8488268488268486e-05, "loss": 0.0646, "step": 6460 }, { "epoch": 1.9215919215919217, "grad_norm": 0.9091644883155823, "learning_rate": 1.8470448470448472e-05, "loss": 0.0579, "step": 6470 }, { "epoch": 1.9245619245619245, "grad_norm": 0.6501501798629761, "learning_rate": 1.8452628452628454e-05, "loss": 0.0518, "step": 6480 }, { "epoch": 1.9275319275319274, "grad_norm": 0.6668860912322998, "learning_rate": 1.8434808434808436e-05, "loss": 0.0484, "step": 6490 }, { "epoch": 1.9305019305019306, "grad_norm": 0.6235078573226929, "learning_rate": 1.841698841698842e-05, "loss": 0.0571, "step": 6500 }, { "epoch": 1.9334719334719335, "grad_norm": 0.5391401648521423, "learning_rate": 1.83991683991684e-05, "loss": 0.042, "step": 6510 }, { "epoch": 1.9364419364419363, "grad_norm": 0.4596608877182007, "learning_rate": 1.838134838134838e-05, "loss": 0.0666, "step": 6520 }, { "epoch": 1.9394119394119396, "grad_norm": 0.9191550612449646, "learning_rate": 1.836352836352836e-05, "loss": 0.0731, "step": 6530 }, { "epoch": 1.9423819423819424, "grad_norm": 0.8240092992782593, "learning_rate": 1.8345708345708347e-05, "loss": 0.0635, "step": 6540 }, { "epoch": 1.9453519453519452, "grad_norm": 0.8021003007888794, "learning_rate": 1.832788832788833e-05, "loss": 0.0675, "step": 6550 }, { "epoch": 1.9483219483219483, "grad_norm": 1.1347585916519165, "learning_rate": 1.831006831006831e-05, "loss": 0.0621, "step": 6560 }, { "epoch": 1.9512919512919513, "grad_norm": 0.6139947175979614, "learning_rate": 1.8292248292248294e-05, "loss": 0.0543, "step": 6570 }, { "epoch": 1.9542619542619541, "grad_norm": 0.7635927796363831, "learning_rate": 1.8274428274428276e-05, "loss": 0.0592, "step": 6580 }, { "epoch": 1.9572319572319572, "grad_norm": 0.43468376994132996, "learning_rate": 1.8256608256608254e-05, "loss": 0.0612, "step": 6590 }, { "epoch": 1.9602019602019602, "grad_norm": 0.7748926877975464, "learning_rate": 1.8238788238788237e-05, "loss": 0.0604, "step": 6600 }, { "epoch": 1.963171963171963, "grad_norm": 0.8012065887451172, "learning_rate": 1.8220968220968222e-05, "loss": 0.0611, "step": 6610 }, { "epoch": 1.9661419661419661, "grad_norm": 0.9480249285697937, "learning_rate": 1.8203148203148204e-05, "loss": 0.0557, "step": 6620 }, { "epoch": 1.9691119691119692, "grad_norm": 0.8358305096626282, "learning_rate": 1.8185328185328187e-05, "loss": 0.0598, "step": 6630 }, { "epoch": 1.972081972081972, "grad_norm": 0.5806179642677307, "learning_rate": 1.816750816750817e-05, "loss": 0.0646, "step": 6640 }, { "epoch": 1.975051975051975, "grad_norm": 1.0674052238464355, "learning_rate": 1.814968814968815e-05, "loss": 0.075, "step": 6650 }, { "epoch": 1.978021978021978, "grad_norm": 0.7299931645393372, "learning_rate": 1.813186813186813e-05, "loss": 0.0526, "step": 6660 }, { "epoch": 1.980991980991981, "grad_norm": 0.7215370535850525, "learning_rate": 1.8114048114048112e-05, "loss": 0.063, "step": 6670 }, { "epoch": 1.983961983961984, "grad_norm": 0.6642675995826721, "learning_rate": 1.8096228096228097e-05, "loss": 0.0534, "step": 6680 }, { "epoch": 1.986931986931987, "grad_norm": 1.6818110942840576, "learning_rate": 1.807840807840808e-05, "loss": 0.0663, "step": 6690 }, { "epoch": 1.9899019899019899, "grad_norm": 0.8374834060668945, "learning_rate": 1.8060588060588062e-05, "loss": 0.057, "step": 6700 }, { "epoch": 1.992871992871993, "grad_norm": 0.5147525668144226, "learning_rate": 1.8042768042768044e-05, "loss": 0.0629, "step": 6710 }, { "epoch": 1.995841995841996, "grad_norm": 0.8637810945510864, "learning_rate": 1.8024948024948026e-05, "loss": 0.068, "step": 6720 }, { "epoch": 1.9988119988119988, "grad_norm": 1.2453491687774658, "learning_rate": 1.8007128007128005e-05, "loss": 0.0679, "step": 6730 }, { "epoch": 2.0, "eval_f1": 0.33031292965957215, "eval_loss": 0.04602367803454399, "eval_runtime": 175.7476, "eval_samples_per_second": 216.327, "eval_steps_per_second": 3.386, "step": 6734 }, { "epoch": 2.0017820017820016, "grad_norm": 1.147544264793396, "learning_rate": 1.7989307989307987e-05, "loss": 0.0478, "step": 6740 }, { "epoch": 2.004752004752005, "grad_norm": 0.7158094644546509, "learning_rate": 1.7971487971487973e-05, "loss": 0.0597, "step": 6750 }, { "epoch": 2.0077220077220077, "grad_norm": 0.5224217176437378, "learning_rate": 1.7953667953667955e-05, "loss": 0.0655, "step": 6760 }, { "epoch": 2.0106920106920105, "grad_norm": 1.072407603263855, "learning_rate": 1.7935847935847937e-05, "loss": 0.058, "step": 6770 }, { "epoch": 2.013662013662014, "grad_norm": 1.3633906841278076, "learning_rate": 1.791802791802792e-05, "loss": 0.0601, "step": 6780 }, { "epoch": 2.0166320166320166, "grad_norm": 1.6582151651382446, "learning_rate": 1.79002079002079e-05, "loss": 0.0629, "step": 6790 }, { "epoch": 2.0196020196020195, "grad_norm": 0.9400825500488281, "learning_rate": 1.788238788238788e-05, "loss": 0.06, "step": 6800 }, { "epoch": 2.0225720225720227, "grad_norm": 0.9094600677490234, "learning_rate": 1.7864567864567862e-05, "loss": 0.0583, "step": 6810 }, { "epoch": 2.0255420255420256, "grad_norm": 0.7568134069442749, "learning_rate": 1.7846747846747848e-05, "loss": 0.0584, "step": 6820 }, { "epoch": 2.0285120285120284, "grad_norm": 0.4175792634487152, "learning_rate": 1.782892782892783e-05, "loss": 0.056, "step": 6830 }, { "epoch": 2.0314820314820317, "grad_norm": 0.7946698665618896, "learning_rate": 1.7811107811107812e-05, "loss": 0.0611, "step": 6840 }, { "epoch": 2.0344520344520345, "grad_norm": 0.5856500267982483, "learning_rate": 1.7793287793287794e-05, "loss": 0.056, "step": 6850 }, { "epoch": 2.0374220374220373, "grad_norm": 0.7179090976715088, "learning_rate": 1.7775467775467776e-05, "loss": 0.0602, "step": 6860 }, { "epoch": 2.0403920403920406, "grad_norm": 1.6203516721725464, "learning_rate": 1.7757647757647755e-05, "loss": 0.059, "step": 6870 }, { "epoch": 2.0433620433620434, "grad_norm": 0.522436797618866, "learning_rate": 1.7739827739827737e-05, "loss": 0.0616, "step": 6880 }, { "epoch": 2.0463320463320462, "grad_norm": 1.3147087097167969, "learning_rate": 1.7722007722007723e-05, "loss": 0.0658, "step": 6890 }, { "epoch": 2.0493020493020495, "grad_norm": 0.7292523980140686, "learning_rate": 1.7704187704187705e-05, "loss": 0.0559, "step": 6900 }, { "epoch": 2.0522720522720523, "grad_norm": 0.7819983959197998, "learning_rate": 1.7686367686367687e-05, "loss": 0.0581, "step": 6910 }, { "epoch": 2.055242055242055, "grad_norm": 0.7037408947944641, "learning_rate": 1.766854766854767e-05, "loss": 0.0653, "step": 6920 }, { "epoch": 2.0582120582120584, "grad_norm": 0.940587043762207, "learning_rate": 1.765072765072765e-05, "loss": 0.063, "step": 6930 }, { "epoch": 2.0611820611820613, "grad_norm": 0.7256175875663757, "learning_rate": 1.7632907632907634e-05, "loss": 0.0611, "step": 6940 }, { "epoch": 2.064152064152064, "grad_norm": 0.7367241978645325, "learning_rate": 1.7615087615087613e-05, "loss": 0.0548, "step": 6950 }, { "epoch": 2.067122067122067, "grad_norm": 0.9881260991096497, "learning_rate": 1.7597267597267598e-05, "loss": 0.054, "step": 6960 }, { "epoch": 2.07009207009207, "grad_norm": 0.8852285742759705, "learning_rate": 1.757944757944758e-05, "loss": 0.065, "step": 6970 }, { "epoch": 2.073062073062073, "grad_norm": 0.8763765692710876, "learning_rate": 1.7561627561627563e-05, "loss": 0.0714, "step": 6980 }, { "epoch": 2.076032076032076, "grad_norm": 0.9801186919212341, "learning_rate": 1.7543807543807545e-05, "loss": 0.0644, "step": 6990 }, { "epoch": 2.079002079002079, "grad_norm": 0.8674045205116272, "learning_rate": 1.7525987525987527e-05, "loss": 0.0597, "step": 7000 }, { "epoch": 2.081972081972082, "grad_norm": 0.8817034959793091, "learning_rate": 1.750816750816751e-05, "loss": 0.0558, "step": 7010 }, { "epoch": 2.0849420849420848, "grad_norm": 0.8658657073974609, "learning_rate": 1.7490347490347488e-05, "loss": 0.0673, "step": 7020 }, { "epoch": 2.087912087912088, "grad_norm": 0.5216271877288818, "learning_rate": 1.7472527472527473e-05, "loss": 0.057, "step": 7030 }, { "epoch": 2.090882090882091, "grad_norm": 1.3478199243545532, "learning_rate": 1.7454707454707456e-05, "loss": 0.0623, "step": 7040 }, { "epoch": 2.0938520938520937, "grad_norm": 1.1730366945266724, "learning_rate": 1.7436887436887438e-05, "loss": 0.0663, "step": 7050 }, { "epoch": 2.096822096822097, "grad_norm": 0.7896762490272522, "learning_rate": 1.741906741906742e-05, "loss": 0.0677, "step": 7060 }, { "epoch": 2.0997920997921, "grad_norm": 0.6003497838973999, "learning_rate": 1.7401247401247402e-05, "loss": 0.062, "step": 7070 }, { "epoch": 2.1027621027621026, "grad_norm": 1.0001591444015503, "learning_rate": 1.7383427383427384e-05, "loss": 0.0653, "step": 7080 }, { "epoch": 2.105732105732106, "grad_norm": 1.3104040622711182, "learning_rate": 1.7365607365607363e-05, "loss": 0.0653, "step": 7090 }, { "epoch": 2.1087021087021087, "grad_norm": 0.6923945546150208, "learning_rate": 1.734778734778735e-05, "loss": 0.0474, "step": 7100 }, { "epoch": 2.1116721116721116, "grad_norm": 0.6234051585197449, "learning_rate": 1.732996732996733e-05, "loss": 0.0617, "step": 7110 }, { "epoch": 2.114642114642115, "grad_norm": 0.9373548626899719, "learning_rate": 1.7312147312147313e-05, "loss": 0.055, "step": 7120 }, { "epoch": 2.1176121176121177, "grad_norm": 0.9294105768203735, "learning_rate": 1.7294327294327295e-05, "loss": 0.0573, "step": 7130 }, { "epoch": 2.1205821205821205, "grad_norm": 0.7299237847328186, "learning_rate": 1.7276507276507277e-05, "loss": 0.0528, "step": 7140 }, { "epoch": 2.1235521235521237, "grad_norm": 0.5744438767433167, "learning_rate": 1.725868725868726e-05, "loss": 0.0606, "step": 7150 }, { "epoch": 2.1265221265221266, "grad_norm": 0.7229251861572266, "learning_rate": 1.7240867240867238e-05, "loss": 0.0543, "step": 7160 }, { "epoch": 2.1294921294921294, "grad_norm": 1.3666146993637085, "learning_rate": 1.7223047223047224e-05, "loss": 0.0617, "step": 7170 }, { "epoch": 2.1324621324621322, "grad_norm": 0.6104298830032349, "learning_rate": 1.7205227205227206e-05, "loss": 0.0507, "step": 7180 }, { "epoch": 2.1354321354321355, "grad_norm": 0.5680792331695557, "learning_rate": 1.7187407187407188e-05, "loss": 0.052, "step": 7190 }, { "epoch": 2.1384021384021383, "grad_norm": 0.5435447692871094, "learning_rate": 1.716958716958717e-05, "loss": 0.0578, "step": 7200 }, { "epoch": 2.141372141372141, "grad_norm": 0.6102536916732788, "learning_rate": 1.7151767151767152e-05, "loss": 0.0717, "step": 7210 }, { "epoch": 2.1443421443421444, "grad_norm": 0.9239901304244995, "learning_rate": 1.7133947133947135e-05, "loss": 0.0595, "step": 7220 }, { "epoch": 2.1473121473121473, "grad_norm": 0.7742301225662231, "learning_rate": 1.7116127116127117e-05, "loss": 0.0613, "step": 7230 }, { "epoch": 2.15028215028215, "grad_norm": 0.7156841158866882, "learning_rate": 1.70983070983071e-05, "loss": 0.0673, "step": 7240 }, { "epoch": 2.1532521532521534, "grad_norm": 0.8358856439590454, "learning_rate": 1.708048708048708e-05, "loss": 0.0662, "step": 7250 }, { "epoch": 2.156222156222156, "grad_norm": 1.2078478336334229, "learning_rate": 1.7062667062667063e-05, "loss": 0.0612, "step": 7260 }, { "epoch": 2.159192159192159, "grad_norm": 0.7166781425476074, "learning_rate": 1.7044847044847045e-05, "loss": 0.0758, "step": 7270 }, { "epoch": 2.1621621621621623, "grad_norm": 0.5501638650894165, "learning_rate": 1.7027027027027028e-05, "loss": 0.0544, "step": 7280 }, { "epoch": 2.165132165132165, "grad_norm": 0.6720577478408813, "learning_rate": 1.700920700920701e-05, "loss": 0.0682, "step": 7290 }, { "epoch": 2.168102168102168, "grad_norm": 1.2910141944885254, "learning_rate": 1.6991386991386992e-05, "loss": 0.0563, "step": 7300 }, { "epoch": 2.171072171072171, "grad_norm": 0.834705114364624, "learning_rate": 1.6973566973566974e-05, "loss": 0.0562, "step": 7310 }, { "epoch": 2.174042174042174, "grad_norm": 0.9644896984100342, "learning_rate": 1.6955746955746956e-05, "loss": 0.0582, "step": 7320 }, { "epoch": 2.177012177012177, "grad_norm": 0.7104148864746094, "learning_rate": 1.693792693792694e-05, "loss": 0.0624, "step": 7330 }, { "epoch": 2.17998217998218, "grad_norm": 1.1333881616592407, "learning_rate": 1.692010692010692e-05, "loss": 0.0681, "step": 7340 }, { "epoch": 2.182952182952183, "grad_norm": 0.5454867482185364, "learning_rate": 1.6902286902286903e-05, "loss": 0.0504, "step": 7350 }, { "epoch": 2.185922185922186, "grad_norm": 0.7545517086982727, "learning_rate": 1.6884466884466885e-05, "loss": 0.0546, "step": 7360 }, { "epoch": 2.188892188892189, "grad_norm": 0.8815241456031799, "learning_rate": 1.6866646866646867e-05, "loss": 0.0645, "step": 7370 }, { "epoch": 2.191862191862192, "grad_norm": 0.8765430450439453, "learning_rate": 1.684882684882685e-05, "loss": 0.0658, "step": 7380 }, { "epoch": 2.1948321948321947, "grad_norm": 0.8555752038955688, "learning_rate": 1.683100683100683e-05, "loss": 0.06, "step": 7390 }, { "epoch": 2.197802197802198, "grad_norm": 0.7630652785301208, "learning_rate": 1.6813186813186814e-05, "loss": 0.0701, "step": 7400 }, { "epoch": 2.200772200772201, "grad_norm": 0.6194639205932617, "learning_rate": 1.6795366795366796e-05, "loss": 0.057, "step": 7410 }, { "epoch": 2.2037422037422036, "grad_norm": 0.4202905297279358, "learning_rate": 1.6777546777546778e-05, "loss": 0.0486, "step": 7420 }, { "epoch": 2.206712206712207, "grad_norm": 0.6983931660652161, "learning_rate": 1.675972675972676e-05, "loss": 0.0638, "step": 7430 }, { "epoch": 2.2096822096822097, "grad_norm": 0.521522045135498, "learning_rate": 1.6741906741906742e-05, "loss": 0.0659, "step": 7440 }, { "epoch": 2.2126522126522126, "grad_norm": 0.4995703101158142, "learning_rate": 1.6724086724086725e-05, "loss": 0.0631, "step": 7450 }, { "epoch": 2.215622215622216, "grad_norm": 1.18669593334198, "learning_rate": 1.6706266706266707e-05, "loss": 0.0589, "step": 7460 }, { "epoch": 2.2185922185922187, "grad_norm": 1.027181625366211, "learning_rate": 1.668844668844669e-05, "loss": 0.0664, "step": 7470 }, { "epoch": 2.2215622215622215, "grad_norm": 0.9086653590202332, "learning_rate": 1.667062667062667e-05, "loss": 0.0589, "step": 7480 }, { "epoch": 2.2245322245322248, "grad_norm": 0.6986064314842224, "learning_rate": 1.6652806652806653e-05, "loss": 0.061, "step": 7490 }, { "epoch": 2.2275022275022276, "grad_norm": 1.728353500366211, "learning_rate": 1.6634986634986635e-05, "loss": 0.0496, "step": 7500 }, { "epoch": 2.2304722304722304, "grad_norm": 0.635563313961029, "learning_rate": 1.6617166617166618e-05, "loss": 0.0711, "step": 7510 }, { "epoch": 2.2334422334422332, "grad_norm": 0.9441576600074768, "learning_rate": 1.65993465993466e-05, "loss": 0.0554, "step": 7520 }, { "epoch": 2.2364122364122365, "grad_norm": 0.984162449836731, "learning_rate": 1.6581526581526582e-05, "loss": 0.059, "step": 7530 }, { "epoch": 2.2393822393822393, "grad_norm": 0.7663610577583313, "learning_rate": 1.6563706563706564e-05, "loss": 0.0654, "step": 7540 }, { "epoch": 2.242352242352242, "grad_norm": 0.8404085636138916, "learning_rate": 1.6545886545886546e-05, "loss": 0.0723, "step": 7550 }, { "epoch": 2.2453222453222454, "grad_norm": 0.9056734442710876, "learning_rate": 1.652806652806653e-05, "loss": 0.0632, "step": 7560 }, { "epoch": 2.2482922482922483, "grad_norm": 1.249064326286316, "learning_rate": 1.651024651024651e-05, "loss": 0.0762, "step": 7570 }, { "epoch": 2.251262251262251, "grad_norm": 0.7174326181411743, "learning_rate": 1.6492426492426496e-05, "loss": 0.054, "step": 7580 }, { "epoch": 2.2542322542322544, "grad_norm": 0.9042089581489563, "learning_rate": 1.6474606474606475e-05, "loss": 0.0668, "step": 7590 }, { "epoch": 2.257202257202257, "grad_norm": 0.6631519794464111, "learning_rate": 1.6456786456786457e-05, "loss": 0.0599, "step": 7600 }, { "epoch": 2.26017226017226, "grad_norm": 0.6724472045898438, "learning_rate": 1.643896643896644e-05, "loss": 0.0675, "step": 7610 }, { "epoch": 2.2631422631422633, "grad_norm": 0.48432090878486633, "learning_rate": 1.642114642114642e-05, "loss": 0.0565, "step": 7620 }, { "epoch": 2.266112266112266, "grad_norm": 0.5798602104187012, "learning_rate": 1.6403326403326404e-05, "loss": 0.0584, "step": 7630 }, { "epoch": 2.269082269082269, "grad_norm": 0.676134467124939, "learning_rate": 1.6385506385506386e-05, "loss": 0.0651, "step": 7640 }, { "epoch": 2.2720522720522722, "grad_norm": 0.6854906678199768, "learning_rate": 1.636768636768637e-05, "loss": 0.0711, "step": 7650 }, { "epoch": 2.275022275022275, "grad_norm": 1.0705246925354004, "learning_rate": 1.634986634986635e-05, "loss": 0.0635, "step": 7660 }, { "epoch": 2.277992277992278, "grad_norm": 0.6841071248054504, "learning_rate": 1.6332046332046332e-05, "loss": 0.0534, "step": 7670 }, { "epoch": 2.280962280962281, "grad_norm": 1.0041640996932983, "learning_rate": 1.6314226314226314e-05, "loss": 0.053, "step": 7680 }, { "epoch": 2.283932283932284, "grad_norm": 0.7719654440879822, "learning_rate": 1.6296406296406297e-05, "loss": 0.0715, "step": 7690 }, { "epoch": 2.286902286902287, "grad_norm": 0.5428666472434998, "learning_rate": 1.627858627858628e-05, "loss": 0.0483, "step": 7700 }, { "epoch": 2.2898722898722896, "grad_norm": 0.9091588854789734, "learning_rate": 1.626076626076626e-05, "loss": 0.0525, "step": 7710 }, { "epoch": 2.292842292842293, "grad_norm": 0.8281648755073547, "learning_rate": 1.6242946242946247e-05, "loss": 0.0662, "step": 7720 }, { "epoch": 2.2958122958122957, "grad_norm": 1.2040926218032837, "learning_rate": 1.6225126225126225e-05, "loss": 0.0561, "step": 7730 }, { "epoch": 2.2987822987822986, "grad_norm": 1.1443513631820679, "learning_rate": 1.6207306207306207e-05, "loss": 0.0462, "step": 7740 }, { "epoch": 2.301752301752302, "grad_norm": 0.990245521068573, "learning_rate": 1.618948618948619e-05, "loss": 0.0647, "step": 7750 }, { "epoch": 2.3047223047223047, "grad_norm": 0.6413756608963013, "learning_rate": 1.6171666171666172e-05, "loss": 0.0713, "step": 7760 }, { "epoch": 2.3076923076923075, "grad_norm": 0.5494916439056396, "learning_rate": 1.6153846153846154e-05, "loss": 0.0594, "step": 7770 }, { "epoch": 2.3106623106623108, "grad_norm": 0.6467922329902649, "learning_rate": 1.6136026136026136e-05, "loss": 0.0586, "step": 7780 }, { "epoch": 2.3136323136323136, "grad_norm": 0.5509008169174194, "learning_rate": 1.6118206118206122e-05, "loss": 0.0574, "step": 7790 }, { "epoch": 2.3166023166023164, "grad_norm": 0.7247257232666016, "learning_rate": 1.61003861003861e-05, "loss": 0.0676, "step": 7800 }, { "epoch": 2.3195723195723197, "grad_norm": 0.7993832230567932, "learning_rate": 1.6082566082566083e-05, "loss": 0.0666, "step": 7810 }, { "epoch": 2.3225423225423225, "grad_norm": 0.5373691916465759, "learning_rate": 1.6064746064746065e-05, "loss": 0.0551, "step": 7820 }, { "epoch": 2.3255123255123253, "grad_norm": 0.8234823942184448, "learning_rate": 1.6046926046926047e-05, "loss": 0.0696, "step": 7830 }, { "epoch": 2.3284823284823286, "grad_norm": 0.6193700432777405, "learning_rate": 1.602910602910603e-05, "loss": 0.0742, "step": 7840 }, { "epoch": 2.3314523314523314, "grad_norm": 0.9918487071990967, "learning_rate": 1.601128601128601e-05, "loss": 0.0636, "step": 7850 }, { "epoch": 2.3344223344223343, "grad_norm": 0.47731947898864746, "learning_rate": 1.5993465993465997e-05, "loss": 0.0605, "step": 7860 }, { "epoch": 2.3373923373923375, "grad_norm": 1.2060436010360718, "learning_rate": 1.5975645975645976e-05, "loss": 0.0532, "step": 7870 }, { "epoch": 2.3403623403623404, "grad_norm": 0.9285519123077393, "learning_rate": 1.5957825957825958e-05, "loss": 0.0638, "step": 7880 }, { "epoch": 2.343332343332343, "grad_norm": 1.4281196594238281, "learning_rate": 1.594000594000594e-05, "loss": 0.0678, "step": 7890 }, { "epoch": 2.3463023463023465, "grad_norm": 1.6847813129425049, "learning_rate": 1.5922185922185922e-05, "loss": 0.0629, "step": 7900 }, { "epoch": 2.3492723492723493, "grad_norm": 0.5582916140556335, "learning_rate": 1.5904365904365904e-05, "loss": 0.0614, "step": 7910 }, { "epoch": 2.352242352242352, "grad_norm": 0.5601425170898438, "learning_rate": 1.5886545886545887e-05, "loss": 0.0549, "step": 7920 }, { "epoch": 2.3552123552123554, "grad_norm": 0.8730356097221375, "learning_rate": 1.5868725868725872e-05, "loss": 0.051, "step": 7930 }, { "epoch": 2.358182358182358, "grad_norm": 0.9987317323684692, "learning_rate": 1.585090585090585e-05, "loss": 0.0598, "step": 7940 }, { "epoch": 2.361152361152361, "grad_norm": 0.689275860786438, "learning_rate": 1.5833085833085833e-05, "loss": 0.0477, "step": 7950 }, { "epoch": 2.3641223641223643, "grad_norm": 0.8642667531967163, "learning_rate": 1.5815265815265815e-05, "loss": 0.0619, "step": 7960 }, { "epoch": 2.367092367092367, "grad_norm": 0.675285279750824, "learning_rate": 1.5797445797445797e-05, "loss": 0.0671, "step": 7970 }, { "epoch": 2.37006237006237, "grad_norm": 0.9872873425483704, "learning_rate": 1.577962577962578e-05, "loss": 0.0523, "step": 7980 }, { "epoch": 2.3730323730323732, "grad_norm": 0.7550463676452637, "learning_rate": 1.5761805761805762e-05, "loss": 0.0616, "step": 7990 }, { "epoch": 2.376002376002376, "grad_norm": 1.0801265239715576, "learning_rate": 1.5743985743985747e-05, "loss": 0.0565, "step": 8000 }, { "epoch": 2.378972378972379, "grad_norm": 1.093015432357788, "learning_rate": 1.5726165726165726e-05, "loss": 0.0666, "step": 8010 }, { "epoch": 2.381942381942382, "grad_norm": 0.882056713104248, "learning_rate": 1.5708345708345708e-05, "loss": 0.0609, "step": 8020 }, { "epoch": 2.384912384912385, "grad_norm": 0.7684606909751892, "learning_rate": 1.569052569052569e-05, "loss": 0.063, "step": 8030 }, { "epoch": 2.387882387882388, "grad_norm": 0.9712556600570679, "learning_rate": 1.5672705672705673e-05, "loss": 0.0598, "step": 8040 }, { "epoch": 2.390852390852391, "grad_norm": 0.7231224179267883, "learning_rate": 1.5654885654885655e-05, "loss": 0.0671, "step": 8050 }, { "epoch": 2.393822393822394, "grad_norm": 0.6880158185958862, "learning_rate": 1.5637065637065637e-05, "loss": 0.0638, "step": 8060 }, { "epoch": 2.3967923967923968, "grad_norm": 0.8751171231269836, "learning_rate": 1.5619245619245622e-05, "loss": 0.0607, "step": 8070 }, { "epoch": 2.3997623997623996, "grad_norm": 1.0133845806121826, "learning_rate": 1.56014256014256e-05, "loss": 0.0663, "step": 8080 }, { "epoch": 2.402732402732403, "grad_norm": 0.4937121272087097, "learning_rate": 1.5583605583605583e-05, "loss": 0.0586, "step": 8090 }, { "epoch": 2.4057024057024057, "grad_norm": 0.6717655062675476, "learning_rate": 1.5565785565785566e-05, "loss": 0.0709, "step": 8100 }, { "epoch": 2.4086724086724085, "grad_norm": 0.9288782477378845, "learning_rate": 1.5547965547965548e-05, "loss": 0.0728, "step": 8110 }, { "epoch": 2.4116424116424118, "grad_norm": 0.9981745481491089, "learning_rate": 1.553014553014553e-05, "loss": 0.0518, "step": 8120 }, { "epoch": 2.4146124146124146, "grad_norm": 0.6858815550804138, "learning_rate": 1.5512325512325512e-05, "loss": 0.059, "step": 8130 }, { "epoch": 2.4175824175824174, "grad_norm": 1.0234894752502441, "learning_rate": 1.5494505494505498e-05, "loss": 0.0656, "step": 8140 }, { "epoch": 2.4205524205524207, "grad_norm": 1.0168789625167847, "learning_rate": 1.547668547668548e-05, "loss": 0.0661, "step": 8150 }, { "epoch": 2.4235224235224235, "grad_norm": 0.5430660247802734, "learning_rate": 1.545886545886546e-05, "loss": 0.0498, "step": 8160 }, { "epoch": 2.4264924264924264, "grad_norm": 0.6519795656204224, "learning_rate": 1.544104544104544e-05, "loss": 0.0565, "step": 8170 }, { "epoch": 2.4294624294624296, "grad_norm": 0.4860493540763855, "learning_rate": 1.5423225423225423e-05, "loss": 0.0502, "step": 8180 }, { "epoch": 2.4324324324324325, "grad_norm": 1.287544846534729, "learning_rate": 1.5405405405405405e-05, "loss": 0.0619, "step": 8190 }, { "epoch": 2.4354024354024353, "grad_norm": 0.7595537900924683, "learning_rate": 1.5387585387585387e-05, "loss": 0.062, "step": 8200 }, { "epoch": 2.4383724383724386, "grad_norm": 1.2238869667053223, "learning_rate": 1.5369765369765373e-05, "loss": 0.0602, "step": 8210 }, { "epoch": 2.4413424413424414, "grad_norm": 0.6044248342514038, "learning_rate": 1.5351945351945355e-05, "loss": 0.064, "step": 8220 }, { "epoch": 2.444312444312444, "grad_norm": 1.0515096187591553, "learning_rate": 1.5334125334125334e-05, "loss": 0.0623, "step": 8230 }, { "epoch": 2.447282447282447, "grad_norm": 0.3388879895210266, "learning_rate": 1.5316305316305316e-05, "loss": 0.0508, "step": 8240 }, { "epoch": 2.4502524502524503, "grad_norm": 1.4969494342803955, "learning_rate": 1.5298485298485298e-05, "loss": 0.0562, "step": 8250 }, { "epoch": 2.453222453222453, "grad_norm": 1.1669758558273315, "learning_rate": 1.528066528066528e-05, "loss": 0.0694, "step": 8260 }, { "epoch": 2.456192456192456, "grad_norm": 0.8401341438293457, "learning_rate": 1.5262845262845263e-05, "loss": 0.0596, "step": 8270 }, { "epoch": 2.4591624591624592, "grad_norm": 0.8268706798553467, "learning_rate": 1.5245025245025246e-05, "loss": 0.0595, "step": 8280 }, { "epoch": 2.462132462132462, "grad_norm": 1.1533724069595337, "learning_rate": 1.5227205227205229e-05, "loss": 0.0586, "step": 8290 }, { "epoch": 2.465102465102465, "grad_norm": 0.972823441028595, "learning_rate": 1.520938520938521e-05, "loss": 0.0431, "step": 8300 }, { "epoch": 2.468072468072468, "grad_norm": 0.6986684799194336, "learning_rate": 1.5191565191565193e-05, "loss": 0.0591, "step": 8310 }, { "epoch": 2.471042471042471, "grad_norm": 0.6188581585884094, "learning_rate": 1.5173745173745173e-05, "loss": 0.0557, "step": 8320 }, { "epoch": 2.474012474012474, "grad_norm": 0.7166512608528137, "learning_rate": 1.5155925155925156e-05, "loss": 0.0562, "step": 8330 }, { "epoch": 2.476982476982477, "grad_norm": 0.6496429443359375, "learning_rate": 1.5138105138105138e-05, "loss": 0.0555, "step": 8340 }, { "epoch": 2.47995247995248, "grad_norm": 0.9905614852905273, "learning_rate": 1.5120285120285122e-05, "loss": 0.0574, "step": 8350 }, { "epoch": 2.4829224829224827, "grad_norm": 0.5973777770996094, "learning_rate": 1.5102465102465104e-05, "loss": 0.0636, "step": 8360 }, { "epoch": 2.485892485892486, "grad_norm": 0.4405939280986786, "learning_rate": 1.5084645084645086e-05, "loss": 0.0795, "step": 8370 }, { "epoch": 2.488862488862489, "grad_norm": 0.6318250894546509, "learning_rate": 1.5066825066825068e-05, "loss": 0.0599, "step": 8380 }, { "epoch": 2.4918324918324917, "grad_norm": 0.5132951140403748, "learning_rate": 1.5049005049005049e-05, "loss": 0.0632, "step": 8390 }, { "epoch": 2.494802494802495, "grad_norm": 1.1621384620666504, "learning_rate": 1.503118503118503e-05, "loss": 0.0716, "step": 8400 }, { "epoch": 2.4977724977724978, "grad_norm": 0.7687697410583496, "learning_rate": 1.5013365013365013e-05, "loss": 0.0543, "step": 8410 }, { "epoch": 2.5007425007425006, "grad_norm": 1.1648590564727783, "learning_rate": 1.4995544995544995e-05, "loss": 0.0525, "step": 8420 }, { "epoch": 2.503712503712504, "grad_norm": 1.093809723854065, "learning_rate": 1.4977724977724977e-05, "loss": 0.0577, "step": 8430 }, { "epoch": 2.5066825066825067, "grad_norm": 0.5859802961349487, "learning_rate": 1.4959904959904961e-05, "loss": 0.0505, "step": 8440 }, { "epoch": 2.5096525096525095, "grad_norm": 0.818012535572052, "learning_rate": 1.4942084942084943e-05, "loss": 0.06, "step": 8450 }, { "epoch": 2.512622512622513, "grad_norm": 0.4179311990737915, "learning_rate": 1.4924264924264924e-05, "loss": 0.0538, "step": 8460 }, { "epoch": 2.5155925155925156, "grad_norm": 1.3955974578857422, "learning_rate": 1.4906444906444908e-05, "loss": 0.0721, "step": 8470 }, { "epoch": 2.5185625185625184, "grad_norm": 0.8818016052246094, "learning_rate": 1.488862488862489e-05, "loss": 0.0578, "step": 8480 }, { "epoch": 2.5215325215325217, "grad_norm": 0.7490390539169312, "learning_rate": 1.487080487080487e-05, "loss": 0.0528, "step": 8490 }, { "epoch": 2.5245025245025245, "grad_norm": 0.7568694949150085, "learning_rate": 1.4852984852984852e-05, "loss": 0.0649, "step": 8500 }, { "epoch": 2.5274725274725274, "grad_norm": 0.7171285152435303, "learning_rate": 1.4835164835164836e-05, "loss": 0.0474, "step": 8510 }, { "epoch": 2.5304425304425306, "grad_norm": 0.5041258931159973, "learning_rate": 1.4817344817344818e-05, "loss": 0.0562, "step": 8520 }, { "epoch": 2.5334125334125335, "grad_norm": 0.5090057253837585, "learning_rate": 1.4799524799524799e-05, "loss": 0.0709, "step": 8530 }, { "epoch": 2.5363825363825363, "grad_norm": 0.5413128137588501, "learning_rate": 1.4781704781704783e-05, "loss": 0.0619, "step": 8540 }, { "epoch": 2.5393525393525396, "grad_norm": 0.8878335952758789, "learning_rate": 1.4763884763884765e-05, "loss": 0.0625, "step": 8550 }, { "epoch": 2.5423225423225424, "grad_norm": 0.7137174606323242, "learning_rate": 1.4746064746064745e-05, "loss": 0.057, "step": 8560 }, { "epoch": 2.5452925452925452, "grad_norm": 0.7620881795883179, "learning_rate": 1.4728244728244728e-05, "loss": 0.0681, "step": 8570 }, { "epoch": 2.5482625482625485, "grad_norm": 1.1073065996170044, "learning_rate": 1.4710424710424711e-05, "loss": 0.0708, "step": 8580 }, { "epoch": 2.5512325512325513, "grad_norm": 0.7626820802688599, "learning_rate": 1.4692604692604694e-05, "loss": 0.0616, "step": 8590 }, { "epoch": 2.554202554202554, "grad_norm": 0.849026083946228, "learning_rate": 1.4674784674784674e-05, "loss": 0.0511, "step": 8600 }, { "epoch": 2.5571725571725574, "grad_norm": 0.7794767618179321, "learning_rate": 1.4656964656964658e-05, "loss": 0.0527, "step": 8610 }, { "epoch": 2.5601425601425603, "grad_norm": 0.6230559349060059, "learning_rate": 1.463914463914464e-05, "loss": 0.0615, "step": 8620 }, { "epoch": 2.563112563112563, "grad_norm": 0.517436146736145, "learning_rate": 1.4621324621324622e-05, "loss": 0.0562, "step": 8630 }, { "epoch": 2.5660825660825664, "grad_norm": 0.835220217704773, "learning_rate": 1.4603504603504603e-05, "loss": 0.0548, "step": 8640 }, { "epoch": 2.569052569052569, "grad_norm": 0.8735977411270142, "learning_rate": 1.4585684585684587e-05, "loss": 0.0728, "step": 8650 }, { "epoch": 2.572022572022572, "grad_norm": 0.27027377486228943, "learning_rate": 1.4567864567864569e-05, "loss": 0.0512, "step": 8660 }, { "epoch": 2.574992574992575, "grad_norm": 0.804161548614502, "learning_rate": 1.455004455004455e-05, "loss": 0.0598, "step": 8670 }, { "epoch": 2.577962577962578, "grad_norm": 0.9159836173057556, "learning_rate": 1.4532224532224533e-05, "loss": 0.0755, "step": 8680 }, { "epoch": 2.580932580932581, "grad_norm": 1.0621222257614136, "learning_rate": 1.4514404514404515e-05, "loss": 0.0599, "step": 8690 }, { "epoch": 2.5839025839025838, "grad_norm": 0.61686110496521, "learning_rate": 1.4496584496584498e-05, "loss": 0.0606, "step": 8700 }, { "epoch": 2.586872586872587, "grad_norm": 0.3363722860813141, "learning_rate": 1.4478764478764478e-05, "loss": 0.0451, "step": 8710 }, { "epoch": 2.58984258984259, "grad_norm": 0.5933698415756226, "learning_rate": 1.4460944460944462e-05, "loss": 0.0635, "step": 8720 }, { "epoch": 2.5928125928125927, "grad_norm": 0.6424260139465332, "learning_rate": 1.4443124443124444e-05, "loss": 0.0537, "step": 8730 }, { "epoch": 2.5957825957825955, "grad_norm": 0.981282651424408, "learning_rate": 1.4425304425304425e-05, "loss": 0.0635, "step": 8740 }, { "epoch": 2.598752598752599, "grad_norm": 1.4847664833068848, "learning_rate": 1.4407484407484408e-05, "loss": 0.0664, "step": 8750 }, { "epoch": 2.6017226017226016, "grad_norm": 0.5225690007209778, "learning_rate": 1.438966438966439e-05, "loss": 0.061, "step": 8760 }, { "epoch": 2.6046926046926044, "grad_norm": 0.6397286057472229, "learning_rate": 1.4371844371844373e-05, "loss": 0.0674, "step": 8770 }, { "epoch": 2.6076626076626077, "grad_norm": 0.8102043867111206, "learning_rate": 1.4354024354024353e-05, "loss": 0.067, "step": 8780 }, { "epoch": 2.6106326106326105, "grad_norm": 0.9729601144790649, "learning_rate": 1.4336204336204337e-05, "loss": 0.0569, "step": 8790 }, { "epoch": 2.6136026136026134, "grad_norm": 0.6307176947593689, "learning_rate": 1.431838431838432e-05, "loss": 0.056, "step": 8800 }, { "epoch": 2.6165726165726166, "grad_norm": 0.8650787472724915, "learning_rate": 1.43005643005643e-05, "loss": 0.0674, "step": 8810 }, { "epoch": 2.6195426195426195, "grad_norm": 0.9029502868652344, "learning_rate": 1.4282744282744284e-05, "loss": 0.0533, "step": 8820 }, { "epoch": 2.6225126225126223, "grad_norm": 0.817317008972168, "learning_rate": 1.4264924264924266e-05, "loss": 0.0596, "step": 8830 }, { "epoch": 2.6254826254826256, "grad_norm": 0.9638757705688477, "learning_rate": 1.4247104247104248e-05, "loss": 0.0603, "step": 8840 }, { "epoch": 2.6284526284526284, "grad_norm": 0.9621570110321045, "learning_rate": 1.4229284229284228e-05, "loss": 0.0612, "step": 8850 }, { "epoch": 2.631422631422631, "grad_norm": 0.504012942314148, "learning_rate": 1.4211464211464212e-05, "loss": 0.0747, "step": 8860 }, { "epoch": 2.6343926343926345, "grad_norm": 1.0219043493270874, "learning_rate": 1.4193644193644194e-05, "loss": 0.052, "step": 8870 }, { "epoch": 2.6373626373626373, "grad_norm": 0.7350105047225952, "learning_rate": 1.4175824175824177e-05, "loss": 0.0604, "step": 8880 }, { "epoch": 2.64033264033264, "grad_norm": 0.5058907270431519, "learning_rate": 1.4158004158004159e-05, "loss": 0.0558, "step": 8890 }, { "epoch": 2.6433026433026434, "grad_norm": 0.6319795846939087, "learning_rate": 1.4140184140184141e-05, "loss": 0.0505, "step": 8900 }, { "epoch": 2.6462726462726462, "grad_norm": 0.6916008591651917, "learning_rate": 1.4122364122364123e-05, "loss": 0.0686, "step": 8910 }, { "epoch": 2.649242649242649, "grad_norm": 1.57939612865448, "learning_rate": 1.4104544104544104e-05, "loss": 0.0534, "step": 8920 }, { "epoch": 2.6522126522126523, "grad_norm": 1.0057226419448853, "learning_rate": 1.4086724086724087e-05, "loss": 0.0573, "step": 8930 }, { "epoch": 2.655182655182655, "grad_norm": 0.6452613472938538, "learning_rate": 1.406890406890407e-05, "loss": 0.0678, "step": 8940 }, { "epoch": 2.658152658152658, "grad_norm": 0.45321300625801086, "learning_rate": 1.4051084051084052e-05, "loss": 0.0453, "step": 8950 }, { "epoch": 2.6611226611226613, "grad_norm": 0.46493178606033325, "learning_rate": 1.4033264033264034e-05, "loss": 0.0579, "step": 8960 }, { "epoch": 2.664092664092664, "grad_norm": 0.6382163763046265, "learning_rate": 1.4015444015444016e-05, "loss": 0.0537, "step": 8970 }, { "epoch": 2.667062667062667, "grad_norm": 0.5830327272415161, "learning_rate": 1.3997623997623998e-05, "loss": 0.0529, "step": 8980 }, { "epoch": 2.67003267003267, "grad_norm": 0.8241320252418518, "learning_rate": 1.3979803979803979e-05, "loss": 0.0501, "step": 8990 }, { "epoch": 2.673002673002673, "grad_norm": 1.3200924396514893, "learning_rate": 1.3961983961983963e-05, "loss": 0.0637, "step": 9000 }, { "epoch": 2.675972675972676, "grad_norm": 0.5963950157165527, "learning_rate": 1.3944163944163945e-05, "loss": 0.0581, "step": 9010 }, { "epoch": 2.678942678942679, "grad_norm": 0.5137681365013123, "learning_rate": 1.3926343926343927e-05, "loss": 0.0504, "step": 9020 }, { "epoch": 2.681912681912682, "grad_norm": 0.8717916011810303, "learning_rate": 1.390852390852391e-05, "loss": 0.0681, "step": 9030 }, { "epoch": 2.684882684882685, "grad_norm": 0.5380828976631165, "learning_rate": 1.3890703890703891e-05, "loss": 0.0568, "step": 9040 }, { "epoch": 2.687852687852688, "grad_norm": 0.8956130743026733, "learning_rate": 1.3872883872883874e-05, "loss": 0.0623, "step": 9050 }, { "epoch": 2.690822690822691, "grad_norm": 0.6086248159408569, "learning_rate": 1.3855063855063854e-05, "loss": 0.0643, "step": 9060 }, { "epoch": 2.6937926937926937, "grad_norm": 0.8992329835891724, "learning_rate": 1.3837243837243838e-05, "loss": 0.0649, "step": 9070 }, { "epoch": 2.696762696762697, "grad_norm": 0.3477851450443268, "learning_rate": 1.381942381942382e-05, "loss": 0.0479, "step": 9080 }, { "epoch": 2.6997326997327, "grad_norm": 1.4136226177215576, "learning_rate": 1.3801603801603802e-05, "loss": 0.0529, "step": 9090 }, { "epoch": 2.7027027027027026, "grad_norm": 1.031639814376831, "learning_rate": 1.3783783783783784e-05, "loss": 0.0607, "step": 9100 }, { "epoch": 2.705672705672706, "grad_norm": 0.9110945463180542, "learning_rate": 1.3765963765963767e-05, "loss": 0.054, "step": 9110 }, { "epoch": 2.7086427086427087, "grad_norm": 1.111244797706604, "learning_rate": 1.3748143748143749e-05, "loss": 0.065, "step": 9120 }, { "epoch": 2.7116127116127116, "grad_norm": 0.507455587387085, "learning_rate": 1.373032373032373e-05, "loss": 0.0605, "step": 9130 }, { "epoch": 2.714582714582715, "grad_norm": 1.2011653184890747, "learning_rate": 1.3712503712503713e-05, "loss": 0.0535, "step": 9140 }, { "epoch": 2.7175527175527177, "grad_norm": 0.48414701223373413, "learning_rate": 1.3694683694683695e-05, "loss": 0.0488, "step": 9150 }, { "epoch": 2.7205227205227205, "grad_norm": 1.0523313283920288, "learning_rate": 1.3676863676863677e-05, "loss": 0.0657, "step": 9160 }, { "epoch": 2.7234927234927238, "grad_norm": 0.8370203971862793, "learning_rate": 1.365904365904366e-05, "loss": 0.0563, "step": 9170 }, { "epoch": 2.7264627264627266, "grad_norm": 0.6961177587509155, "learning_rate": 1.3641223641223642e-05, "loss": 0.0587, "step": 9180 }, { "epoch": 2.7294327294327294, "grad_norm": 1.089735507965088, "learning_rate": 1.3623403623403624e-05, "loss": 0.0606, "step": 9190 }, { "epoch": 2.7324027324027322, "grad_norm": 0.8496239185333252, "learning_rate": 1.3605583605583606e-05, "loss": 0.0572, "step": 9200 }, { "epoch": 2.7353727353727355, "grad_norm": 0.8729379773139954, "learning_rate": 1.3587763587763588e-05, "loss": 0.0604, "step": 9210 }, { "epoch": 2.7383427383427383, "grad_norm": 0.8139004111289978, "learning_rate": 1.356994356994357e-05, "loss": 0.052, "step": 9220 }, { "epoch": 2.741312741312741, "grad_norm": 1.0032668113708496, "learning_rate": 1.3552123552123553e-05, "loss": 0.0578, "step": 9230 }, { "epoch": 2.7442827442827444, "grad_norm": 0.5807051062583923, "learning_rate": 1.3534303534303535e-05, "loss": 0.0581, "step": 9240 }, { "epoch": 2.7472527472527473, "grad_norm": 0.5372444987297058, "learning_rate": 1.3516483516483517e-05, "loss": 0.0606, "step": 9250 }, { "epoch": 2.75022275022275, "grad_norm": 2.1565425395965576, "learning_rate": 1.3498663498663499e-05, "loss": 0.0545, "step": 9260 }, { "epoch": 2.753192753192753, "grad_norm": 0.6508318185806274, "learning_rate": 1.3480843480843481e-05, "loss": 0.0591, "step": 9270 }, { "epoch": 2.756162756162756, "grad_norm": 0.36386728286743164, "learning_rate": 1.3463023463023463e-05, "loss": 0.049, "step": 9280 }, { "epoch": 2.759132759132759, "grad_norm": 1.4432202577590942, "learning_rate": 1.3445203445203446e-05, "loss": 0.067, "step": 9290 }, { "epoch": 2.762102762102762, "grad_norm": 0.7495296001434326, "learning_rate": 1.3427383427383428e-05, "loss": 0.0551, "step": 9300 }, { "epoch": 2.765072765072765, "grad_norm": 0.5541219711303711, "learning_rate": 1.340956340956341e-05, "loss": 0.0594, "step": 9310 }, { "epoch": 2.768042768042768, "grad_norm": 0.8535263538360596, "learning_rate": 1.3391743391743392e-05, "loss": 0.0583, "step": 9320 }, { "epoch": 2.7710127710127708, "grad_norm": 0.784618616104126, "learning_rate": 1.3373923373923374e-05, "loss": 0.0705, "step": 9330 }, { "epoch": 2.773982773982774, "grad_norm": 0.6920385360717773, "learning_rate": 1.3356103356103356e-05, "loss": 0.0686, "step": 9340 }, { "epoch": 2.776952776952777, "grad_norm": 0.7097575068473816, "learning_rate": 1.3338283338283339e-05, "loss": 0.0546, "step": 9350 }, { "epoch": 2.7799227799227797, "grad_norm": 0.9163897633552551, "learning_rate": 1.332046332046332e-05, "loss": 0.0604, "step": 9360 }, { "epoch": 2.782892782892783, "grad_norm": 0.5482726097106934, "learning_rate": 1.3302643302643303e-05, "loss": 0.0561, "step": 9370 }, { "epoch": 2.785862785862786, "grad_norm": 0.5546408891677856, "learning_rate": 1.3284823284823285e-05, "loss": 0.0664, "step": 9380 }, { "epoch": 2.7888327888327886, "grad_norm": 0.9671948552131653, "learning_rate": 1.3267003267003267e-05, "loss": 0.049, "step": 9390 }, { "epoch": 2.791802791802792, "grad_norm": 0.8189311027526855, "learning_rate": 1.324918324918325e-05, "loss": 0.0464, "step": 9400 }, { "epoch": 2.7947727947727947, "grad_norm": 0.5405160188674927, "learning_rate": 1.3231363231363232e-05, "loss": 0.0673, "step": 9410 }, { "epoch": 2.7977427977427975, "grad_norm": 0.8173415660858154, "learning_rate": 1.3213543213543214e-05, "loss": 0.0589, "step": 9420 }, { "epoch": 2.800712800712801, "grad_norm": 0.6421013474464417, "learning_rate": 1.3195723195723196e-05, "loss": 0.0591, "step": 9430 }, { "epoch": 2.8036828036828036, "grad_norm": 1.540049433708191, "learning_rate": 1.3177903177903178e-05, "loss": 0.072, "step": 9440 }, { "epoch": 2.8066528066528065, "grad_norm": 0.6752909421920776, "learning_rate": 1.316008316008316e-05, "loss": 0.049, "step": 9450 }, { "epoch": 2.8096228096228097, "grad_norm": 0.9367174506187439, "learning_rate": 1.3142263142263142e-05, "loss": 0.0738, "step": 9460 }, { "epoch": 2.8125928125928126, "grad_norm": 1.2231054306030273, "learning_rate": 1.3124443124443125e-05, "loss": 0.0625, "step": 9470 }, { "epoch": 2.8155628155628154, "grad_norm": 0.4123198091983795, "learning_rate": 1.3106623106623107e-05, "loss": 0.0518, "step": 9480 }, { "epoch": 2.8185328185328187, "grad_norm": 0.6436600089073181, "learning_rate": 1.3088803088803089e-05, "loss": 0.0725, "step": 9490 }, { "epoch": 2.8215028215028215, "grad_norm": 0.6609872579574585, "learning_rate": 1.3070983070983071e-05, "loss": 0.0594, "step": 9500 }, { "epoch": 2.8244728244728243, "grad_norm": 0.4559807777404785, "learning_rate": 1.3053163053163053e-05, "loss": 0.063, "step": 9510 }, { "epoch": 2.8274428274428276, "grad_norm": 1.0290307998657227, "learning_rate": 1.3035343035343037e-05, "loss": 0.0618, "step": 9520 }, { "epoch": 2.8304128304128304, "grad_norm": 0.7586894035339355, "learning_rate": 1.3017523017523018e-05, "loss": 0.0584, "step": 9530 }, { "epoch": 2.8333828333828333, "grad_norm": 0.714316725730896, "learning_rate": 1.2999702999703e-05, "loss": 0.0448, "step": 9540 }, { "epoch": 2.8363528363528365, "grad_norm": 0.6114319562911987, "learning_rate": 1.2981882981882982e-05, "loss": 0.0689, "step": 9550 }, { "epoch": 2.8393228393228394, "grad_norm": 0.7381054162979126, "learning_rate": 1.2964062964062964e-05, "loss": 0.0577, "step": 9560 }, { "epoch": 2.842292842292842, "grad_norm": 0.7892597317695618, "learning_rate": 1.2946242946242946e-05, "loss": 0.0697, "step": 9570 }, { "epoch": 2.8452628452628455, "grad_norm": 0.6848814487457275, "learning_rate": 1.2928422928422929e-05, "loss": 0.0521, "step": 9580 }, { "epoch": 2.8482328482328483, "grad_norm": 0.5784212946891785, "learning_rate": 1.2910602910602912e-05, "loss": 0.0647, "step": 9590 }, { "epoch": 2.851202851202851, "grad_norm": 0.6273770332336426, "learning_rate": 1.2892782892782893e-05, "loss": 0.0677, "step": 9600 }, { "epoch": 2.8541728541728544, "grad_norm": 0.7045747637748718, "learning_rate": 1.2874962874962875e-05, "loss": 0.0686, "step": 9610 }, { "epoch": 2.857142857142857, "grad_norm": 0.8756235241889954, "learning_rate": 1.2857142857142857e-05, "loss": 0.0551, "step": 9620 }, { "epoch": 2.86011286011286, "grad_norm": 1.0615090131759644, "learning_rate": 1.283932283932284e-05, "loss": 0.0676, "step": 9630 }, { "epoch": 2.8630828630828633, "grad_norm": 0.42012715339660645, "learning_rate": 1.2821502821502822e-05, "loss": 0.0492, "step": 9640 }, { "epoch": 2.866052866052866, "grad_norm": 0.8934495449066162, "learning_rate": 1.2803682803682804e-05, "loss": 0.0523, "step": 9650 }, { "epoch": 2.869022869022869, "grad_norm": 1.0009864568710327, "learning_rate": 1.2785862785862788e-05, "loss": 0.0662, "step": 9660 }, { "epoch": 2.8719928719928722, "grad_norm": 0.8075212836265564, "learning_rate": 1.2768042768042768e-05, "loss": 0.0553, "step": 9670 }, { "epoch": 2.874962874962875, "grad_norm": 0.7965303063392639, "learning_rate": 1.275022275022275e-05, "loss": 0.0573, "step": 9680 }, { "epoch": 2.877932877932878, "grad_norm": 0.6948648691177368, "learning_rate": 1.2732402732402732e-05, "loss": 0.0557, "step": 9690 }, { "epoch": 2.880902880902881, "grad_norm": 0.7285399436950684, "learning_rate": 1.2714582714582715e-05, "loss": 0.0624, "step": 9700 }, { "epoch": 2.883872883872884, "grad_norm": 0.37577903270721436, "learning_rate": 1.2696762696762697e-05, "loss": 0.0517, "step": 9710 }, { "epoch": 2.886842886842887, "grad_norm": 0.36712825298309326, "learning_rate": 1.2678942678942679e-05, "loss": 0.0567, "step": 9720 }, { "epoch": 2.88981288981289, "grad_norm": 1.1397478580474854, "learning_rate": 1.2661122661122663e-05, "loss": 0.0636, "step": 9730 }, { "epoch": 2.892782892782893, "grad_norm": 0.5818475484848022, "learning_rate": 1.2643302643302643e-05, "loss": 0.0535, "step": 9740 }, { "epoch": 2.8957528957528957, "grad_norm": 0.7430572509765625, "learning_rate": 1.2625482625482625e-05, "loss": 0.0484, "step": 9750 }, { "epoch": 2.8987228987228986, "grad_norm": 1.0265908241271973, "learning_rate": 1.2607662607662608e-05, "loss": 0.0566, "step": 9760 }, { "epoch": 2.901692901692902, "grad_norm": 1.191502332687378, "learning_rate": 1.258984258984259e-05, "loss": 0.0574, "step": 9770 }, { "epoch": 2.9046629046629047, "grad_norm": 0.7020824551582336, "learning_rate": 1.2572022572022572e-05, "loss": 0.0498, "step": 9780 }, { "epoch": 2.9076329076329075, "grad_norm": 1.4909850358963013, "learning_rate": 1.2554202554202554e-05, "loss": 0.0648, "step": 9790 }, { "epoch": 2.9106029106029108, "grad_norm": 0.9916106462478638, "learning_rate": 1.2536382536382538e-05, "loss": 0.068, "step": 9800 }, { "epoch": 2.9135729135729136, "grad_norm": 0.7778554558753967, "learning_rate": 1.2518562518562518e-05, "loss": 0.0518, "step": 9810 }, { "epoch": 2.9165429165429164, "grad_norm": 1.0887690782546997, "learning_rate": 1.25007425007425e-05, "loss": 0.0668, "step": 9820 }, { "epoch": 2.9195129195129192, "grad_norm": 0.5052692294120789, "learning_rate": 1.2482922482922483e-05, "loss": 0.0489, "step": 9830 }, { "epoch": 2.9224829224829225, "grad_norm": 0.9965596199035645, "learning_rate": 1.2465102465102467e-05, "loss": 0.0596, "step": 9840 }, { "epoch": 2.9254529254529253, "grad_norm": 0.6010634303092957, "learning_rate": 1.2447282447282447e-05, "loss": 0.0556, "step": 9850 }, { "epoch": 2.928422928422928, "grad_norm": 0.7271102070808411, "learning_rate": 1.242946242946243e-05, "loss": 0.0539, "step": 9860 }, { "epoch": 2.9313929313929314, "grad_norm": 0.5833537578582764, "learning_rate": 1.2411642411642413e-05, "loss": 0.0551, "step": 9870 }, { "epoch": 2.9343629343629343, "grad_norm": 1.5592166185379028, "learning_rate": 1.2393822393822394e-05, "loss": 0.0649, "step": 9880 }, { "epoch": 2.937332937332937, "grad_norm": 0.6563842296600342, "learning_rate": 1.2376002376002376e-05, "loss": 0.047, "step": 9890 }, { "epoch": 2.9403029403029404, "grad_norm": 0.8599936366081238, "learning_rate": 1.2358182358182358e-05, "loss": 0.0665, "step": 9900 }, { "epoch": 2.943272943272943, "grad_norm": 0.6941415667533875, "learning_rate": 1.2340362340362342e-05, "loss": 0.0603, "step": 9910 }, { "epoch": 2.946242946242946, "grad_norm": 0.6341265439987183, "learning_rate": 1.2322542322542322e-05, "loss": 0.0701, "step": 9920 }, { "epoch": 2.9492129492129493, "grad_norm": 0.8768404722213745, "learning_rate": 1.2304722304722305e-05, "loss": 0.0635, "step": 9930 }, { "epoch": 2.952182952182952, "grad_norm": 0.7118542194366455, "learning_rate": 1.2286902286902288e-05, "loss": 0.0592, "step": 9940 }, { "epoch": 2.955152955152955, "grad_norm": 1.1415071487426758, "learning_rate": 1.2269082269082269e-05, "loss": 0.0649, "step": 9950 }, { "epoch": 2.9581229581229582, "grad_norm": 1.042884349822998, "learning_rate": 1.2251262251262251e-05, "loss": 0.0522, "step": 9960 }, { "epoch": 2.961092961092961, "grad_norm": 0.5634980201721191, "learning_rate": 1.2233442233442233e-05, "loss": 0.0532, "step": 9970 }, { "epoch": 2.964062964062964, "grad_norm": 0.8798786401748657, "learning_rate": 1.2215622215622217e-05, "loss": 0.0575, "step": 9980 }, { "epoch": 2.967032967032967, "grad_norm": 0.651884913444519, "learning_rate": 1.2197802197802198e-05, "loss": 0.0469, "step": 9990 }, { "epoch": 2.97000297000297, "grad_norm": 0.9131767749786377, "learning_rate": 1.217998217998218e-05, "loss": 0.0576, "step": 10000 }, { "epoch": 2.972972972972973, "grad_norm": 1.2109713554382324, "learning_rate": 1.2162162162162164e-05, "loss": 0.0663, "step": 10010 }, { "epoch": 2.975942975942976, "grad_norm": 0.9525397419929504, "learning_rate": 1.2144342144342144e-05, "loss": 0.0531, "step": 10020 }, { "epoch": 2.978912978912979, "grad_norm": 0.7746742963790894, "learning_rate": 1.2126522126522126e-05, "loss": 0.0619, "step": 10030 }, { "epoch": 2.9818829818829817, "grad_norm": 0.526714026927948, "learning_rate": 1.2108702108702108e-05, "loss": 0.0639, "step": 10040 }, { "epoch": 2.984852984852985, "grad_norm": 0.42681199312210083, "learning_rate": 1.2090882090882092e-05, "loss": 0.059, "step": 10050 }, { "epoch": 2.987822987822988, "grad_norm": 1.26163911819458, "learning_rate": 1.2073062073062073e-05, "loss": 0.0534, "step": 10060 }, { "epoch": 2.9907929907929907, "grad_norm": 0.6416770815849304, "learning_rate": 1.2055242055242055e-05, "loss": 0.0668, "step": 10070 }, { "epoch": 2.993762993762994, "grad_norm": 0.7979917526245117, "learning_rate": 1.2037422037422039e-05, "loss": 0.0553, "step": 10080 }, { "epoch": 2.9967329967329968, "grad_norm": 0.6158313751220703, "learning_rate": 1.2019602019602021e-05, "loss": 0.0388, "step": 10090 }, { "epoch": 2.9997029997029996, "grad_norm": 0.5872673392295837, "learning_rate": 1.2001782001782001e-05, "loss": 0.0508, "step": 10100 }, { "epoch": 3.0, "eval_f1": 0.33031292965957215, "eval_loss": 0.04517492279410362, "eval_runtime": 166.1304, "eval_samples_per_second": 228.85, "eval_steps_per_second": 3.582, "step": 10101 }, { "epoch": 3.002673002673003, "grad_norm": 0.6985778212547302, "learning_rate": 1.1983961983961984e-05, "loss": 0.056, "step": 10110 }, { "epoch": 3.0056430056430057, "grad_norm": 0.88740074634552, "learning_rate": 1.1966141966141967e-05, "loss": 0.0639, "step": 10120 }, { "epoch": 3.0086130086130085, "grad_norm": 1.2202911376953125, "learning_rate": 1.1948321948321948e-05, "loss": 0.0697, "step": 10130 }, { "epoch": 3.011583011583012, "grad_norm": 0.9488741755485535, "learning_rate": 1.193050193050193e-05, "loss": 0.0473, "step": 10140 }, { "epoch": 3.0145530145530146, "grad_norm": 0.6430271863937378, "learning_rate": 1.1912681912681914e-05, "loss": 0.0654, "step": 10150 }, { "epoch": 3.0175230175230174, "grad_norm": 0.4088257849216461, "learning_rate": 1.1894861894861896e-05, "loss": 0.0608, "step": 10160 }, { "epoch": 3.0204930204930207, "grad_norm": 0.8268032670021057, "learning_rate": 1.1877041877041877e-05, "loss": 0.0543, "step": 10170 }, { "epoch": 3.0234630234630235, "grad_norm": 0.7411820888519287, "learning_rate": 1.1859221859221859e-05, "loss": 0.045, "step": 10180 }, { "epoch": 3.0264330264330264, "grad_norm": 1.4683622121810913, "learning_rate": 1.1841401841401843e-05, "loss": 0.0657, "step": 10190 }, { "epoch": 3.029403029403029, "grad_norm": 0.926177978515625, "learning_rate": 1.1823581823581823e-05, "loss": 0.0535, "step": 10200 }, { "epoch": 3.0323730323730325, "grad_norm": 0.816768229007721, "learning_rate": 1.1805761805761805e-05, "loss": 0.0591, "step": 10210 }, { "epoch": 3.0353430353430353, "grad_norm": 0.4738346040248871, "learning_rate": 1.1787941787941789e-05, "loss": 0.0697, "step": 10220 }, { "epoch": 3.038313038313038, "grad_norm": 0.748884379863739, "learning_rate": 1.1770121770121771e-05, "loss": 0.0489, "step": 10230 }, { "epoch": 3.0412830412830414, "grad_norm": 0.4384136199951172, "learning_rate": 1.1752301752301752e-05, "loss": 0.058, "step": 10240 }, { "epoch": 3.044253044253044, "grad_norm": 0.8452009558677673, "learning_rate": 1.1734481734481734e-05, "loss": 0.0579, "step": 10250 }, { "epoch": 3.047223047223047, "grad_norm": 1.2820180654525757, "learning_rate": 1.1716661716661718e-05, "loss": 0.0506, "step": 10260 }, { "epoch": 3.0501930501930503, "grad_norm": 0.6100145578384399, "learning_rate": 1.1698841698841698e-05, "loss": 0.0493, "step": 10270 }, { "epoch": 3.053163053163053, "grad_norm": 0.8073909282684326, "learning_rate": 1.168102168102168e-05, "loss": 0.059, "step": 10280 }, { "epoch": 3.056133056133056, "grad_norm": 0.6318356394767761, "learning_rate": 1.1663201663201664e-05, "loss": 0.0595, "step": 10290 }, { "epoch": 3.0591030591030592, "grad_norm": 0.7712961435317993, "learning_rate": 1.1645381645381647e-05, "loss": 0.0636, "step": 10300 }, { "epoch": 3.062073062073062, "grad_norm": 0.9575150012969971, "learning_rate": 1.1627561627561627e-05, "loss": 0.067, "step": 10310 }, { "epoch": 3.065043065043065, "grad_norm": 0.9202025532722473, "learning_rate": 1.160974160974161e-05, "loss": 0.0711, "step": 10320 }, { "epoch": 3.068013068013068, "grad_norm": 1.038837194442749, "learning_rate": 1.1591921591921593e-05, "loss": 0.0487, "step": 10330 }, { "epoch": 3.070983070983071, "grad_norm": 0.6200097799301147, "learning_rate": 1.1574101574101574e-05, "loss": 0.0578, "step": 10340 }, { "epoch": 3.073953073953074, "grad_norm": 0.6585675477981567, "learning_rate": 1.1556281556281556e-05, "loss": 0.0648, "step": 10350 }, { "epoch": 3.076923076923077, "grad_norm": 0.8432527184486389, "learning_rate": 1.153846153846154e-05, "loss": 0.0574, "step": 10360 }, { "epoch": 3.07989307989308, "grad_norm": 0.8519158959388733, "learning_rate": 1.1520641520641522e-05, "loss": 0.0435, "step": 10370 }, { "epoch": 3.0828630828630827, "grad_norm": 0.5639305710792542, "learning_rate": 1.1502821502821502e-05, "loss": 0.0548, "step": 10380 }, { "epoch": 3.085833085833086, "grad_norm": 1.1483186483383179, "learning_rate": 1.1485001485001484e-05, "loss": 0.067, "step": 10390 }, { "epoch": 3.088803088803089, "grad_norm": 0.7661743760108948, "learning_rate": 1.1467181467181468e-05, "loss": 0.0409, "step": 10400 }, { "epoch": 3.0917730917730917, "grad_norm": 0.42964890599250793, "learning_rate": 1.144936144936145e-05, "loss": 0.0656, "step": 10410 }, { "epoch": 3.094743094743095, "grad_norm": 0.4453743100166321, "learning_rate": 1.1431541431541431e-05, "loss": 0.0638, "step": 10420 }, { "epoch": 3.0977130977130978, "grad_norm": 1.3230763673782349, "learning_rate": 1.1413721413721415e-05, "loss": 0.0644, "step": 10430 }, { "epoch": 3.1006831006831006, "grad_norm": 0.6661444306373596, "learning_rate": 1.1395901395901397e-05, "loss": 0.0585, "step": 10440 }, { "epoch": 3.1036531036531034, "grad_norm": 0.6051294207572937, "learning_rate": 1.1378081378081377e-05, "loss": 0.0583, "step": 10450 }, { "epoch": 3.1066231066231067, "grad_norm": 0.9865986704826355, "learning_rate": 1.136026136026136e-05, "loss": 0.0547, "step": 10460 }, { "epoch": 3.1095931095931095, "grad_norm": 0.6704586148262024, "learning_rate": 1.1342441342441343e-05, "loss": 0.0582, "step": 10470 }, { "epoch": 3.1125631125631124, "grad_norm": 0.8656442761421204, "learning_rate": 1.1324621324621326e-05, "loss": 0.0574, "step": 10480 }, { "epoch": 3.1155331155331156, "grad_norm": 0.704118549823761, "learning_rate": 1.1306801306801306e-05, "loss": 0.0509, "step": 10490 }, { "epoch": 3.1185031185031185, "grad_norm": 0.5805778503417969, "learning_rate": 1.128898128898129e-05, "loss": 0.0587, "step": 10500 }, { "epoch": 3.1214731214731213, "grad_norm": 1.2092708349227905, "learning_rate": 1.1271161271161272e-05, "loss": 0.069, "step": 10510 }, { "epoch": 3.1244431244431246, "grad_norm": 0.7527390718460083, "learning_rate": 1.1253341253341253e-05, "loss": 0.059, "step": 10520 }, { "epoch": 3.1274131274131274, "grad_norm": 0.5099059343338013, "learning_rate": 1.1235521235521235e-05, "loss": 0.0616, "step": 10530 }, { "epoch": 3.13038313038313, "grad_norm": 0.6300451755523682, "learning_rate": 1.1217701217701219e-05, "loss": 0.0625, "step": 10540 }, { "epoch": 3.1333531333531335, "grad_norm": 0.9511438608169556, "learning_rate": 1.11998811998812e-05, "loss": 0.0734, "step": 10550 }, { "epoch": 3.1363231363231363, "grad_norm": 0.6538392901420593, "learning_rate": 1.1182061182061181e-05, "loss": 0.0429, "step": 10560 }, { "epoch": 3.139293139293139, "grad_norm": 0.983573317527771, "learning_rate": 1.1164241164241165e-05, "loss": 0.049, "step": 10570 }, { "epoch": 3.1422631422631424, "grad_norm": 0.8676197528839111, "learning_rate": 1.1146421146421147e-05, "loss": 0.0639, "step": 10580 }, { "epoch": 3.1452331452331452, "grad_norm": 0.4240647256374359, "learning_rate": 1.1128601128601128e-05, "loss": 0.063, "step": 10590 }, { "epoch": 3.148203148203148, "grad_norm": 0.9053698182106018, "learning_rate": 1.111078111078111e-05, "loss": 0.0409, "step": 10600 }, { "epoch": 3.1511731511731513, "grad_norm": 0.6372582316398621, "learning_rate": 1.1092961092961094e-05, "loss": 0.0648, "step": 10610 }, { "epoch": 3.154143154143154, "grad_norm": 0.7005014419555664, "learning_rate": 1.1075141075141076e-05, "loss": 0.0563, "step": 10620 }, { "epoch": 3.157113157113157, "grad_norm": 0.8473065495491028, "learning_rate": 1.1057321057321056e-05, "loss": 0.0729, "step": 10630 }, { "epoch": 3.1600831600831603, "grad_norm": 0.3668000102043152, "learning_rate": 1.103950103950104e-05, "loss": 0.0609, "step": 10640 }, { "epoch": 3.163053163053163, "grad_norm": 0.3873116374015808, "learning_rate": 1.1021681021681022e-05, "loss": 0.0427, "step": 10650 }, { "epoch": 3.166023166023166, "grad_norm": 0.6118089556694031, "learning_rate": 1.1003861003861003e-05, "loss": 0.0426, "step": 10660 }, { "epoch": 3.168993168993169, "grad_norm": 0.8138056993484497, "learning_rate": 1.0986040986040985e-05, "loss": 0.0626, "step": 10670 }, { "epoch": 3.171963171963172, "grad_norm": 0.45243504643440247, "learning_rate": 1.0968220968220969e-05, "loss": 0.0643, "step": 10680 }, { "epoch": 3.174933174933175, "grad_norm": 0.8468220233917236, "learning_rate": 1.0950400950400951e-05, "loss": 0.0628, "step": 10690 }, { "epoch": 3.177903177903178, "grad_norm": 0.3471866846084595, "learning_rate": 1.0932580932580932e-05, "loss": 0.056, "step": 10700 }, { "epoch": 3.180873180873181, "grad_norm": 0.6115337014198303, "learning_rate": 1.0914760914760916e-05, "loss": 0.0571, "step": 10710 }, { "epoch": 3.1838431838431838, "grad_norm": 0.7158623933792114, "learning_rate": 1.0896940896940898e-05, "loss": 0.0541, "step": 10720 }, { "epoch": 3.186813186813187, "grad_norm": 0.49794185161590576, "learning_rate": 1.087912087912088e-05, "loss": 0.0491, "step": 10730 }, { "epoch": 3.18978318978319, "grad_norm": 0.786274254322052, "learning_rate": 1.086130086130086e-05, "loss": 0.0675, "step": 10740 }, { "epoch": 3.1927531927531927, "grad_norm": 1.0238100290298462, "learning_rate": 1.0843480843480844e-05, "loss": 0.0413, "step": 10750 }, { "epoch": 3.1957231957231955, "grad_norm": 1.032472848892212, "learning_rate": 1.0825660825660826e-05, "loss": 0.0607, "step": 10760 }, { "epoch": 3.198693198693199, "grad_norm": 0.7296070456504822, "learning_rate": 1.0807840807840807e-05, "loss": 0.0532, "step": 10770 }, { "epoch": 3.2016632016632016, "grad_norm": 1.2393643856048584, "learning_rate": 1.079002079002079e-05, "loss": 0.0602, "step": 10780 }, { "epoch": 3.2046332046332044, "grad_norm": 0.6754175424575806, "learning_rate": 1.0772200772200773e-05, "loss": 0.0592, "step": 10790 }, { "epoch": 3.2076032076032077, "grad_norm": 0.5455211997032166, "learning_rate": 1.0754380754380755e-05, "loss": 0.0622, "step": 10800 }, { "epoch": 3.2105732105732105, "grad_norm": 0.449032187461853, "learning_rate": 1.0736560736560736e-05, "loss": 0.0497, "step": 10810 }, { "epoch": 3.2135432135432134, "grad_norm": 0.3489013612270355, "learning_rate": 1.071874071874072e-05, "loss": 0.0669, "step": 10820 }, { "epoch": 3.2165132165132166, "grad_norm": 1.101258397102356, "learning_rate": 1.0700920700920702e-05, "loss": 0.0718, "step": 10830 }, { "epoch": 3.2194832194832195, "grad_norm": 0.637738049030304, "learning_rate": 1.0683100683100682e-05, "loss": 0.0455, "step": 10840 }, { "epoch": 3.2224532224532223, "grad_norm": 0.8752096891403198, "learning_rate": 1.0665280665280666e-05, "loss": 0.0671, "step": 10850 }, { "epoch": 3.2254232254232256, "grad_norm": 0.5893465876579285, "learning_rate": 1.0647460647460648e-05, "loss": 0.0627, "step": 10860 }, { "epoch": 3.2283932283932284, "grad_norm": 1.188922643661499, "learning_rate": 1.062964062964063e-05, "loss": 0.066, "step": 10870 }, { "epoch": 3.2313632313632312, "grad_norm": 0.6569589376449585, "learning_rate": 1.0611820611820612e-05, "loss": 0.0481, "step": 10880 }, { "epoch": 3.2343332343332345, "grad_norm": 0.7973625063896179, "learning_rate": 1.0594000594000595e-05, "loss": 0.0532, "step": 10890 }, { "epoch": 3.2373032373032373, "grad_norm": 1.0945541858673096, "learning_rate": 1.0576180576180577e-05, "loss": 0.0557, "step": 10900 }, { "epoch": 3.24027324027324, "grad_norm": 0.6560423374176025, "learning_rate": 1.0558360558360557e-05, "loss": 0.048, "step": 10910 }, { "epoch": 3.2432432432432434, "grad_norm": 0.7343136072158813, "learning_rate": 1.0540540540540541e-05, "loss": 0.0588, "step": 10920 }, { "epoch": 3.2462132462132463, "grad_norm": 0.6441863775253296, "learning_rate": 1.0522720522720523e-05, "loss": 0.0501, "step": 10930 }, { "epoch": 3.249183249183249, "grad_norm": 0.5556747913360596, "learning_rate": 1.0504900504900505e-05, "loss": 0.0521, "step": 10940 }, { "epoch": 3.252153252153252, "grad_norm": 0.636785089969635, "learning_rate": 1.0487080487080488e-05, "loss": 0.0593, "step": 10950 }, { "epoch": 3.255123255123255, "grad_norm": 1.0058456659317017, "learning_rate": 1.046926046926047e-05, "loss": 0.0683, "step": 10960 }, { "epoch": 3.258093258093258, "grad_norm": 0.6078860759735107, "learning_rate": 1.0451440451440452e-05, "loss": 0.0525, "step": 10970 }, { "epoch": 3.261063261063261, "grad_norm": 0.675046980381012, "learning_rate": 1.0433620433620434e-05, "loss": 0.0538, "step": 10980 }, { "epoch": 3.264033264033264, "grad_norm": 0.4468725323677063, "learning_rate": 1.0415800415800416e-05, "loss": 0.0669, "step": 10990 }, { "epoch": 3.267003267003267, "grad_norm": 0.7147549986839294, "learning_rate": 1.0397980397980398e-05, "loss": 0.0454, "step": 11000 }, { "epoch": 3.2699732699732698, "grad_norm": 0.8929649591445923, "learning_rate": 1.038016038016038e-05, "loss": 0.0399, "step": 11010 }, { "epoch": 3.272943272943273, "grad_norm": 0.3554942011833191, "learning_rate": 1.0362340362340363e-05, "loss": 0.0501, "step": 11020 }, { "epoch": 3.275913275913276, "grad_norm": 0.4639175236225128, "learning_rate": 1.0344520344520345e-05, "loss": 0.0614, "step": 11030 }, { "epoch": 3.2788832788832787, "grad_norm": 0.8245081901550293, "learning_rate": 1.0326700326700327e-05, "loss": 0.0581, "step": 11040 }, { "epoch": 3.281853281853282, "grad_norm": 0.9380563497543335, "learning_rate": 1.030888030888031e-05, "loss": 0.0677, "step": 11050 }, { "epoch": 3.284823284823285, "grad_norm": 1.2146899700164795, "learning_rate": 1.0291060291060291e-05, "loss": 0.0511, "step": 11060 }, { "epoch": 3.2877932877932876, "grad_norm": 0.7667972445487976, "learning_rate": 1.0273240273240274e-05, "loss": 0.0475, "step": 11070 }, { "epoch": 3.290763290763291, "grad_norm": 0.6132957339286804, "learning_rate": 1.0255420255420256e-05, "loss": 0.0539, "step": 11080 }, { "epoch": 3.2937332937332937, "grad_norm": 1.2011501789093018, "learning_rate": 1.0237600237600238e-05, "loss": 0.0475, "step": 11090 }, { "epoch": 3.2967032967032965, "grad_norm": 0.6205746531486511, "learning_rate": 1.021978021978022e-05, "loss": 0.0593, "step": 11100 }, { "epoch": 3.2996732996733, "grad_norm": 1.1285995244979858, "learning_rate": 1.0201960201960202e-05, "loss": 0.0693, "step": 11110 }, { "epoch": 3.3026433026433026, "grad_norm": 1.0713976621627808, "learning_rate": 1.0184140184140184e-05, "loss": 0.062, "step": 11120 }, { "epoch": 3.3056133056133055, "grad_norm": 0.6657689213752747, "learning_rate": 1.0166320166320167e-05, "loss": 0.0544, "step": 11130 }, { "epoch": 3.3085833085833087, "grad_norm": 0.9671308994293213, "learning_rate": 1.0148500148500149e-05, "loss": 0.0586, "step": 11140 }, { "epoch": 3.3115533115533116, "grad_norm": 0.6092202663421631, "learning_rate": 1.0130680130680131e-05, "loss": 0.0646, "step": 11150 }, { "epoch": 3.3145233145233144, "grad_norm": 0.8114497065544128, "learning_rate": 1.0112860112860113e-05, "loss": 0.0723, "step": 11160 }, { "epoch": 3.3174933174933177, "grad_norm": 1.0398253202438354, "learning_rate": 1.0095040095040095e-05, "loss": 0.0516, "step": 11170 }, { "epoch": 3.3204633204633205, "grad_norm": 0.773544430732727, "learning_rate": 1.0077220077220078e-05, "loss": 0.0553, "step": 11180 }, { "epoch": 3.3234333234333233, "grad_norm": 1.1099355220794678, "learning_rate": 1.005940005940006e-05, "loss": 0.0662, "step": 11190 }, { "epoch": 3.3264033264033266, "grad_norm": 0.7854142189025879, "learning_rate": 1.0041580041580042e-05, "loss": 0.0578, "step": 11200 }, { "epoch": 3.3293733293733294, "grad_norm": 0.47787368297576904, "learning_rate": 1.0023760023760024e-05, "loss": 0.0634, "step": 11210 }, { "epoch": 3.3323433323433322, "grad_norm": 0.5865157246589661, "learning_rate": 1.0005940005940006e-05, "loss": 0.0425, "step": 11220 }, { "epoch": 3.3353133353133355, "grad_norm": 0.7458857297897339, "learning_rate": 9.988119988119988e-06, "loss": 0.0659, "step": 11230 }, { "epoch": 3.3382833382833383, "grad_norm": 0.73867267370224, "learning_rate": 9.97029997029997e-06, "loss": 0.0615, "step": 11240 }, { "epoch": 3.341253341253341, "grad_norm": 0.9785704016685486, "learning_rate": 9.952479952479953e-06, "loss": 0.0452, "step": 11250 }, { "epoch": 3.3442233442233444, "grad_norm": 0.7227851152420044, "learning_rate": 9.934659934659935e-06, "loss": 0.0572, "step": 11260 }, { "epoch": 3.3471933471933473, "grad_norm": 0.6161572933197021, "learning_rate": 9.916839916839917e-06, "loss": 0.0457, "step": 11270 }, { "epoch": 3.35016335016335, "grad_norm": 0.6703232526779175, "learning_rate": 9.8990198990199e-06, "loss": 0.0682, "step": 11280 }, { "epoch": 3.3531333531333534, "grad_norm": 0.6447526216506958, "learning_rate": 9.881199881199881e-06, "loss": 0.0596, "step": 11290 }, { "epoch": 3.356103356103356, "grad_norm": 0.46125978231430054, "learning_rate": 9.863379863379865e-06, "loss": 0.0659, "step": 11300 }, { "epoch": 3.359073359073359, "grad_norm": 1.2563014030456543, "learning_rate": 9.845559845559846e-06, "loss": 0.057, "step": 11310 }, { "epoch": 3.362043362043362, "grad_norm": 0.8137726187705994, "learning_rate": 9.827739827739828e-06, "loss": 0.0461, "step": 11320 }, { "epoch": 3.365013365013365, "grad_norm": 0.5662651658058167, "learning_rate": 9.80991980991981e-06, "loss": 0.0484, "step": 11330 }, { "epoch": 3.367983367983368, "grad_norm": 0.5154465436935425, "learning_rate": 9.792099792099792e-06, "loss": 0.0579, "step": 11340 }, { "epoch": 3.3709533709533708, "grad_norm": 0.7611321806907654, "learning_rate": 9.774279774279774e-06, "loss": 0.0613, "step": 11350 }, { "epoch": 3.373923373923374, "grad_norm": 0.8335089087486267, "learning_rate": 9.756459756459757e-06, "loss": 0.0493, "step": 11360 }, { "epoch": 3.376893376893377, "grad_norm": 1.004220724105835, "learning_rate": 9.73863973863974e-06, "loss": 0.0511, "step": 11370 }, { "epoch": 3.3798633798633797, "grad_norm": 0.5322721600532532, "learning_rate": 9.720819720819721e-06, "loss": 0.0551, "step": 11380 }, { "epoch": 3.382833382833383, "grad_norm": 0.7684707641601562, "learning_rate": 9.702999702999703e-06, "loss": 0.0658, "step": 11390 }, { "epoch": 3.385803385803386, "grad_norm": 0.7341310381889343, "learning_rate": 9.685179685179685e-06, "loss": 0.0626, "step": 11400 }, { "epoch": 3.3887733887733886, "grad_norm": 0.7289576530456543, "learning_rate": 9.667359667359667e-06, "loss": 0.0508, "step": 11410 }, { "epoch": 3.391743391743392, "grad_norm": 0.9594446420669556, "learning_rate": 9.64953964953965e-06, "loss": 0.0559, "step": 11420 }, { "epoch": 3.3947133947133947, "grad_norm": 0.7913696765899658, "learning_rate": 9.631719631719632e-06, "loss": 0.0648, "step": 11430 }, { "epoch": 3.3976833976833976, "grad_norm": 0.8502475619316101, "learning_rate": 9.613899613899616e-06, "loss": 0.0531, "step": 11440 }, { "epoch": 3.400653400653401, "grad_norm": 0.8771364092826843, "learning_rate": 9.596079596079596e-06, "loss": 0.0577, "step": 11450 }, { "epoch": 3.4036234036234037, "grad_norm": 0.6660274267196655, "learning_rate": 9.578259578259578e-06, "loss": 0.0496, "step": 11460 }, { "epoch": 3.4065934065934065, "grad_norm": 0.7120699286460876, "learning_rate": 9.56043956043956e-06, "loss": 0.0533, "step": 11470 }, { "epoch": 3.4095634095634098, "grad_norm": 0.8846875429153442, "learning_rate": 9.542619542619543e-06, "loss": 0.0557, "step": 11480 }, { "epoch": 3.4125334125334126, "grad_norm": 0.5555750131607056, "learning_rate": 9.524799524799525e-06, "loss": 0.049, "step": 11490 }, { "epoch": 3.4155034155034154, "grad_norm": 0.6334190964698792, "learning_rate": 9.506979506979507e-06, "loss": 0.0679, "step": 11500 }, { "epoch": 3.4184734184734182, "grad_norm": 0.462439626455307, "learning_rate": 9.48915948915949e-06, "loss": 0.0572, "step": 11510 }, { "epoch": 3.4214434214434215, "grad_norm": 0.6081441640853882, "learning_rate": 9.471339471339471e-06, "loss": 0.0487, "step": 11520 }, { "epoch": 3.4244134244134243, "grad_norm": 0.7960311770439148, "learning_rate": 9.453519453519453e-06, "loss": 0.0628, "step": 11530 }, { "epoch": 3.427383427383427, "grad_norm": 0.5583157539367676, "learning_rate": 9.435699435699436e-06, "loss": 0.0666, "step": 11540 }, { "epoch": 3.4303534303534304, "grad_norm": 0.8321641683578491, "learning_rate": 9.417879417879418e-06, "loss": 0.0527, "step": 11550 }, { "epoch": 3.4333234333234333, "grad_norm": 0.6398223042488098, "learning_rate": 9.4000594000594e-06, "loss": 0.0421, "step": 11560 }, { "epoch": 3.436293436293436, "grad_norm": 0.8610657453536987, "learning_rate": 9.382239382239382e-06, "loss": 0.0512, "step": 11570 }, { "epoch": 3.4392634392634394, "grad_norm": 0.8731980919837952, "learning_rate": 9.364419364419366e-06, "loss": 0.0529, "step": 11580 }, { "epoch": 3.442233442233442, "grad_norm": 0.6579751372337341, "learning_rate": 9.346599346599347e-06, "loss": 0.0437, "step": 11590 }, { "epoch": 3.445203445203445, "grad_norm": 1.1427984237670898, "learning_rate": 9.328779328779329e-06, "loss": 0.0565, "step": 11600 }, { "epoch": 3.4481734481734483, "grad_norm": 0.7703096866607666, "learning_rate": 9.31095931095931e-06, "loss": 0.0473, "step": 11610 }, { "epoch": 3.451143451143451, "grad_norm": 1.2040685415267944, "learning_rate": 9.293139293139295e-06, "loss": 0.0579, "step": 11620 }, { "epoch": 3.454113454113454, "grad_norm": 0.7224343419075012, "learning_rate": 9.275319275319275e-06, "loss": 0.0523, "step": 11630 }, { "epoch": 3.457083457083457, "grad_norm": 1.0151997804641724, "learning_rate": 9.257499257499257e-06, "loss": 0.0559, "step": 11640 }, { "epoch": 3.46005346005346, "grad_norm": 0.5172938704490662, "learning_rate": 9.239679239679241e-06, "loss": 0.0518, "step": 11650 }, { "epoch": 3.463023463023463, "grad_norm": 0.864036500453949, "learning_rate": 9.221859221859222e-06, "loss": 0.0407, "step": 11660 }, { "epoch": 3.465993465993466, "grad_norm": 0.603539228439331, "learning_rate": 9.204039204039204e-06, "loss": 0.0589, "step": 11670 }, { "epoch": 3.468963468963469, "grad_norm": 0.6137470006942749, "learning_rate": 9.186219186219186e-06, "loss": 0.0657, "step": 11680 }, { "epoch": 3.471933471933472, "grad_norm": 0.8344042897224426, "learning_rate": 9.16839916839917e-06, "loss": 0.0562, "step": 11690 }, { "epoch": 3.474903474903475, "grad_norm": 0.9684216976165771, "learning_rate": 9.15057915057915e-06, "loss": 0.0482, "step": 11700 }, { "epoch": 3.477873477873478, "grad_norm": 0.5925269722938538, "learning_rate": 9.132759132759133e-06, "loss": 0.0563, "step": 11710 }, { "epoch": 3.4808434808434807, "grad_norm": 1.1107386350631714, "learning_rate": 9.114939114939116e-06, "loss": 0.0585, "step": 11720 }, { "epoch": 3.483813483813484, "grad_norm": 0.7802149653434753, "learning_rate": 9.097119097119097e-06, "loss": 0.0639, "step": 11730 }, { "epoch": 3.486783486783487, "grad_norm": 0.9649807214736938, "learning_rate": 9.079299079299079e-06, "loss": 0.0687, "step": 11740 }, { "epoch": 3.4897534897534896, "grad_norm": 0.5262308716773987, "learning_rate": 9.061479061479061e-06, "loss": 0.0432, "step": 11750 }, { "epoch": 3.492723492723493, "grad_norm": 0.8198150396347046, "learning_rate": 9.043659043659045e-06, "loss": 0.0583, "step": 11760 }, { "epoch": 3.4956934956934957, "grad_norm": 1.3013529777526855, "learning_rate": 9.025839025839026e-06, "loss": 0.0562, "step": 11770 }, { "epoch": 3.4986634986634986, "grad_norm": 0.875752329826355, "learning_rate": 9.008019008019008e-06, "loss": 0.0499, "step": 11780 }, { "epoch": 3.501633501633502, "grad_norm": 0.8648601174354553, "learning_rate": 8.990198990198992e-06, "loss": 0.0502, "step": 11790 }, { "epoch": 3.5046035046035047, "grad_norm": 0.9301609992980957, "learning_rate": 8.972378972378972e-06, "loss": 0.0478, "step": 11800 }, { "epoch": 3.5075735075735075, "grad_norm": 0.9498510360717773, "learning_rate": 8.954558954558954e-06, "loss": 0.0503, "step": 11810 }, { "epoch": 3.5105435105435108, "grad_norm": 0.5462694764137268, "learning_rate": 8.936738936738936e-06, "loss": 0.0575, "step": 11820 }, { "epoch": 3.5135135135135136, "grad_norm": 0.8553217649459839, "learning_rate": 8.91891891891892e-06, "loss": 0.0655, "step": 11830 }, { "epoch": 3.5164835164835164, "grad_norm": 0.9228159189224243, "learning_rate": 8.9010989010989e-06, "loss": 0.0611, "step": 11840 }, { "epoch": 3.5194535194535197, "grad_norm": 0.6614230275154114, "learning_rate": 8.883278883278883e-06, "loss": 0.0465, "step": 11850 }, { "epoch": 3.5224235224235225, "grad_norm": 0.9171364307403564, "learning_rate": 8.865458865458867e-06, "loss": 0.0524, "step": 11860 }, { "epoch": 3.5253935253935254, "grad_norm": 0.4737289249897003, "learning_rate": 8.847638847638847e-06, "loss": 0.0688, "step": 11870 }, { "epoch": 3.5283635283635286, "grad_norm": 0.79871666431427, "learning_rate": 8.82981882981883e-06, "loss": 0.0556, "step": 11880 }, { "epoch": 3.5313335313335315, "grad_norm": 0.9557964205741882, "learning_rate": 8.811998811998812e-06, "loss": 0.0592, "step": 11890 }, { "epoch": 3.5343035343035343, "grad_norm": 0.9244213104248047, "learning_rate": 8.794178794178795e-06, "loss": 0.0513, "step": 11900 }, { "epoch": 3.5372735372735375, "grad_norm": 0.5867542624473572, "learning_rate": 8.776358776358776e-06, "loss": 0.0504, "step": 11910 }, { "epoch": 3.5402435402435404, "grad_norm": 0.47742247581481934, "learning_rate": 8.758538758538758e-06, "loss": 0.0529, "step": 11920 }, { "epoch": 3.543213543213543, "grad_norm": 1.0986418724060059, "learning_rate": 8.740718740718742e-06, "loss": 0.0546, "step": 11930 }, { "epoch": 3.546183546183546, "grad_norm": 0.7719411253929138, "learning_rate": 8.722898722898724e-06, "loss": 0.0564, "step": 11940 }, { "epoch": 3.5491535491535493, "grad_norm": 0.9818161129951477, "learning_rate": 8.705078705078705e-06, "loss": 0.0691, "step": 11950 }, { "epoch": 3.552123552123552, "grad_norm": 0.6486049294471741, "learning_rate": 8.687258687258687e-06, "loss": 0.0662, "step": 11960 }, { "epoch": 3.555093555093555, "grad_norm": 0.6224350929260254, "learning_rate": 8.66943866943867e-06, "loss": 0.0565, "step": 11970 }, { "epoch": 3.5580635580635582, "grad_norm": 1.0095922946929932, "learning_rate": 8.651618651618651e-06, "loss": 0.0609, "step": 11980 }, { "epoch": 3.561033561033561, "grad_norm": 0.5099046230316162, "learning_rate": 8.633798633798633e-06, "loss": 0.0578, "step": 11990 }, { "epoch": 3.564003564003564, "grad_norm": 0.9583487510681152, "learning_rate": 8.615978615978617e-06, "loss": 0.0751, "step": 12000 }, { "epoch": 3.5669735669735667, "grad_norm": 0.8922184109687805, "learning_rate": 8.5981585981586e-06, "loss": 0.0469, "step": 12010 }, { "epoch": 3.56994356994357, "grad_norm": 0.6828520894050598, "learning_rate": 8.58033858033858e-06, "loss": 0.0482, "step": 12020 }, { "epoch": 3.572913572913573, "grad_norm": 1.372278094291687, "learning_rate": 8.562518562518562e-06, "loss": 0.0606, "step": 12030 }, { "epoch": 3.5758835758835756, "grad_norm": 0.5054883360862732, "learning_rate": 8.544698544698546e-06, "loss": 0.0465, "step": 12040 }, { "epoch": 3.578853578853579, "grad_norm": 0.564597487449646, "learning_rate": 8.526878526878526e-06, "loss": 0.0516, "step": 12050 }, { "epoch": 3.5818235818235817, "grad_norm": 0.6574399471282959, "learning_rate": 8.509058509058509e-06, "loss": 0.0511, "step": 12060 }, { "epoch": 3.5847935847935846, "grad_norm": 0.4628748893737793, "learning_rate": 8.491238491238492e-06, "loss": 0.0494, "step": 12070 }, { "epoch": 3.587763587763588, "grad_norm": 0.957274854183197, "learning_rate": 8.473418473418475e-06, "loss": 0.0558, "step": 12080 }, { "epoch": 3.5907335907335907, "grad_norm": 0.9441186785697937, "learning_rate": 8.455598455598455e-06, "loss": 0.0572, "step": 12090 }, { "epoch": 3.5937035937035935, "grad_norm": 0.7726499438285828, "learning_rate": 8.437778437778437e-06, "loss": 0.058, "step": 12100 }, { "epoch": 3.5966735966735968, "grad_norm": 0.7733856439590454, "learning_rate": 8.419958419958421e-06, "loss": 0.0383, "step": 12110 }, { "epoch": 3.5996435996435996, "grad_norm": 0.8538611531257629, "learning_rate": 8.402138402138402e-06, "loss": 0.0647, "step": 12120 }, { "epoch": 3.6026136026136024, "grad_norm": 0.826570451259613, "learning_rate": 8.384318384318384e-06, "loss": 0.0521, "step": 12130 }, { "epoch": 3.6055836055836057, "grad_norm": 1.6991403102874756, "learning_rate": 8.366498366498368e-06, "loss": 0.058, "step": 12140 }, { "epoch": 3.6085536085536085, "grad_norm": 0.7238519191741943, "learning_rate": 8.34867834867835e-06, "loss": 0.0655, "step": 12150 }, { "epoch": 3.6115236115236113, "grad_norm": 0.7092881798744202, "learning_rate": 8.33085833085833e-06, "loss": 0.0623, "step": 12160 }, { "epoch": 3.6144936144936146, "grad_norm": 0.5333400964736938, "learning_rate": 8.313038313038312e-06, "loss": 0.0503, "step": 12170 }, { "epoch": 3.6174636174636174, "grad_norm": 0.9490695595741272, "learning_rate": 8.295218295218296e-06, "loss": 0.0495, "step": 12180 }, { "epoch": 3.6204336204336203, "grad_norm": 0.5925624370574951, "learning_rate": 8.277398277398278e-06, "loss": 0.0575, "step": 12190 }, { "epoch": 3.6234036234036235, "grad_norm": 0.8142616748809814, "learning_rate": 8.259578259578259e-06, "loss": 0.0561, "step": 12200 }, { "epoch": 3.6263736263736264, "grad_norm": 0.8163090348243713, "learning_rate": 8.241758241758243e-06, "loss": 0.0576, "step": 12210 }, { "epoch": 3.629343629343629, "grad_norm": 0.8602144122123718, "learning_rate": 8.223938223938225e-06, "loss": 0.0543, "step": 12220 }, { "epoch": 3.6323136323136325, "grad_norm": 0.5057087540626526, "learning_rate": 8.206118206118205e-06, "loss": 0.0658, "step": 12230 }, { "epoch": 3.6352836352836353, "grad_norm": 1.0046048164367676, "learning_rate": 8.188298188298188e-06, "loss": 0.0546, "step": 12240 }, { "epoch": 3.638253638253638, "grad_norm": 0.5388000011444092, "learning_rate": 8.170478170478171e-06, "loss": 0.0535, "step": 12250 }, { "epoch": 3.6412236412236414, "grad_norm": 0.31448933482170105, "learning_rate": 8.152658152658154e-06, "loss": 0.0561, "step": 12260 }, { "epoch": 3.644193644193644, "grad_norm": 0.7060285210609436, "learning_rate": 8.134838134838134e-06, "loss": 0.0628, "step": 12270 }, { "epoch": 3.647163647163647, "grad_norm": 0.44246843457221985, "learning_rate": 8.117018117018118e-06, "loss": 0.0549, "step": 12280 }, { "epoch": 3.6501336501336503, "grad_norm": 0.8844314813613892, "learning_rate": 8.0991980991981e-06, "loss": 0.0525, "step": 12290 }, { "epoch": 3.653103653103653, "grad_norm": 1.1188846826553345, "learning_rate": 8.08137808137808e-06, "loss": 0.0563, "step": 12300 }, { "epoch": 3.656073656073656, "grad_norm": 0.7898098230361938, "learning_rate": 8.063558063558063e-06, "loss": 0.0571, "step": 12310 }, { "epoch": 3.6590436590436592, "grad_norm": 0.5534541606903076, "learning_rate": 8.045738045738047e-06, "loss": 0.0648, "step": 12320 }, { "epoch": 3.662013662013662, "grad_norm": 0.5332062840461731, "learning_rate": 8.027918027918029e-06, "loss": 0.0674, "step": 12330 }, { "epoch": 3.664983664983665, "grad_norm": 1.169073224067688, "learning_rate": 8.01009801009801e-06, "loss": 0.0532, "step": 12340 }, { "epoch": 3.667953667953668, "grad_norm": 0.5271252393722534, "learning_rate": 7.992277992277993e-06, "loss": 0.0593, "step": 12350 }, { "epoch": 3.670923670923671, "grad_norm": 0.8392048478126526, "learning_rate": 7.974457974457975e-06, "loss": 0.0468, "step": 12360 }, { "epoch": 3.673893673893674, "grad_norm": 0.7559081315994263, "learning_rate": 7.956637956637956e-06, "loss": 0.0476, "step": 12370 }, { "epoch": 3.676863676863677, "grad_norm": 0.6734028458595276, "learning_rate": 7.938817938817938e-06, "loss": 0.0532, "step": 12380 }, { "epoch": 3.67983367983368, "grad_norm": 0.7153018712997437, "learning_rate": 7.920997920997922e-06, "loss": 0.0597, "step": 12390 }, { "epoch": 3.6828036828036828, "grad_norm": 0.7017358541488647, "learning_rate": 7.903177903177904e-06, "loss": 0.0642, "step": 12400 }, { "epoch": 3.685773685773686, "grad_norm": 0.8669632077217102, "learning_rate": 7.885357885357884e-06, "loss": 0.0521, "step": 12410 }, { "epoch": 3.688743688743689, "grad_norm": 0.5131626725196838, "learning_rate": 7.867537867537868e-06, "loss": 0.0578, "step": 12420 }, { "epoch": 3.6917136917136917, "grad_norm": 1.0312336683273315, "learning_rate": 7.84971784971785e-06, "loss": 0.0641, "step": 12430 }, { "epoch": 3.694683694683695, "grad_norm": 0.620993435382843, "learning_rate": 7.831897831897831e-06, "loss": 0.0577, "step": 12440 }, { "epoch": 3.697653697653698, "grad_norm": 0.9431599974632263, "learning_rate": 7.814077814077813e-06, "loss": 0.054, "step": 12450 }, { "epoch": 3.7006237006237006, "grad_norm": 0.5963833928108215, "learning_rate": 7.796257796257797e-06, "loss": 0.0617, "step": 12460 }, { "epoch": 3.7035937035937034, "grad_norm": 0.8309038877487183, "learning_rate": 7.77843777843778e-06, "loss": 0.064, "step": 12470 }, { "epoch": 3.7065637065637067, "grad_norm": 0.38153156638145447, "learning_rate": 7.76061776061776e-06, "loss": 0.0574, "step": 12480 }, { "epoch": 3.7095337095337095, "grad_norm": 0.6947796940803528, "learning_rate": 7.742797742797744e-06, "loss": 0.0582, "step": 12490 }, { "epoch": 3.7125037125037124, "grad_norm": 0.29392537474632263, "learning_rate": 7.724977724977726e-06, "loss": 0.0599, "step": 12500 }, { "epoch": 3.7154737154737156, "grad_norm": 0.7719232439994812, "learning_rate": 7.707157707157708e-06, "loss": 0.0454, "step": 12510 }, { "epoch": 3.7184437184437185, "grad_norm": 0.8208956718444824, "learning_rate": 7.689337689337688e-06, "loss": 0.0627, "step": 12520 }, { "epoch": 3.7214137214137213, "grad_norm": 0.7686058878898621, "learning_rate": 7.671517671517672e-06, "loss": 0.055, "step": 12530 }, { "epoch": 3.724383724383724, "grad_norm": 0.7790077328681946, "learning_rate": 7.653697653697654e-06, "loss": 0.0595, "step": 12540 }, { "epoch": 3.7273537273537274, "grad_norm": 0.5429967045783997, "learning_rate": 7.635877635877635e-06, "loss": 0.0596, "step": 12550 }, { "epoch": 3.73032373032373, "grad_norm": 0.4277282655239105, "learning_rate": 7.618057618057619e-06, "loss": 0.057, "step": 12560 }, { "epoch": 3.733293733293733, "grad_norm": 0.9140810966491699, "learning_rate": 7.600237600237601e-06, "loss": 0.0572, "step": 12570 }, { "epoch": 3.7362637362637363, "grad_norm": 0.9258958697319031, "learning_rate": 7.582417582417582e-06, "loss": 0.0467, "step": 12580 }, { "epoch": 3.739233739233739, "grad_norm": 0.4420478045940399, "learning_rate": 7.564597564597564e-06, "loss": 0.0633, "step": 12590 }, { "epoch": 3.742203742203742, "grad_norm": 0.642549455165863, "learning_rate": 7.546777546777547e-06, "loss": 0.059, "step": 12600 }, { "epoch": 3.7451737451737452, "grad_norm": 0.9195041060447693, "learning_rate": 7.528957528957529e-06, "loss": 0.059, "step": 12610 }, { "epoch": 3.748143748143748, "grad_norm": 0.7634923458099365, "learning_rate": 7.511137511137511e-06, "loss": 0.0476, "step": 12620 }, { "epoch": 3.751113751113751, "grad_norm": 0.7769595980644226, "learning_rate": 7.493317493317493e-06, "loss": 0.0573, "step": 12630 }, { "epoch": 3.754083754083754, "grad_norm": 0.8632429242134094, "learning_rate": 7.475497475497476e-06, "loss": 0.0769, "step": 12640 }, { "epoch": 3.757053757053757, "grad_norm": 0.7088262438774109, "learning_rate": 7.457677457677457e-06, "loss": 0.0542, "step": 12650 }, { "epoch": 3.76002376002376, "grad_norm": 0.7802964448928833, "learning_rate": 7.4398574398574404e-06, "loss": 0.0541, "step": 12660 }, { "epoch": 3.762993762993763, "grad_norm": 0.8181250095367432, "learning_rate": 7.422037422037423e-06, "loss": 0.0555, "step": 12670 }, { "epoch": 3.765963765963766, "grad_norm": 0.48934659361839294, "learning_rate": 7.404217404217404e-06, "loss": 0.0552, "step": 12680 }, { "epoch": 3.7689337689337687, "grad_norm": 0.7606164216995239, "learning_rate": 7.386397386397387e-06, "loss": 0.052, "step": 12690 }, { "epoch": 3.771903771903772, "grad_norm": 0.5713620185852051, "learning_rate": 7.368577368577368e-06, "loss": 0.0497, "step": 12700 }, { "epoch": 3.774873774873775, "grad_norm": 0.4329434037208557, "learning_rate": 7.350757350757351e-06, "loss": 0.0508, "step": 12710 }, { "epoch": 3.7778437778437777, "grad_norm": 0.5861251950263977, "learning_rate": 7.332937332937333e-06, "loss": 0.0574, "step": 12720 }, { "epoch": 3.780813780813781, "grad_norm": 0.9137320518493652, "learning_rate": 7.315117315117316e-06, "loss": 0.0473, "step": 12730 }, { "epoch": 3.7837837837837838, "grad_norm": 0.4197285771369934, "learning_rate": 7.297297297297298e-06, "loss": 0.0557, "step": 12740 }, { "epoch": 3.7867537867537866, "grad_norm": 0.8672818541526794, "learning_rate": 7.27947727947728e-06, "loss": 0.0547, "step": 12750 }, { "epoch": 3.78972378972379, "grad_norm": 1.1099815368652344, "learning_rate": 7.261657261657262e-06, "loss": 0.0547, "step": 12760 }, { "epoch": 3.7926937926937927, "grad_norm": 0.6481065154075623, "learning_rate": 7.2438372438372435e-06, "loss": 0.0552, "step": 12770 }, { "epoch": 3.7956637956637955, "grad_norm": 0.6549276113510132, "learning_rate": 7.2260172260172265e-06, "loss": 0.052, "step": 12780 }, { "epoch": 3.798633798633799, "grad_norm": 1.2700341939926147, "learning_rate": 7.208197208197208e-06, "loss": 0.052, "step": 12790 }, { "epoch": 3.8016038016038016, "grad_norm": 0.7553129196166992, "learning_rate": 7.190377190377191e-06, "loss": 0.0506, "step": 12800 }, { "epoch": 3.8045738045738045, "grad_norm": 0.6122118830680847, "learning_rate": 7.172557172557173e-06, "loss": 0.0588, "step": 12810 }, { "epoch": 3.8075438075438077, "grad_norm": 1.0293030738830566, "learning_rate": 7.154737154737155e-06, "loss": 0.0626, "step": 12820 }, { "epoch": 3.8105138105138106, "grad_norm": 0.8084139227867126, "learning_rate": 7.136917136917137e-06, "loss": 0.0523, "step": 12830 }, { "epoch": 3.8134838134838134, "grad_norm": 0.83390212059021, "learning_rate": 7.119097119097119e-06, "loss": 0.0561, "step": 12840 }, { "epoch": 3.8164538164538166, "grad_norm": 0.8345874547958374, "learning_rate": 7.101277101277102e-06, "loss": 0.0578, "step": 12850 }, { "epoch": 3.8194238194238195, "grad_norm": 0.9062905311584473, "learning_rate": 7.083457083457083e-06, "loss": 0.0488, "step": 12860 }, { "epoch": 3.8223938223938223, "grad_norm": 0.855656623840332, "learning_rate": 7.065637065637066e-06, "loss": 0.0469, "step": 12870 }, { "epoch": 3.8253638253638256, "grad_norm": 0.7304750084877014, "learning_rate": 7.047817047817048e-06, "loss": 0.0612, "step": 12880 }, { "epoch": 3.8283338283338284, "grad_norm": 1.3436956405639648, "learning_rate": 7.02999702999703e-06, "loss": 0.0692, "step": 12890 }, { "epoch": 3.8313038313038312, "grad_norm": 0.6037256121635437, "learning_rate": 7.0121770121770125e-06, "loss": 0.0527, "step": 12900 }, { "epoch": 3.8342738342738345, "grad_norm": 0.9017062187194824, "learning_rate": 6.994356994356995e-06, "loss": 0.0544, "step": 12910 }, { "epoch": 3.8372438372438373, "grad_norm": 0.5321645736694336, "learning_rate": 6.976536976536977e-06, "loss": 0.0616, "step": 12920 }, { "epoch": 3.84021384021384, "grad_norm": 0.7365061044692993, "learning_rate": 6.958716958716958e-06, "loss": 0.0544, "step": 12930 }, { "epoch": 3.8431838431838434, "grad_norm": 0.6113480925559998, "learning_rate": 6.940896940896941e-06, "loss": 0.0547, "step": 12940 }, { "epoch": 3.8461538461538463, "grad_norm": 0.6009132266044617, "learning_rate": 6.923076923076923e-06, "loss": 0.0585, "step": 12950 }, { "epoch": 3.849123849123849, "grad_norm": 0.9174481630325317, "learning_rate": 6.9052569052569056e-06, "loss": 0.0622, "step": 12960 }, { "epoch": 3.8520938520938524, "grad_norm": 0.3774741590023041, "learning_rate": 6.887436887436888e-06, "loss": 0.0545, "step": 12970 }, { "epoch": 3.855063855063855, "grad_norm": 1.134770154953003, "learning_rate": 6.86961686961687e-06, "loss": 0.0471, "step": 12980 }, { "epoch": 3.858033858033858, "grad_norm": 0.8583316802978516, "learning_rate": 6.851796851796852e-06, "loss": 0.0535, "step": 12990 }, { "epoch": 3.861003861003861, "grad_norm": 0.6052166223526001, "learning_rate": 6.833976833976834e-06, "loss": 0.0557, "step": 13000 }, { "epoch": 3.863973863973864, "grad_norm": 1.0991127490997314, "learning_rate": 6.816156816156816e-06, "loss": 0.0528, "step": 13010 }, { "epoch": 3.866943866943867, "grad_norm": 0.7694152593612671, "learning_rate": 6.7983367983367986e-06, "loss": 0.0689, "step": 13020 }, { "epoch": 3.8699138699138698, "grad_norm": 0.9624953866004944, "learning_rate": 6.780516780516781e-06, "loss": 0.0585, "step": 13030 }, { "epoch": 3.872883872883873, "grad_norm": 0.613135814666748, "learning_rate": 6.762696762696763e-06, "loss": 0.0636, "step": 13040 }, { "epoch": 3.875853875853876, "grad_norm": 0.6408237814903259, "learning_rate": 6.744876744876745e-06, "loss": 0.0525, "step": 13050 }, { "epoch": 3.8788238788238787, "grad_norm": 0.5397794842720032, "learning_rate": 6.727056727056727e-06, "loss": 0.0538, "step": 13060 }, { "epoch": 3.8817938817938815, "grad_norm": 0.569254457950592, "learning_rate": 6.7092367092367094e-06, "loss": 0.0494, "step": 13070 }, { "epoch": 3.884763884763885, "grad_norm": 0.7283187508583069, "learning_rate": 6.691416691416692e-06, "loss": 0.0428, "step": 13080 }, { "epoch": 3.8877338877338876, "grad_norm": 0.5344458818435669, "learning_rate": 6.673596673596674e-06, "loss": 0.0487, "step": 13090 }, { "epoch": 3.8907038907038904, "grad_norm": 0.8869258165359497, "learning_rate": 6.655776655776656e-06, "loss": 0.0517, "step": 13100 }, { "epoch": 3.8936738936738937, "grad_norm": 0.6164690256118774, "learning_rate": 6.637956637956638e-06, "loss": 0.056, "step": 13110 }, { "epoch": 3.8966438966438965, "grad_norm": 1.2430282831192017, "learning_rate": 6.62013662013662e-06, "loss": 0.0546, "step": 13120 }, { "epoch": 3.8996138996138994, "grad_norm": 0.5532137751579285, "learning_rate": 6.6023166023166025e-06, "loss": 0.0545, "step": 13130 }, { "epoch": 3.9025839025839026, "grad_norm": 0.9797311425209045, "learning_rate": 6.584496584496585e-06, "loss": 0.0532, "step": 13140 }, { "epoch": 3.9055539055539055, "grad_norm": 1.118308186531067, "learning_rate": 6.566676566676567e-06, "loss": 0.0609, "step": 13150 }, { "epoch": 3.9085239085239083, "grad_norm": 0.8075883388519287, "learning_rate": 6.548856548856549e-06, "loss": 0.0623, "step": 13160 }, { "epoch": 3.9114939114939116, "grad_norm": 0.5950072407722473, "learning_rate": 6.531036531036531e-06, "loss": 0.0556, "step": 13170 }, { "epoch": 3.9144639144639144, "grad_norm": 0.7014681696891785, "learning_rate": 6.513216513216513e-06, "loss": 0.0685, "step": 13180 }, { "epoch": 3.9174339174339172, "grad_norm": 0.8518680334091187, "learning_rate": 6.4953964953964955e-06, "loss": 0.0561, "step": 13190 }, { "epoch": 3.9204039204039205, "grad_norm": 0.61479651927948, "learning_rate": 6.477576477576478e-06, "loss": 0.0512, "step": 13200 }, { "epoch": 3.9233739233739233, "grad_norm": 0.8155126571655273, "learning_rate": 6.45975645975646e-06, "loss": 0.0469, "step": 13210 }, { "epoch": 3.926343926343926, "grad_norm": 0.8962894082069397, "learning_rate": 6.441936441936442e-06, "loss": 0.0517, "step": 13220 }, { "epoch": 3.9293139293139294, "grad_norm": 0.8040223717689514, "learning_rate": 6.424116424116425e-06, "loss": 0.0546, "step": 13230 }, { "epoch": 3.9322839322839322, "grad_norm": 0.6404465436935425, "learning_rate": 6.406296406296406e-06, "loss": 0.057, "step": 13240 }, { "epoch": 3.935253935253935, "grad_norm": 0.6395756602287292, "learning_rate": 6.3884763884763885e-06, "loss": 0.0552, "step": 13250 }, { "epoch": 3.9382239382239383, "grad_norm": 0.8880889415740967, "learning_rate": 6.370656370656371e-06, "loss": 0.0529, "step": 13260 }, { "epoch": 3.941193941193941, "grad_norm": 0.8830543160438538, "learning_rate": 6.352836352836353e-06, "loss": 0.0532, "step": 13270 }, { "epoch": 3.944163944163944, "grad_norm": 1.0157254934310913, "learning_rate": 6.335016335016335e-06, "loss": 0.0504, "step": 13280 }, { "epoch": 3.9471339471339473, "grad_norm": 0.43136101961135864, "learning_rate": 6.317196317196317e-06, "loss": 0.0422, "step": 13290 }, { "epoch": 3.95010395010395, "grad_norm": 0.990959107875824, "learning_rate": 6.2993762993763e-06, "loss": 0.0685, "step": 13300 }, { "epoch": 3.953073953073953, "grad_norm": 0.7046756744384766, "learning_rate": 6.2815562815562815e-06, "loss": 0.0482, "step": 13310 }, { "epoch": 3.956043956043956, "grad_norm": 0.8372275829315186, "learning_rate": 6.2637362637362645e-06, "loss": 0.0458, "step": 13320 }, { "epoch": 3.959013959013959, "grad_norm": 0.7267201542854309, "learning_rate": 6.245916245916246e-06, "loss": 0.0461, "step": 13330 }, { "epoch": 3.961983961983962, "grad_norm": 1.0572090148925781, "learning_rate": 6.228096228096228e-06, "loss": 0.0502, "step": 13340 }, { "epoch": 3.964953964953965, "grad_norm": 1.4934335947036743, "learning_rate": 6.21027621027621e-06, "loss": 0.0622, "step": 13350 }, { "epoch": 3.967923967923968, "grad_norm": 0.9395650029182434, "learning_rate": 6.192456192456192e-06, "loss": 0.0438, "step": 13360 }, { "epoch": 3.970893970893971, "grad_norm": 0.7308318018913269, "learning_rate": 6.174636174636175e-06, "loss": 0.0547, "step": 13370 }, { "epoch": 3.973863973863974, "grad_norm": 1.219499111175537, "learning_rate": 6.156816156816157e-06, "loss": 0.0698, "step": 13380 }, { "epoch": 3.976833976833977, "grad_norm": 1.007973074913025, "learning_rate": 6.13899613899614e-06, "loss": 0.0598, "step": 13390 }, { "epoch": 3.9798039798039797, "grad_norm": 0.569019615650177, "learning_rate": 6.121176121176121e-06, "loss": 0.0567, "step": 13400 }, { "epoch": 3.982773982773983, "grad_norm": 1.0321439504623413, "learning_rate": 6.103356103356103e-06, "loss": 0.0512, "step": 13410 }, { "epoch": 3.985743985743986, "grad_norm": 0.6399171352386475, "learning_rate": 6.085536085536085e-06, "loss": 0.0551, "step": 13420 }, { "epoch": 3.9887139887139886, "grad_norm": 0.7629494071006775, "learning_rate": 6.0677160677160676e-06, "loss": 0.0577, "step": 13430 }, { "epoch": 3.991683991683992, "grad_norm": 0.564344584941864, "learning_rate": 6.049896049896051e-06, "loss": 0.0512, "step": 13440 }, { "epoch": 3.9946539946539947, "grad_norm": 0.5358871817588806, "learning_rate": 6.032076032076032e-06, "loss": 0.0552, "step": 13450 }, { "epoch": 3.9976239976239976, "grad_norm": 0.9634397625923157, "learning_rate": 6.014256014256015e-06, "loss": 0.0539, "step": 13460 }, { "epoch": 4.0, "eval_f1": 0.33031292965957215, "eval_loss": 0.043538980185985565, "eval_runtime": 164.5887, "eval_samples_per_second": 230.994, "eval_steps_per_second": 3.615, "step": 13468 }, { "epoch": 4.000594000594001, "grad_norm": 1.27556574344635, "learning_rate": 5.996435996435996e-06, "loss": 0.0485, "step": 13470 }, { "epoch": 4.003564003564003, "grad_norm": 1.009068489074707, "learning_rate": 5.978615978615979e-06, "loss": 0.0567, "step": 13480 }, { "epoch": 4.0065340065340065, "grad_norm": 0.7276560068130493, "learning_rate": 5.960795960795961e-06, "loss": 0.0475, "step": 13490 }, { "epoch": 4.00950400950401, "grad_norm": 1.1445473432540894, "learning_rate": 5.942975942975943e-06, "loss": 0.0555, "step": 13500 }, { "epoch": 4.012474012474012, "grad_norm": 0.8062208294868469, "learning_rate": 5.925155925155926e-06, "loss": 0.0482, "step": 13510 }, { "epoch": 4.015444015444015, "grad_norm": 0.8470934629440308, "learning_rate": 5.907335907335907e-06, "loss": 0.0553, "step": 13520 }, { "epoch": 4.018414018414019, "grad_norm": 0.5474221706390381, "learning_rate": 5.88951588951589e-06, "loss": 0.0534, "step": 13530 }, { "epoch": 4.021384021384021, "grad_norm": 0.6959888339042664, "learning_rate": 5.8716958716958714e-06, "loss": 0.0538, "step": 13540 }, { "epoch": 4.024354024354024, "grad_norm": 0.9099267721176147, "learning_rate": 5.8538758538758545e-06, "loss": 0.055, "step": 13550 }, { "epoch": 4.027324027324028, "grad_norm": 1.2985259294509888, "learning_rate": 5.836055836055836e-06, "loss": 0.0594, "step": 13560 }, { "epoch": 4.03029403029403, "grad_norm": 0.9521999359130859, "learning_rate": 5.818235818235818e-06, "loss": 0.0588, "step": 13570 }, { "epoch": 4.033264033264033, "grad_norm": 0.5641045570373535, "learning_rate": 5.800415800415801e-06, "loss": 0.0638, "step": 13580 }, { "epoch": 4.0362340362340365, "grad_norm": 1.3589891195297241, "learning_rate": 5.782595782595782e-06, "loss": 0.0558, "step": 13590 }, { "epoch": 4.039204039204039, "grad_norm": 1.1346710920333862, "learning_rate": 5.764775764775765e-06, "loss": 0.0548, "step": 13600 }, { "epoch": 4.042174042174042, "grad_norm": 1.2047168016433716, "learning_rate": 5.746955746955747e-06, "loss": 0.0485, "step": 13610 }, { "epoch": 4.0451440451440455, "grad_norm": 0.673353910446167, "learning_rate": 5.72913572913573e-06, "loss": 0.0518, "step": 13620 }, { "epoch": 4.048114048114048, "grad_norm": 0.6137884855270386, "learning_rate": 5.711315711315711e-06, "loss": 0.0594, "step": 13630 }, { "epoch": 4.051084051084051, "grad_norm": 0.7243009805679321, "learning_rate": 5.693495693495694e-06, "loss": 0.0528, "step": 13640 }, { "epoch": 4.054054054054054, "grad_norm": 0.5329965353012085, "learning_rate": 5.675675675675676e-06, "loss": 0.049, "step": 13650 }, { "epoch": 4.057024057024057, "grad_norm": 0.4849202334880829, "learning_rate": 5.6578556578556575e-06, "loss": 0.0541, "step": 13660 }, { "epoch": 4.05999405999406, "grad_norm": 0.6682773232460022, "learning_rate": 5.6400356400356405e-06, "loss": 0.0571, "step": 13670 }, { "epoch": 4.062964062964063, "grad_norm": 0.7048366069793701, "learning_rate": 5.622215622215622e-06, "loss": 0.0582, "step": 13680 }, { "epoch": 4.065934065934066, "grad_norm": 0.7056940793991089, "learning_rate": 5.604395604395605e-06, "loss": 0.0437, "step": 13690 }, { "epoch": 4.068904068904069, "grad_norm": 0.6416358351707458, "learning_rate": 5.586575586575586e-06, "loss": 0.0458, "step": 13700 }, { "epoch": 4.071874071874072, "grad_norm": 0.7514241933822632, "learning_rate": 5.568755568755569e-06, "loss": 0.0589, "step": 13710 }, { "epoch": 4.074844074844075, "grad_norm": 0.6604739427566528, "learning_rate": 5.550935550935551e-06, "loss": 0.0562, "step": 13720 }, { "epoch": 4.077814077814078, "grad_norm": 0.4404093325138092, "learning_rate": 5.533115533115533e-06, "loss": 0.0561, "step": 13730 }, { "epoch": 4.080784080784081, "grad_norm": 0.5778560042381287, "learning_rate": 5.515295515295516e-06, "loss": 0.0664, "step": 13740 }, { "epoch": 4.0837540837540836, "grad_norm": 0.8316758275032043, "learning_rate": 5.497475497475497e-06, "loss": 0.0623, "step": 13750 }, { "epoch": 4.086724086724087, "grad_norm": 1.186763882637024, "learning_rate": 5.47965547965548e-06, "loss": 0.0524, "step": 13760 }, { "epoch": 4.08969408969409, "grad_norm": 0.4923029839992523, "learning_rate": 5.461835461835461e-06, "loss": 0.0601, "step": 13770 }, { "epoch": 4.0926640926640925, "grad_norm": 0.8885074257850647, "learning_rate": 5.444015444015444e-06, "loss": 0.0549, "step": 13780 }, { "epoch": 4.095634095634096, "grad_norm": 1.2533655166625977, "learning_rate": 5.4261954261954265e-06, "loss": 0.0662, "step": 13790 }, { "epoch": 4.098604098604099, "grad_norm": 0.7636746168136597, "learning_rate": 5.408375408375409e-06, "loss": 0.056, "step": 13800 }, { "epoch": 4.101574101574101, "grad_norm": 0.5362575054168701, "learning_rate": 5.390555390555391e-06, "loss": 0.0492, "step": 13810 }, { "epoch": 4.104544104544105, "grad_norm": 0.8624958992004395, "learning_rate": 5.372735372735372e-06, "loss": 0.0472, "step": 13820 }, { "epoch": 4.107514107514108, "grad_norm": 0.8159658908843994, "learning_rate": 5.354915354915355e-06, "loss": 0.052, "step": 13830 }, { "epoch": 4.11048411048411, "grad_norm": 1.0133655071258545, "learning_rate": 5.337095337095337e-06, "loss": 0.0675, "step": 13840 }, { "epoch": 4.113454113454114, "grad_norm": 0.7075998783111572, "learning_rate": 5.3192753192753196e-06, "loss": 0.0521, "step": 13850 }, { "epoch": 4.116424116424117, "grad_norm": 0.7996563911437988, "learning_rate": 5.301455301455302e-06, "loss": 0.0569, "step": 13860 }, { "epoch": 4.119394119394119, "grad_norm": 0.7214605808258057, "learning_rate": 5.283635283635284e-06, "loss": 0.0444, "step": 13870 }, { "epoch": 4.1223641223641225, "grad_norm": 0.5179925560951233, "learning_rate": 5.265815265815266e-06, "loss": 0.0473, "step": 13880 }, { "epoch": 4.125334125334125, "grad_norm": 0.7396313548088074, "learning_rate": 5.247995247995247e-06, "loss": 0.0549, "step": 13890 }, { "epoch": 4.128304128304128, "grad_norm": 0.8080036640167236, "learning_rate": 5.2301752301752304e-06, "loss": 0.0415, "step": 13900 }, { "epoch": 4.1312741312741315, "grad_norm": 0.6321327090263367, "learning_rate": 5.212355212355213e-06, "loss": 0.0503, "step": 13910 }, { "epoch": 4.134244134244134, "grad_norm": 0.8221378922462463, "learning_rate": 5.194535194535195e-06, "loss": 0.0456, "step": 13920 }, { "epoch": 4.137214137214137, "grad_norm": 1.2352089881896973, "learning_rate": 5.176715176715177e-06, "loss": 0.0631, "step": 13930 }, { "epoch": 4.14018414018414, "grad_norm": 0.7337819933891296, "learning_rate": 5.158895158895159e-06, "loss": 0.0568, "step": 13940 }, { "epoch": 4.143154143154143, "grad_norm": 0.5924921035766602, "learning_rate": 5.141075141075141e-06, "loss": 0.0591, "step": 13950 }, { "epoch": 4.146124146124146, "grad_norm": 0.6949560642242432, "learning_rate": 5.1232551232551234e-06, "loss": 0.0448, "step": 13960 }, { "epoch": 4.149094149094149, "grad_norm": 0.686150074005127, "learning_rate": 5.105435105435106e-06, "loss": 0.0603, "step": 13970 }, { "epoch": 4.152064152064152, "grad_norm": 0.9512919783592224, "learning_rate": 5.087615087615088e-06, "loss": 0.0757, "step": 13980 }, { "epoch": 4.155034155034155, "grad_norm": 1.0221737623214722, "learning_rate": 5.06979506979507e-06, "loss": 0.0431, "step": 13990 }, { "epoch": 4.158004158004158, "grad_norm": 0.5151529312133789, "learning_rate": 5.051975051975052e-06, "loss": 0.0446, "step": 14000 }, { "epoch": 4.160974160974161, "grad_norm": 0.5607339143753052, "learning_rate": 5.034155034155034e-06, "loss": 0.0582, "step": 14010 }, { "epoch": 4.163944163944164, "grad_norm": 0.5247116088867188, "learning_rate": 5.0163350163350165e-06, "loss": 0.0601, "step": 14020 }, { "epoch": 4.166914166914167, "grad_norm": 0.5927850604057312, "learning_rate": 4.998514998514999e-06, "loss": 0.0519, "step": 14030 }, { "epoch": 4.1698841698841695, "grad_norm": 1.483964443206787, "learning_rate": 4.980694980694981e-06, "loss": 0.0551, "step": 14040 }, { "epoch": 4.172854172854173, "grad_norm": 0.7660327553749084, "learning_rate": 4.962874962874963e-06, "loss": 0.0511, "step": 14050 }, { "epoch": 4.175824175824176, "grad_norm": 0.7715831995010376, "learning_rate": 4.945054945054945e-06, "loss": 0.069, "step": 14060 }, { "epoch": 4.1787941787941785, "grad_norm": 0.8233699798583984, "learning_rate": 4.927234927234927e-06, "loss": 0.0532, "step": 14070 }, { "epoch": 4.181764181764182, "grad_norm": 1.1859806776046753, "learning_rate": 4.9094149094149095e-06, "loss": 0.0636, "step": 14080 }, { "epoch": 4.184734184734185, "grad_norm": 0.6633349061012268, "learning_rate": 4.891594891594892e-06, "loss": 0.0535, "step": 14090 }, { "epoch": 4.187704187704187, "grad_norm": 0.6353622674942017, "learning_rate": 4.873774873774874e-06, "loss": 0.0511, "step": 14100 }, { "epoch": 4.190674190674191, "grad_norm": 0.7265152335166931, "learning_rate": 4.855954855954856e-06, "loss": 0.0562, "step": 14110 }, { "epoch": 4.193644193644194, "grad_norm": 0.8203964233398438, "learning_rate": 4.838134838134839e-06, "loss": 0.062, "step": 14120 }, { "epoch": 4.196614196614196, "grad_norm": 0.6999956369400024, "learning_rate": 4.82031482031482e-06, "loss": 0.0468, "step": 14130 }, { "epoch": 4.1995841995842, "grad_norm": 0.7191495299339294, "learning_rate": 4.8024948024948025e-06, "loss": 0.0521, "step": 14140 }, { "epoch": 4.202554202554203, "grad_norm": 1.0970999002456665, "learning_rate": 4.784674784674785e-06, "loss": 0.0538, "step": 14150 }, { "epoch": 4.205524205524205, "grad_norm": 0.8598058223724365, "learning_rate": 4.766854766854767e-06, "loss": 0.0607, "step": 14160 }, { "epoch": 4.2084942084942085, "grad_norm": 0.6349924206733704, "learning_rate": 4.749034749034749e-06, "loss": 0.0684, "step": 14170 }, { "epoch": 4.211464211464212, "grad_norm": 0.836054265499115, "learning_rate": 4.731214731214731e-06, "loss": 0.0429, "step": 14180 }, { "epoch": 4.214434214434214, "grad_norm": 1.0408257246017456, "learning_rate": 4.713394713394714e-06, "loss": 0.0579, "step": 14190 }, { "epoch": 4.2174042174042174, "grad_norm": 0.69316166639328, "learning_rate": 4.6955746955746955e-06, "loss": 0.0506, "step": 14200 }, { "epoch": 4.220374220374221, "grad_norm": 0.9772447347640991, "learning_rate": 4.677754677754678e-06, "loss": 0.0597, "step": 14210 }, { "epoch": 4.223344223344223, "grad_norm": 0.7558261156082153, "learning_rate": 4.65993465993466e-06, "loss": 0.0596, "step": 14220 }, { "epoch": 4.226314226314226, "grad_norm": 1.0452693700790405, "learning_rate": 4.642114642114642e-06, "loss": 0.0593, "step": 14230 }, { "epoch": 4.22928422928423, "grad_norm": 0.8529985547065735, "learning_rate": 4.624294624294624e-06, "loss": 0.0565, "step": 14240 }, { "epoch": 4.232254232254232, "grad_norm": 0.7685134410858154, "learning_rate": 4.606474606474606e-06, "loss": 0.0399, "step": 14250 }, { "epoch": 4.235224235224235, "grad_norm": 0.6068404912948608, "learning_rate": 4.588654588654589e-06, "loss": 0.0574, "step": 14260 }, { "epoch": 4.238194238194239, "grad_norm": 0.9283270835876465, "learning_rate": 4.570834570834571e-06, "loss": 0.0578, "step": 14270 }, { "epoch": 4.241164241164241, "grad_norm": 1.1383968591690063, "learning_rate": 4.553014553014554e-06, "loss": 0.0581, "step": 14280 }, { "epoch": 4.244134244134244, "grad_norm": 1.4468823671340942, "learning_rate": 4.535194535194535e-06, "loss": 0.0472, "step": 14290 }, { "epoch": 4.2471042471042475, "grad_norm": 0.7596200108528137, "learning_rate": 4.517374517374517e-06, "loss": 0.0529, "step": 14300 }, { "epoch": 4.25007425007425, "grad_norm": 0.31641915440559387, "learning_rate": 4.499554499554499e-06, "loss": 0.0515, "step": 14310 }, { "epoch": 4.253044253044253, "grad_norm": 0.7532891035079956, "learning_rate": 4.481734481734482e-06, "loss": 0.0618, "step": 14320 }, { "epoch": 4.256014256014256, "grad_norm": 0.5696304440498352, "learning_rate": 4.463914463914465e-06, "loss": 0.0508, "step": 14330 }, { "epoch": 4.258984258984259, "grad_norm": 0.7461338639259338, "learning_rate": 4.446094446094446e-06, "loss": 0.0619, "step": 14340 }, { "epoch": 4.261954261954262, "grad_norm": 0.6547114849090576, "learning_rate": 4.428274428274429e-06, "loss": 0.054, "step": 14350 }, { "epoch": 4.2649242649242645, "grad_norm": 0.47888076305389404, "learning_rate": 4.41045441045441e-06, "loss": 0.0528, "step": 14360 }, { "epoch": 4.267894267894268, "grad_norm": 0.500554084777832, "learning_rate": 4.392634392634393e-06, "loss": 0.0508, "step": 14370 }, { "epoch": 4.270864270864271, "grad_norm": 0.8352609276771545, "learning_rate": 4.374814374814375e-06, "loss": 0.0517, "step": 14380 }, { "epoch": 4.273834273834273, "grad_norm": 0.8386610150337219, "learning_rate": 4.356994356994357e-06, "loss": 0.0537, "step": 14390 }, { "epoch": 4.276804276804277, "grad_norm": 0.6090155243873596, "learning_rate": 4.33917433917434e-06, "loss": 0.0624, "step": 14400 }, { "epoch": 4.27977427977428, "grad_norm": 0.5954572558403015, "learning_rate": 4.321354321354321e-06, "loss": 0.0562, "step": 14410 }, { "epoch": 4.282744282744282, "grad_norm": 1.196431279182434, "learning_rate": 4.303534303534304e-06, "loss": 0.0625, "step": 14420 }, { "epoch": 4.285714285714286, "grad_norm": 0.8351851105690002, "learning_rate": 4.2857142857142855e-06, "loss": 0.0473, "step": 14430 }, { "epoch": 4.288684288684289, "grad_norm": 1.5514023303985596, "learning_rate": 4.2678942678942685e-06, "loss": 0.0521, "step": 14440 }, { "epoch": 4.291654291654291, "grad_norm": 0.5265604257583618, "learning_rate": 4.25007425007425e-06, "loss": 0.0527, "step": 14450 }, { "epoch": 4.2946242946242945, "grad_norm": 0.746711015701294, "learning_rate": 4.232254232254232e-06, "loss": 0.0539, "step": 14460 }, { "epoch": 4.297594297594298, "grad_norm": 1.2712050676345825, "learning_rate": 4.214434214434215e-06, "loss": 0.061, "step": 14470 }, { "epoch": 4.3005643005643, "grad_norm": 0.7480892539024353, "learning_rate": 4.196614196614196e-06, "loss": 0.0546, "step": 14480 }, { "epoch": 4.303534303534303, "grad_norm": 0.9045110940933228, "learning_rate": 4.178794178794179e-06, "loss": 0.0528, "step": 14490 }, { "epoch": 4.306504306504307, "grad_norm": 0.7714174389839172, "learning_rate": 4.160974160974161e-06, "loss": 0.0498, "step": 14500 }, { "epoch": 4.309474309474309, "grad_norm": 0.9806069731712341, "learning_rate": 4.143154143154144e-06, "loss": 0.0642, "step": 14510 }, { "epoch": 4.312444312444312, "grad_norm": 0.8302409052848816, "learning_rate": 4.125334125334125e-06, "loss": 0.0682, "step": 14520 }, { "epoch": 4.315414315414316, "grad_norm": 1.0357773303985596, "learning_rate": 4.107514107514108e-06, "loss": 0.0543, "step": 14530 }, { "epoch": 4.318384318384318, "grad_norm": 0.711145281791687, "learning_rate": 4.08969408969409e-06, "loss": 0.0546, "step": 14540 }, { "epoch": 4.321354321354321, "grad_norm": 0.4952409863471985, "learning_rate": 4.0718740718740715e-06, "loss": 0.0483, "step": 14550 }, { "epoch": 4.324324324324325, "grad_norm": 0.7028197050094604, "learning_rate": 4.0540540540540545e-06, "loss": 0.0449, "step": 14560 }, { "epoch": 4.327294327294327, "grad_norm": 0.7618030905723572, "learning_rate": 4.036234036234036e-06, "loss": 0.0533, "step": 14570 }, { "epoch": 4.33026433026433, "grad_norm": 0.4399206340312958, "learning_rate": 4.018414018414019e-06, "loss": 0.0504, "step": 14580 }, { "epoch": 4.3332343332343335, "grad_norm": 0.4202236533164978, "learning_rate": 4.000594000594e-06, "loss": 0.0553, "step": 14590 }, { "epoch": 4.336204336204336, "grad_norm": 0.4028279185295105, "learning_rate": 3.982773982773983e-06, "loss": 0.0548, "step": 14600 }, { "epoch": 4.339174339174339, "grad_norm": 0.7991787195205688, "learning_rate": 3.964953964953965e-06, "loss": 0.0462, "step": 14610 }, { "epoch": 4.342144342144342, "grad_norm": 0.8941323757171631, "learning_rate": 3.947133947133947e-06, "loss": 0.0396, "step": 14620 }, { "epoch": 4.345114345114345, "grad_norm": 0.6208845973014832, "learning_rate": 3.92931392931393e-06, "loss": 0.0491, "step": 14630 }, { "epoch": 4.348084348084348, "grad_norm": 0.7600147724151611, "learning_rate": 3.911493911493911e-06, "loss": 0.0625, "step": 14640 }, { "epoch": 4.351054351054351, "grad_norm": 0.4878835678100586, "learning_rate": 3.893673893673894e-06, "loss": 0.0517, "step": 14650 }, { "epoch": 4.354024354024354, "grad_norm": 0.7598239183425903, "learning_rate": 3.875853875853875e-06, "loss": 0.0534, "step": 14660 }, { "epoch": 4.356994356994357, "grad_norm": 1.0485094785690308, "learning_rate": 3.858033858033858e-06, "loss": 0.0429, "step": 14670 }, { "epoch": 4.35996435996436, "grad_norm": 1.0490139722824097, "learning_rate": 3.8402138402138406e-06, "loss": 0.0608, "step": 14680 }, { "epoch": 4.362934362934363, "grad_norm": 0.6799785494804382, "learning_rate": 3.822393822393823e-06, "loss": 0.0582, "step": 14690 }, { "epoch": 4.365904365904366, "grad_norm": 0.6751995086669922, "learning_rate": 3.804573804573805e-06, "loss": 0.0436, "step": 14700 }, { "epoch": 4.368874368874369, "grad_norm": 0.8852332830429077, "learning_rate": 3.7867537867537867e-06, "loss": 0.0505, "step": 14710 }, { "epoch": 4.371844371844372, "grad_norm": 1.6373037099838257, "learning_rate": 3.7689337689337693e-06, "loss": 0.0592, "step": 14720 }, { "epoch": 4.374814374814375, "grad_norm": 0.8025867342948914, "learning_rate": 3.751113751113751e-06, "loss": 0.0429, "step": 14730 }, { "epoch": 4.377784377784378, "grad_norm": 1.4527711868286133, "learning_rate": 3.733293733293733e-06, "loss": 0.0479, "step": 14740 }, { "epoch": 4.3807543807543805, "grad_norm": 0.726428210735321, "learning_rate": 3.7154737154737153e-06, "loss": 0.0496, "step": 14750 }, { "epoch": 4.383724383724384, "grad_norm": 0.6071370840072632, "learning_rate": 3.6976536976536975e-06, "loss": 0.0461, "step": 14760 }, { "epoch": 4.386694386694387, "grad_norm": 0.7492642998695374, "learning_rate": 3.67983367983368e-06, "loss": 0.0476, "step": 14770 }, { "epoch": 4.389664389664389, "grad_norm": 0.6537764072418213, "learning_rate": 3.6620136620136623e-06, "loss": 0.0514, "step": 14780 }, { "epoch": 4.392634392634393, "grad_norm": 0.9229190349578857, "learning_rate": 3.6441936441936444e-06, "loss": 0.0522, "step": 14790 }, { "epoch": 4.395604395604396, "grad_norm": 0.7735339999198914, "learning_rate": 3.6263736263736266e-06, "loss": 0.0492, "step": 14800 }, { "epoch": 4.398574398574398, "grad_norm": 0.7553290128707886, "learning_rate": 3.6085536085536088e-06, "loss": 0.0399, "step": 14810 }, { "epoch": 4.401544401544402, "grad_norm": 0.988293468952179, "learning_rate": 3.5907335907335905e-06, "loss": 0.0616, "step": 14820 }, { "epoch": 4.404514404514405, "grad_norm": 0.6976864337921143, "learning_rate": 3.5729135729135727e-06, "loss": 0.045, "step": 14830 }, { "epoch": 4.407484407484407, "grad_norm": 1.1107743978500366, "learning_rate": 3.5550935550935553e-06, "loss": 0.0592, "step": 14840 }, { "epoch": 4.410454410454411, "grad_norm": 0.5012500286102295, "learning_rate": 3.5372735372735375e-06, "loss": 0.0691, "step": 14850 }, { "epoch": 4.413424413424414, "grad_norm": 0.6141711473464966, "learning_rate": 3.5194535194535196e-06, "loss": 0.0504, "step": 14860 }, { "epoch": 4.416394416394416, "grad_norm": 0.9581981897354126, "learning_rate": 3.501633501633502e-06, "loss": 0.0506, "step": 14870 }, { "epoch": 4.4193644193644195, "grad_norm": 0.5101330280303955, "learning_rate": 3.483813483813484e-06, "loss": 0.051, "step": 14880 }, { "epoch": 4.422334422334423, "grad_norm": 1.3231449127197266, "learning_rate": 3.465993465993466e-06, "loss": 0.0683, "step": 14890 }, { "epoch": 4.425304425304425, "grad_norm": 0.7032347917556763, "learning_rate": 3.448173448173448e-06, "loss": 0.0544, "step": 14900 }, { "epoch": 4.428274428274428, "grad_norm": 1.0103317499160767, "learning_rate": 3.4303534303534305e-06, "loss": 0.0686, "step": 14910 }, { "epoch": 4.431244431244432, "grad_norm": 0.8349617719650269, "learning_rate": 3.4125334125334127e-06, "loss": 0.0532, "step": 14920 }, { "epoch": 4.434214434214434, "grad_norm": 0.7502866983413696, "learning_rate": 3.394713394713395e-06, "loss": 0.0585, "step": 14930 }, { "epoch": 4.437184437184437, "grad_norm": 0.6169337630271912, "learning_rate": 3.376893376893377e-06, "loss": 0.0643, "step": 14940 }, { "epoch": 4.440154440154441, "grad_norm": 1.1160919666290283, "learning_rate": 3.359073359073359e-06, "loss": 0.0545, "step": 14950 }, { "epoch": 4.443124443124443, "grad_norm": 1.2266151905059814, "learning_rate": 3.3412533412533413e-06, "loss": 0.0514, "step": 14960 }, { "epoch": 4.446094446094446, "grad_norm": 0.4717876613140106, "learning_rate": 3.3234333234333235e-06, "loss": 0.0492, "step": 14970 }, { "epoch": 4.4490644490644495, "grad_norm": 0.7887519598007202, "learning_rate": 3.3056133056133057e-06, "loss": 0.053, "step": 14980 }, { "epoch": 4.452034452034452, "grad_norm": 0.6707944273948669, "learning_rate": 3.287793287793288e-06, "loss": 0.0636, "step": 14990 }, { "epoch": 4.455004455004455, "grad_norm": 0.632908821105957, "learning_rate": 3.26997326997327e-06, "loss": 0.0563, "step": 15000 }, { "epoch": 4.457974457974458, "grad_norm": 0.597366213798523, "learning_rate": 3.252153252153252e-06, "loss": 0.0605, "step": 15010 }, { "epoch": 4.460944460944461, "grad_norm": 0.9385949969291687, "learning_rate": 3.2343332343332344e-06, "loss": 0.0562, "step": 15020 }, { "epoch": 4.463914463914464, "grad_norm": 0.7558770179748535, "learning_rate": 3.2165132165132165e-06, "loss": 0.0629, "step": 15030 }, { "epoch": 4.4668844668844665, "grad_norm": 0.5903274416923523, "learning_rate": 3.1986931986931987e-06, "loss": 0.0642, "step": 15040 }, { "epoch": 4.46985446985447, "grad_norm": 0.564150869846344, "learning_rate": 3.1808731808731813e-06, "loss": 0.0541, "step": 15050 }, { "epoch": 4.472824472824473, "grad_norm": 0.41265928745269775, "learning_rate": 3.163053163053163e-06, "loss": 0.0532, "step": 15060 }, { "epoch": 4.475794475794475, "grad_norm": 0.43676692247390747, "learning_rate": 3.1452331452331452e-06, "loss": 0.0448, "step": 15070 }, { "epoch": 4.478764478764479, "grad_norm": 0.9410478472709656, "learning_rate": 3.1274131274131274e-06, "loss": 0.0521, "step": 15080 }, { "epoch": 4.481734481734482, "grad_norm": 0.7335745096206665, "learning_rate": 3.1095931095931096e-06, "loss": 0.0431, "step": 15090 }, { "epoch": 4.484704484704484, "grad_norm": 0.8039221167564392, "learning_rate": 3.0917730917730917e-06, "loss": 0.0561, "step": 15100 }, { "epoch": 4.487674487674488, "grad_norm": 0.7728500366210938, "learning_rate": 3.073953073953074e-06, "loss": 0.0487, "step": 15110 }, { "epoch": 4.490644490644491, "grad_norm": 0.8309100270271301, "learning_rate": 3.0561330561330565e-06, "loss": 0.0506, "step": 15120 }, { "epoch": 4.493614493614493, "grad_norm": 0.8935027718544006, "learning_rate": 3.0383130383130387e-06, "loss": 0.0479, "step": 15130 }, { "epoch": 4.4965844965844965, "grad_norm": 1.1319783926010132, "learning_rate": 3.0204930204930204e-06, "loss": 0.0542, "step": 15140 }, { "epoch": 4.4995544995545, "grad_norm": 0.41596898436546326, "learning_rate": 3.0026730026730026e-06, "loss": 0.0375, "step": 15150 }, { "epoch": 4.502524502524502, "grad_norm": 1.0437852144241333, "learning_rate": 2.9848529848529848e-06, "loss": 0.0566, "step": 15160 }, { "epoch": 4.5054945054945055, "grad_norm": 0.41360408067703247, "learning_rate": 2.967032967032967e-06, "loss": 0.0503, "step": 15170 }, { "epoch": 4.508464508464509, "grad_norm": 0.5282860994338989, "learning_rate": 2.949212949212949e-06, "loss": 0.0495, "step": 15180 }, { "epoch": 4.511434511434511, "grad_norm": 0.764944851398468, "learning_rate": 2.9313929313929317e-06, "loss": 0.0546, "step": 15190 }, { "epoch": 4.514404514404514, "grad_norm": 0.7760726809501648, "learning_rate": 2.913572913572914e-06, "loss": 0.0552, "step": 15200 }, { "epoch": 4.517374517374518, "grad_norm": 0.6407870650291443, "learning_rate": 2.895752895752896e-06, "loss": 0.0624, "step": 15210 }, { "epoch": 4.52034452034452, "grad_norm": 0.7598323822021484, "learning_rate": 2.877932877932878e-06, "loss": 0.0547, "step": 15220 }, { "epoch": 4.523314523314523, "grad_norm": 0.6085183024406433, "learning_rate": 2.86011286011286e-06, "loss": 0.047, "step": 15230 }, { "epoch": 4.526284526284527, "grad_norm": 0.5744462609291077, "learning_rate": 2.842292842292842e-06, "loss": 0.0476, "step": 15240 }, { "epoch": 4.529254529254529, "grad_norm": 0.6744916439056396, "learning_rate": 2.8244728244728243e-06, "loss": 0.0503, "step": 15250 }, { "epoch": 4.532224532224532, "grad_norm": 0.939346194267273, "learning_rate": 2.806652806652807e-06, "loss": 0.0602, "step": 15260 }, { "epoch": 4.5351945351945355, "grad_norm": 1.4114429950714111, "learning_rate": 2.788832788832789e-06, "loss": 0.0659, "step": 15270 }, { "epoch": 4.538164538164538, "grad_norm": 0.6457445025444031, "learning_rate": 2.7710127710127712e-06, "loss": 0.0554, "step": 15280 }, { "epoch": 4.541134541134541, "grad_norm": 0.6651527881622314, "learning_rate": 2.7531927531927534e-06, "loss": 0.0468, "step": 15290 }, { "epoch": 4.5441045441045445, "grad_norm": 0.45064395666122437, "learning_rate": 2.7353727353727356e-06, "loss": 0.0649, "step": 15300 }, { "epoch": 4.547074547074547, "grad_norm": 0.4725130498409271, "learning_rate": 2.7175527175527173e-06, "loss": 0.051, "step": 15310 }, { "epoch": 4.55004455004455, "grad_norm": 0.7358287572860718, "learning_rate": 2.6997326997326995e-06, "loss": 0.0481, "step": 15320 }, { "epoch": 4.553014553014553, "grad_norm": 0.6313060522079468, "learning_rate": 2.681912681912682e-06, "loss": 0.0463, "step": 15330 }, { "epoch": 4.555984555984556, "grad_norm": 1.0503184795379639, "learning_rate": 2.6640926640926642e-06, "loss": 0.0597, "step": 15340 }, { "epoch": 4.558954558954559, "grad_norm": 1.3355445861816406, "learning_rate": 2.6462726462726464e-06, "loss": 0.0545, "step": 15350 }, { "epoch": 4.561924561924562, "grad_norm": 0.9509602785110474, "learning_rate": 2.6284526284526286e-06, "loss": 0.0629, "step": 15360 }, { "epoch": 4.564894564894565, "grad_norm": 1.1099562644958496, "learning_rate": 2.6106326106326108e-06, "loss": 0.0707, "step": 15370 }, { "epoch": 4.567864567864568, "grad_norm": 1.1157077550888062, "learning_rate": 2.592812592812593e-06, "loss": 0.0609, "step": 15380 }, { "epoch": 4.57083457083457, "grad_norm": 0.5642300844192505, "learning_rate": 2.574992574992575e-06, "loss": 0.0572, "step": 15390 }, { "epoch": 4.573804573804574, "grad_norm": 1.1523653268814087, "learning_rate": 2.5571725571725573e-06, "loss": 0.0585, "step": 15400 }, { "epoch": 4.576774576774577, "grad_norm": 0.6620275378227234, "learning_rate": 2.5393525393525394e-06, "loss": 0.0428, "step": 15410 }, { "epoch": 4.579744579744579, "grad_norm": 0.8893596529960632, "learning_rate": 2.5215325215325216e-06, "loss": 0.0529, "step": 15420 }, { "epoch": 4.5827145827145825, "grad_norm": 0.3965216875076294, "learning_rate": 2.5037125037125038e-06, "loss": 0.0501, "step": 15430 }, { "epoch": 4.585684585684586, "grad_norm": 0.888308584690094, "learning_rate": 2.485892485892486e-06, "loss": 0.0411, "step": 15440 }, { "epoch": 4.588654588654588, "grad_norm": 0.7801947593688965, "learning_rate": 2.468072468072468e-06, "loss": 0.0465, "step": 15450 }, { "epoch": 4.5916245916245915, "grad_norm": 0.5786198377609253, "learning_rate": 2.4502524502524507e-06, "loss": 0.0546, "step": 15460 }, { "epoch": 4.594594594594595, "grad_norm": 0.537132978439331, "learning_rate": 2.4324324324324325e-06, "loss": 0.0602, "step": 15470 }, { "epoch": 4.597564597564597, "grad_norm": 0.9715235829353333, "learning_rate": 2.4146124146124146e-06, "loss": 0.058, "step": 15480 }, { "epoch": 4.6005346005346, "grad_norm": 1.2125133275985718, "learning_rate": 2.396792396792397e-06, "loss": 0.0663, "step": 15490 }, { "epoch": 4.603504603504604, "grad_norm": 0.955084502696991, "learning_rate": 2.378972378972379e-06, "loss": 0.058, "step": 15500 }, { "epoch": 4.606474606474606, "grad_norm": 1.2165902853012085, "learning_rate": 2.361152361152361e-06, "loss": 0.0537, "step": 15510 }, { "epoch": 4.609444609444609, "grad_norm": 0.5308700203895569, "learning_rate": 2.3433323433323433e-06, "loss": 0.0489, "step": 15520 }, { "epoch": 4.612414612414613, "grad_norm": 1.492642879486084, "learning_rate": 2.325512325512326e-06, "loss": 0.0535, "step": 15530 }, { "epoch": 4.615384615384615, "grad_norm": 0.8670569062232971, "learning_rate": 2.307692307692308e-06, "loss": 0.0635, "step": 15540 }, { "epoch": 4.618354618354618, "grad_norm": 0.5621850490570068, "learning_rate": 2.28987228987229e-06, "loss": 0.0563, "step": 15550 }, { "epoch": 4.6213246213246215, "grad_norm": 0.7570291757583618, "learning_rate": 2.272052272052272e-06, "loss": 0.0554, "step": 15560 }, { "epoch": 4.624294624294624, "grad_norm": 0.7704951763153076, "learning_rate": 2.254232254232254e-06, "loss": 0.0504, "step": 15570 }, { "epoch": 4.627264627264627, "grad_norm": 1.1347087621688843, "learning_rate": 2.2364122364122363e-06, "loss": 0.0586, "step": 15580 }, { "epoch": 4.63023463023463, "grad_norm": 0.9112380743026733, "learning_rate": 2.2185922185922185e-06, "loss": 0.0489, "step": 15590 }, { "epoch": 4.633204633204633, "grad_norm": 0.9339880347251892, "learning_rate": 2.200772200772201e-06, "loss": 0.0487, "step": 15600 }, { "epoch": 4.636174636174636, "grad_norm": 0.5124456286430359, "learning_rate": 2.1829521829521833e-06, "loss": 0.0682, "step": 15610 }, { "epoch": 4.639144639144639, "grad_norm": 0.742302417755127, "learning_rate": 2.1651321651321654e-06, "loss": 0.0579, "step": 15620 }, { "epoch": 4.642114642114642, "grad_norm": 0.7987191677093506, "learning_rate": 2.147312147312147e-06, "loss": 0.0501, "step": 15630 }, { "epoch": 4.645084645084645, "grad_norm": 1.0972774028778076, "learning_rate": 2.1294921294921294e-06, "loss": 0.0657, "step": 15640 }, { "epoch": 4.648054648054648, "grad_norm": 0.855499804019928, "learning_rate": 2.1116721116721115e-06, "loss": 0.0623, "step": 15650 }, { "epoch": 4.651024651024651, "grad_norm": 0.6510183215141296, "learning_rate": 2.0938520938520937e-06, "loss": 0.0556, "step": 15660 }, { "epoch": 4.653994653994654, "grad_norm": 0.7300544381141663, "learning_rate": 2.0760320760320763e-06, "loss": 0.047, "step": 15670 }, { "epoch": 4.656964656964657, "grad_norm": 1.2278099060058594, "learning_rate": 2.0582120582120585e-06, "loss": 0.0551, "step": 15680 }, { "epoch": 4.65993465993466, "grad_norm": 0.751706063747406, "learning_rate": 2.0403920403920406e-06, "loss": 0.0583, "step": 15690 }, { "epoch": 4.662904662904663, "grad_norm": 0.32578542828559875, "learning_rate": 2.022572022572023e-06, "loss": 0.0566, "step": 15700 }, { "epoch": 4.665874665874666, "grad_norm": 0.8979377746582031, "learning_rate": 2.0047520047520046e-06, "loss": 0.0574, "step": 15710 }, { "epoch": 4.6688446688446685, "grad_norm": 0.816284716129303, "learning_rate": 1.9869319869319867e-06, "loss": 0.0534, "step": 15720 }, { "epoch": 4.671814671814672, "grad_norm": 0.9114980101585388, "learning_rate": 1.969111969111969e-06, "loss": 0.0599, "step": 15730 }, { "epoch": 4.674784674784675, "grad_norm": 0.8003864884376526, "learning_rate": 1.9512919512919515e-06, "loss": 0.0571, "step": 15740 }, { "epoch": 4.6777546777546775, "grad_norm": 0.7641117572784424, "learning_rate": 1.9334719334719337e-06, "loss": 0.0523, "step": 15750 }, { "epoch": 4.680724680724681, "grad_norm": 0.795187771320343, "learning_rate": 1.915651915651916e-06, "loss": 0.0652, "step": 15760 }, { "epoch": 4.683694683694684, "grad_norm": 0.7843266129493713, "learning_rate": 1.8978318978318978e-06, "loss": 0.0541, "step": 15770 }, { "epoch": 4.686664686664686, "grad_norm": 1.0619968175888062, "learning_rate": 1.88001188001188e-06, "loss": 0.061, "step": 15780 }, { "epoch": 4.68963468963469, "grad_norm": 0.6766983270645142, "learning_rate": 1.8621918621918623e-06, "loss": 0.0479, "step": 15790 }, { "epoch": 4.692604692604693, "grad_norm": 1.0215040445327759, "learning_rate": 1.8443718443718445e-06, "loss": 0.0479, "step": 15800 }, { "epoch": 4.695574695574695, "grad_norm": 1.0211207866668701, "learning_rate": 1.8265518265518265e-06, "loss": 0.0595, "step": 15810 }, { "epoch": 4.698544698544699, "grad_norm": 0.677951455116272, "learning_rate": 1.8087318087318088e-06, "loss": 0.0389, "step": 15820 }, { "epoch": 4.701514701514702, "grad_norm": 0.9057896137237549, "learning_rate": 1.790911790911791e-06, "loss": 0.0694, "step": 15830 }, { "epoch": 4.704484704484704, "grad_norm": 0.49817752838134766, "learning_rate": 1.7730917730917732e-06, "loss": 0.0572, "step": 15840 }, { "epoch": 4.7074547074547075, "grad_norm": 0.554590106010437, "learning_rate": 1.7552717552717551e-06, "loss": 0.0381, "step": 15850 }, { "epoch": 4.710424710424711, "grad_norm": 0.6778869032859802, "learning_rate": 1.7374517374517375e-06, "loss": 0.0488, "step": 15860 }, { "epoch": 4.713394713394713, "grad_norm": 0.38889896869659424, "learning_rate": 1.7196317196317197e-06, "loss": 0.0454, "step": 15870 }, { "epoch": 4.716364716364716, "grad_norm": 0.6715492010116577, "learning_rate": 1.7018117018117019e-06, "loss": 0.0532, "step": 15880 }, { "epoch": 4.71933471933472, "grad_norm": 0.9232339262962341, "learning_rate": 1.683991683991684e-06, "loss": 0.0407, "step": 15890 }, { "epoch": 4.722304722304722, "grad_norm": 0.9645439386367798, "learning_rate": 1.6661716661716662e-06, "loss": 0.0553, "step": 15900 }, { "epoch": 4.725274725274725, "grad_norm": 0.6320471167564392, "learning_rate": 1.6483516483516484e-06, "loss": 0.0413, "step": 15910 }, { "epoch": 4.728244728244729, "grad_norm": 0.7938999533653259, "learning_rate": 1.6305316305316306e-06, "loss": 0.0497, "step": 15920 }, { "epoch": 4.731214731214731, "grad_norm": 0.5799199938774109, "learning_rate": 1.6127116127116127e-06, "loss": 0.0608, "step": 15930 }, { "epoch": 4.734184734184734, "grad_norm": 0.6402101516723633, "learning_rate": 1.594891594891595e-06, "loss": 0.0524, "step": 15940 }, { "epoch": 4.737154737154738, "grad_norm": 0.6040619015693665, "learning_rate": 1.577071577071577e-06, "loss": 0.0486, "step": 15950 }, { "epoch": 4.74012474012474, "grad_norm": 0.4295766055583954, "learning_rate": 1.5592515592515594e-06, "loss": 0.0481, "step": 15960 }, { "epoch": 4.743094743094743, "grad_norm": 0.9524640440940857, "learning_rate": 1.5414315414315414e-06, "loss": 0.0501, "step": 15970 }, { "epoch": 4.7460647460647465, "grad_norm": 0.6644571423530579, "learning_rate": 1.5236115236115236e-06, "loss": 0.069, "step": 15980 }, { "epoch": 4.749034749034749, "grad_norm": 0.9465572237968445, "learning_rate": 1.5057915057915057e-06, "loss": 0.0499, "step": 15990 }, { "epoch": 4.752004752004752, "grad_norm": 0.6358723640441895, "learning_rate": 1.4879714879714881e-06, "loss": 0.0516, "step": 16000 }, { "epoch": 4.754974754974755, "grad_norm": 0.6010512709617615, "learning_rate": 1.47015147015147e-06, "loss": 0.0378, "step": 16010 }, { "epoch": 4.757944757944758, "grad_norm": 0.5566834211349487, "learning_rate": 1.4523314523314523e-06, "loss": 0.046, "step": 16020 }, { "epoch": 4.760914760914761, "grad_norm": 0.8823090195655823, "learning_rate": 1.4345114345114346e-06, "loss": 0.0516, "step": 16030 }, { "epoch": 4.763884763884764, "grad_norm": 0.9315813183784485, "learning_rate": 1.4166914166914168e-06, "loss": 0.0585, "step": 16040 }, { "epoch": 4.766854766854767, "grad_norm": 0.848888635635376, "learning_rate": 1.3988713988713988e-06, "loss": 0.0544, "step": 16050 }, { "epoch": 4.76982476982477, "grad_norm": 0.7777525782585144, "learning_rate": 1.381051381051381e-06, "loss": 0.0606, "step": 16060 }, { "epoch": 4.772794772794773, "grad_norm": 0.7137773036956787, "learning_rate": 1.3632313632313633e-06, "loss": 0.0561, "step": 16070 }, { "epoch": 4.775764775764776, "grad_norm": 0.7126787900924683, "learning_rate": 1.3454113454113455e-06, "loss": 0.0597, "step": 16080 }, { "epoch": 4.778734778734779, "grad_norm": 1.0160871744155884, "learning_rate": 1.3275913275913275e-06, "loss": 0.0566, "step": 16090 }, { "epoch": 4.781704781704782, "grad_norm": 1.4480925798416138, "learning_rate": 1.3097713097713098e-06, "loss": 0.072, "step": 16100 }, { "epoch": 4.784674784674785, "grad_norm": 0.7537574172019958, "learning_rate": 1.291951291951292e-06, "loss": 0.0528, "step": 16110 }, { "epoch": 4.787644787644788, "grad_norm": 1.0016658306121826, "learning_rate": 1.2741312741312742e-06, "loss": 0.0501, "step": 16120 }, { "epoch": 4.79061479061479, "grad_norm": 0.7509961128234863, "learning_rate": 1.2563112563112563e-06, "loss": 0.0573, "step": 16130 }, { "epoch": 4.7935847935847935, "grad_norm": 0.7173300981521606, "learning_rate": 1.2384912384912385e-06, "loss": 0.0545, "step": 16140 }, { "epoch": 4.796554796554797, "grad_norm": 0.6532518267631531, "learning_rate": 1.2206712206712207e-06, "loss": 0.0711, "step": 16150 }, { "epoch": 4.799524799524799, "grad_norm": 0.7350220084190369, "learning_rate": 1.2028512028512029e-06, "loss": 0.0511, "step": 16160 }, { "epoch": 4.802494802494802, "grad_norm": 0.8470587730407715, "learning_rate": 1.185031185031185e-06, "loss": 0.045, "step": 16170 }, { "epoch": 4.805464805464806, "grad_norm": 0.7634150981903076, "learning_rate": 1.1672111672111672e-06, "loss": 0.0529, "step": 16180 }, { "epoch": 4.808434808434808, "grad_norm": 0.8620032668113708, "learning_rate": 1.1493911493911494e-06, "loss": 0.0527, "step": 16190 }, { "epoch": 4.811404811404811, "grad_norm": 0.770086944103241, "learning_rate": 1.1315711315711318e-06, "loss": 0.0521, "step": 16200 }, { "epoch": 4.814374814374815, "grad_norm": 0.6775233149528503, "learning_rate": 1.1137511137511137e-06, "loss": 0.0465, "step": 16210 }, { "epoch": 4.817344817344817, "grad_norm": 0.7453165054321289, "learning_rate": 1.0959310959310959e-06, "loss": 0.0547, "step": 16220 }, { "epoch": 4.82031482031482, "grad_norm": 1.0375672578811646, "learning_rate": 1.078111078111078e-06, "loss": 0.0583, "step": 16230 }, { "epoch": 4.8232848232848236, "grad_norm": 0.8512599468231201, "learning_rate": 1.0602910602910604e-06, "loss": 0.0587, "step": 16240 }, { "epoch": 4.826254826254826, "grad_norm": 0.7857792377471924, "learning_rate": 1.0424710424710424e-06, "loss": 0.0596, "step": 16250 }, { "epoch": 4.829224829224829, "grad_norm": 0.6470934152603149, "learning_rate": 1.0246510246510246e-06, "loss": 0.0538, "step": 16260 }, { "epoch": 4.8321948321948325, "grad_norm": 0.8165239095687866, "learning_rate": 1.006831006831007e-06, "loss": 0.0567, "step": 16270 }, { "epoch": 4.835164835164835, "grad_norm": 0.5124844908714294, "learning_rate": 9.890109890109891e-07, "loss": 0.0493, "step": 16280 }, { "epoch": 4.838134838134838, "grad_norm": 1.0751707553863525, "learning_rate": 9.711909711909713e-07, "loss": 0.0559, "step": 16290 }, { "epoch": 4.841104841104841, "grad_norm": 0.9530063271522522, "learning_rate": 9.533709533709534e-07, "loss": 0.0474, "step": 16300 }, { "epoch": 4.844074844074844, "grad_norm": 1.0915278196334839, "learning_rate": 9.355509355509356e-07, "loss": 0.0565, "step": 16310 }, { "epoch": 4.847044847044847, "grad_norm": 1.2590452432632446, "learning_rate": 9.177309177309178e-07, "loss": 0.043, "step": 16320 }, { "epoch": 4.85001485001485, "grad_norm": 0.6076927781105042, "learning_rate": 8.999108999109e-07, "loss": 0.056, "step": 16330 }, { "epoch": 4.852984852984853, "grad_norm": 0.8309280276298523, "learning_rate": 8.820908820908821e-07, "loss": 0.0486, "step": 16340 }, { "epoch": 4.855954855954856, "grad_norm": 0.8125685453414917, "learning_rate": 8.642708642708643e-07, "loss": 0.0499, "step": 16350 }, { "epoch": 4.858924858924859, "grad_norm": 0.6055704951286316, "learning_rate": 8.464508464508465e-07, "loss": 0.0454, "step": 16360 }, { "epoch": 4.861894861894862, "grad_norm": 0.6113276481628418, "learning_rate": 8.286308286308286e-07, "loss": 0.0446, "step": 16370 }, { "epoch": 4.864864864864865, "grad_norm": 0.6283566951751709, "learning_rate": 8.108108108108109e-07, "loss": 0.0643, "step": 16380 }, { "epoch": 4.867834867834868, "grad_norm": 0.6934186220169067, "learning_rate": 7.92990792990793e-07, "loss": 0.0506, "step": 16390 }, { "epoch": 4.870804870804871, "grad_norm": 0.7844398021697998, "learning_rate": 7.751707751707753e-07, "loss": 0.0551, "step": 16400 }, { "epoch": 4.873774873774874, "grad_norm": 0.7214897274971008, "learning_rate": 7.573507573507573e-07, "loss": 0.0648, "step": 16410 }, { "epoch": 4.876744876744877, "grad_norm": 1.6125606298446655, "learning_rate": 7.395307395307396e-07, "loss": 0.0462, "step": 16420 }, { "epoch": 4.8797148797148795, "grad_norm": 1.0362389087677002, "learning_rate": 7.217107217107217e-07, "loss": 0.0579, "step": 16430 }, { "epoch": 4.882684882684883, "grad_norm": 1.2571192979812622, "learning_rate": 7.03890703890704e-07, "loss": 0.0526, "step": 16440 }, { "epoch": 4.885654885654886, "grad_norm": 0.7572282552719116, "learning_rate": 6.860706860706861e-07, "loss": 0.0551, "step": 16450 }, { "epoch": 4.888624888624888, "grad_norm": 0.7616530060768127, "learning_rate": 6.682506682506683e-07, "loss": 0.0557, "step": 16460 }, { "epoch": 4.891594891594892, "grad_norm": 1.0284298658370972, "learning_rate": 6.504306504306505e-07, "loss": 0.0501, "step": 16470 }, { "epoch": 4.894564894564894, "grad_norm": 0.9173740148544312, "learning_rate": 6.326106326106326e-07, "loss": 0.0554, "step": 16480 }, { "epoch": 4.897534897534897, "grad_norm": 0.5375432968139648, "learning_rate": 6.147906147906148e-07, "loss": 0.0534, "step": 16490 }, { "epoch": 4.900504900504901, "grad_norm": 0.7436822056770325, "learning_rate": 5.969705969705971e-07, "loss": 0.0528, "step": 16500 }, { "epoch": 4.903474903474903, "grad_norm": 0.3775230348110199, "learning_rate": 5.791505791505791e-07, "loss": 0.0584, "step": 16510 }, { "epoch": 4.906444906444906, "grad_norm": 0.792444109916687, "learning_rate": 5.613305613305614e-07, "loss": 0.0578, "step": 16520 }, { "epoch": 4.9094149094149095, "grad_norm": 0.527422308921814, "learning_rate": 5.435105435105435e-07, "loss": 0.0581, "step": 16530 }, { "epoch": 4.912384912384912, "grad_norm": 0.6384781002998352, "learning_rate": 5.256905256905258e-07, "loss": 0.0492, "step": 16540 }, { "epoch": 4.915354915354915, "grad_norm": 0.8578511476516724, "learning_rate": 5.078705078705078e-07, "loss": 0.069, "step": 16550 }, { "epoch": 4.9183249183249185, "grad_norm": 0.7936479449272156, "learning_rate": 4.900504900504901e-07, "loss": 0.0533, "step": 16560 }, { "epoch": 4.921294921294921, "grad_norm": 0.6151940226554871, "learning_rate": 4.7223047223047227e-07, "loss": 0.0473, "step": 16570 }, { "epoch": 4.924264924264924, "grad_norm": 0.8337469696998596, "learning_rate": 4.544104544104544e-07, "loss": 0.0434, "step": 16580 }, { "epoch": 4.927234927234927, "grad_norm": 0.7319076657295227, "learning_rate": 4.365904365904366e-07, "loss": 0.057, "step": 16590 }, { "epoch": 4.93020493020493, "grad_norm": 0.8618406653404236, "learning_rate": 4.187704187704188e-07, "loss": 0.0613, "step": 16600 }, { "epoch": 4.933174933174933, "grad_norm": 1.1733934879302979, "learning_rate": 4.0095040095040095e-07, "loss": 0.0491, "step": 16610 }, { "epoch": 4.936144936144936, "grad_norm": 0.41288742423057556, "learning_rate": 3.831303831303831e-07, "loss": 0.0443, "step": 16620 }, { "epoch": 4.939114939114939, "grad_norm": 0.854500412940979, "learning_rate": 3.653103653103653e-07, "loss": 0.0625, "step": 16630 }, { "epoch": 4.942084942084942, "grad_norm": 0.610560953617096, "learning_rate": 3.4749034749034746e-07, "loss": 0.0492, "step": 16640 }, { "epoch": 4.945054945054945, "grad_norm": 0.6326998472213745, "learning_rate": 3.296703296703297e-07, "loss": 0.0608, "step": 16650 }, { "epoch": 4.948024948024948, "grad_norm": 0.4535106122493744, "learning_rate": 3.1185031185031186e-07, "loss": 0.0415, "step": 16660 }, { "epoch": 4.950994950994951, "grad_norm": 0.8340437412261963, "learning_rate": 2.9403029403029403e-07, "loss": 0.0505, "step": 16670 }, { "epoch": 4.953964953964954, "grad_norm": 0.5445932149887085, "learning_rate": 2.762102762102762e-07, "loss": 0.0482, "step": 16680 }, { "epoch": 4.956934956934957, "grad_norm": 0.6914054155349731, "learning_rate": 2.5839025839025837e-07, "loss": 0.0588, "step": 16690 }, { "epoch": 4.95990495990496, "grad_norm": 0.9241282343864441, "learning_rate": 2.4057024057024054e-07, "loss": 0.0549, "step": 16700 }, { "epoch": 4.962874962874963, "grad_norm": 0.5624737739562988, "learning_rate": 2.2275022275022276e-07, "loss": 0.0588, "step": 16710 }, { "epoch": 4.9658449658449655, "grad_norm": 1.0242942571640015, "learning_rate": 2.0493020493020493e-07, "loss": 0.0457, "step": 16720 }, { "epoch": 4.968814968814969, "grad_norm": 1.025058388710022, "learning_rate": 1.8711018711018713e-07, "loss": 0.0524, "step": 16730 }, { "epoch": 4.971784971784972, "grad_norm": 1.143563985824585, "learning_rate": 1.692901692901693e-07, "loss": 0.0391, "step": 16740 }, { "epoch": 4.974754974754974, "grad_norm": 0.7179445028305054, "learning_rate": 1.5147015147015147e-07, "loss": 0.054, "step": 16750 }, { "epoch": 4.977724977724978, "grad_norm": 1.321466088294983, "learning_rate": 1.3365013365013367e-07, "loss": 0.0569, "step": 16760 }, { "epoch": 4.980694980694981, "grad_norm": 0.7663488388061523, "learning_rate": 1.1583011583011584e-07, "loss": 0.0542, "step": 16770 }, { "epoch": 4.983664983664983, "grad_norm": 0.7234964966773987, "learning_rate": 9.801009801009801e-08, "loss": 0.0414, "step": 16780 }, { "epoch": 4.986634986634987, "grad_norm": 0.8341914415359497, "learning_rate": 8.019008019008019e-08, "loss": 0.0647, "step": 16790 }, { "epoch": 4.98960498960499, "grad_norm": 0.9533319473266602, "learning_rate": 6.237006237006238e-08, "loss": 0.0513, "step": 16800 }, { "epoch": 4.992574992574992, "grad_norm": 1.162724494934082, "learning_rate": 4.4550044550044554e-08, "loss": 0.0543, "step": 16810 }, { "epoch": 4.9955449955449955, "grad_norm": 1.0774705410003662, "learning_rate": 2.673002673002673e-08, "loss": 0.0559, "step": 16820 }, { "epoch": 4.998514998514999, "grad_norm": 0.7029628157615662, "learning_rate": 8.91000891000891e-09, "loss": 0.0568, "step": 16830 }, { "epoch": 5.0, "eval_f1": 0.33031292965957215, "eval_loss": 0.041273970156908035, "eval_runtime": 1003.0923, "eval_samples_per_second": 37.902, "eval_steps_per_second": 0.593, "step": 16835 }, { "epoch": 5.0, "step": 16835, "total_flos": 8.348297673446728e+19, "train_loss": 0.062463992788064436, "train_runtime": 18625.883, "train_samples_per_second": 57.834, "train_steps_per_second": 0.904 } ], "logging_steps": 10, "max_steps": 16835, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.348297673446728e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }