{ "best_metric": 0.05167969688773155, "best_model_checkpoint": "./microsoft_beit-base-patch16-224-pt22k-ft22k_epoch_5/checkpoint-67330", "epoch": 5.0, "eval_steps": 500, "global_step": 67330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007426110203475419, "grad_norm": 19.504955291748047, "learning_rate": 2.9995544333877916e-05, "loss": 0.4424, "step": 10 }, { "epoch": 0.0014852220406950838, "grad_norm": 5.601377964019775, "learning_rate": 2.999108866775583e-05, "loss": 0.238, "step": 20 }, { "epoch": 0.002227833061042626, "grad_norm": 8.971532821655273, "learning_rate": 2.9986633001633746e-05, "loss": 0.2289, "step": 30 }, { "epoch": 0.0029704440813901676, "grad_norm": 3.3992252349853516, "learning_rate": 2.998217733551166e-05, "loss": 0.2005, "step": 40 }, { "epoch": 0.0037130551017377097, "grad_norm": 13.932947158813477, "learning_rate": 2.9977721669389572e-05, "loss": 0.2329, "step": 50 }, { "epoch": 0.004455666122085252, "grad_norm": 2.9520821571350098, "learning_rate": 2.997326600326749e-05, "loss": 0.2114, "step": 60 }, { "epoch": 0.005198277142432793, "grad_norm": 4.807623386383057, "learning_rate": 2.9968810337145402e-05, "loss": 0.1468, "step": 70 }, { "epoch": 0.005940888162780335, "grad_norm": 4.0346903800964355, "learning_rate": 2.9964354671023317e-05, "loss": 0.2179, "step": 80 }, { "epoch": 0.006683499183127877, "grad_norm": 4.7647199630737305, "learning_rate": 2.9959899004901236e-05, "loss": 0.1647, "step": 90 }, { "epoch": 0.007426110203475419, "grad_norm": 9.930302619934082, "learning_rate": 2.9955443338779147e-05, "loss": 0.1436, "step": 100 }, { "epoch": 0.008168721223822962, "grad_norm": 2.7145912647247314, "learning_rate": 2.9950987672657062e-05, "loss": 0.1408, "step": 110 }, { "epoch": 0.008911332244170504, "grad_norm": 4.7143940925598145, "learning_rate": 2.994653200653498e-05, "loss": 0.1411, "step": 120 }, { "epoch": 0.009653943264518046, "grad_norm": 5.331055641174316, "learning_rate": 2.9942076340412892e-05, "loss": 0.1392, "step": 130 }, { "epoch": 0.010396554284865587, "grad_norm": 4.96783447265625, "learning_rate": 2.9937620674290807e-05, "loss": 0.1772, "step": 140 }, { "epoch": 0.011139165305213129, "grad_norm": 3.5116193294525146, "learning_rate": 2.9933165008168722e-05, "loss": 0.111, "step": 150 }, { "epoch": 0.01188177632556067, "grad_norm": 3.521408796310425, "learning_rate": 2.9928709342046637e-05, "loss": 0.1418, "step": 160 }, { "epoch": 0.012624387345908213, "grad_norm": 2.7895073890686035, "learning_rate": 2.9924253675924552e-05, "loss": 0.1338, "step": 170 }, { "epoch": 0.013366998366255755, "grad_norm": 5.244328022003174, "learning_rate": 2.9919798009802464e-05, "loss": 0.1252, "step": 180 }, { "epoch": 0.014109609386603297, "grad_norm": 1.7632620334625244, "learning_rate": 2.9915342343680382e-05, "loss": 0.1122, "step": 190 }, { "epoch": 0.014852220406950839, "grad_norm": 3.5791783332824707, "learning_rate": 2.9910886677558297e-05, "loss": 0.1217, "step": 200 }, { "epoch": 0.01559483142729838, "grad_norm": 5.1304450035095215, "learning_rate": 2.990643101143621e-05, "loss": 0.1361, "step": 210 }, { "epoch": 0.016337442447645924, "grad_norm": 5.422621250152588, "learning_rate": 2.9901975345314124e-05, "loss": 0.1404, "step": 220 }, { "epoch": 0.017080053467993465, "grad_norm": 7.065879821777344, "learning_rate": 2.9897519679192042e-05, "loss": 0.1371, "step": 230 }, { "epoch": 0.01782266448834101, "grad_norm": 2.3558316230773926, "learning_rate": 2.9893064013069954e-05, "loss": 0.1386, "step": 240 }, { "epoch": 0.01856527550868855, "grad_norm": 1.8473409414291382, "learning_rate": 2.988860834694787e-05, "loss": 0.166, "step": 250 }, { "epoch": 0.019307886529036093, "grad_norm": 4.502384662628174, "learning_rate": 2.9884152680825784e-05, "loss": 0.1241, "step": 260 }, { "epoch": 0.020050497549383633, "grad_norm": 1.857735514640808, "learning_rate": 2.98796970147037e-05, "loss": 0.1369, "step": 270 }, { "epoch": 0.020793108569731173, "grad_norm": 6.109134674072266, "learning_rate": 2.9875241348581614e-05, "loss": 0.1139, "step": 280 }, { "epoch": 0.021535719590078717, "grad_norm": 3.0833802223205566, "learning_rate": 2.987078568245953e-05, "loss": 0.1203, "step": 290 }, { "epoch": 0.022278330610426257, "grad_norm": 2.37730073928833, "learning_rate": 2.9866330016337444e-05, "loss": 0.12, "step": 300 }, { "epoch": 0.0230209416307738, "grad_norm": 2.950272798538208, "learning_rate": 2.986187435021536e-05, "loss": 0.1139, "step": 310 }, { "epoch": 0.02376355265112134, "grad_norm": 4.4759521484375, "learning_rate": 2.985741868409327e-05, "loss": 0.146, "step": 320 }, { "epoch": 0.024506163671468885, "grad_norm": 1.08765709400177, "learning_rate": 2.985296301797119e-05, "loss": 0.1132, "step": 330 }, { "epoch": 0.025248774691816425, "grad_norm": 2.5022499561309814, "learning_rate": 2.9848507351849104e-05, "loss": 0.0975, "step": 340 }, { "epoch": 0.02599138571216397, "grad_norm": 6.558846950531006, "learning_rate": 2.9844051685727015e-05, "loss": 0.1128, "step": 350 }, { "epoch": 0.02673399673251151, "grad_norm": 2.4449667930603027, "learning_rate": 2.9839596019604934e-05, "loss": 0.146, "step": 360 }, { "epoch": 0.027476607752859053, "grad_norm": 3.488586902618408, "learning_rate": 2.9835140353482845e-05, "loss": 0.1066, "step": 370 }, { "epoch": 0.028219218773206593, "grad_norm": 3.4595577716827393, "learning_rate": 2.983068468736076e-05, "loss": 0.1295, "step": 380 }, { "epoch": 0.028961829793554137, "grad_norm": 5.9359259605407715, "learning_rate": 2.9826229021238675e-05, "loss": 0.1566, "step": 390 }, { "epoch": 0.029704440813901677, "grad_norm": 2.771470546722412, "learning_rate": 2.982177335511659e-05, "loss": 0.0896, "step": 400 }, { "epoch": 0.03044705183424922, "grad_norm": 5.8616862297058105, "learning_rate": 2.9817317688994505e-05, "loss": 0.1435, "step": 410 }, { "epoch": 0.03118966285459676, "grad_norm": 3.5416555404663086, "learning_rate": 2.981286202287242e-05, "loss": 0.133, "step": 420 }, { "epoch": 0.0319322738749443, "grad_norm": 2.3944127559661865, "learning_rate": 2.9808406356750335e-05, "loss": 0.1329, "step": 430 }, { "epoch": 0.03267488489529185, "grad_norm": 2.402383804321289, "learning_rate": 2.980395069062825e-05, "loss": 0.1159, "step": 440 }, { "epoch": 0.03341749591563939, "grad_norm": 8.119304656982422, "learning_rate": 2.9799495024506165e-05, "loss": 0.0956, "step": 450 }, { "epoch": 0.03416010693598693, "grad_norm": 7.150251865386963, "learning_rate": 2.9795039358384077e-05, "loss": 0.1152, "step": 460 }, { "epoch": 0.03490271795633447, "grad_norm": 3.084035634994507, "learning_rate": 2.9790583692261995e-05, "loss": 0.0913, "step": 470 }, { "epoch": 0.03564532897668202, "grad_norm": 4.946183681488037, "learning_rate": 2.9786128026139907e-05, "loss": 0.1228, "step": 480 }, { "epoch": 0.03638793999702956, "grad_norm": 1.6884194612503052, "learning_rate": 2.9781672360017822e-05, "loss": 0.1226, "step": 490 }, { "epoch": 0.0371305510173771, "grad_norm": 3.5960988998413086, "learning_rate": 2.977721669389574e-05, "loss": 0.1086, "step": 500 }, { "epoch": 0.03787316203772464, "grad_norm": 1.143905758857727, "learning_rate": 2.9772761027773652e-05, "loss": 0.1043, "step": 510 }, { "epoch": 0.038615773058072185, "grad_norm": 5.153063774108887, "learning_rate": 2.9768305361651567e-05, "loss": 0.1263, "step": 520 }, { "epoch": 0.039358384078419725, "grad_norm": 3.656158924102783, "learning_rate": 2.9763849695529485e-05, "loss": 0.1135, "step": 530 }, { "epoch": 0.040100995098767266, "grad_norm": 3.6122283935546875, "learning_rate": 2.9759394029407397e-05, "loss": 0.1087, "step": 540 }, { "epoch": 0.040843606119114806, "grad_norm": 2.1035525798797607, "learning_rate": 2.9754938363285312e-05, "loss": 0.1246, "step": 550 }, { "epoch": 0.041586217139462346, "grad_norm": 4.143787384033203, "learning_rate": 2.9750482697163227e-05, "loss": 0.1212, "step": 560 }, { "epoch": 0.04232882815980989, "grad_norm": 3.2385432720184326, "learning_rate": 2.9746027031041142e-05, "loss": 0.1181, "step": 570 }, { "epoch": 0.043071439180157434, "grad_norm": 2.036144733428955, "learning_rate": 2.9741571364919057e-05, "loss": 0.1131, "step": 580 }, { "epoch": 0.043814050200504974, "grad_norm": 8.29155158996582, "learning_rate": 2.973711569879697e-05, "loss": 0.1433, "step": 590 }, { "epoch": 0.044556661220852514, "grad_norm": 4.493346691131592, "learning_rate": 2.9732660032674887e-05, "loss": 0.1083, "step": 600 }, { "epoch": 0.04529927224120006, "grad_norm": 2.184577465057373, "learning_rate": 2.9728204366552802e-05, "loss": 0.076, "step": 610 }, { "epoch": 0.0460418832615476, "grad_norm": 5.727611064910889, "learning_rate": 2.9723748700430713e-05, "loss": 0.1295, "step": 620 }, { "epoch": 0.04678449428189514, "grad_norm": 2.224817991256714, "learning_rate": 2.971929303430863e-05, "loss": 0.1438, "step": 630 }, { "epoch": 0.04752710530224268, "grad_norm": 2.196302652359009, "learning_rate": 2.9714837368186547e-05, "loss": 0.1176, "step": 640 }, { "epoch": 0.04826971632259023, "grad_norm": 2.8282594680786133, "learning_rate": 2.971038170206446e-05, "loss": 0.1177, "step": 650 }, { "epoch": 0.04901232734293777, "grad_norm": 3.0465710163116455, "learning_rate": 2.9705926035942373e-05, "loss": 0.1386, "step": 660 }, { "epoch": 0.04975493836328531, "grad_norm": 2.1503002643585205, "learning_rate": 2.9701470369820292e-05, "loss": 0.1032, "step": 670 }, { "epoch": 0.05049754938363285, "grad_norm": 3.2530624866485596, "learning_rate": 2.9697014703698203e-05, "loss": 0.1137, "step": 680 }, { "epoch": 0.0512401604039804, "grad_norm": 4.812537670135498, "learning_rate": 2.969255903757612e-05, "loss": 0.1326, "step": 690 }, { "epoch": 0.05198277142432794, "grad_norm": 3.6558496952056885, "learning_rate": 2.9688103371454033e-05, "loss": 0.1269, "step": 700 }, { "epoch": 0.05272538244467548, "grad_norm": 2.3057334423065186, "learning_rate": 2.968364770533195e-05, "loss": 0.1299, "step": 710 }, { "epoch": 0.05346799346502302, "grad_norm": 3.2395918369293213, "learning_rate": 2.9679192039209863e-05, "loss": 0.0952, "step": 720 }, { "epoch": 0.054210604485370566, "grad_norm": 2.045339584350586, "learning_rate": 2.9674736373087775e-05, "loss": 0.1311, "step": 730 }, { "epoch": 0.054953215505718106, "grad_norm": 5.249059677124023, "learning_rate": 2.9670280706965693e-05, "loss": 0.1554, "step": 740 }, { "epoch": 0.055695826526065646, "grad_norm": 2.609159469604492, "learning_rate": 2.966582504084361e-05, "loss": 0.094, "step": 750 }, { "epoch": 0.05643843754641319, "grad_norm": 6.062933444976807, "learning_rate": 2.966136937472152e-05, "loss": 0.1746, "step": 760 }, { "epoch": 0.057181048566760734, "grad_norm": 1.9558234214782715, "learning_rate": 2.965691370859944e-05, "loss": 0.1119, "step": 770 }, { "epoch": 0.057923659587108274, "grad_norm": 3.7589683532714844, "learning_rate": 2.965245804247735e-05, "loss": 0.1309, "step": 780 }, { "epoch": 0.058666270607455814, "grad_norm": 5.349353790283203, "learning_rate": 2.9648002376355265e-05, "loss": 0.1092, "step": 790 }, { "epoch": 0.059408881627803355, "grad_norm": 2.1258533000946045, "learning_rate": 2.964354671023318e-05, "loss": 0.0964, "step": 800 }, { "epoch": 0.0601514926481509, "grad_norm": 2.0149474143981934, "learning_rate": 2.9639091044111095e-05, "loss": 0.1101, "step": 810 }, { "epoch": 0.06089410366849844, "grad_norm": 4.236600875854492, "learning_rate": 2.963463537798901e-05, "loss": 0.091, "step": 820 }, { "epoch": 0.06163671468884598, "grad_norm": 1.9681212902069092, "learning_rate": 2.9630179711866925e-05, "loss": 0.1065, "step": 830 }, { "epoch": 0.06237932570919352, "grad_norm": 3.0421035289764404, "learning_rate": 2.962572404574484e-05, "loss": 0.1165, "step": 840 }, { "epoch": 0.06312193672954107, "grad_norm": 4.69833517074585, "learning_rate": 2.9621268379622755e-05, "loss": 0.1158, "step": 850 }, { "epoch": 0.0638645477498886, "grad_norm": 1.8385733366012573, "learning_rate": 2.961681271350067e-05, "loss": 0.1073, "step": 860 }, { "epoch": 0.06460715877023615, "grad_norm": 4.566705703735352, "learning_rate": 2.961235704737858e-05, "loss": 0.0994, "step": 870 }, { "epoch": 0.0653497697905837, "grad_norm": 5.810744285583496, "learning_rate": 2.96079013812565e-05, "loss": 0.101, "step": 880 }, { "epoch": 0.06609238081093123, "grad_norm": 2.1213254928588867, "learning_rate": 2.960344571513441e-05, "loss": 0.1196, "step": 890 }, { "epoch": 0.06683499183127878, "grad_norm": 2.281233787536621, "learning_rate": 2.9598990049012327e-05, "loss": 0.0955, "step": 900 }, { "epoch": 0.06757760285162631, "grad_norm": 1.4217268228530884, "learning_rate": 2.9594534382890245e-05, "loss": 0.1013, "step": 910 }, { "epoch": 0.06832021387197386, "grad_norm": 2.6054506301879883, "learning_rate": 2.9590078716768157e-05, "loss": 0.0866, "step": 920 }, { "epoch": 0.0690628248923214, "grad_norm": 3.899290084838867, "learning_rate": 2.958562305064607e-05, "loss": 0.128, "step": 930 }, { "epoch": 0.06980543591266894, "grad_norm": 1.7163687944412231, "learning_rate": 2.958116738452399e-05, "loss": 0.14, "step": 940 }, { "epoch": 0.07054804693301649, "grad_norm": 2.1250195503234863, "learning_rate": 2.95767117184019e-05, "loss": 0.1174, "step": 950 }, { "epoch": 0.07129065795336403, "grad_norm": 3.302210569381714, "learning_rate": 2.9572256052279816e-05, "loss": 0.0919, "step": 960 }, { "epoch": 0.07203326897371157, "grad_norm": 2.169238328933716, "learning_rate": 2.956780038615773e-05, "loss": 0.1573, "step": 970 }, { "epoch": 0.07277587999405911, "grad_norm": 5.784306049346924, "learning_rate": 2.9563344720035646e-05, "loss": 0.1171, "step": 980 }, { "epoch": 0.07351849101440665, "grad_norm": 1.108161449432373, "learning_rate": 2.955888905391356e-05, "loss": 0.1081, "step": 990 }, { "epoch": 0.0742611020347542, "grad_norm": 2.4193813800811768, "learning_rate": 2.9554433387791473e-05, "loss": 0.1057, "step": 1000 }, { "epoch": 0.07500371305510174, "grad_norm": 0.32207611203193665, "learning_rate": 2.954997772166939e-05, "loss": 0.1128, "step": 1010 }, { "epoch": 0.07574632407544928, "grad_norm": 2.084397554397583, "learning_rate": 2.9545522055547306e-05, "loss": 0.0976, "step": 1020 }, { "epoch": 0.07648893509579682, "grad_norm": 4.024555683135986, "learning_rate": 2.9541066389425218e-05, "loss": 0.1344, "step": 1030 }, { "epoch": 0.07723154611614437, "grad_norm": 1.9376616477966309, "learning_rate": 2.9536610723303133e-05, "loss": 0.0779, "step": 1040 }, { "epoch": 0.0779741571364919, "grad_norm": 4.285502910614014, "learning_rate": 2.953215505718105e-05, "loss": 0.1116, "step": 1050 }, { "epoch": 0.07871676815683945, "grad_norm": 2.065674066543579, "learning_rate": 2.9527699391058963e-05, "loss": 0.1212, "step": 1060 }, { "epoch": 0.07945937917718698, "grad_norm": 3.7805116176605225, "learning_rate": 2.9523243724936878e-05, "loss": 0.1089, "step": 1070 }, { "epoch": 0.08020199019753453, "grad_norm": 2.0384578704833984, "learning_rate": 2.9518788058814796e-05, "loss": 0.0872, "step": 1080 }, { "epoch": 0.08094460121788208, "grad_norm": 1.7026249170303345, "learning_rate": 2.9514332392692708e-05, "loss": 0.0844, "step": 1090 }, { "epoch": 0.08168721223822961, "grad_norm": 2.1008737087249756, "learning_rate": 2.9509876726570623e-05, "loss": 0.0968, "step": 1100 }, { "epoch": 0.08242982325857716, "grad_norm": 2.490581750869751, "learning_rate": 2.9505421060448538e-05, "loss": 0.0936, "step": 1110 }, { "epoch": 0.08317243427892469, "grad_norm": 3.1089231967926025, "learning_rate": 2.9500965394326453e-05, "loss": 0.1072, "step": 1120 }, { "epoch": 0.08391504529927224, "grad_norm": 5.446791172027588, "learning_rate": 2.9496509728204368e-05, "loss": 0.1201, "step": 1130 }, { "epoch": 0.08465765631961979, "grad_norm": 2.201861619949341, "learning_rate": 2.949205406208228e-05, "loss": 0.1447, "step": 1140 }, { "epoch": 0.08540026733996732, "grad_norm": 1.191215991973877, "learning_rate": 2.9487598395960198e-05, "loss": 0.1118, "step": 1150 }, { "epoch": 0.08614287836031487, "grad_norm": 1.263275146484375, "learning_rate": 2.9483142729838113e-05, "loss": 0.0975, "step": 1160 }, { "epoch": 0.08688548938066241, "grad_norm": 4.553534507751465, "learning_rate": 2.9478687063716025e-05, "loss": 0.112, "step": 1170 }, { "epoch": 0.08762810040100995, "grad_norm": 2.4747018814086914, "learning_rate": 2.9474231397593943e-05, "loss": 0.1136, "step": 1180 }, { "epoch": 0.0883707114213575, "grad_norm": 1.9060287475585938, "learning_rate": 2.9469775731471858e-05, "loss": 0.1184, "step": 1190 }, { "epoch": 0.08911332244170503, "grad_norm": 2.2394371032714844, "learning_rate": 2.946532006534977e-05, "loss": 0.1119, "step": 1200 }, { "epoch": 0.08985593346205258, "grad_norm": 4.413462162017822, "learning_rate": 2.9460864399227685e-05, "loss": 0.1005, "step": 1210 }, { "epoch": 0.09059854448240012, "grad_norm": 2.7055234909057617, "learning_rate": 2.94564087331056e-05, "loss": 0.1211, "step": 1220 }, { "epoch": 0.09134115550274766, "grad_norm": 4.200916290283203, "learning_rate": 2.9451953066983515e-05, "loss": 0.1064, "step": 1230 }, { "epoch": 0.0920837665230952, "grad_norm": 4.629003047943115, "learning_rate": 2.944749740086143e-05, "loss": 0.1465, "step": 1240 }, { "epoch": 0.09282637754344275, "grad_norm": 2.249943494796753, "learning_rate": 2.9443041734739345e-05, "loss": 0.1019, "step": 1250 }, { "epoch": 0.09356898856379028, "grad_norm": 2.5523922443389893, "learning_rate": 2.943858606861726e-05, "loss": 0.1458, "step": 1260 }, { "epoch": 0.09431159958413783, "grad_norm": 1.662513017654419, "learning_rate": 2.9434130402495175e-05, "loss": 0.1219, "step": 1270 }, { "epoch": 0.09505421060448536, "grad_norm": 1.973795771598816, "learning_rate": 2.9429674736373086e-05, "loss": 0.1208, "step": 1280 }, { "epoch": 0.09579682162483291, "grad_norm": 2.5172524452209473, "learning_rate": 2.9425219070251005e-05, "loss": 0.0764, "step": 1290 }, { "epoch": 0.09653943264518046, "grad_norm": 2.8811981678009033, "learning_rate": 2.9420763404128916e-05, "loss": 0.1142, "step": 1300 }, { "epoch": 0.09728204366552799, "grad_norm": 2.752640962600708, "learning_rate": 2.941630773800683e-05, "loss": 0.0713, "step": 1310 }, { "epoch": 0.09802465468587554, "grad_norm": 3.1258955001831055, "learning_rate": 2.941185207188475e-05, "loss": 0.1004, "step": 1320 }, { "epoch": 0.09876726570622309, "grad_norm": 2.652444839477539, "learning_rate": 2.940739640576266e-05, "loss": 0.1075, "step": 1330 }, { "epoch": 0.09950987672657062, "grad_norm": 1.6719880104064941, "learning_rate": 2.9402940739640576e-05, "loss": 0.1315, "step": 1340 }, { "epoch": 0.10025248774691817, "grad_norm": 1.6493836641311646, "learning_rate": 2.9398485073518494e-05, "loss": 0.1261, "step": 1350 }, { "epoch": 0.1009950987672657, "grad_norm": 3.3327760696411133, "learning_rate": 2.9394029407396406e-05, "loss": 0.1226, "step": 1360 }, { "epoch": 0.10173770978761325, "grad_norm": 1.6937384605407715, "learning_rate": 2.938957374127432e-05, "loss": 0.1263, "step": 1370 }, { "epoch": 0.1024803208079608, "grad_norm": 1.859004259109497, "learning_rate": 2.9385118075152236e-05, "loss": 0.0915, "step": 1380 }, { "epoch": 0.10322293182830833, "grad_norm": 2.384235382080078, "learning_rate": 2.938066240903015e-05, "loss": 0.1396, "step": 1390 }, { "epoch": 0.10396554284865588, "grad_norm": 1.8067560195922852, "learning_rate": 2.9376206742908066e-05, "loss": 0.0948, "step": 1400 }, { "epoch": 0.10470815386900341, "grad_norm": 0.9995975494384766, "learning_rate": 2.9371751076785978e-05, "loss": 0.0838, "step": 1410 }, { "epoch": 0.10545076488935096, "grad_norm": 1.3114655017852783, "learning_rate": 2.9367295410663896e-05, "loss": 0.1145, "step": 1420 }, { "epoch": 0.1061933759096985, "grad_norm": 2.0225460529327393, "learning_rate": 2.936283974454181e-05, "loss": 0.1128, "step": 1430 }, { "epoch": 0.10693598693004604, "grad_norm": 2.146571159362793, "learning_rate": 2.9358384078419723e-05, "loss": 0.0799, "step": 1440 }, { "epoch": 0.10767859795039358, "grad_norm": 4.288600921630859, "learning_rate": 2.9353928412297638e-05, "loss": 0.1249, "step": 1450 }, { "epoch": 0.10842120897074113, "grad_norm": 1.9718669652938843, "learning_rate": 2.9349472746175556e-05, "loss": 0.1119, "step": 1460 }, { "epoch": 0.10916381999108866, "grad_norm": 2.543238639831543, "learning_rate": 2.9345017080053468e-05, "loss": 0.113, "step": 1470 }, { "epoch": 0.10990643101143621, "grad_norm": 1.8163429498672485, "learning_rate": 2.9340561413931383e-05, "loss": 0.0969, "step": 1480 }, { "epoch": 0.11064904203178375, "grad_norm": 1.0760383605957031, "learning_rate": 2.93361057478093e-05, "loss": 0.0848, "step": 1490 }, { "epoch": 0.11139165305213129, "grad_norm": 0.9687877297401428, "learning_rate": 2.9331650081687213e-05, "loss": 0.0766, "step": 1500 }, { "epoch": 0.11213426407247884, "grad_norm": 3.9466569423675537, "learning_rate": 2.9327194415565128e-05, "loss": 0.1187, "step": 1510 }, { "epoch": 0.11287687509282637, "grad_norm": 4.158041477203369, "learning_rate": 2.9322738749443043e-05, "loss": 0.1327, "step": 1520 }, { "epoch": 0.11361948611317392, "grad_norm": 4.5801591873168945, "learning_rate": 2.9318283083320958e-05, "loss": 0.0884, "step": 1530 }, { "epoch": 0.11436209713352147, "grad_norm": 4.984243392944336, "learning_rate": 2.9313827417198873e-05, "loss": 0.1085, "step": 1540 }, { "epoch": 0.115104708153869, "grad_norm": 3.010652780532837, "learning_rate": 2.9309371751076784e-05, "loss": 0.0696, "step": 1550 }, { "epoch": 0.11584731917421655, "grad_norm": 2.6272575855255127, "learning_rate": 2.9304916084954703e-05, "loss": 0.0979, "step": 1560 }, { "epoch": 0.11658993019456408, "grad_norm": 5.034517765045166, "learning_rate": 2.9300460418832618e-05, "loss": 0.0946, "step": 1570 }, { "epoch": 0.11733254121491163, "grad_norm": 2.4360742568969727, "learning_rate": 2.929600475271053e-05, "loss": 0.1243, "step": 1580 }, { "epoch": 0.11807515223525918, "grad_norm": 1.762876033782959, "learning_rate": 2.9291549086588448e-05, "loss": 0.0878, "step": 1590 }, { "epoch": 0.11881776325560671, "grad_norm": 4.341997146606445, "learning_rate": 2.9287093420466363e-05, "loss": 0.143, "step": 1600 }, { "epoch": 0.11956037427595426, "grad_norm": 2.6205196380615234, "learning_rate": 2.9282637754344274e-05, "loss": 0.1234, "step": 1610 }, { "epoch": 0.1203029852963018, "grad_norm": 1.5635881423950195, "learning_rate": 2.927818208822219e-05, "loss": 0.0918, "step": 1620 }, { "epoch": 0.12104559631664934, "grad_norm": 4.154393672943115, "learning_rate": 2.9273726422100104e-05, "loss": 0.1227, "step": 1630 }, { "epoch": 0.12178820733699688, "grad_norm": 3.161184549331665, "learning_rate": 2.926927075597802e-05, "loss": 0.1084, "step": 1640 }, { "epoch": 0.12253081835734442, "grad_norm": 1.4087167978286743, "learning_rate": 2.9264815089855934e-05, "loss": 0.0725, "step": 1650 }, { "epoch": 0.12327342937769196, "grad_norm": 2.6927695274353027, "learning_rate": 2.926035942373385e-05, "loss": 0.0761, "step": 1660 }, { "epoch": 0.12401604039803951, "grad_norm": 2.0278165340423584, "learning_rate": 2.9255903757611764e-05, "loss": 0.0915, "step": 1670 }, { "epoch": 0.12475865141838705, "grad_norm": 4.448098659515381, "learning_rate": 2.925144809148968e-05, "loss": 0.1183, "step": 1680 }, { "epoch": 0.12550126243873458, "grad_norm": 1.2473807334899902, "learning_rate": 2.9246992425367594e-05, "loss": 0.0999, "step": 1690 }, { "epoch": 0.12624387345908214, "grad_norm": 1.8847259283065796, "learning_rate": 2.924253675924551e-05, "loss": 0.0896, "step": 1700 }, { "epoch": 0.12698648447942967, "grad_norm": 2.6261157989501953, "learning_rate": 2.923808109312342e-05, "loss": 0.1043, "step": 1710 }, { "epoch": 0.1277290954997772, "grad_norm": 4.396406650543213, "learning_rate": 2.9233625427001336e-05, "loss": 0.1166, "step": 1720 }, { "epoch": 0.12847170652012477, "grad_norm": 1.8150869607925415, "learning_rate": 2.9229169760879254e-05, "loss": 0.0932, "step": 1730 }, { "epoch": 0.1292143175404723, "grad_norm": 1.0094959735870361, "learning_rate": 2.9224714094757166e-05, "loss": 0.107, "step": 1740 }, { "epoch": 0.12995692856081983, "grad_norm": 1.1417855024337769, "learning_rate": 2.922025842863508e-05, "loss": 0.1165, "step": 1750 }, { "epoch": 0.1306995395811674, "grad_norm": 2.269012689590454, "learning_rate": 2.9215802762513e-05, "loss": 0.124, "step": 1760 }, { "epoch": 0.13144215060151493, "grad_norm": 2.0236096382141113, "learning_rate": 2.921134709639091e-05, "loss": 0.0817, "step": 1770 }, { "epoch": 0.13218476162186246, "grad_norm": 3.055938482284546, "learning_rate": 2.9206891430268826e-05, "loss": 0.1123, "step": 1780 }, { "epoch": 0.13292737264221002, "grad_norm": 3.183199882507324, "learning_rate": 2.920243576414674e-05, "loss": 0.1264, "step": 1790 }, { "epoch": 0.13366998366255756, "grad_norm": 1.401501178741455, "learning_rate": 2.9197980098024656e-05, "loss": 0.1142, "step": 1800 }, { "epoch": 0.1344125946829051, "grad_norm": 0.9348109364509583, "learning_rate": 2.919352443190257e-05, "loss": 0.0885, "step": 1810 }, { "epoch": 0.13515520570325262, "grad_norm": 2.85656476020813, "learning_rate": 2.9189068765780482e-05, "loss": 0.1249, "step": 1820 }, { "epoch": 0.13589781672360018, "grad_norm": 2.1008095741271973, "learning_rate": 2.91846130996584e-05, "loss": 0.092, "step": 1830 }, { "epoch": 0.13664042774394772, "grad_norm": 3.1172657012939453, "learning_rate": 2.9180157433536316e-05, "loss": 0.0867, "step": 1840 }, { "epoch": 0.13738303876429525, "grad_norm": 1.8529694080352783, "learning_rate": 2.9175701767414227e-05, "loss": 0.0946, "step": 1850 }, { "epoch": 0.1381256497846428, "grad_norm": 2.7626330852508545, "learning_rate": 2.9171246101292142e-05, "loss": 0.0952, "step": 1860 }, { "epoch": 0.13886826080499035, "grad_norm": 7.8472089767456055, "learning_rate": 2.916679043517006e-05, "loss": 0.1124, "step": 1870 }, { "epoch": 0.13961087182533788, "grad_norm": 2.6485297679901123, "learning_rate": 2.9162334769047972e-05, "loss": 0.1109, "step": 1880 }, { "epoch": 0.14035348284568544, "grad_norm": 4.575742721557617, "learning_rate": 2.9157879102925887e-05, "loss": 0.0989, "step": 1890 }, { "epoch": 0.14109609386603297, "grad_norm": 4.842647552490234, "learning_rate": 2.9153423436803806e-05, "loss": 0.1074, "step": 1900 }, { "epoch": 0.1418387048863805, "grad_norm": 3.04119873046875, "learning_rate": 2.9148967770681717e-05, "loss": 0.0972, "step": 1910 }, { "epoch": 0.14258131590672807, "grad_norm": 2.0396547317504883, "learning_rate": 2.9144512104559632e-05, "loss": 0.1181, "step": 1920 }, { "epoch": 0.1433239269270756, "grad_norm": 2.2266111373901367, "learning_rate": 2.9140056438437547e-05, "loss": 0.0901, "step": 1930 }, { "epoch": 0.14406653794742313, "grad_norm": 3.9434754848480225, "learning_rate": 2.9135600772315462e-05, "loss": 0.1078, "step": 1940 }, { "epoch": 0.1448091489677707, "grad_norm": 1.5583536624908447, "learning_rate": 2.9131145106193377e-05, "loss": 0.0767, "step": 1950 }, { "epoch": 0.14555175998811823, "grad_norm": 2.2595632076263428, "learning_rate": 2.912668944007129e-05, "loss": 0.0906, "step": 1960 }, { "epoch": 0.14629437100846576, "grad_norm": 1.268849492073059, "learning_rate": 2.9122233773949207e-05, "loss": 0.0847, "step": 1970 }, { "epoch": 0.1470369820288133, "grad_norm": 2.6412172317504883, "learning_rate": 2.9117778107827122e-05, "loss": 0.1297, "step": 1980 }, { "epoch": 0.14777959304916086, "grad_norm": 3.151843547821045, "learning_rate": 2.9113322441705034e-05, "loss": 0.0975, "step": 1990 }, { "epoch": 0.1485222040695084, "grad_norm": 2.6987667083740234, "learning_rate": 2.9108866775582952e-05, "loss": 0.0992, "step": 2000 }, { "epoch": 0.14926481508985592, "grad_norm": 3.049734115600586, "learning_rate": 2.9104411109460867e-05, "loss": 0.1189, "step": 2010 }, { "epoch": 0.15000742611020348, "grad_norm": 2.4125332832336426, "learning_rate": 2.909995544333878e-05, "loss": 0.059, "step": 2020 }, { "epoch": 0.15075003713055102, "grad_norm": 2.5139408111572266, "learning_rate": 2.9095499777216694e-05, "loss": 0.1085, "step": 2030 }, { "epoch": 0.15149264815089855, "grad_norm": 2.7138638496398926, "learning_rate": 2.909104411109461e-05, "loss": 0.1312, "step": 2040 }, { "epoch": 0.1522352591712461, "grad_norm": 2.654601812362671, "learning_rate": 2.9086588444972524e-05, "loss": 0.1116, "step": 2050 }, { "epoch": 0.15297787019159365, "grad_norm": 2.8384549617767334, "learning_rate": 2.908213277885044e-05, "loss": 0.0842, "step": 2060 }, { "epoch": 0.15372048121194118, "grad_norm": 2.3352036476135254, "learning_rate": 2.9077677112728354e-05, "loss": 0.1261, "step": 2070 }, { "epoch": 0.15446309223228874, "grad_norm": 4.299140453338623, "learning_rate": 2.907322144660627e-05, "loss": 0.1066, "step": 2080 }, { "epoch": 0.15520570325263627, "grad_norm": 3.3088362216949463, "learning_rate": 2.9068765780484184e-05, "loss": 0.0954, "step": 2090 }, { "epoch": 0.1559483142729838, "grad_norm": 1.6698198318481445, "learning_rate": 2.90643101143621e-05, "loss": 0.1086, "step": 2100 }, { "epoch": 0.15669092529333134, "grad_norm": 2.066899061203003, "learning_rate": 2.9059854448240014e-05, "loss": 0.1169, "step": 2110 }, { "epoch": 0.1574335363136789, "grad_norm": 1.2918481826782227, "learning_rate": 2.905539878211793e-05, "loss": 0.1248, "step": 2120 }, { "epoch": 0.15817614733402643, "grad_norm": 3.492408037185669, "learning_rate": 2.905094311599584e-05, "loss": 0.0907, "step": 2130 }, { "epoch": 0.15891875835437397, "grad_norm": 4.03883695602417, "learning_rate": 2.904648744987376e-05, "loss": 0.1044, "step": 2140 }, { "epoch": 0.15966136937472153, "grad_norm": 2.541898250579834, "learning_rate": 2.904203178375167e-05, "loss": 0.1244, "step": 2150 }, { "epoch": 0.16040398039506906, "grad_norm": 1.1779425144195557, "learning_rate": 2.9037576117629585e-05, "loss": 0.096, "step": 2160 }, { "epoch": 0.1611465914154166, "grad_norm": 2.521737575531006, "learning_rate": 2.9033120451507504e-05, "loss": 0.0854, "step": 2170 }, { "epoch": 0.16188920243576416, "grad_norm": 2.2708542346954346, "learning_rate": 2.9028664785385415e-05, "loss": 0.1053, "step": 2180 }, { "epoch": 0.1626318134561117, "grad_norm": 1.611698865890503, "learning_rate": 2.902420911926333e-05, "loss": 0.1094, "step": 2190 }, { "epoch": 0.16337442447645922, "grad_norm": 2.962660074234009, "learning_rate": 2.9019753453141245e-05, "loss": 0.1022, "step": 2200 }, { "epoch": 0.16411703549680678, "grad_norm": 1.720831036567688, "learning_rate": 2.901529778701916e-05, "loss": 0.1207, "step": 2210 }, { "epoch": 0.16485964651715432, "grad_norm": 2.2500967979431152, "learning_rate": 2.9010842120897075e-05, "loss": 0.1023, "step": 2220 }, { "epoch": 0.16560225753750185, "grad_norm": 2.8786466121673584, "learning_rate": 2.9006386454774987e-05, "loss": 0.133, "step": 2230 }, { "epoch": 0.16634486855784938, "grad_norm": 2.339738607406616, "learning_rate": 2.9001930788652905e-05, "loss": 0.1436, "step": 2240 }, { "epoch": 0.16708747957819695, "grad_norm": 2.527097702026367, "learning_rate": 2.899747512253082e-05, "loss": 0.0817, "step": 2250 }, { "epoch": 0.16783009059854448, "grad_norm": 2.750969171524048, "learning_rate": 2.8993019456408732e-05, "loss": 0.1492, "step": 2260 }, { "epoch": 0.168572701618892, "grad_norm": 2.195770740509033, "learning_rate": 2.8988563790286647e-05, "loss": 0.115, "step": 2270 }, { "epoch": 0.16931531263923957, "grad_norm": 3.0774083137512207, "learning_rate": 2.8984108124164565e-05, "loss": 0.1062, "step": 2280 }, { "epoch": 0.1700579236595871, "grad_norm": 2.673882484436035, "learning_rate": 2.8979652458042477e-05, "loss": 0.1063, "step": 2290 }, { "epoch": 0.17080053467993464, "grad_norm": 3.152207612991333, "learning_rate": 2.8975196791920392e-05, "loss": 0.1098, "step": 2300 }, { "epoch": 0.1715431457002822, "grad_norm": 4.860641956329346, "learning_rate": 2.897074112579831e-05, "loss": 0.1073, "step": 2310 }, { "epoch": 0.17228575672062973, "grad_norm": 2.261838436126709, "learning_rate": 2.8966285459676222e-05, "loss": 0.1035, "step": 2320 }, { "epoch": 0.17302836774097727, "grad_norm": 1.3627759218215942, "learning_rate": 2.8961829793554137e-05, "loss": 0.0873, "step": 2330 }, { "epoch": 0.17377097876132483, "grad_norm": 2.009950637817383, "learning_rate": 2.8957374127432052e-05, "loss": 0.1033, "step": 2340 }, { "epoch": 0.17451358978167236, "grad_norm": 1.0061966180801392, "learning_rate": 2.8952918461309967e-05, "loss": 0.0995, "step": 2350 }, { "epoch": 0.1752562008020199, "grad_norm": 4.665594100952148, "learning_rate": 2.8948462795187882e-05, "loss": 0.1148, "step": 2360 }, { "epoch": 0.17599881182236746, "grad_norm": 2.1051509380340576, "learning_rate": 2.8944007129065793e-05, "loss": 0.0832, "step": 2370 }, { "epoch": 0.176741422842715, "grad_norm": 4.589431285858154, "learning_rate": 2.8939551462943712e-05, "loss": 0.072, "step": 2380 }, { "epoch": 0.17748403386306252, "grad_norm": 5.031434059143066, "learning_rate": 2.8935095796821627e-05, "loss": 0.1275, "step": 2390 }, { "epoch": 0.17822664488341006, "grad_norm": 1.6660507917404175, "learning_rate": 2.893064013069954e-05, "loss": 0.1189, "step": 2400 }, { "epoch": 0.17896925590375762, "grad_norm": 2.1114559173583984, "learning_rate": 2.8926184464577457e-05, "loss": 0.09, "step": 2410 }, { "epoch": 0.17971186692410515, "grad_norm": 1.1121262311935425, "learning_rate": 2.8921728798455372e-05, "loss": 0.0799, "step": 2420 }, { "epoch": 0.18045447794445268, "grad_norm": 2.8174080848693848, "learning_rate": 2.8917273132333283e-05, "loss": 0.0926, "step": 2430 }, { "epoch": 0.18119708896480025, "grad_norm": 3.2218480110168457, "learning_rate": 2.89128174662112e-05, "loss": 0.0758, "step": 2440 }, { "epoch": 0.18193969998514778, "grad_norm": 1.7610548734664917, "learning_rate": 2.8908361800089113e-05, "loss": 0.1052, "step": 2450 }, { "epoch": 0.1826823110054953, "grad_norm": 2.7015151977539062, "learning_rate": 2.890390613396703e-05, "loss": 0.0857, "step": 2460 }, { "epoch": 0.18342492202584287, "grad_norm": 1.8576743602752686, "learning_rate": 2.8899450467844943e-05, "loss": 0.0653, "step": 2470 }, { "epoch": 0.1841675330461904, "grad_norm": 5.928577423095703, "learning_rate": 2.889499480172286e-05, "loss": 0.1243, "step": 2480 }, { "epoch": 0.18491014406653794, "grad_norm": 2.597346544265747, "learning_rate": 2.8890539135600773e-05, "loss": 0.1309, "step": 2490 }, { "epoch": 0.1856527550868855, "grad_norm": 3.324141263961792, "learning_rate": 2.888608346947869e-05, "loss": 0.0892, "step": 2500 }, { "epoch": 0.18639536610723303, "grad_norm": 2.4857001304626465, "learning_rate": 2.8881627803356603e-05, "loss": 0.0984, "step": 2510 }, { "epoch": 0.18713797712758057, "grad_norm": 2.5961930751800537, "learning_rate": 2.887717213723452e-05, "loss": 0.091, "step": 2520 }, { "epoch": 0.1878805881479281, "grad_norm": 0.8424578309059143, "learning_rate": 2.8872716471112433e-05, "loss": 0.1048, "step": 2530 }, { "epoch": 0.18862319916827566, "grad_norm": 1.7092845439910889, "learning_rate": 2.8868260804990345e-05, "loss": 0.0916, "step": 2540 }, { "epoch": 0.1893658101886232, "grad_norm": 1.8642319440841675, "learning_rate": 2.8863805138868263e-05, "loss": 0.0953, "step": 2550 }, { "epoch": 0.19010842120897073, "grad_norm": 3.4981400966644287, "learning_rate": 2.8859349472746175e-05, "loss": 0.088, "step": 2560 }, { "epoch": 0.1908510322293183, "grad_norm": 4.905360221862793, "learning_rate": 2.885489380662409e-05, "loss": 0.102, "step": 2570 }, { "epoch": 0.19159364324966582, "grad_norm": 3.5098886489868164, "learning_rate": 2.8850438140502008e-05, "loss": 0.116, "step": 2580 }, { "epoch": 0.19233625427001336, "grad_norm": 2.462068557739258, "learning_rate": 2.884598247437992e-05, "loss": 0.0878, "step": 2590 }, { "epoch": 0.19307886529036092, "grad_norm": 1.3594739437103271, "learning_rate": 2.8841526808257835e-05, "loss": 0.0978, "step": 2600 }, { "epoch": 0.19382147631070845, "grad_norm": 0.8977119326591492, "learning_rate": 2.883707114213575e-05, "loss": 0.1115, "step": 2610 }, { "epoch": 0.19456408733105598, "grad_norm": 4.278836727142334, "learning_rate": 2.8832615476013665e-05, "loss": 0.0873, "step": 2620 }, { "epoch": 0.19530669835140355, "grad_norm": 2.4040420055389404, "learning_rate": 2.882815980989158e-05, "loss": 0.1154, "step": 2630 }, { "epoch": 0.19604930937175108, "grad_norm": 3.2387709617614746, "learning_rate": 2.8823704143769495e-05, "loss": 0.1202, "step": 2640 }, { "epoch": 0.1967919203920986, "grad_norm": 1.2771217823028564, "learning_rate": 2.881924847764741e-05, "loss": 0.1064, "step": 2650 }, { "epoch": 0.19753453141244617, "grad_norm": 4.477030277252197, "learning_rate": 2.8814792811525325e-05, "loss": 0.0813, "step": 2660 }, { "epoch": 0.1982771424327937, "grad_norm": 2.6116533279418945, "learning_rate": 2.8810337145403236e-05, "loss": 0.1133, "step": 2670 }, { "epoch": 0.19901975345314124, "grad_norm": 2.1124253273010254, "learning_rate": 2.880588147928115e-05, "loss": 0.1134, "step": 2680 }, { "epoch": 0.19976236447348877, "grad_norm": 2.3649754524230957, "learning_rate": 2.880142581315907e-05, "loss": 0.1062, "step": 2690 }, { "epoch": 0.20050497549383633, "grad_norm": 2.6647801399230957, "learning_rate": 2.879697014703698e-05, "loss": 0.1092, "step": 2700 }, { "epoch": 0.20124758651418387, "grad_norm": 3.3392791748046875, "learning_rate": 2.8792514480914896e-05, "loss": 0.1107, "step": 2710 }, { "epoch": 0.2019901975345314, "grad_norm": 2.0530688762664795, "learning_rate": 2.8788058814792815e-05, "loss": 0.1148, "step": 2720 }, { "epoch": 0.20273280855487896, "grad_norm": 2.3883824348449707, "learning_rate": 2.8783603148670726e-05, "loss": 0.0924, "step": 2730 }, { "epoch": 0.2034754195752265, "grad_norm": 1.486218810081482, "learning_rate": 2.877914748254864e-05, "loss": 0.0943, "step": 2740 }, { "epoch": 0.20421803059557403, "grad_norm": 2.0853097438812256, "learning_rate": 2.8774691816426556e-05, "loss": 0.1223, "step": 2750 }, { "epoch": 0.2049606416159216, "grad_norm": 3.1080849170684814, "learning_rate": 2.877023615030447e-05, "loss": 0.0905, "step": 2760 }, { "epoch": 0.20570325263626912, "grad_norm": 1.9018908739089966, "learning_rate": 2.8765780484182386e-05, "loss": 0.0928, "step": 2770 }, { "epoch": 0.20644586365661666, "grad_norm": 2.179426908493042, "learning_rate": 2.8761324818060298e-05, "loss": 0.1011, "step": 2780 }, { "epoch": 0.20718847467696422, "grad_norm": 1.9516263008117676, "learning_rate": 2.8756869151938216e-05, "loss": 0.1033, "step": 2790 }, { "epoch": 0.20793108569731175, "grad_norm": 2.347296953201294, "learning_rate": 2.875241348581613e-05, "loss": 0.104, "step": 2800 }, { "epoch": 0.20867369671765928, "grad_norm": 2.022731304168701, "learning_rate": 2.8747957819694043e-05, "loss": 0.0947, "step": 2810 }, { "epoch": 0.20941630773800682, "grad_norm": 1.8994909524917603, "learning_rate": 2.874350215357196e-05, "loss": 0.0827, "step": 2820 }, { "epoch": 0.21015891875835438, "grad_norm": 1.9812676906585693, "learning_rate": 2.8739046487449876e-05, "loss": 0.0721, "step": 2830 }, { "epoch": 0.2109015297787019, "grad_norm": 0.4040673077106476, "learning_rate": 2.8734590821327788e-05, "loss": 0.0885, "step": 2840 }, { "epoch": 0.21164414079904945, "grad_norm": 3.147190570831299, "learning_rate": 2.8730135155205703e-05, "loss": 0.1106, "step": 2850 }, { "epoch": 0.212386751819397, "grad_norm": 2.5040011405944824, "learning_rate": 2.8725679489083618e-05, "loss": 0.1102, "step": 2860 }, { "epoch": 0.21312936283974454, "grad_norm": 2.1592671871185303, "learning_rate": 2.8721223822961533e-05, "loss": 0.0697, "step": 2870 }, { "epoch": 0.21387197386009207, "grad_norm": 2.1262803077697754, "learning_rate": 2.8716768156839448e-05, "loss": 0.1251, "step": 2880 }, { "epoch": 0.21461458488043963, "grad_norm": 6.860218524932861, "learning_rate": 2.8712312490717363e-05, "loss": 0.0852, "step": 2890 }, { "epoch": 0.21535719590078717, "grad_norm": 3.187988758087158, "learning_rate": 2.8707856824595278e-05, "loss": 0.112, "step": 2900 }, { "epoch": 0.2160998069211347, "grad_norm": 2.9651613235473633, "learning_rate": 2.8703401158473193e-05, "loss": 0.0847, "step": 2910 }, { "epoch": 0.21684241794148226, "grad_norm": 1.7240506410598755, "learning_rate": 2.8698945492351108e-05, "loss": 0.0983, "step": 2920 }, { "epoch": 0.2175850289618298, "grad_norm": 3.074819803237915, "learning_rate": 2.8694489826229023e-05, "loss": 0.0944, "step": 2930 }, { "epoch": 0.21832763998217733, "grad_norm": 4.255871772766113, "learning_rate": 2.8690034160106938e-05, "loss": 0.1251, "step": 2940 }, { "epoch": 0.2190702510025249, "grad_norm": 2.6262733936309814, "learning_rate": 2.868557849398485e-05, "loss": 0.0804, "step": 2950 }, { "epoch": 0.21981286202287242, "grad_norm": 1.9793500900268555, "learning_rate": 2.8681122827862768e-05, "loss": 0.1058, "step": 2960 }, { "epoch": 0.22055547304321996, "grad_norm": 1.1691769361495972, "learning_rate": 2.867666716174068e-05, "loss": 0.0838, "step": 2970 }, { "epoch": 0.2212980840635675, "grad_norm": 2.1811420917510986, "learning_rate": 2.8672211495618594e-05, "loss": 0.1095, "step": 2980 }, { "epoch": 0.22204069508391505, "grad_norm": 2.311396360397339, "learning_rate": 2.8667755829496513e-05, "loss": 0.1164, "step": 2990 }, { "epoch": 0.22278330610426259, "grad_norm": 5.444539546966553, "learning_rate": 2.8663300163374424e-05, "loss": 0.1294, "step": 3000 }, { "epoch": 0.22352591712461012, "grad_norm": 1.1934783458709717, "learning_rate": 2.865884449725234e-05, "loss": 0.1179, "step": 3010 }, { "epoch": 0.22426852814495768, "grad_norm": 1.7925602197647095, "learning_rate": 2.8654388831130254e-05, "loss": 0.144, "step": 3020 }, { "epoch": 0.2250111391653052, "grad_norm": 4.332716941833496, "learning_rate": 2.864993316500817e-05, "loss": 0.111, "step": 3030 }, { "epoch": 0.22575375018565275, "grad_norm": 3.0859615802764893, "learning_rate": 2.8645477498886084e-05, "loss": 0.0978, "step": 3040 }, { "epoch": 0.2264963612060003, "grad_norm": 2.46098256111145, "learning_rate": 2.8641021832764e-05, "loss": 0.0836, "step": 3050 }, { "epoch": 0.22723897222634784, "grad_norm": 1.820902705192566, "learning_rate": 2.8636566166641914e-05, "loss": 0.0985, "step": 3060 }, { "epoch": 0.22798158324669537, "grad_norm": 2.86248517036438, "learning_rate": 2.863211050051983e-05, "loss": 0.1093, "step": 3070 }, { "epoch": 0.22872419426704294, "grad_norm": 2.933708429336548, "learning_rate": 2.862765483439774e-05, "loss": 0.0901, "step": 3080 }, { "epoch": 0.22946680528739047, "grad_norm": 2.0867459774017334, "learning_rate": 2.862319916827566e-05, "loss": 0.0686, "step": 3090 }, { "epoch": 0.230209416307738, "grad_norm": 2.3671841621398926, "learning_rate": 2.8618743502153574e-05, "loss": 0.118, "step": 3100 }, { "epoch": 0.23095202732808554, "grad_norm": 1.118376612663269, "learning_rate": 2.8614287836031486e-05, "loss": 0.0853, "step": 3110 }, { "epoch": 0.2316946383484331, "grad_norm": 3.297832727432251, "learning_rate": 2.86098321699094e-05, "loss": 0.0996, "step": 3120 }, { "epoch": 0.23243724936878063, "grad_norm": 2.1501147747039795, "learning_rate": 2.860537650378732e-05, "loss": 0.1381, "step": 3130 }, { "epoch": 0.23317986038912816, "grad_norm": 0.9489710927009583, "learning_rate": 2.860092083766523e-05, "loss": 0.0692, "step": 3140 }, { "epoch": 0.23392247140947572, "grad_norm": 2.0320940017700195, "learning_rate": 2.8596465171543146e-05, "loss": 0.0855, "step": 3150 }, { "epoch": 0.23466508242982326, "grad_norm": 2.169110059738159, "learning_rate": 2.859200950542106e-05, "loss": 0.0809, "step": 3160 }, { "epoch": 0.2354076934501708, "grad_norm": 3.284989595413208, "learning_rate": 2.8587553839298976e-05, "loss": 0.1028, "step": 3170 }, { "epoch": 0.23615030447051835, "grad_norm": 2.6544220447540283, "learning_rate": 2.858309817317689e-05, "loss": 0.115, "step": 3180 }, { "epoch": 0.23689291549086589, "grad_norm": 1.7478609085083008, "learning_rate": 2.8578642507054803e-05, "loss": 0.083, "step": 3190 }, { "epoch": 0.23763552651121342, "grad_norm": 2.0759472846984863, "learning_rate": 2.857418684093272e-05, "loss": 0.1224, "step": 3200 }, { "epoch": 0.23837813753156098, "grad_norm": 2.7815895080566406, "learning_rate": 2.8569731174810636e-05, "loss": 0.1166, "step": 3210 }, { "epoch": 0.2391207485519085, "grad_norm": 3.542616367340088, "learning_rate": 2.8565275508688548e-05, "loss": 0.1009, "step": 3220 }, { "epoch": 0.23986335957225605, "grad_norm": 1.8111937046051025, "learning_rate": 2.8560819842566466e-05, "loss": 0.099, "step": 3230 }, { "epoch": 0.2406059705926036, "grad_norm": 1.9494497776031494, "learning_rate": 2.855636417644438e-05, "loss": 0.0787, "step": 3240 }, { "epoch": 0.24134858161295114, "grad_norm": 1.57643461227417, "learning_rate": 2.8551908510322293e-05, "loss": 0.0731, "step": 3250 }, { "epoch": 0.24209119263329867, "grad_norm": 1.140007495880127, "learning_rate": 2.8547452844200208e-05, "loss": 0.0824, "step": 3260 }, { "epoch": 0.2428338036536462, "grad_norm": 4.138311386108398, "learning_rate": 2.8542997178078123e-05, "loss": 0.1063, "step": 3270 }, { "epoch": 0.24357641467399377, "grad_norm": 3.1349868774414062, "learning_rate": 2.8538541511956038e-05, "loss": 0.1078, "step": 3280 }, { "epoch": 0.2443190256943413, "grad_norm": 1.922900676727295, "learning_rate": 2.8534085845833953e-05, "loss": 0.0801, "step": 3290 }, { "epoch": 0.24506163671468884, "grad_norm": 3.57891583442688, "learning_rate": 2.8529630179711868e-05, "loss": 0.0956, "step": 3300 }, { "epoch": 0.2458042477350364, "grad_norm": 1.5893707275390625, "learning_rate": 2.8525174513589783e-05, "loss": 0.0762, "step": 3310 }, { "epoch": 0.24654685875538393, "grad_norm": 4.745431423187256, "learning_rate": 2.8520718847467698e-05, "loss": 0.0812, "step": 3320 }, { "epoch": 0.24728946977573146, "grad_norm": 1.915309190750122, "learning_rate": 2.8516263181345613e-05, "loss": 0.0879, "step": 3330 }, { "epoch": 0.24803208079607902, "grad_norm": 2.0146467685699463, "learning_rate": 2.8511807515223528e-05, "loss": 0.1033, "step": 3340 }, { "epoch": 0.24877469181642656, "grad_norm": 1.623887300491333, "learning_rate": 2.8507351849101443e-05, "loss": 0.0813, "step": 3350 }, { "epoch": 0.2495173028367741, "grad_norm": 0.7771584987640381, "learning_rate": 2.8502896182979354e-05, "loss": 0.0881, "step": 3360 }, { "epoch": 0.25025991385712165, "grad_norm": 2.778308868408203, "learning_rate": 2.8498440516857272e-05, "loss": 0.0786, "step": 3370 }, { "epoch": 0.25100252487746916, "grad_norm": 1.1731817722320557, "learning_rate": 2.8493984850735184e-05, "loss": 0.1235, "step": 3380 }, { "epoch": 0.2517451358978167, "grad_norm": 2.099097967147827, "learning_rate": 2.84895291846131e-05, "loss": 0.11, "step": 3390 }, { "epoch": 0.2524877469181643, "grad_norm": 1.712109923362732, "learning_rate": 2.8485073518491017e-05, "loss": 0.0873, "step": 3400 }, { "epoch": 0.2532303579385118, "grad_norm": 1.978943943977356, "learning_rate": 2.848061785236893e-05, "loss": 0.0906, "step": 3410 }, { "epoch": 0.25397296895885935, "grad_norm": 2.0468902587890625, "learning_rate": 2.8476162186246844e-05, "loss": 0.1062, "step": 3420 }, { "epoch": 0.2547155799792069, "grad_norm": 1.208884358406067, "learning_rate": 2.847170652012476e-05, "loss": 0.1014, "step": 3430 }, { "epoch": 0.2554581909995544, "grad_norm": 2.65171217918396, "learning_rate": 2.8467250854002674e-05, "loss": 0.1386, "step": 3440 }, { "epoch": 0.256200802019902, "grad_norm": 1.2456876039505005, "learning_rate": 2.846279518788059e-05, "loss": 0.0859, "step": 3450 }, { "epoch": 0.25694341304024954, "grad_norm": 1.7401740550994873, "learning_rate": 2.8458339521758504e-05, "loss": 0.1033, "step": 3460 }, { "epoch": 0.25768602406059704, "grad_norm": 5.769093990325928, "learning_rate": 2.845388385563642e-05, "loss": 0.1029, "step": 3470 }, { "epoch": 0.2584286350809446, "grad_norm": 2.1862595081329346, "learning_rate": 2.8449428189514334e-05, "loss": 0.082, "step": 3480 }, { "epoch": 0.25917124610129216, "grad_norm": 4.852025985717773, "learning_rate": 2.8444972523392246e-05, "loss": 0.0956, "step": 3490 }, { "epoch": 0.25991385712163967, "grad_norm": 2.4434781074523926, "learning_rate": 2.8440516857270164e-05, "loss": 0.108, "step": 3500 }, { "epoch": 0.26065646814198723, "grad_norm": 2.209559679031372, "learning_rate": 2.843606119114808e-05, "loss": 0.1083, "step": 3510 }, { "epoch": 0.2613990791623348, "grad_norm": 3.44124698638916, "learning_rate": 2.843160552502599e-05, "loss": 0.0981, "step": 3520 }, { "epoch": 0.2621416901826823, "grad_norm": 3.689404249191284, "learning_rate": 2.8427149858903906e-05, "loss": 0.0863, "step": 3530 }, { "epoch": 0.26288430120302986, "grad_norm": 1.4514044523239136, "learning_rate": 2.8422694192781824e-05, "loss": 0.0854, "step": 3540 }, { "epoch": 0.2636269122233774, "grad_norm": 1.8752799034118652, "learning_rate": 2.8418238526659736e-05, "loss": 0.0775, "step": 3550 }, { "epoch": 0.2643695232437249, "grad_norm": 2.1504430770874023, "learning_rate": 2.841378286053765e-05, "loss": 0.0839, "step": 3560 }, { "epoch": 0.2651121342640725, "grad_norm": 3.2270238399505615, "learning_rate": 2.840932719441557e-05, "loss": 0.1367, "step": 3570 }, { "epoch": 0.26585474528442005, "grad_norm": 2.0077528953552246, "learning_rate": 2.840487152829348e-05, "loss": 0.0887, "step": 3580 }, { "epoch": 0.26659735630476755, "grad_norm": 1.6168723106384277, "learning_rate": 2.8400415862171396e-05, "loss": 0.1324, "step": 3590 }, { "epoch": 0.2673399673251151, "grad_norm": 1.800391674041748, "learning_rate": 2.8395960196049307e-05, "loss": 0.1256, "step": 3600 }, { "epoch": 0.2680825783454627, "grad_norm": 1.1540509462356567, "learning_rate": 2.8391504529927226e-05, "loss": 0.0888, "step": 3610 }, { "epoch": 0.2688251893658102, "grad_norm": 1.3013066053390503, "learning_rate": 2.838704886380514e-05, "loss": 0.096, "step": 3620 }, { "epoch": 0.26956780038615774, "grad_norm": 1.9634844064712524, "learning_rate": 2.8382593197683052e-05, "loss": 0.0817, "step": 3630 }, { "epoch": 0.27031041140650525, "grad_norm": 2.515450954437256, "learning_rate": 2.837813753156097e-05, "loss": 0.0909, "step": 3640 }, { "epoch": 0.2710530224268528, "grad_norm": 2.612504482269287, "learning_rate": 2.8373681865438886e-05, "loss": 0.1121, "step": 3650 }, { "epoch": 0.27179563344720037, "grad_norm": 1.4512356519699097, "learning_rate": 2.8369226199316797e-05, "loss": 0.0939, "step": 3660 }, { "epoch": 0.2725382444675479, "grad_norm": 2.2824881076812744, "learning_rate": 2.8364770533194712e-05, "loss": 0.1103, "step": 3670 }, { "epoch": 0.27328085548789544, "grad_norm": 1.5197831392288208, "learning_rate": 2.8360314867072627e-05, "loss": 0.1198, "step": 3680 }, { "epoch": 0.274023466508243, "grad_norm": 1.559735894203186, "learning_rate": 2.8355859200950542e-05, "loss": 0.1381, "step": 3690 }, { "epoch": 0.2747660775285905, "grad_norm": 1.8886692523956299, "learning_rate": 2.8351403534828457e-05, "loss": 0.0942, "step": 3700 }, { "epoch": 0.27550868854893806, "grad_norm": 1.677405834197998, "learning_rate": 2.8346947868706372e-05, "loss": 0.0857, "step": 3710 }, { "epoch": 0.2762512995692856, "grad_norm": 1.0931998491287231, "learning_rate": 2.8342492202584287e-05, "loss": 0.1002, "step": 3720 }, { "epoch": 0.27699391058963313, "grad_norm": 2.140795946121216, "learning_rate": 2.8338036536462202e-05, "loss": 0.1144, "step": 3730 }, { "epoch": 0.2777365216099807, "grad_norm": 1.8325400352478027, "learning_rate": 2.8333580870340117e-05, "loss": 0.0829, "step": 3740 }, { "epoch": 0.27847913263032825, "grad_norm": 2.1785285472869873, "learning_rate": 2.8329125204218032e-05, "loss": 0.0643, "step": 3750 }, { "epoch": 0.27922174365067576, "grad_norm": 2.3438045978546143, "learning_rate": 2.8324669538095947e-05, "loss": 0.0871, "step": 3760 }, { "epoch": 0.2799643546710233, "grad_norm": 2.866464853286743, "learning_rate": 2.832021387197386e-05, "loss": 0.0718, "step": 3770 }, { "epoch": 0.2807069656913709, "grad_norm": 1.4197877645492554, "learning_rate": 2.8315758205851777e-05, "loss": 0.1264, "step": 3780 }, { "epoch": 0.2814495767117184, "grad_norm": 4.769101142883301, "learning_rate": 2.831130253972969e-05, "loss": 0.0845, "step": 3790 }, { "epoch": 0.28219218773206595, "grad_norm": 1.278130292892456, "learning_rate": 2.8306846873607604e-05, "loss": 0.0915, "step": 3800 }, { "epoch": 0.2829347987524135, "grad_norm": 2.7825405597686768, "learning_rate": 2.8302391207485522e-05, "loss": 0.1035, "step": 3810 }, { "epoch": 0.283677409772761, "grad_norm": 3.6590402126312256, "learning_rate": 2.8297935541363434e-05, "loss": 0.0821, "step": 3820 }, { "epoch": 0.2844200207931086, "grad_norm": 3.2565736770629883, "learning_rate": 2.829347987524135e-05, "loss": 0.097, "step": 3830 }, { "epoch": 0.28516263181345614, "grad_norm": 1.7409720420837402, "learning_rate": 2.8289024209119264e-05, "loss": 0.0929, "step": 3840 }, { "epoch": 0.28590524283380364, "grad_norm": 2.9615607261657715, "learning_rate": 2.828456854299718e-05, "loss": 0.0771, "step": 3850 }, { "epoch": 0.2866478538541512, "grad_norm": 2.6329636573791504, "learning_rate": 2.8280112876875094e-05, "loss": 0.1215, "step": 3860 }, { "epoch": 0.28739046487449876, "grad_norm": 1.5111801624298096, "learning_rate": 2.827565721075301e-05, "loss": 0.0735, "step": 3870 }, { "epoch": 0.28813307589484627, "grad_norm": 2.780776262283325, "learning_rate": 2.8271201544630924e-05, "loss": 0.0969, "step": 3880 }, { "epoch": 0.28887568691519383, "grad_norm": 1.8121346235275269, "learning_rate": 2.826674587850884e-05, "loss": 0.1015, "step": 3890 }, { "epoch": 0.2896182979355414, "grad_norm": 1.4083514213562012, "learning_rate": 2.826229021238675e-05, "loss": 0.066, "step": 3900 }, { "epoch": 0.2903609089558889, "grad_norm": 2.5285115242004395, "learning_rate": 2.825783454626467e-05, "loss": 0.0923, "step": 3910 }, { "epoch": 0.29110351997623646, "grad_norm": 0.7836059927940369, "learning_rate": 2.8253378880142584e-05, "loss": 0.0506, "step": 3920 }, { "epoch": 0.29184613099658396, "grad_norm": 1.5895808935165405, "learning_rate": 2.8248923214020495e-05, "loss": 0.0886, "step": 3930 }, { "epoch": 0.2925887420169315, "grad_norm": 1.657165288925171, "learning_rate": 2.824446754789841e-05, "loss": 0.0845, "step": 3940 }, { "epoch": 0.2933313530372791, "grad_norm": 1.5813052654266357, "learning_rate": 2.824001188177633e-05, "loss": 0.1015, "step": 3950 }, { "epoch": 0.2940739640576266, "grad_norm": 2.2893810272216797, "learning_rate": 2.823555621565424e-05, "loss": 0.1201, "step": 3960 }, { "epoch": 0.29481657507797415, "grad_norm": 3.8998055458068848, "learning_rate": 2.8231100549532155e-05, "loss": 0.1317, "step": 3970 }, { "epoch": 0.2955591860983217, "grad_norm": 1.5163902044296265, "learning_rate": 2.8226644883410074e-05, "loss": 0.1226, "step": 3980 }, { "epoch": 0.2963017971186692, "grad_norm": 2.5356316566467285, "learning_rate": 2.8222189217287985e-05, "loss": 0.1257, "step": 3990 }, { "epoch": 0.2970444081390168, "grad_norm": 0.5978565216064453, "learning_rate": 2.82177335511659e-05, "loss": 0.0741, "step": 4000 }, { "epoch": 0.29778701915936434, "grad_norm": 3.2044990062713623, "learning_rate": 2.8213277885043812e-05, "loss": 0.0927, "step": 4010 }, { "epoch": 0.29852963017971185, "grad_norm": 2.5349199771881104, "learning_rate": 2.820882221892173e-05, "loss": 0.1082, "step": 4020 }, { "epoch": 0.2992722412000594, "grad_norm": 2.297657012939453, "learning_rate": 2.8204366552799645e-05, "loss": 0.1034, "step": 4030 }, { "epoch": 0.30001485222040697, "grad_norm": 2.956207036972046, "learning_rate": 2.8199910886677557e-05, "loss": 0.0724, "step": 4040 }, { "epoch": 0.3007574632407545, "grad_norm": 2.382066488265991, "learning_rate": 2.8195455220555475e-05, "loss": 0.0734, "step": 4050 }, { "epoch": 0.30150007426110204, "grad_norm": 2.7788658142089844, "learning_rate": 2.819099955443339e-05, "loss": 0.1129, "step": 4060 }, { "epoch": 0.3022426852814496, "grad_norm": 1.6891690492630005, "learning_rate": 2.8186543888311302e-05, "loss": 0.1168, "step": 4070 }, { "epoch": 0.3029852963017971, "grad_norm": 2.3250083923339844, "learning_rate": 2.8182088222189217e-05, "loss": 0.0998, "step": 4080 }, { "epoch": 0.30372790732214466, "grad_norm": 2.700108766555786, "learning_rate": 2.8177632556067135e-05, "loss": 0.1129, "step": 4090 }, { "epoch": 0.3044705183424922, "grad_norm": 2.239126443862915, "learning_rate": 2.8173176889945047e-05, "loss": 0.1041, "step": 4100 }, { "epoch": 0.30521312936283973, "grad_norm": 2.046869993209839, "learning_rate": 2.8168721223822962e-05, "loss": 0.1011, "step": 4110 }, { "epoch": 0.3059557403831873, "grad_norm": 2.6533050537109375, "learning_rate": 2.8164265557700877e-05, "loss": 0.0722, "step": 4120 }, { "epoch": 0.30669835140353485, "grad_norm": 1.3280346393585205, "learning_rate": 2.8159809891578792e-05, "loss": 0.0925, "step": 4130 }, { "epoch": 0.30744096242388236, "grad_norm": 1.894659161567688, "learning_rate": 2.8155354225456707e-05, "loss": 0.0688, "step": 4140 }, { "epoch": 0.3081835734442299, "grad_norm": 1.138370394706726, "learning_rate": 2.8150898559334622e-05, "loss": 0.085, "step": 4150 }, { "epoch": 0.3089261844645775, "grad_norm": 2.353771686553955, "learning_rate": 2.8146442893212537e-05, "loss": 0.1051, "step": 4160 }, { "epoch": 0.309668795484925, "grad_norm": 1.1877645254135132, "learning_rate": 2.8141987227090452e-05, "loss": 0.1023, "step": 4170 }, { "epoch": 0.31041140650527255, "grad_norm": 1.905053973197937, "learning_rate": 2.8137531560968363e-05, "loss": 0.1233, "step": 4180 }, { "epoch": 0.3111540175256201, "grad_norm": 2.760115385055542, "learning_rate": 2.813307589484628e-05, "loss": 0.1237, "step": 4190 }, { "epoch": 0.3118966285459676, "grad_norm": 2.511549711227417, "learning_rate": 2.8128620228724193e-05, "loss": 0.0922, "step": 4200 }, { "epoch": 0.3126392395663152, "grad_norm": 1.948473572731018, "learning_rate": 2.8124164562602108e-05, "loss": 0.0985, "step": 4210 }, { "epoch": 0.3133818505866627, "grad_norm": 3.190645456314087, "learning_rate": 2.8119708896480027e-05, "loss": 0.106, "step": 4220 }, { "epoch": 0.31412446160701024, "grad_norm": 2.2379205226898193, "learning_rate": 2.8115253230357938e-05, "loss": 0.0896, "step": 4230 }, { "epoch": 0.3148670726273578, "grad_norm": 1.1914069652557373, "learning_rate": 2.8110797564235853e-05, "loss": 0.0659, "step": 4240 }, { "epoch": 0.3156096836477053, "grad_norm": 1.5502461194992065, "learning_rate": 2.8106341898113768e-05, "loss": 0.0995, "step": 4250 }, { "epoch": 0.31635229466805287, "grad_norm": 2.563169240951538, "learning_rate": 2.8101886231991683e-05, "loss": 0.0967, "step": 4260 }, { "epoch": 0.31709490568840043, "grad_norm": 4.562102317810059, "learning_rate": 2.8097430565869598e-05, "loss": 0.107, "step": 4270 }, { "epoch": 0.31783751670874794, "grad_norm": 0.7943652868270874, "learning_rate": 2.8092974899747513e-05, "loss": 0.0811, "step": 4280 }, { "epoch": 0.3185801277290955, "grad_norm": 2.5280022621154785, "learning_rate": 2.8088519233625428e-05, "loss": 0.0628, "step": 4290 }, { "epoch": 0.31932273874944306, "grad_norm": 1.1994893550872803, "learning_rate": 2.8084063567503343e-05, "loss": 0.0747, "step": 4300 }, { "epoch": 0.32006534976979056, "grad_norm": 2.5964338779449463, "learning_rate": 2.8079607901381255e-05, "loss": 0.0978, "step": 4310 }, { "epoch": 0.3208079607901381, "grad_norm": 1.0539716482162476, "learning_rate": 2.8075152235259173e-05, "loss": 0.1243, "step": 4320 }, { "epoch": 0.3215505718104857, "grad_norm": 3.5578460693359375, "learning_rate": 2.8070696569137088e-05, "loss": 0.1073, "step": 4330 }, { "epoch": 0.3222931828308332, "grad_norm": 3.5634069442749023, "learning_rate": 2.8066240903015e-05, "loss": 0.1102, "step": 4340 }, { "epoch": 0.32303579385118075, "grad_norm": 1.1170202493667603, "learning_rate": 2.8061785236892915e-05, "loss": 0.0718, "step": 4350 }, { "epoch": 0.3237784048715283, "grad_norm": 2.6861186027526855, "learning_rate": 2.8057329570770833e-05, "loss": 0.0986, "step": 4360 }, { "epoch": 0.3245210158918758, "grad_norm": 2.0378482341766357, "learning_rate": 2.8052873904648745e-05, "loss": 0.1087, "step": 4370 }, { "epoch": 0.3252636269122234, "grad_norm": 2.456540822982788, "learning_rate": 2.804841823852666e-05, "loss": 0.0763, "step": 4380 }, { "epoch": 0.32600623793257094, "grad_norm": 1.6984671354293823, "learning_rate": 2.8043962572404578e-05, "loss": 0.1253, "step": 4390 }, { "epoch": 0.32674884895291845, "grad_norm": 3.025683641433716, "learning_rate": 2.803950690628249e-05, "loss": 0.1072, "step": 4400 }, { "epoch": 0.327491459973266, "grad_norm": 2.3869524002075195, "learning_rate": 2.8035051240160405e-05, "loss": 0.0733, "step": 4410 }, { "epoch": 0.32823407099361357, "grad_norm": 1.5265862941741943, "learning_rate": 2.8030595574038316e-05, "loss": 0.0541, "step": 4420 }, { "epoch": 0.3289766820139611, "grad_norm": 2.0215351581573486, "learning_rate": 2.8026139907916235e-05, "loss": 0.0865, "step": 4430 }, { "epoch": 0.32971929303430864, "grad_norm": 1.381551742553711, "learning_rate": 2.802168424179415e-05, "loss": 0.1084, "step": 4440 }, { "epoch": 0.3304619040546562, "grad_norm": 1.7766149044036865, "learning_rate": 2.801722857567206e-05, "loss": 0.0987, "step": 4450 }, { "epoch": 0.3312045150750037, "grad_norm": 2.317441701889038, "learning_rate": 2.801277290954998e-05, "loss": 0.0809, "step": 4460 }, { "epoch": 0.33194712609535126, "grad_norm": 2.322162389755249, "learning_rate": 2.8008317243427895e-05, "loss": 0.114, "step": 4470 }, { "epoch": 0.33268973711569877, "grad_norm": 2.353233575820923, "learning_rate": 2.8003861577305806e-05, "loss": 0.098, "step": 4480 }, { "epoch": 0.33343234813604633, "grad_norm": 0.9074286818504333, "learning_rate": 2.7999405911183725e-05, "loss": 0.1093, "step": 4490 }, { "epoch": 0.3341749591563939, "grad_norm": 4.220743656158447, "learning_rate": 2.799495024506164e-05, "loss": 0.0983, "step": 4500 }, { "epoch": 0.3349175701767414, "grad_norm": 2.652031898498535, "learning_rate": 2.799049457893955e-05, "loss": 0.0873, "step": 4510 }, { "epoch": 0.33566018119708896, "grad_norm": 1.0324969291687012, "learning_rate": 2.7986038912817466e-05, "loss": 0.0755, "step": 4520 }, { "epoch": 0.3364027922174365, "grad_norm": 0.8681501746177673, "learning_rate": 2.798158324669538e-05, "loss": 0.1043, "step": 4530 }, { "epoch": 0.337145403237784, "grad_norm": 1.413583755493164, "learning_rate": 2.7977127580573296e-05, "loss": 0.0682, "step": 4540 }, { "epoch": 0.3378880142581316, "grad_norm": 1.2596721649169922, "learning_rate": 2.797267191445121e-05, "loss": 0.1021, "step": 4550 }, { "epoch": 0.33863062527847915, "grad_norm": 2.051772117614746, "learning_rate": 2.7968216248329126e-05, "loss": 0.0646, "step": 4560 }, { "epoch": 0.33937323629882665, "grad_norm": 2.39245343208313, "learning_rate": 2.796376058220704e-05, "loss": 0.0913, "step": 4570 }, { "epoch": 0.3401158473191742, "grad_norm": 1.1950043439865112, "learning_rate": 2.7959304916084956e-05, "loss": 0.0772, "step": 4580 }, { "epoch": 0.3408584583395218, "grad_norm": 1.7713611125946045, "learning_rate": 2.7954849249962868e-05, "loss": 0.1027, "step": 4590 }, { "epoch": 0.3416010693598693, "grad_norm": 1.3670064210891724, "learning_rate": 2.7950393583840786e-05, "loss": 0.0623, "step": 4600 }, { "epoch": 0.34234368038021684, "grad_norm": 1.9665565490722656, "learning_rate": 2.7945937917718698e-05, "loss": 0.0808, "step": 4610 }, { "epoch": 0.3430862914005644, "grad_norm": 3.5627613067626953, "learning_rate": 2.7941482251596613e-05, "loss": 0.0861, "step": 4620 }, { "epoch": 0.3438289024209119, "grad_norm": 1.8066272735595703, "learning_rate": 2.793702658547453e-05, "loss": 0.0831, "step": 4630 }, { "epoch": 0.34457151344125947, "grad_norm": 2.1542608737945557, "learning_rate": 2.7932570919352443e-05, "loss": 0.1111, "step": 4640 }, { "epoch": 0.34531412446160703, "grad_norm": 2.243263006210327, "learning_rate": 2.7928115253230358e-05, "loss": 0.0756, "step": 4650 }, { "epoch": 0.34605673548195454, "grad_norm": 1.6739652156829834, "learning_rate": 2.7923659587108273e-05, "loss": 0.1328, "step": 4660 }, { "epoch": 0.3467993465023021, "grad_norm": 2.321486711502075, "learning_rate": 2.7919203920986188e-05, "loss": 0.0903, "step": 4670 }, { "epoch": 0.34754195752264966, "grad_norm": 2.773947238922119, "learning_rate": 2.7914748254864103e-05, "loss": 0.0967, "step": 4680 }, { "epoch": 0.34828456854299716, "grad_norm": 1.9256445169448853, "learning_rate": 2.7910292588742018e-05, "loss": 0.1307, "step": 4690 }, { "epoch": 0.3490271795633447, "grad_norm": 2.0387189388275146, "learning_rate": 2.7905836922619933e-05, "loss": 0.0758, "step": 4700 }, { "epoch": 0.3497697905836923, "grad_norm": 0.6718337535858154, "learning_rate": 2.7901381256497848e-05, "loss": 0.0923, "step": 4710 }, { "epoch": 0.3505124016040398, "grad_norm": 2.4144012928009033, "learning_rate": 2.789692559037576e-05, "loss": 0.0815, "step": 4720 }, { "epoch": 0.35125501262438735, "grad_norm": 0.7492033839225769, "learning_rate": 2.7892469924253678e-05, "loss": 0.1039, "step": 4730 }, { "epoch": 0.3519976236447349, "grad_norm": 3.2652149200439453, "learning_rate": 2.7888014258131593e-05, "loss": 0.1027, "step": 4740 }, { "epoch": 0.3527402346650824, "grad_norm": 1.8765047788619995, "learning_rate": 2.7883558592009504e-05, "loss": 0.1082, "step": 4750 }, { "epoch": 0.35348284568543, "grad_norm": 2.7471463680267334, "learning_rate": 2.787910292588742e-05, "loss": 0.0829, "step": 4760 }, { "epoch": 0.3542254567057775, "grad_norm": 4.803821563720703, "learning_rate": 2.7874647259765338e-05, "loss": 0.0897, "step": 4770 }, { "epoch": 0.35496806772612505, "grad_norm": 1.495339035987854, "learning_rate": 2.787019159364325e-05, "loss": 0.0898, "step": 4780 }, { "epoch": 0.3557106787464726, "grad_norm": 2.4038844108581543, "learning_rate": 2.7865735927521164e-05, "loss": 0.0976, "step": 4790 }, { "epoch": 0.3564532897668201, "grad_norm": 1.810927152633667, "learning_rate": 2.7861280261399083e-05, "loss": 0.0859, "step": 4800 }, { "epoch": 0.3571959007871677, "grad_norm": 3.185044527053833, "learning_rate": 2.7856824595276994e-05, "loss": 0.0832, "step": 4810 }, { "epoch": 0.35793851180751524, "grad_norm": 4.21889066696167, "learning_rate": 2.785236892915491e-05, "loss": 0.0992, "step": 4820 }, { "epoch": 0.35868112282786274, "grad_norm": 1.788333773612976, "learning_rate": 2.784791326303282e-05, "loss": 0.0538, "step": 4830 }, { "epoch": 0.3594237338482103, "grad_norm": 3.176811933517456, "learning_rate": 2.784345759691074e-05, "loss": 0.107, "step": 4840 }, { "epoch": 0.36016634486855786, "grad_norm": 3.0961802005767822, "learning_rate": 2.7839001930788654e-05, "loss": 0.0717, "step": 4850 }, { "epoch": 0.36090895588890537, "grad_norm": 1.6400991678237915, "learning_rate": 2.7834546264666566e-05, "loss": 0.0998, "step": 4860 }, { "epoch": 0.36165156690925293, "grad_norm": 1.0916283130645752, "learning_rate": 2.7830090598544484e-05, "loss": 0.0795, "step": 4870 }, { "epoch": 0.3623941779296005, "grad_norm": 0.8684899210929871, "learning_rate": 2.78256349324224e-05, "loss": 0.1083, "step": 4880 }, { "epoch": 0.363136788949948, "grad_norm": 6.465219497680664, "learning_rate": 2.782117926630031e-05, "loss": 0.1498, "step": 4890 }, { "epoch": 0.36387939997029556, "grad_norm": 1.2663229703903198, "learning_rate": 2.781672360017823e-05, "loss": 0.0654, "step": 4900 }, { "epoch": 0.3646220109906431, "grad_norm": 3.739539861679077, "learning_rate": 2.7812267934056144e-05, "loss": 0.1353, "step": 4910 }, { "epoch": 0.3653646220109906, "grad_norm": 3.384850025177002, "learning_rate": 2.7807812267934056e-05, "loss": 0.111, "step": 4920 }, { "epoch": 0.3661072330313382, "grad_norm": 2.7936530113220215, "learning_rate": 2.780335660181197e-05, "loss": 0.1048, "step": 4930 }, { "epoch": 0.36684984405168575, "grad_norm": 1.8607102632522583, "learning_rate": 2.7798900935689886e-05, "loss": 0.0717, "step": 4940 }, { "epoch": 0.36759245507203325, "grad_norm": 2.1067261695861816, "learning_rate": 2.77944452695678e-05, "loss": 0.0881, "step": 4950 }, { "epoch": 0.3683350660923808, "grad_norm": 1.7310969829559326, "learning_rate": 2.7789989603445716e-05, "loss": 0.0873, "step": 4960 }, { "epoch": 0.3690776771127284, "grad_norm": 1.5683966875076294, "learning_rate": 2.778553393732363e-05, "loss": 0.1008, "step": 4970 }, { "epoch": 0.3698202881330759, "grad_norm": 3.5258140563964844, "learning_rate": 2.7781078271201546e-05, "loss": 0.0738, "step": 4980 }, { "epoch": 0.37056289915342344, "grad_norm": 1.4318699836730957, "learning_rate": 2.777662260507946e-05, "loss": 0.0773, "step": 4990 }, { "epoch": 0.371305510173771, "grad_norm": 2.4203314781188965, "learning_rate": 2.7772166938957373e-05, "loss": 0.1097, "step": 5000 }, { "epoch": 0.3720481211941185, "grad_norm": 1.1299662590026855, "learning_rate": 2.776771127283529e-05, "loss": 0.0858, "step": 5010 }, { "epoch": 0.37279073221446607, "grad_norm": 4.186913013458252, "learning_rate": 2.7763255606713206e-05, "loss": 0.1212, "step": 5020 }, { "epoch": 0.37353334323481363, "grad_norm": 3.082172393798828, "learning_rate": 2.7758799940591117e-05, "loss": 0.1027, "step": 5030 }, { "epoch": 0.37427595425516114, "grad_norm": 3.5075833797454834, "learning_rate": 2.7754344274469036e-05, "loss": 0.0974, "step": 5040 }, { "epoch": 0.3750185652755087, "grad_norm": 4.949690818786621, "learning_rate": 2.7749888608346947e-05, "loss": 0.0912, "step": 5050 }, { "epoch": 0.3757611762958562, "grad_norm": 2.1641194820404053, "learning_rate": 2.7745432942224862e-05, "loss": 0.11, "step": 5060 }, { "epoch": 0.37650378731620376, "grad_norm": 2.0834977626800537, "learning_rate": 2.7740977276102777e-05, "loss": 0.055, "step": 5070 }, { "epoch": 0.3772463983365513, "grad_norm": 1.6411371231079102, "learning_rate": 2.7736521609980692e-05, "loss": 0.0844, "step": 5080 }, { "epoch": 0.37798900935689883, "grad_norm": 2.455378770828247, "learning_rate": 2.7732065943858607e-05, "loss": 0.086, "step": 5090 }, { "epoch": 0.3787316203772464, "grad_norm": 0.9530849456787109, "learning_rate": 2.7727610277736522e-05, "loss": 0.0879, "step": 5100 }, { "epoch": 0.37947423139759395, "grad_norm": 0.6833879947662354, "learning_rate": 2.7723154611614437e-05, "loss": 0.1207, "step": 5110 }, { "epoch": 0.38021684241794146, "grad_norm": 2.2070958614349365, "learning_rate": 2.7718698945492352e-05, "loss": 0.0749, "step": 5120 }, { "epoch": 0.380959453438289, "grad_norm": 4.673049449920654, "learning_rate": 2.7714243279370264e-05, "loss": 0.0855, "step": 5130 }, { "epoch": 0.3817020644586366, "grad_norm": 2.2408103942871094, "learning_rate": 2.7709787613248182e-05, "loss": 0.0863, "step": 5140 }, { "epoch": 0.3824446754789841, "grad_norm": 1.7068830728530884, "learning_rate": 2.7705331947126097e-05, "loss": 0.0949, "step": 5150 }, { "epoch": 0.38318728649933165, "grad_norm": 1.8522627353668213, "learning_rate": 2.770087628100401e-05, "loss": 0.0875, "step": 5160 }, { "epoch": 0.3839298975196792, "grad_norm": 2.645232915878296, "learning_rate": 2.7696420614881924e-05, "loss": 0.0823, "step": 5170 }, { "epoch": 0.3846725085400267, "grad_norm": 3.677633047103882, "learning_rate": 2.7691964948759842e-05, "loss": 0.0726, "step": 5180 }, { "epoch": 0.3854151195603743, "grad_norm": 2.5653793811798096, "learning_rate": 2.7687509282637754e-05, "loss": 0.087, "step": 5190 }, { "epoch": 0.38615773058072184, "grad_norm": 3.1218738555908203, "learning_rate": 2.768305361651567e-05, "loss": 0.09, "step": 5200 }, { "epoch": 0.38690034160106934, "grad_norm": 1.5911304950714111, "learning_rate": 2.7678597950393587e-05, "loss": 0.0867, "step": 5210 }, { "epoch": 0.3876429526214169, "grad_norm": 1.051086187362671, "learning_rate": 2.76741422842715e-05, "loss": 0.1087, "step": 5220 }, { "epoch": 0.38838556364176446, "grad_norm": 2.593616247177124, "learning_rate": 2.7669686618149414e-05, "loss": 0.0842, "step": 5230 }, { "epoch": 0.38912817466211197, "grad_norm": 2.5163533687591553, "learning_rate": 2.7665230952027326e-05, "loss": 0.1173, "step": 5240 }, { "epoch": 0.38987078568245953, "grad_norm": 4.386409759521484, "learning_rate": 2.7660775285905244e-05, "loss": 0.1035, "step": 5250 }, { "epoch": 0.3906133967028071, "grad_norm": 2.9560604095458984, "learning_rate": 2.765631961978316e-05, "loss": 0.0936, "step": 5260 }, { "epoch": 0.3913560077231546, "grad_norm": 2.026900291442871, "learning_rate": 2.765186395366107e-05, "loss": 0.1084, "step": 5270 }, { "epoch": 0.39209861874350216, "grad_norm": 2.574880361557007, "learning_rate": 2.764740828753899e-05, "loss": 0.0733, "step": 5280 }, { "epoch": 0.3928412297638497, "grad_norm": 1.350338339805603, "learning_rate": 2.7642952621416904e-05, "loss": 0.092, "step": 5290 }, { "epoch": 0.3935838407841972, "grad_norm": 1.7275868654251099, "learning_rate": 2.7638496955294816e-05, "loss": 0.0534, "step": 5300 }, { "epoch": 0.3943264518045448, "grad_norm": 1.1320747137069702, "learning_rate": 2.7634041289172734e-05, "loss": 0.1091, "step": 5310 }, { "epoch": 0.39506906282489235, "grad_norm": 1.5764305591583252, "learning_rate": 2.762958562305065e-05, "loss": 0.0697, "step": 5320 }, { "epoch": 0.39581167384523985, "grad_norm": 1.4530662298202515, "learning_rate": 2.762512995692856e-05, "loss": 0.0823, "step": 5330 }, { "epoch": 0.3965542848655874, "grad_norm": 3.964816093444824, "learning_rate": 2.7620674290806476e-05, "loss": 0.0677, "step": 5340 }, { "epoch": 0.3972968958859349, "grad_norm": 2.6048128604888916, "learning_rate": 2.761621862468439e-05, "loss": 0.0656, "step": 5350 }, { "epoch": 0.3980395069062825, "grad_norm": 1.2549293041229248, "learning_rate": 2.7611762958562306e-05, "loss": 0.0803, "step": 5360 }, { "epoch": 0.39878211792663004, "grad_norm": 2.1924233436584473, "learning_rate": 2.760730729244022e-05, "loss": 0.0775, "step": 5370 }, { "epoch": 0.39952472894697755, "grad_norm": 1.1957290172576904, "learning_rate": 2.7602851626318136e-05, "loss": 0.0526, "step": 5380 }, { "epoch": 0.4002673399673251, "grad_norm": 4.39811897277832, "learning_rate": 2.759839596019605e-05, "loss": 0.0843, "step": 5390 }, { "epoch": 0.40100995098767267, "grad_norm": 2.887032985687256, "learning_rate": 2.7593940294073965e-05, "loss": 0.105, "step": 5400 }, { "epoch": 0.4017525620080202, "grad_norm": 2.1287643909454346, "learning_rate": 2.7589484627951877e-05, "loss": 0.0919, "step": 5410 }, { "epoch": 0.40249517302836774, "grad_norm": 2.559832811355591, "learning_rate": 2.7585028961829795e-05, "loss": 0.0941, "step": 5420 }, { "epoch": 0.4032377840487153, "grad_norm": 3.4506430625915527, "learning_rate": 2.758057329570771e-05, "loss": 0.0959, "step": 5430 }, { "epoch": 0.4039803950690628, "grad_norm": 0.877765953540802, "learning_rate": 2.7576117629585622e-05, "loss": 0.1001, "step": 5440 }, { "epoch": 0.40472300608941036, "grad_norm": 2.6283414363861084, "learning_rate": 2.757166196346354e-05, "loss": 0.0734, "step": 5450 }, { "epoch": 0.4054656171097579, "grad_norm": 2.917095899581909, "learning_rate": 2.7567206297341452e-05, "loss": 0.1153, "step": 5460 }, { "epoch": 0.40620822813010543, "grad_norm": 1.10123872756958, "learning_rate": 2.7562750631219367e-05, "loss": 0.0765, "step": 5470 }, { "epoch": 0.406950839150453, "grad_norm": 4.8916096687316895, "learning_rate": 2.7558294965097282e-05, "loss": 0.0609, "step": 5480 }, { "epoch": 0.40769345017080055, "grad_norm": 1.0813095569610596, "learning_rate": 2.7553839298975197e-05, "loss": 0.0926, "step": 5490 }, { "epoch": 0.40843606119114806, "grad_norm": 2.3865935802459717, "learning_rate": 2.7549383632853112e-05, "loss": 0.0769, "step": 5500 }, { "epoch": 0.4091786722114956, "grad_norm": 2.4773435592651367, "learning_rate": 2.7544927966731027e-05, "loss": 0.0767, "step": 5510 }, { "epoch": 0.4099212832318432, "grad_norm": 1.5695173740386963, "learning_rate": 2.7540472300608942e-05, "loss": 0.0639, "step": 5520 }, { "epoch": 0.4106638942521907, "grad_norm": 3.533438205718994, "learning_rate": 2.7536016634486857e-05, "loss": 0.0692, "step": 5530 }, { "epoch": 0.41140650527253825, "grad_norm": 1.1190873384475708, "learning_rate": 2.7531560968364772e-05, "loss": 0.0645, "step": 5540 }, { "epoch": 0.4121491162928858, "grad_norm": 2.1660842895507812, "learning_rate": 2.7527105302242687e-05, "loss": 0.0883, "step": 5550 }, { "epoch": 0.4128917273132333, "grad_norm": 1.7716519832611084, "learning_rate": 2.7522649636120602e-05, "loss": 0.0858, "step": 5560 }, { "epoch": 0.4136343383335809, "grad_norm": 1.537878155708313, "learning_rate": 2.7518193969998514e-05, "loss": 0.0814, "step": 5570 }, { "epoch": 0.41437694935392844, "grad_norm": 2.6977486610412598, "learning_rate": 2.751373830387643e-05, "loss": 0.0803, "step": 5580 }, { "epoch": 0.41511956037427594, "grad_norm": 2.5686473846435547, "learning_rate": 2.7509282637754347e-05, "loss": 0.1299, "step": 5590 }, { "epoch": 0.4158621713946235, "grad_norm": 3.5624582767486572, "learning_rate": 2.750482697163226e-05, "loss": 0.0667, "step": 5600 }, { "epoch": 0.41660478241497106, "grad_norm": 1.4908101558685303, "learning_rate": 2.7500371305510174e-05, "loss": 0.0869, "step": 5610 }, { "epoch": 0.41734739343531857, "grad_norm": 1.9675188064575195, "learning_rate": 2.7495915639388092e-05, "loss": 0.0645, "step": 5620 }, { "epoch": 0.41809000445566613, "grad_norm": 3.775062322616577, "learning_rate": 2.7491459973266004e-05, "loss": 0.0985, "step": 5630 }, { "epoch": 0.41883261547601364, "grad_norm": 5.706444263458252, "learning_rate": 2.748700430714392e-05, "loss": 0.0996, "step": 5640 }, { "epoch": 0.4195752264963612, "grad_norm": 2.382413625717163, "learning_rate": 2.748254864102183e-05, "loss": 0.079, "step": 5650 }, { "epoch": 0.42031783751670876, "grad_norm": 2.5608088970184326, "learning_rate": 2.747809297489975e-05, "loss": 0.0893, "step": 5660 }, { "epoch": 0.42106044853705626, "grad_norm": 2.507960796356201, "learning_rate": 2.7473637308777664e-05, "loss": 0.0689, "step": 5670 }, { "epoch": 0.4218030595574038, "grad_norm": 2.9068281650543213, "learning_rate": 2.7469181642655575e-05, "loss": 0.0848, "step": 5680 }, { "epoch": 0.4225456705777514, "grad_norm": 3.1836397647857666, "learning_rate": 2.7464725976533494e-05, "loss": 0.0851, "step": 5690 }, { "epoch": 0.4232882815980989, "grad_norm": 3.9612765312194824, "learning_rate": 2.746027031041141e-05, "loss": 0.0785, "step": 5700 }, { "epoch": 0.42403089261844645, "grad_norm": 5.2058210372924805, "learning_rate": 2.745581464428932e-05, "loss": 0.0883, "step": 5710 }, { "epoch": 0.424773503638794, "grad_norm": 2.7457072734832764, "learning_rate": 2.745135897816724e-05, "loss": 0.0654, "step": 5720 }, { "epoch": 0.4255161146591415, "grad_norm": 1.1056705713272095, "learning_rate": 2.7446903312045154e-05, "loss": 0.0996, "step": 5730 }, { "epoch": 0.4262587256794891, "grad_norm": 2.1076269149780273, "learning_rate": 2.7442447645923065e-05, "loss": 0.057, "step": 5740 }, { "epoch": 0.42700133669983664, "grad_norm": 2.5549466609954834, "learning_rate": 2.743799197980098e-05, "loss": 0.1045, "step": 5750 }, { "epoch": 0.42774394772018415, "grad_norm": 1.2105517387390137, "learning_rate": 2.7433536313678895e-05, "loss": 0.0826, "step": 5760 }, { "epoch": 0.4284865587405317, "grad_norm": 1.5219643115997314, "learning_rate": 2.742908064755681e-05, "loss": 0.0729, "step": 5770 }, { "epoch": 0.42922916976087927, "grad_norm": 2.4484918117523193, "learning_rate": 2.7424624981434725e-05, "loss": 0.1096, "step": 5780 }, { "epoch": 0.4299717807812268, "grad_norm": 2.4884450435638428, "learning_rate": 2.742016931531264e-05, "loss": 0.0927, "step": 5790 }, { "epoch": 0.43071439180157434, "grad_norm": 2.647526502609253, "learning_rate": 2.7415713649190555e-05, "loss": 0.0918, "step": 5800 }, { "epoch": 0.4314570028219219, "grad_norm": 3.45865535736084, "learning_rate": 2.741125798306847e-05, "loss": 0.1012, "step": 5810 }, { "epoch": 0.4321996138422694, "grad_norm": 1.7236058712005615, "learning_rate": 2.7406802316946382e-05, "loss": 0.1037, "step": 5820 }, { "epoch": 0.43294222486261696, "grad_norm": 4.1282572746276855, "learning_rate": 2.74023466508243e-05, "loss": 0.0613, "step": 5830 }, { "epoch": 0.4336848358829645, "grad_norm": 2.4355249404907227, "learning_rate": 2.7397890984702215e-05, "loss": 0.0899, "step": 5840 }, { "epoch": 0.43442744690331203, "grad_norm": 1.723847508430481, "learning_rate": 2.7393435318580127e-05, "loss": 0.101, "step": 5850 }, { "epoch": 0.4351700579236596, "grad_norm": 2.700627088546753, "learning_rate": 2.7388979652458045e-05, "loss": 0.0907, "step": 5860 }, { "epoch": 0.43591266894400715, "grad_norm": 1.1109275817871094, "learning_rate": 2.7384523986335957e-05, "loss": 0.1118, "step": 5870 }, { "epoch": 0.43665527996435466, "grad_norm": 1.6550132036209106, "learning_rate": 2.738006832021387e-05, "loss": 0.1243, "step": 5880 }, { "epoch": 0.4373978909847022, "grad_norm": 1.3361659049987793, "learning_rate": 2.737561265409179e-05, "loss": 0.1412, "step": 5890 }, { "epoch": 0.4381405020050498, "grad_norm": 2.4262852668762207, "learning_rate": 2.73711569879697e-05, "loss": 0.0814, "step": 5900 }, { "epoch": 0.4388831130253973, "grad_norm": 3.202860116958618, "learning_rate": 2.7366701321847617e-05, "loss": 0.0694, "step": 5910 }, { "epoch": 0.43962572404574485, "grad_norm": 1.6271086931228638, "learning_rate": 2.736224565572553e-05, "loss": 0.0871, "step": 5920 }, { "epoch": 0.44036833506609235, "grad_norm": 1.3334532976150513, "learning_rate": 2.7357789989603447e-05, "loss": 0.1081, "step": 5930 }, { "epoch": 0.4411109460864399, "grad_norm": 1.230716586112976, "learning_rate": 2.735333432348136e-05, "loss": 0.067, "step": 5940 }, { "epoch": 0.4418535571067875, "grad_norm": 1.8485809564590454, "learning_rate": 2.7348878657359277e-05, "loss": 0.069, "step": 5950 }, { "epoch": 0.442596168127135, "grad_norm": 1.9252151250839233, "learning_rate": 2.734442299123719e-05, "loss": 0.0703, "step": 5960 }, { "epoch": 0.44333877914748254, "grad_norm": 0.5167465209960938, "learning_rate": 2.7339967325115107e-05, "loss": 0.0766, "step": 5970 }, { "epoch": 0.4440813901678301, "grad_norm": 1.259781002998352, "learning_rate": 2.7335511658993018e-05, "loss": 0.0752, "step": 5980 }, { "epoch": 0.4448240011881776, "grad_norm": 0.9502667188644409, "learning_rate": 2.7331055992870933e-05, "loss": 0.0699, "step": 5990 }, { "epoch": 0.44556661220852517, "grad_norm": 2.211690902709961, "learning_rate": 2.732660032674885e-05, "loss": 0.0806, "step": 6000 }, { "epoch": 0.44630922322887273, "grad_norm": 4.159378528594971, "learning_rate": 2.7322144660626763e-05, "loss": 0.0766, "step": 6010 }, { "epoch": 0.44705183424922024, "grad_norm": 2.38044810295105, "learning_rate": 2.7317688994504678e-05, "loss": 0.1117, "step": 6020 }, { "epoch": 0.4477944452695678, "grad_norm": 3.320197105407715, "learning_rate": 2.7313233328382597e-05, "loss": 0.0819, "step": 6030 }, { "epoch": 0.44853705628991536, "grad_norm": 2.641312599182129, "learning_rate": 2.7308777662260508e-05, "loss": 0.0753, "step": 6040 }, { "epoch": 0.44927966731026286, "grad_norm": 3.1988885402679443, "learning_rate": 2.7304321996138423e-05, "loss": 0.1066, "step": 6050 }, { "epoch": 0.4500222783306104, "grad_norm": 0.6954814195632935, "learning_rate": 2.7299866330016335e-05, "loss": 0.0885, "step": 6060 }, { "epoch": 0.450764889350958, "grad_norm": 3.5615670680999756, "learning_rate": 2.7295410663894253e-05, "loss": 0.0942, "step": 6070 }, { "epoch": 0.4515075003713055, "grad_norm": 0.6206175088882446, "learning_rate": 2.7290954997772168e-05, "loss": 0.0688, "step": 6080 }, { "epoch": 0.45225011139165305, "grad_norm": 1.3338674306869507, "learning_rate": 2.728649933165008e-05, "loss": 0.0969, "step": 6090 }, { "epoch": 0.4529927224120006, "grad_norm": 2.5011191368103027, "learning_rate": 2.7282043665527998e-05, "loss": 0.0852, "step": 6100 }, { "epoch": 0.4537353334323481, "grad_norm": 4.930363655090332, "learning_rate": 2.7277587999405913e-05, "loss": 0.1074, "step": 6110 }, { "epoch": 0.4544779444526957, "grad_norm": 2.0421066284179688, "learning_rate": 2.7273132333283825e-05, "loss": 0.0821, "step": 6120 }, { "epoch": 0.45522055547304324, "grad_norm": 1.314985752105713, "learning_rate": 2.7268676667161743e-05, "loss": 0.0847, "step": 6130 }, { "epoch": 0.45596316649339075, "grad_norm": 2.257136583328247, "learning_rate": 2.7264221001039658e-05, "loss": 0.0911, "step": 6140 }, { "epoch": 0.4567057775137383, "grad_norm": 2.229437828063965, "learning_rate": 2.725976533491757e-05, "loss": 0.081, "step": 6150 }, { "epoch": 0.45744838853408587, "grad_norm": 1.778793215751648, "learning_rate": 2.7255309668795485e-05, "loss": 0.1086, "step": 6160 }, { "epoch": 0.4581909995544334, "grad_norm": 1.5188746452331543, "learning_rate": 2.72508540026734e-05, "loss": 0.0933, "step": 6170 }, { "epoch": 0.45893361057478094, "grad_norm": 1.7076901197433472, "learning_rate": 2.7246398336551315e-05, "loss": 0.0746, "step": 6180 }, { "epoch": 0.4596762215951285, "grad_norm": 1.1018537282943726, "learning_rate": 2.724194267042923e-05, "loss": 0.0773, "step": 6190 }, { "epoch": 0.460418832615476, "grad_norm": 1.8429148197174072, "learning_rate": 2.7237487004307145e-05, "loss": 0.0822, "step": 6200 }, { "epoch": 0.46116144363582356, "grad_norm": 4.57528829574585, "learning_rate": 2.723303133818506e-05, "loss": 0.1028, "step": 6210 }, { "epoch": 0.46190405465617107, "grad_norm": 2.2696962356567383, "learning_rate": 2.7228575672062975e-05, "loss": 0.0925, "step": 6220 }, { "epoch": 0.46264666567651863, "grad_norm": 1.2681903839111328, "learning_rate": 2.7224120005940886e-05, "loss": 0.1078, "step": 6230 }, { "epoch": 0.4633892766968662, "grad_norm": 0.9987069964408875, "learning_rate": 2.7219664339818805e-05, "loss": 0.0875, "step": 6240 }, { "epoch": 0.4641318877172137, "grad_norm": 1.8749423027038574, "learning_rate": 2.721520867369672e-05, "loss": 0.1022, "step": 6250 }, { "epoch": 0.46487449873756126, "grad_norm": 1.0351048707962036, "learning_rate": 2.721075300757463e-05, "loss": 0.0946, "step": 6260 }, { "epoch": 0.4656171097579088, "grad_norm": 0.9065452814102173, "learning_rate": 2.720629734145255e-05, "loss": 0.0955, "step": 6270 }, { "epoch": 0.4663597207782563, "grad_norm": 0.9384631514549255, "learning_rate": 2.720184167533046e-05, "loss": 0.0789, "step": 6280 }, { "epoch": 0.4671023317986039, "grad_norm": 0.5935912132263184, "learning_rate": 2.7197386009208376e-05, "loss": 0.0711, "step": 6290 }, { "epoch": 0.46784494281895145, "grad_norm": 2.0923197269439697, "learning_rate": 2.7192930343086295e-05, "loss": 0.0668, "step": 6300 }, { "epoch": 0.46858755383929895, "grad_norm": 0.9946518540382385, "learning_rate": 2.7188474676964206e-05, "loss": 0.07, "step": 6310 }, { "epoch": 0.4693301648596465, "grad_norm": 4.637099742889404, "learning_rate": 2.718401901084212e-05, "loss": 0.0938, "step": 6320 }, { "epoch": 0.4700727758799941, "grad_norm": 3.6027259826660156, "learning_rate": 2.7179563344720036e-05, "loss": 0.0707, "step": 6330 }, { "epoch": 0.4708153869003416, "grad_norm": 2.179995059967041, "learning_rate": 2.717510767859795e-05, "loss": 0.0795, "step": 6340 }, { "epoch": 0.47155799792068914, "grad_norm": 1.5892603397369385, "learning_rate": 2.7170652012475866e-05, "loss": 0.085, "step": 6350 }, { "epoch": 0.4723006089410367, "grad_norm": 2.146799087524414, "learning_rate": 2.716619634635378e-05, "loss": 0.0842, "step": 6360 }, { "epoch": 0.4730432199613842, "grad_norm": 2.106539249420166, "learning_rate": 2.7161740680231696e-05, "loss": 0.0918, "step": 6370 }, { "epoch": 0.47378583098173177, "grad_norm": 1.503983497619629, "learning_rate": 2.715728501410961e-05, "loss": 0.1038, "step": 6380 }, { "epoch": 0.47452844200207933, "grad_norm": 1.7388819456100464, "learning_rate": 2.7152829347987523e-05, "loss": 0.1041, "step": 6390 }, { "epoch": 0.47527105302242684, "grad_norm": 3.1437437534332275, "learning_rate": 2.7148373681865438e-05, "loss": 0.0824, "step": 6400 }, { "epoch": 0.4760136640427744, "grad_norm": 5.636854648590088, "learning_rate": 2.7143918015743356e-05, "loss": 0.0835, "step": 6410 }, { "epoch": 0.47675627506312196, "grad_norm": 1.5559483766555786, "learning_rate": 2.7139462349621268e-05, "loss": 0.1108, "step": 6420 }, { "epoch": 0.47749888608346946, "grad_norm": 2.2242963314056396, "learning_rate": 2.7135006683499183e-05, "loss": 0.133, "step": 6430 }, { "epoch": 0.478241497103817, "grad_norm": 3.61586332321167, "learning_rate": 2.71305510173771e-05, "loss": 0.0996, "step": 6440 }, { "epoch": 0.4789841081241646, "grad_norm": 3.0987701416015625, "learning_rate": 2.7126095351255013e-05, "loss": 0.0924, "step": 6450 }, { "epoch": 0.4797267191445121, "grad_norm": 1.8545348644256592, "learning_rate": 2.7121639685132928e-05, "loss": 0.082, "step": 6460 }, { "epoch": 0.48046933016485965, "grad_norm": 0.8686532974243164, "learning_rate": 2.7117184019010843e-05, "loss": 0.068, "step": 6470 }, { "epoch": 0.4812119411852072, "grad_norm": 1.4402474164962769, "learning_rate": 2.7112728352888758e-05, "loss": 0.0888, "step": 6480 }, { "epoch": 0.4819545522055547, "grad_norm": 1.4960230588912964, "learning_rate": 2.7108272686766673e-05, "loss": 0.0626, "step": 6490 }, { "epoch": 0.4826971632259023, "grad_norm": 0.6626843214035034, "learning_rate": 2.7103817020644584e-05, "loss": 0.0459, "step": 6500 }, { "epoch": 0.4834397742462498, "grad_norm": 2.2946035861968994, "learning_rate": 2.7099361354522503e-05, "loss": 0.0836, "step": 6510 }, { "epoch": 0.48418238526659735, "grad_norm": 3.0957255363464355, "learning_rate": 2.7094905688400418e-05, "loss": 0.0909, "step": 6520 }, { "epoch": 0.4849249962869449, "grad_norm": 4.055625915527344, "learning_rate": 2.709045002227833e-05, "loss": 0.1138, "step": 6530 }, { "epoch": 0.4856676073072924, "grad_norm": 1.6780099868774414, "learning_rate": 2.7085994356156248e-05, "loss": 0.0946, "step": 6540 }, { "epoch": 0.48641021832764, "grad_norm": 1.8075953722000122, "learning_rate": 2.7081538690034163e-05, "loss": 0.0697, "step": 6550 }, { "epoch": 0.48715282934798754, "grad_norm": 1.8275692462921143, "learning_rate": 2.7077083023912074e-05, "loss": 0.0973, "step": 6560 }, { "epoch": 0.48789544036833504, "grad_norm": 2.2628328800201416, "learning_rate": 2.707262735778999e-05, "loss": 0.1178, "step": 6570 }, { "epoch": 0.4886380513886826, "grad_norm": 1.0537023544311523, "learning_rate": 2.7068171691667904e-05, "loss": 0.0633, "step": 6580 }, { "epoch": 0.48938066240903016, "grad_norm": 0.39916807413101196, "learning_rate": 2.706371602554582e-05, "loss": 0.061, "step": 6590 }, { "epoch": 0.49012327342937767, "grad_norm": 2.486980438232422, "learning_rate": 2.7059260359423734e-05, "loss": 0.0502, "step": 6600 }, { "epoch": 0.49086588444972523, "grad_norm": 1.5549534559249878, "learning_rate": 2.705480469330165e-05, "loss": 0.096, "step": 6610 }, { "epoch": 0.4916084954700728, "grad_norm": 1.8436572551727295, "learning_rate": 2.7050349027179564e-05, "loss": 0.0435, "step": 6620 }, { "epoch": 0.4923511064904203, "grad_norm": 2.9370453357696533, "learning_rate": 2.704589336105748e-05, "loss": 0.1154, "step": 6630 }, { "epoch": 0.49309371751076786, "grad_norm": 1.7184120416641235, "learning_rate": 2.704143769493539e-05, "loss": 0.1227, "step": 6640 }, { "epoch": 0.4938363285311154, "grad_norm": 1.9661284685134888, "learning_rate": 2.703698202881331e-05, "loss": 0.0861, "step": 6650 }, { "epoch": 0.4945789395514629, "grad_norm": 5.240865230560303, "learning_rate": 2.7032526362691224e-05, "loss": 0.0973, "step": 6660 }, { "epoch": 0.4953215505718105, "grad_norm": 0.6822782158851624, "learning_rate": 2.7028070696569136e-05, "loss": 0.0736, "step": 6670 }, { "epoch": 0.49606416159215805, "grad_norm": 2.3852436542510986, "learning_rate": 2.7023615030447054e-05, "loss": 0.1097, "step": 6680 }, { "epoch": 0.49680677261250555, "grad_norm": 2.0364935398101807, "learning_rate": 2.7019159364324966e-05, "loss": 0.0749, "step": 6690 }, { "epoch": 0.4975493836328531, "grad_norm": 1.774452805519104, "learning_rate": 2.701470369820288e-05, "loss": 0.0769, "step": 6700 }, { "epoch": 0.4982919946532007, "grad_norm": 1.5295710563659668, "learning_rate": 2.70102480320808e-05, "loss": 0.0939, "step": 6710 }, { "epoch": 0.4990346056735482, "grad_norm": 3.159693956375122, "learning_rate": 2.700579236595871e-05, "loss": 0.0719, "step": 6720 }, { "epoch": 0.49977721669389574, "grad_norm": 1.0851925611495972, "learning_rate": 2.7001336699836626e-05, "loss": 0.0661, "step": 6730 }, { "epoch": 0.5005198277142433, "grad_norm": 1.9622503519058228, "learning_rate": 2.699688103371454e-05, "loss": 0.0874, "step": 6740 }, { "epoch": 0.5012624387345909, "grad_norm": 1.293068528175354, "learning_rate": 2.6992425367592456e-05, "loss": 0.1071, "step": 6750 }, { "epoch": 0.5020050497549383, "grad_norm": 2.094120740890503, "learning_rate": 2.698796970147037e-05, "loss": 0.0936, "step": 6760 }, { "epoch": 0.5027476607752859, "grad_norm": 2.2570743560791016, "learning_rate": 2.6983514035348286e-05, "loss": 0.0788, "step": 6770 }, { "epoch": 0.5034902717956334, "grad_norm": 2.327422857284546, "learning_rate": 2.69790583692262e-05, "loss": 0.0765, "step": 6780 }, { "epoch": 0.504232882815981, "grad_norm": 2.4832825660705566, "learning_rate": 2.6974602703104116e-05, "loss": 0.0789, "step": 6790 }, { "epoch": 0.5049754938363286, "grad_norm": 2.0305325984954834, "learning_rate": 2.6970147036982027e-05, "loss": 0.1098, "step": 6800 }, { "epoch": 0.5057181048566761, "grad_norm": 1.299988865852356, "learning_rate": 2.6965691370859942e-05, "loss": 0.1285, "step": 6810 }, { "epoch": 0.5064607158770236, "grad_norm": 2.6327931880950928, "learning_rate": 2.696123570473786e-05, "loss": 0.0914, "step": 6820 }, { "epoch": 0.5072033268973711, "grad_norm": 2.3139307498931885, "learning_rate": 2.6956780038615772e-05, "loss": 0.0831, "step": 6830 }, { "epoch": 0.5079459379177187, "grad_norm": 1.7000758647918701, "learning_rate": 2.6952324372493687e-05, "loss": 0.0996, "step": 6840 }, { "epoch": 0.5086885489380663, "grad_norm": 2.333949089050293, "learning_rate": 2.6947868706371606e-05, "loss": 0.1131, "step": 6850 }, { "epoch": 0.5094311599584138, "grad_norm": 0.8475568294525146, "learning_rate": 2.6943413040249517e-05, "loss": 0.0658, "step": 6860 }, { "epoch": 0.5101737709787614, "grad_norm": 2.65226411819458, "learning_rate": 2.6938957374127432e-05, "loss": 0.0892, "step": 6870 }, { "epoch": 0.5109163819991088, "grad_norm": 2.692626953125, "learning_rate": 2.6934501708005347e-05, "loss": 0.1133, "step": 6880 }, { "epoch": 0.5116589930194564, "grad_norm": 1.8590973615646362, "learning_rate": 2.6930046041883262e-05, "loss": 0.0935, "step": 6890 }, { "epoch": 0.512401604039804, "grad_norm": 1.9496850967407227, "learning_rate": 2.6925590375761177e-05, "loss": 0.0604, "step": 6900 }, { "epoch": 0.5131442150601515, "grad_norm": 3.5495307445526123, "learning_rate": 2.692113470963909e-05, "loss": 0.0863, "step": 6910 }, { "epoch": 0.5138868260804991, "grad_norm": 5.097157001495361, "learning_rate": 2.6916679043517007e-05, "loss": 0.108, "step": 6920 }, { "epoch": 0.5146294371008466, "grad_norm": 3.4224822521209717, "learning_rate": 2.6912223377394922e-05, "loss": 0.1177, "step": 6930 }, { "epoch": 0.5153720481211941, "grad_norm": 2.013091564178467, "learning_rate": 2.6907767711272834e-05, "loss": 0.1062, "step": 6940 }, { "epoch": 0.5161146591415416, "grad_norm": 1.1934363842010498, "learning_rate": 2.6903312045150752e-05, "loss": 0.0608, "step": 6950 }, { "epoch": 0.5168572701618892, "grad_norm": 3.092979669570923, "learning_rate": 2.6898856379028667e-05, "loss": 0.1076, "step": 6960 }, { "epoch": 0.5175998811822368, "grad_norm": 1.059200644493103, "learning_rate": 2.689440071290658e-05, "loss": 0.0554, "step": 6970 }, { "epoch": 0.5183424922025843, "grad_norm": 4.116779804229736, "learning_rate": 2.6889945046784494e-05, "loss": 0.0815, "step": 6980 }, { "epoch": 0.5190851032229318, "grad_norm": 1.1455808877944946, "learning_rate": 2.6885489380662412e-05, "loss": 0.0623, "step": 6990 }, { "epoch": 0.5198277142432793, "grad_norm": 0.5762424468994141, "learning_rate": 2.6881033714540324e-05, "loss": 0.0587, "step": 7000 }, { "epoch": 0.5205703252636269, "grad_norm": 2.309788942337036, "learning_rate": 2.687657804841824e-05, "loss": 0.0826, "step": 7010 }, { "epoch": 0.5213129362839745, "grad_norm": 3.214755058288574, "learning_rate": 2.6872122382296154e-05, "loss": 0.0845, "step": 7020 }, { "epoch": 0.522055547304322, "grad_norm": 1.238411545753479, "learning_rate": 2.686766671617407e-05, "loss": 0.046, "step": 7030 }, { "epoch": 0.5227981583246696, "grad_norm": 0.6192235946655273, "learning_rate": 2.6863211050051984e-05, "loss": 0.0523, "step": 7040 }, { "epoch": 0.523540769345017, "grad_norm": 1.7982896566390991, "learning_rate": 2.6858755383929895e-05, "loss": 0.0863, "step": 7050 }, { "epoch": 0.5242833803653646, "grad_norm": 1.1628367900848389, "learning_rate": 2.6854299717807814e-05, "loss": 0.094, "step": 7060 }, { "epoch": 0.5250259913857122, "grad_norm": 0.698805034160614, "learning_rate": 2.684984405168573e-05, "loss": 0.0845, "step": 7070 }, { "epoch": 0.5257686024060597, "grad_norm": 0.9980218410491943, "learning_rate": 2.684538838556364e-05, "loss": 0.0718, "step": 7080 }, { "epoch": 0.5265112134264073, "grad_norm": 2.250861883163452, "learning_rate": 2.684093271944156e-05, "loss": 0.0761, "step": 7090 }, { "epoch": 0.5272538244467548, "grad_norm": 0.3348923921585083, "learning_rate": 2.683647705331947e-05, "loss": 0.0697, "step": 7100 }, { "epoch": 0.5279964354671023, "grad_norm": 1.0973154306411743, "learning_rate": 2.6832021387197385e-05, "loss": 0.0842, "step": 7110 }, { "epoch": 0.5287390464874498, "grad_norm": 1.2300523519515991, "learning_rate": 2.6827565721075304e-05, "loss": 0.0847, "step": 7120 }, { "epoch": 0.5294816575077974, "grad_norm": 1.7506873607635498, "learning_rate": 2.6823110054953215e-05, "loss": 0.0802, "step": 7130 }, { "epoch": 0.530224268528145, "grad_norm": 2.3556385040283203, "learning_rate": 2.681865438883113e-05, "loss": 0.0903, "step": 7140 }, { "epoch": 0.5309668795484925, "grad_norm": 8.310062408447266, "learning_rate": 2.6814198722709045e-05, "loss": 0.0855, "step": 7150 }, { "epoch": 0.5317094905688401, "grad_norm": 3.2438154220581055, "learning_rate": 2.680974305658696e-05, "loss": 0.1041, "step": 7160 }, { "epoch": 0.5324521015891875, "grad_norm": 1.3913241624832153, "learning_rate": 2.6805287390464875e-05, "loss": 0.1044, "step": 7170 }, { "epoch": 0.5331947126095351, "grad_norm": 2.0883166790008545, "learning_rate": 2.680083172434279e-05, "loss": 0.0913, "step": 7180 }, { "epoch": 0.5339373236298827, "grad_norm": 1.3850668668746948, "learning_rate": 2.6796376058220705e-05, "loss": 0.0936, "step": 7190 }, { "epoch": 0.5346799346502302, "grad_norm": 2.546489953994751, "learning_rate": 2.679192039209862e-05, "loss": 0.0746, "step": 7200 }, { "epoch": 0.5354225456705778, "grad_norm": 2.2672359943389893, "learning_rate": 2.6787464725976532e-05, "loss": 0.0837, "step": 7210 }, { "epoch": 0.5361651566909253, "grad_norm": 1.1645616292953491, "learning_rate": 2.6783009059854447e-05, "loss": 0.0967, "step": 7220 }, { "epoch": 0.5369077677112728, "grad_norm": 0.8145351409912109, "learning_rate": 2.6778553393732365e-05, "loss": 0.0901, "step": 7230 }, { "epoch": 0.5376503787316204, "grad_norm": 1.619238257408142, "learning_rate": 2.6774097727610277e-05, "loss": 0.0894, "step": 7240 }, { "epoch": 0.5383929897519679, "grad_norm": 2.810974597930908, "learning_rate": 2.6769642061488192e-05, "loss": 0.0848, "step": 7250 }, { "epoch": 0.5391356007723155, "grad_norm": 1.5741685628890991, "learning_rate": 2.676518639536611e-05, "loss": 0.0748, "step": 7260 }, { "epoch": 0.539878211792663, "grad_norm": 0.8893304467201233, "learning_rate": 2.6760730729244022e-05, "loss": 0.0754, "step": 7270 }, { "epoch": 0.5406208228130105, "grad_norm": 1.7500449419021606, "learning_rate": 2.6756275063121937e-05, "loss": 0.0704, "step": 7280 }, { "epoch": 0.541363433833358, "grad_norm": 1.3175913095474243, "learning_rate": 2.6751819396999855e-05, "loss": 0.103, "step": 7290 }, { "epoch": 0.5421060448537056, "grad_norm": 4.421326160430908, "learning_rate": 2.6747363730877767e-05, "loss": 0.1161, "step": 7300 }, { "epoch": 0.5428486558740532, "grad_norm": 3.222348213195801, "learning_rate": 2.6742908064755682e-05, "loss": 0.1346, "step": 7310 }, { "epoch": 0.5435912668944007, "grad_norm": 2.507253885269165, "learning_rate": 2.6738452398633594e-05, "loss": 0.0985, "step": 7320 }, { "epoch": 0.5443338779147483, "grad_norm": 1.2689093351364136, "learning_rate": 2.6733996732511512e-05, "loss": 0.087, "step": 7330 }, { "epoch": 0.5450764889350957, "grad_norm": 2.3243536949157715, "learning_rate": 2.6729541066389427e-05, "loss": 0.0801, "step": 7340 }, { "epoch": 0.5458190999554433, "grad_norm": 1.2682521343231201, "learning_rate": 2.672508540026734e-05, "loss": 0.0924, "step": 7350 }, { "epoch": 0.5465617109757909, "grad_norm": 3.67933988571167, "learning_rate": 2.6720629734145257e-05, "loss": 0.0958, "step": 7360 }, { "epoch": 0.5473043219961384, "grad_norm": 1.3130589723587036, "learning_rate": 2.6716174068023172e-05, "loss": 0.0962, "step": 7370 }, { "epoch": 0.548046933016486, "grad_norm": 2.8078672885894775, "learning_rate": 2.6711718401901084e-05, "loss": 0.0919, "step": 7380 }, { "epoch": 0.5487895440368336, "grad_norm": 0.7422177195549011, "learning_rate": 2.6707262735779e-05, "loss": 0.0666, "step": 7390 }, { "epoch": 0.549532155057181, "grad_norm": 1.9167085886001587, "learning_rate": 2.6702807069656917e-05, "loss": 0.1218, "step": 7400 }, { "epoch": 0.5502747660775286, "grad_norm": 2.299405336380005, "learning_rate": 2.669835140353483e-05, "loss": 0.0889, "step": 7410 }, { "epoch": 0.5510173770978761, "grad_norm": 1.3493014574050903, "learning_rate": 2.6693895737412744e-05, "loss": 0.0697, "step": 7420 }, { "epoch": 0.5517599881182237, "grad_norm": 1.426650881767273, "learning_rate": 2.668944007129066e-05, "loss": 0.0771, "step": 7430 }, { "epoch": 0.5525025991385712, "grad_norm": 1.6747151613235474, "learning_rate": 2.6684984405168573e-05, "loss": 0.0956, "step": 7440 }, { "epoch": 0.5532452101589188, "grad_norm": 2.746018171310425, "learning_rate": 2.668052873904649e-05, "loss": 0.1107, "step": 7450 }, { "epoch": 0.5539878211792663, "grad_norm": 2.0050714015960693, "learning_rate": 2.66760730729244e-05, "loss": 0.0637, "step": 7460 }, { "epoch": 0.5547304321996138, "grad_norm": 1.4880417585372925, "learning_rate": 2.667161740680232e-05, "loss": 0.13, "step": 7470 }, { "epoch": 0.5554730432199614, "grad_norm": 0.5780320167541504, "learning_rate": 2.6667161740680233e-05, "loss": 0.0815, "step": 7480 }, { "epoch": 0.5562156542403089, "grad_norm": 4.6798882484436035, "learning_rate": 2.6662706074558145e-05, "loss": 0.1212, "step": 7490 }, { "epoch": 0.5569582652606565, "grad_norm": 1.5042418241500854, "learning_rate": 2.6658250408436063e-05, "loss": 0.0902, "step": 7500 }, { "epoch": 0.5577008762810041, "grad_norm": 1.637436032295227, "learning_rate": 2.6653794742313975e-05, "loss": 0.0608, "step": 7510 }, { "epoch": 0.5584434873013515, "grad_norm": 0.7876498699188232, "learning_rate": 2.664933907619189e-05, "loss": 0.0649, "step": 7520 }, { "epoch": 0.5591860983216991, "grad_norm": 1.2821192741394043, "learning_rate": 2.664488341006981e-05, "loss": 0.1155, "step": 7530 }, { "epoch": 0.5599287093420466, "grad_norm": 1.3642898797988892, "learning_rate": 2.664042774394772e-05, "loss": 0.0802, "step": 7540 }, { "epoch": 0.5606713203623942, "grad_norm": 1.7505029439926147, "learning_rate": 2.6635972077825635e-05, "loss": 0.1007, "step": 7550 }, { "epoch": 0.5614139313827418, "grad_norm": 0.7114013433456421, "learning_rate": 2.663151641170355e-05, "loss": 0.0941, "step": 7560 }, { "epoch": 0.5621565424030892, "grad_norm": 1.6477638483047485, "learning_rate": 2.6627060745581465e-05, "loss": 0.1173, "step": 7570 }, { "epoch": 0.5628991534234368, "grad_norm": 2.2498269081115723, "learning_rate": 2.662260507945938e-05, "loss": 0.1056, "step": 7580 }, { "epoch": 0.5636417644437843, "grad_norm": 1.7520428895950317, "learning_rate": 2.6618149413337295e-05, "loss": 0.0584, "step": 7590 }, { "epoch": 0.5643843754641319, "grad_norm": 1.958605170249939, "learning_rate": 2.661369374721521e-05, "loss": 0.0787, "step": 7600 }, { "epoch": 0.5651269864844795, "grad_norm": 4.134826183319092, "learning_rate": 2.6609238081093125e-05, "loss": 0.1431, "step": 7610 }, { "epoch": 0.565869597504827, "grad_norm": 1.3306151628494263, "learning_rate": 2.6604782414971037e-05, "loss": 0.083, "step": 7620 }, { "epoch": 0.5666122085251745, "grad_norm": 2.795405626296997, "learning_rate": 2.660032674884895e-05, "loss": 0.1119, "step": 7630 }, { "epoch": 0.567354819545522, "grad_norm": 1.0165034532546997, "learning_rate": 2.659587108272687e-05, "loss": 0.0823, "step": 7640 }, { "epoch": 0.5680974305658696, "grad_norm": 4.020211219787598, "learning_rate": 2.659141541660478e-05, "loss": 0.0867, "step": 7650 }, { "epoch": 0.5688400415862171, "grad_norm": 1.6462618112564087, "learning_rate": 2.6586959750482697e-05, "loss": 0.0917, "step": 7660 }, { "epoch": 0.5695826526065647, "grad_norm": 1.3625034093856812, "learning_rate": 2.6582504084360615e-05, "loss": 0.0833, "step": 7670 }, { "epoch": 0.5703252636269123, "grad_norm": 4.22099494934082, "learning_rate": 2.6578048418238527e-05, "loss": 0.065, "step": 7680 }, { "epoch": 0.5710678746472597, "grad_norm": 2.1432032585144043, "learning_rate": 2.657359275211644e-05, "loss": 0.1104, "step": 7690 }, { "epoch": 0.5718104856676073, "grad_norm": 1.7687879800796509, "learning_rate": 2.656913708599436e-05, "loss": 0.0829, "step": 7700 }, { "epoch": 0.5725530966879548, "grad_norm": 2.1364059448242188, "learning_rate": 2.656468141987227e-05, "loss": 0.0949, "step": 7710 }, { "epoch": 0.5732957077083024, "grad_norm": 1.6391818523406982, "learning_rate": 2.6560225753750187e-05, "loss": 0.0945, "step": 7720 }, { "epoch": 0.57403831872865, "grad_norm": 1.3026098012924194, "learning_rate": 2.6555770087628098e-05, "loss": 0.0608, "step": 7730 }, { "epoch": 0.5747809297489975, "grad_norm": 3.263688325881958, "learning_rate": 2.6551314421506017e-05, "loss": 0.0749, "step": 7740 }, { "epoch": 0.575523540769345, "grad_norm": 1.6053320169448853, "learning_rate": 2.654685875538393e-05, "loss": 0.0739, "step": 7750 }, { "epoch": 0.5762661517896925, "grad_norm": 2.198606491088867, "learning_rate": 2.6542403089261843e-05, "loss": 0.1319, "step": 7760 }, { "epoch": 0.5770087628100401, "grad_norm": 2.7301955223083496, "learning_rate": 2.653794742313976e-05, "loss": 0.1023, "step": 7770 }, { "epoch": 0.5777513738303877, "grad_norm": 1.7216728925704956, "learning_rate": 2.6533491757017677e-05, "loss": 0.0893, "step": 7780 }, { "epoch": 0.5784939848507352, "grad_norm": 2.964611053466797, "learning_rate": 2.6529036090895588e-05, "loss": 0.0759, "step": 7790 }, { "epoch": 0.5792365958710828, "grad_norm": 1.5537538528442383, "learning_rate": 2.6524580424773503e-05, "loss": 0.0987, "step": 7800 }, { "epoch": 0.5799792068914302, "grad_norm": 5.19601583480835, "learning_rate": 2.652012475865142e-05, "loss": 0.0846, "step": 7810 }, { "epoch": 0.5807218179117778, "grad_norm": 1.2774734497070312, "learning_rate": 2.6515669092529333e-05, "loss": 0.0721, "step": 7820 }, { "epoch": 0.5814644289321254, "grad_norm": 1.1469454765319824, "learning_rate": 2.6511213426407248e-05, "loss": 0.114, "step": 7830 }, { "epoch": 0.5822070399524729, "grad_norm": 2.6085078716278076, "learning_rate": 2.6506757760285163e-05, "loss": 0.0769, "step": 7840 }, { "epoch": 0.5829496509728205, "grad_norm": 0.7333324551582336, "learning_rate": 2.6502302094163078e-05, "loss": 0.0583, "step": 7850 }, { "epoch": 0.5836922619931679, "grad_norm": 3.896169900894165, "learning_rate": 2.6497846428040993e-05, "loss": 0.0971, "step": 7860 }, { "epoch": 0.5844348730135155, "grad_norm": 0.8232213854789734, "learning_rate": 2.6493390761918908e-05, "loss": 0.0795, "step": 7870 }, { "epoch": 0.585177484033863, "grad_norm": 2.7149336338043213, "learning_rate": 2.6488935095796823e-05, "loss": 0.0847, "step": 7880 }, { "epoch": 0.5859200950542106, "grad_norm": 2.990295886993408, "learning_rate": 2.6484479429674738e-05, "loss": 0.0692, "step": 7890 }, { "epoch": 0.5866627060745582, "grad_norm": 3.7285399436950684, "learning_rate": 2.648002376355265e-05, "loss": 0.0734, "step": 7900 }, { "epoch": 0.5874053170949057, "grad_norm": 1.7510877847671509, "learning_rate": 2.6475568097430568e-05, "loss": 0.0944, "step": 7910 }, { "epoch": 0.5881479281152532, "grad_norm": 2.186464786529541, "learning_rate": 2.6471112431308483e-05, "loss": 0.1009, "step": 7920 }, { "epoch": 0.5888905391356007, "grad_norm": 3.2270450592041016, "learning_rate": 2.6466656765186395e-05, "loss": 0.0815, "step": 7930 }, { "epoch": 0.5896331501559483, "grad_norm": 2.993773937225342, "learning_rate": 2.6462201099064313e-05, "loss": 0.0764, "step": 7940 }, { "epoch": 0.5903757611762959, "grad_norm": 0.9298529624938965, "learning_rate": 2.6457745432942225e-05, "loss": 0.0933, "step": 7950 }, { "epoch": 0.5911183721966434, "grad_norm": 3.5018069744110107, "learning_rate": 2.645328976682014e-05, "loss": 0.1001, "step": 7960 }, { "epoch": 0.591860983216991, "grad_norm": 1.2015966176986694, "learning_rate": 2.6448834100698055e-05, "loss": 0.0839, "step": 7970 }, { "epoch": 0.5926035942373384, "grad_norm": 1.9032807350158691, "learning_rate": 2.644437843457597e-05, "loss": 0.1019, "step": 7980 }, { "epoch": 0.593346205257686, "grad_norm": 2.2178187370300293, "learning_rate": 2.6439922768453885e-05, "loss": 0.0525, "step": 7990 }, { "epoch": 0.5940888162780336, "grad_norm": 2.323493719100952, "learning_rate": 2.64354671023318e-05, "loss": 0.1142, "step": 8000 }, { "epoch": 0.5948314272983811, "grad_norm": 5.971455097198486, "learning_rate": 2.6431011436209715e-05, "loss": 0.1032, "step": 8010 }, { "epoch": 0.5955740383187287, "grad_norm": 1.563317060470581, "learning_rate": 2.642655577008763e-05, "loss": 0.0701, "step": 8020 }, { "epoch": 0.5963166493390762, "grad_norm": 0.8707819581031799, "learning_rate": 2.642210010396554e-05, "loss": 0.1017, "step": 8030 }, { "epoch": 0.5970592603594237, "grad_norm": 1.7506704330444336, "learning_rate": 2.6417644437843456e-05, "loss": 0.0926, "step": 8040 }, { "epoch": 0.5978018713797713, "grad_norm": 1.1731964349746704, "learning_rate": 2.6413188771721375e-05, "loss": 0.0853, "step": 8050 }, { "epoch": 0.5985444824001188, "grad_norm": 2.8358826637268066, "learning_rate": 2.6408733105599286e-05, "loss": 0.0797, "step": 8060 }, { "epoch": 0.5992870934204664, "grad_norm": 2.624128818511963, "learning_rate": 2.64042774394772e-05, "loss": 0.1109, "step": 8070 }, { "epoch": 0.6000297044408139, "grad_norm": 1.6009690761566162, "learning_rate": 2.639982177335512e-05, "loss": 0.0783, "step": 8080 }, { "epoch": 0.6007723154611615, "grad_norm": 1.9895691871643066, "learning_rate": 2.639536610723303e-05, "loss": 0.1047, "step": 8090 }, { "epoch": 0.601514926481509, "grad_norm": 0.5253069996833801, "learning_rate": 2.6390910441110946e-05, "loss": 0.0767, "step": 8100 }, { "epoch": 0.6022575375018565, "grad_norm": 2.7466979026794434, "learning_rate": 2.6386454774988865e-05, "loss": 0.1141, "step": 8110 }, { "epoch": 0.6030001485222041, "grad_norm": 0.8582619428634644, "learning_rate": 2.6381999108866776e-05, "loss": 0.0747, "step": 8120 }, { "epoch": 0.6037427595425516, "grad_norm": 1.5232957601547241, "learning_rate": 2.637754344274469e-05, "loss": 0.0756, "step": 8130 }, { "epoch": 0.6044853705628992, "grad_norm": 1.6779173612594604, "learning_rate": 2.6373087776622603e-05, "loss": 0.06, "step": 8140 }, { "epoch": 0.6052279815832466, "grad_norm": 1.4858782291412354, "learning_rate": 2.636863211050052e-05, "loss": 0.0934, "step": 8150 }, { "epoch": 0.6059705926035942, "grad_norm": 1.1481568813323975, "learning_rate": 2.6364176444378436e-05, "loss": 0.1104, "step": 8160 }, { "epoch": 0.6067132036239418, "grad_norm": 2.9699254035949707, "learning_rate": 2.6359720778256348e-05, "loss": 0.0974, "step": 8170 }, { "epoch": 0.6074558146442893, "grad_norm": 2.6445741653442383, "learning_rate": 2.6355265112134266e-05, "loss": 0.0873, "step": 8180 }, { "epoch": 0.6081984256646369, "grad_norm": 0.4445909559726715, "learning_rate": 2.635080944601218e-05, "loss": 0.0713, "step": 8190 }, { "epoch": 0.6089410366849844, "grad_norm": 2.068956136703491, "learning_rate": 2.6346353779890093e-05, "loss": 0.0632, "step": 8200 }, { "epoch": 0.6096836477053319, "grad_norm": 2.9205379486083984, "learning_rate": 2.6341898113768008e-05, "loss": 0.061, "step": 8210 }, { "epoch": 0.6104262587256795, "grad_norm": 0.8605203032493591, "learning_rate": 2.6337442447645926e-05, "loss": 0.1332, "step": 8220 }, { "epoch": 0.611168869746027, "grad_norm": 1.654402256011963, "learning_rate": 2.6332986781523838e-05, "loss": 0.0808, "step": 8230 }, { "epoch": 0.6119114807663746, "grad_norm": 3.046501636505127, "learning_rate": 2.6328531115401753e-05, "loss": 0.1068, "step": 8240 }, { "epoch": 0.6126540917867221, "grad_norm": 2.973254680633545, "learning_rate": 2.6324075449279668e-05, "loss": 0.0951, "step": 8250 }, { "epoch": 0.6133967028070697, "grad_norm": 0.9132028222084045, "learning_rate": 2.6319619783157583e-05, "loss": 0.0798, "step": 8260 }, { "epoch": 0.6141393138274172, "grad_norm": 1.8893526792526245, "learning_rate": 2.6315164117035498e-05, "loss": 0.0725, "step": 8270 }, { "epoch": 0.6148819248477647, "grad_norm": 2.337425708770752, "learning_rate": 2.6310708450913413e-05, "loss": 0.0709, "step": 8280 }, { "epoch": 0.6156245358681123, "grad_norm": 1.1997939348220825, "learning_rate": 2.6306252784791328e-05, "loss": 0.0892, "step": 8290 }, { "epoch": 0.6163671468884598, "grad_norm": 2.1006369590759277, "learning_rate": 2.6301797118669243e-05, "loss": 0.0877, "step": 8300 }, { "epoch": 0.6171097579088074, "grad_norm": 1.0404902696609497, "learning_rate": 2.6297341452547154e-05, "loss": 0.0733, "step": 8310 }, { "epoch": 0.617852368929155, "grad_norm": 1.4689126014709473, "learning_rate": 2.6292885786425073e-05, "loss": 0.0738, "step": 8320 }, { "epoch": 0.6185949799495024, "grad_norm": 1.669219970703125, "learning_rate": 2.6288430120302988e-05, "loss": 0.0795, "step": 8330 }, { "epoch": 0.61933759096985, "grad_norm": 1.8779352903366089, "learning_rate": 2.62839744541809e-05, "loss": 0.0967, "step": 8340 }, { "epoch": 0.6200802019901975, "grad_norm": 2.112928867340088, "learning_rate": 2.6279518788058818e-05, "loss": 0.0695, "step": 8350 }, { "epoch": 0.6208228130105451, "grad_norm": 1.240665078163147, "learning_rate": 2.627506312193673e-05, "loss": 0.1107, "step": 8360 }, { "epoch": 0.6215654240308927, "grad_norm": 1.4730993509292603, "learning_rate": 2.6270607455814644e-05, "loss": 0.0955, "step": 8370 }, { "epoch": 0.6223080350512402, "grad_norm": 1.134709119796753, "learning_rate": 2.626615178969256e-05, "loss": 0.1074, "step": 8380 }, { "epoch": 0.6230506460715877, "grad_norm": 2.589599132537842, "learning_rate": 2.6261696123570474e-05, "loss": 0.0796, "step": 8390 }, { "epoch": 0.6237932570919352, "grad_norm": 2.0985918045043945, "learning_rate": 2.625724045744839e-05, "loss": 0.0877, "step": 8400 }, { "epoch": 0.6245358681122828, "grad_norm": 4.56246280670166, "learning_rate": 2.6252784791326304e-05, "loss": 0.0804, "step": 8410 }, { "epoch": 0.6252784791326303, "grad_norm": 2.3391928672790527, "learning_rate": 2.624832912520422e-05, "loss": 0.0987, "step": 8420 }, { "epoch": 0.6260210901529779, "grad_norm": 2.231879472732544, "learning_rate": 2.6243873459082134e-05, "loss": 0.0951, "step": 8430 }, { "epoch": 0.6267637011733254, "grad_norm": 0.44721463322639465, "learning_rate": 2.623941779296005e-05, "loss": 0.0713, "step": 8440 }, { "epoch": 0.6275063121936729, "grad_norm": 1.6443843841552734, "learning_rate": 2.623496212683796e-05, "loss": 0.1187, "step": 8450 }, { "epoch": 0.6282489232140205, "grad_norm": 3.586520195007324, "learning_rate": 2.623050646071588e-05, "loss": 0.0957, "step": 8460 }, { "epoch": 0.628991534234368, "grad_norm": 1.5607584714889526, "learning_rate": 2.622605079459379e-05, "loss": 0.0959, "step": 8470 }, { "epoch": 0.6297341452547156, "grad_norm": 2.6211838722229004, "learning_rate": 2.6221595128471706e-05, "loss": 0.1139, "step": 8480 }, { "epoch": 0.6304767562750632, "grad_norm": 2.3073689937591553, "learning_rate": 2.6217139462349624e-05, "loss": 0.0912, "step": 8490 }, { "epoch": 0.6312193672954106, "grad_norm": 1.4929405450820923, "learning_rate": 2.6212683796227536e-05, "loss": 0.0652, "step": 8500 }, { "epoch": 0.6319619783157582, "grad_norm": 4.748650074005127, "learning_rate": 2.620822813010545e-05, "loss": 0.0861, "step": 8510 }, { "epoch": 0.6327045893361057, "grad_norm": 1.6058298349380493, "learning_rate": 2.620377246398337e-05, "loss": 0.094, "step": 8520 }, { "epoch": 0.6334472003564533, "grad_norm": 0.9887398481369019, "learning_rate": 2.619931679786128e-05, "loss": 0.0531, "step": 8530 }, { "epoch": 0.6341898113768009, "grad_norm": 1.5614607334136963, "learning_rate": 2.6194861131739196e-05, "loss": 0.0575, "step": 8540 }, { "epoch": 0.6349324223971484, "grad_norm": 0.485836923122406, "learning_rate": 2.6190405465617107e-05, "loss": 0.0738, "step": 8550 }, { "epoch": 0.6356750334174959, "grad_norm": 2.3071041107177734, "learning_rate": 2.6185949799495026e-05, "loss": 0.0965, "step": 8560 }, { "epoch": 0.6364176444378434, "grad_norm": 1.3034030199050903, "learning_rate": 2.618149413337294e-05, "loss": 0.0766, "step": 8570 }, { "epoch": 0.637160255458191, "grad_norm": 2.9087538719177246, "learning_rate": 2.6177038467250852e-05, "loss": 0.0808, "step": 8580 }, { "epoch": 0.6379028664785386, "grad_norm": 1.7364327907562256, "learning_rate": 2.617258280112877e-05, "loss": 0.0727, "step": 8590 }, { "epoch": 0.6386454774988861, "grad_norm": 1.3691768646240234, "learning_rate": 2.6168127135006686e-05, "loss": 0.1037, "step": 8600 }, { "epoch": 0.6393880885192337, "grad_norm": 3.924298048019409, "learning_rate": 2.6163671468884597e-05, "loss": 0.1055, "step": 8610 }, { "epoch": 0.6401306995395811, "grad_norm": 4.720126152038574, "learning_rate": 2.6159215802762512e-05, "loss": 0.0634, "step": 8620 }, { "epoch": 0.6408733105599287, "grad_norm": 0.719524621963501, "learning_rate": 2.615476013664043e-05, "loss": 0.105, "step": 8630 }, { "epoch": 0.6416159215802762, "grad_norm": 2.0264840126037598, "learning_rate": 2.6150304470518342e-05, "loss": 0.0827, "step": 8640 }, { "epoch": 0.6423585326006238, "grad_norm": 2.5915403366088867, "learning_rate": 2.6145848804396257e-05, "loss": 0.0989, "step": 8650 }, { "epoch": 0.6431011436209714, "grad_norm": 1.406114935874939, "learning_rate": 2.6141393138274172e-05, "loss": 0.0776, "step": 8660 }, { "epoch": 0.6438437546413188, "grad_norm": 1.9596368074417114, "learning_rate": 2.6136937472152087e-05, "loss": 0.1162, "step": 8670 }, { "epoch": 0.6445863656616664, "grad_norm": 1.9101582765579224, "learning_rate": 2.6132481806030002e-05, "loss": 0.1058, "step": 8680 }, { "epoch": 0.6453289766820139, "grad_norm": 3.665165424346924, "learning_rate": 2.6128026139907917e-05, "loss": 0.0834, "step": 8690 }, { "epoch": 0.6460715877023615, "grad_norm": 1.8130497932434082, "learning_rate": 2.6123570473785832e-05, "loss": 0.0572, "step": 8700 }, { "epoch": 0.6468141987227091, "grad_norm": 2.0652337074279785, "learning_rate": 2.6119114807663747e-05, "loss": 0.1246, "step": 8710 }, { "epoch": 0.6475568097430566, "grad_norm": 1.8479968309402466, "learning_rate": 2.611465914154166e-05, "loss": 0.0879, "step": 8720 }, { "epoch": 0.6482994207634041, "grad_norm": 1.4413061141967773, "learning_rate": 2.6110203475419577e-05, "loss": 0.0743, "step": 8730 }, { "epoch": 0.6490420317837516, "grad_norm": 2.89367413520813, "learning_rate": 2.6105747809297492e-05, "loss": 0.0707, "step": 8740 }, { "epoch": 0.6497846428040992, "grad_norm": 3.1368815898895264, "learning_rate": 2.6101292143175404e-05, "loss": 0.0773, "step": 8750 }, { "epoch": 0.6505272538244468, "grad_norm": 2.096843719482422, "learning_rate": 2.6096836477053322e-05, "loss": 0.0732, "step": 8760 }, { "epoch": 0.6512698648447943, "grad_norm": 2.454930067062378, "learning_rate": 2.6092380810931234e-05, "loss": 0.0926, "step": 8770 }, { "epoch": 0.6520124758651419, "grad_norm": 2.2393689155578613, "learning_rate": 2.608792514480915e-05, "loss": 0.113, "step": 8780 }, { "epoch": 0.6527550868854893, "grad_norm": 1.3184117078781128, "learning_rate": 2.6083469478687064e-05, "loss": 0.0803, "step": 8790 }, { "epoch": 0.6534976979058369, "grad_norm": 1.2592401504516602, "learning_rate": 2.607901381256498e-05, "loss": 0.06, "step": 8800 }, { "epoch": 0.6542403089261845, "grad_norm": 1.8193804025650024, "learning_rate": 2.6074558146442894e-05, "loss": 0.0818, "step": 8810 }, { "epoch": 0.654982919946532, "grad_norm": 0.5750879049301147, "learning_rate": 2.607010248032081e-05, "loss": 0.0704, "step": 8820 }, { "epoch": 0.6557255309668796, "grad_norm": 2.028292655944824, "learning_rate": 2.6065646814198724e-05, "loss": 0.0577, "step": 8830 }, { "epoch": 0.6564681419872271, "grad_norm": 2.086024522781372, "learning_rate": 2.606119114807664e-05, "loss": 0.0675, "step": 8840 }, { "epoch": 0.6572107530075746, "grad_norm": 3.66861891746521, "learning_rate": 2.6056735481954554e-05, "loss": 0.066, "step": 8850 }, { "epoch": 0.6579533640279221, "grad_norm": 1.3219988346099854, "learning_rate": 2.6052279815832465e-05, "loss": 0.0709, "step": 8860 }, { "epoch": 0.6586959750482697, "grad_norm": 1.395115852355957, "learning_rate": 2.6047824149710384e-05, "loss": 0.1049, "step": 8870 }, { "epoch": 0.6594385860686173, "grad_norm": 2.2025349140167236, "learning_rate": 2.6043368483588295e-05, "loss": 0.0716, "step": 8880 }, { "epoch": 0.6601811970889648, "grad_norm": 0.7800239324569702, "learning_rate": 2.603891281746621e-05, "loss": 0.0774, "step": 8890 }, { "epoch": 0.6609238081093124, "grad_norm": 1.6750237941741943, "learning_rate": 2.603445715134413e-05, "loss": 0.0824, "step": 8900 }, { "epoch": 0.6616664191296598, "grad_norm": 3.150371789932251, "learning_rate": 2.603000148522204e-05, "loss": 0.1005, "step": 8910 }, { "epoch": 0.6624090301500074, "grad_norm": 2.6133267879486084, "learning_rate": 2.6025545819099955e-05, "loss": 0.0906, "step": 8920 }, { "epoch": 0.663151641170355, "grad_norm": 2.1227505207061768, "learning_rate": 2.6021090152977874e-05, "loss": 0.094, "step": 8930 }, { "epoch": 0.6638942521907025, "grad_norm": 3.7070045471191406, "learning_rate": 2.6016634486855785e-05, "loss": 0.1054, "step": 8940 }, { "epoch": 0.6646368632110501, "grad_norm": 2.8598554134368896, "learning_rate": 2.60121788207337e-05, "loss": 0.101, "step": 8950 }, { "epoch": 0.6653794742313975, "grad_norm": 1.542912483215332, "learning_rate": 2.6007723154611615e-05, "loss": 0.0896, "step": 8960 }, { "epoch": 0.6661220852517451, "grad_norm": 2.263106346130371, "learning_rate": 2.600326748848953e-05, "loss": 0.0893, "step": 8970 }, { "epoch": 0.6668646962720927, "grad_norm": 1.0385371446609497, "learning_rate": 2.5998811822367445e-05, "loss": 0.1132, "step": 8980 }, { "epoch": 0.6676073072924402, "grad_norm": 3.194511890411377, "learning_rate": 2.5994356156245357e-05, "loss": 0.0824, "step": 8990 }, { "epoch": 0.6683499183127878, "grad_norm": 1.4233129024505615, "learning_rate": 2.5989900490123275e-05, "loss": 0.0689, "step": 9000 }, { "epoch": 0.6690925293331353, "grad_norm": 1.2096024751663208, "learning_rate": 2.598544482400119e-05, "loss": 0.0736, "step": 9010 }, { "epoch": 0.6698351403534828, "grad_norm": 2.155372381210327, "learning_rate": 2.5980989157879102e-05, "loss": 0.077, "step": 9020 }, { "epoch": 0.6705777513738304, "grad_norm": 1.661603331565857, "learning_rate": 2.5976533491757017e-05, "loss": 0.097, "step": 9030 }, { "epoch": 0.6713203623941779, "grad_norm": 2.2005343437194824, "learning_rate": 2.5972077825634935e-05, "loss": 0.0742, "step": 9040 }, { "epoch": 0.6720629734145255, "grad_norm": 1.2867567539215088, "learning_rate": 2.5967622159512847e-05, "loss": 0.0691, "step": 9050 }, { "epoch": 0.672805584434873, "grad_norm": 2.7160210609436035, "learning_rate": 2.5963166493390762e-05, "loss": 0.1008, "step": 9060 }, { "epoch": 0.6735481954552206, "grad_norm": 2.456948757171631, "learning_rate": 2.5958710827268677e-05, "loss": 0.0676, "step": 9070 }, { "epoch": 0.674290806475568, "grad_norm": 1.7581907510757446, "learning_rate": 2.5954255161146592e-05, "loss": 0.0949, "step": 9080 }, { "epoch": 0.6750334174959156, "grad_norm": 1.2283096313476562, "learning_rate": 2.5949799495024507e-05, "loss": 0.0792, "step": 9090 }, { "epoch": 0.6757760285162632, "grad_norm": 1.879252552986145, "learning_rate": 2.5945343828902422e-05, "loss": 0.0732, "step": 9100 }, { "epoch": 0.6765186395366107, "grad_norm": 2.652205228805542, "learning_rate": 2.5940888162780337e-05, "loss": 0.0759, "step": 9110 }, { "epoch": 0.6772612505569583, "grad_norm": 4.162420749664307, "learning_rate": 2.5936432496658252e-05, "loss": 0.0737, "step": 9120 }, { "epoch": 0.6780038615773059, "grad_norm": 1.89590585231781, "learning_rate": 2.5931976830536163e-05, "loss": 0.0805, "step": 9130 }, { "epoch": 0.6787464725976533, "grad_norm": 1.6626734733581543, "learning_rate": 2.5927521164414082e-05, "loss": 0.1003, "step": 9140 }, { "epoch": 0.6794890836180009, "grad_norm": 1.87484610080719, "learning_rate": 2.5923065498291997e-05, "loss": 0.0795, "step": 9150 }, { "epoch": 0.6802316946383484, "grad_norm": 1.9725035429000854, "learning_rate": 2.591860983216991e-05, "loss": 0.0936, "step": 9160 }, { "epoch": 0.680974305658696, "grad_norm": 2.27907395362854, "learning_rate": 2.5914154166047827e-05, "loss": 0.0865, "step": 9170 }, { "epoch": 0.6817169166790435, "grad_norm": 1.4247010946273804, "learning_rate": 2.590969849992574e-05, "loss": 0.0751, "step": 9180 }, { "epoch": 0.6824595276993911, "grad_norm": 2.569737195968628, "learning_rate": 2.5905242833803653e-05, "loss": 0.1007, "step": 9190 }, { "epoch": 0.6832021387197386, "grad_norm": 3.3012797832489014, "learning_rate": 2.590078716768157e-05, "loss": 0.0889, "step": 9200 }, { "epoch": 0.6839447497400861, "grad_norm": 2.0903170108795166, "learning_rate": 2.5896331501559483e-05, "loss": 0.082, "step": 9210 }, { "epoch": 0.6846873607604337, "grad_norm": 1.6836172342300415, "learning_rate": 2.58918758354374e-05, "loss": 0.0873, "step": 9220 }, { "epoch": 0.6854299717807812, "grad_norm": 3.3756263256073, "learning_rate": 2.5887420169315313e-05, "loss": 0.0769, "step": 9230 }, { "epoch": 0.6861725828011288, "grad_norm": 1.1910730600357056, "learning_rate": 2.588296450319323e-05, "loss": 0.0894, "step": 9240 }, { "epoch": 0.6869151938214763, "grad_norm": 1.0612378120422363, "learning_rate": 2.5878508837071143e-05, "loss": 0.062, "step": 9250 }, { "epoch": 0.6876578048418238, "grad_norm": 1.0237765312194824, "learning_rate": 2.587405317094906e-05, "loss": 0.0904, "step": 9260 }, { "epoch": 0.6884004158621714, "grad_norm": 2.666456460952759, "learning_rate": 2.5869597504826973e-05, "loss": 0.0861, "step": 9270 }, { "epoch": 0.6891430268825189, "grad_norm": 1.1967474222183228, "learning_rate": 2.586514183870489e-05, "loss": 0.086, "step": 9280 }, { "epoch": 0.6898856379028665, "grad_norm": 3.264155626296997, "learning_rate": 2.58606861725828e-05, "loss": 0.0903, "step": 9290 }, { "epoch": 0.6906282489232141, "grad_norm": 2.126134157180786, "learning_rate": 2.5856230506460715e-05, "loss": 0.1036, "step": 9300 }, { "epoch": 0.6913708599435615, "grad_norm": 1.6895121335983276, "learning_rate": 2.5851774840338633e-05, "loss": 0.067, "step": 9310 }, { "epoch": 0.6921134709639091, "grad_norm": 2.2356975078582764, "learning_rate": 2.5847319174216545e-05, "loss": 0.0838, "step": 9320 }, { "epoch": 0.6928560819842566, "grad_norm": 1.7429089546203613, "learning_rate": 2.584286350809446e-05, "loss": 0.0731, "step": 9330 }, { "epoch": 0.6935986930046042, "grad_norm": 1.1210354566574097, "learning_rate": 2.583840784197238e-05, "loss": 0.0932, "step": 9340 }, { "epoch": 0.6943413040249518, "grad_norm": 1.4460147619247437, "learning_rate": 2.583395217585029e-05, "loss": 0.0817, "step": 9350 }, { "epoch": 0.6950839150452993, "grad_norm": 1.1217153072357178, "learning_rate": 2.5829496509728205e-05, "loss": 0.0671, "step": 9360 }, { "epoch": 0.6958265260656468, "grad_norm": 2.2373554706573486, "learning_rate": 2.582504084360612e-05, "loss": 0.0681, "step": 9370 }, { "epoch": 0.6965691370859943, "grad_norm": 2.8909049034118652, "learning_rate": 2.5820585177484035e-05, "loss": 0.1121, "step": 9380 }, { "epoch": 0.6973117481063419, "grad_norm": 0.4152112603187561, "learning_rate": 2.581612951136195e-05, "loss": 0.0894, "step": 9390 }, { "epoch": 0.6980543591266894, "grad_norm": 3.5851147174835205, "learning_rate": 2.581167384523986e-05, "loss": 0.0797, "step": 9400 }, { "epoch": 0.698796970147037, "grad_norm": 1.1283321380615234, "learning_rate": 2.580721817911778e-05, "loss": 0.0966, "step": 9410 }, { "epoch": 0.6995395811673846, "grad_norm": 2.237506151199341, "learning_rate": 2.5802762512995695e-05, "loss": 0.0554, "step": 9420 }, { "epoch": 0.700282192187732, "grad_norm": 2.4891796112060547, "learning_rate": 2.5798306846873607e-05, "loss": 0.0808, "step": 9430 }, { "epoch": 0.7010248032080796, "grad_norm": 1.4225846529006958, "learning_rate": 2.579385118075152e-05, "loss": 0.0733, "step": 9440 }, { "epoch": 0.7017674142284271, "grad_norm": 3.312795400619507, "learning_rate": 2.578939551462944e-05, "loss": 0.0979, "step": 9450 }, { "epoch": 0.7025100252487747, "grad_norm": 1.2239809036254883, "learning_rate": 2.578493984850735e-05, "loss": 0.0735, "step": 9460 }, { "epoch": 0.7032526362691223, "grad_norm": 3.1901540756225586, "learning_rate": 2.5780484182385266e-05, "loss": 0.0929, "step": 9470 }, { "epoch": 0.7039952472894698, "grad_norm": 1.4800280332565308, "learning_rate": 2.577602851626318e-05, "loss": 0.0693, "step": 9480 }, { "epoch": 0.7047378583098173, "grad_norm": 3.378511667251587, "learning_rate": 2.5771572850141096e-05, "loss": 0.0794, "step": 9490 }, { "epoch": 0.7054804693301648, "grad_norm": 2.557231903076172, "learning_rate": 2.576711718401901e-05, "loss": 0.0825, "step": 9500 }, { "epoch": 0.7062230803505124, "grad_norm": 1.7998268604278564, "learning_rate": 2.5762661517896926e-05, "loss": 0.0679, "step": 9510 }, { "epoch": 0.70696569137086, "grad_norm": 2.5356063842773438, "learning_rate": 2.575820585177484e-05, "loss": 0.0851, "step": 9520 }, { "epoch": 0.7077083023912075, "grad_norm": 3.3451857566833496, "learning_rate": 2.5753750185652756e-05, "loss": 0.0934, "step": 9530 }, { "epoch": 0.708450913411555, "grad_norm": 2.2727510929107666, "learning_rate": 2.5749294519530668e-05, "loss": 0.065, "step": 9540 }, { "epoch": 0.7091935244319025, "grad_norm": 3.0308828353881836, "learning_rate": 2.5744838853408586e-05, "loss": 0.1067, "step": 9550 }, { "epoch": 0.7099361354522501, "grad_norm": 0.393522173166275, "learning_rate": 2.57403831872865e-05, "loss": 0.0824, "step": 9560 }, { "epoch": 0.7106787464725977, "grad_norm": 1.6205034255981445, "learning_rate": 2.5735927521164413e-05, "loss": 0.0474, "step": 9570 }, { "epoch": 0.7114213574929452, "grad_norm": 1.4009572267532349, "learning_rate": 2.573147185504233e-05, "loss": 0.095, "step": 9580 }, { "epoch": 0.7121639685132928, "grad_norm": 1.9968441724777222, "learning_rate": 2.5727016188920243e-05, "loss": 0.0957, "step": 9590 }, { "epoch": 0.7129065795336402, "grad_norm": 1.6015273332595825, "learning_rate": 2.5722560522798158e-05, "loss": 0.0857, "step": 9600 }, { "epoch": 0.7136491905539878, "grad_norm": 1.4251041412353516, "learning_rate": 2.5718104856676073e-05, "loss": 0.0932, "step": 9610 }, { "epoch": 0.7143918015743354, "grad_norm": 5.090855598449707, "learning_rate": 2.5713649190553988e-05, "loss": 0.1032, "step": 9620 }, { "epoch": 0.7151344125946829, "grad_norm": 2.4273598194122314, "learning_rate": 2.5709193524431903e-05, "loss": 0.0827, "step": 9630 }, { "epoch": 0.7158770236150305, "grad_norm": 1.8204997777938843, "learning_rate": 2.5704737858309818e-05, "loss": 0.1167, "step": 9640 }, { "epoch": 0.716619634635378, "grad_norm": 1.7066177129745483, "learning_rate": 2.5700282192187733e-05, "loss": 0.1037, "step": 9650 }, { "epoch": 0.7173622456557255, "grad_norm": 2.3941705226898193, "learning_rate": 2.5695826526065648e-05, "loss": 0.1003, "step": 9660 }, { "epoch": 0.718104856676073, "grad_norm": 2.3168444633483887, "learning_rate": 2.5691370859943563e-05, "loss": 0.1067, "step": 9670 }, { "epoch": 0.7188474676964206, "grad_norm": 1.6166632175445557, "learning_rate": 2.5686915193821478e-05, "loss": 0.0904, "step": 9680 }, { "epoch": 0.7195900787167682, "grad_norm": 0.9666265845298767, "learning_rate": 2.5682459527699393e-05, "loss": 0.0967, "step": 9690 }, { "epoch": 0.7203326897371157, "grad_norm": 0.7397652864456177, "learning_rate": 2.5678003861577305e-05, "loss": 0.0604, "step": 9700 }, { "epoch": 0.7210753007574633, "grad_norm": 3.255927324295044, "learning_rate": 2.567354819545522e-05, "loss": 0.0784, "step": 9710 }, { "epoch": 0.7218179117778107, "grad_norm": 2.8680319786071777, "learning_rate": 2.5669092529333138e-05, "loss": 0.1131, "step": 9720 }, { "epoch": 0.7225605227981583, "grad_norm": 1.343375325202942, "learning_rate": 2.566463686321105e-05, "loss": 0.1042, "step": 9730 }, { "epoch": 0.7233031338185059, "grad_norm": 2.072066307067871, "learning_rate": 2.5660181197088965e-05, "loss": 0.1135, "step": 9740 }, { "epoch": 0.7240457448388534, "grad_norm": 2.82025408744812, "learning_rate": 2.5655725530966883e-05, "loss": 0.0621, "step": 9750 }, { "epoch": 0.724788355859201, "grad_norm": 4.173225402832031, "learning_rate": 2.5651269864844795e-05, "loss": 0.0756, "step": 9760 }, { "epoch": 0.7255309668795485, "grad_norm": 0.6784592866897583, "learning_rate": 2.564681419872271e-05, "loss": 0.0859, "step": 9770 }, { "epoch": 0.726273577899896, "grad_norm": 2.3363256454467773, "learning_rate": 2.5642358532600625e-05, "loss": 0.0596, "step": 9780 }, { "epoch": 0.7270161889202436, "grad_norm": 1.6436067819595337, "learning_rate": 2.563790286647854e-05, "loss": 0.0875, "step": 9790 }, { "epoch": 0.7277587999405911, "grad_norm": 2.9929933547973633, "learning_rate": 2.5633447200356455e-05, "loss": 0.1146, "step": 9800 }, { "epoch": 0.7285014109609387, "grad_norm": 2.5027360916137695, "learning_rate": 2.5628991534234366e-05, "loss": 0.0916, "step": 9810 }, { "epoch": 0.7292440219812862, "grad_norm": 0.6115292310714722, "learning_rate": 2.5624535868112285e-05, "loss": 0.0381, "step": 9820 }, { "epoch": 0.7299866330016337, "grad_norm": 3.5652284622192383, "learning_rate": 2.56200802019902e-05, "loss": 0.0678, "step": 9830 }, { "epoch": 0.7307292440219813, "grad_norm": 2.814704179763794, "learning_rate": 2.561562453586811e-05, "loss": 0.1019, "step": 9840 }, { "epoch": 0.7314718550423288, "grad_norm": 2.0167160034179688, "learning_rate": 2.5611168869746026e-05, "loss": 0.0719, "step": 9850 }, { "epoch": 0.7322144660626764, "grad_norm": 1.6718881130218506, "learning_rate": 2.5606713203623944e-05, "loss": 0.0965, "step": 9860 }, { "epoch": 0.7329570770830239, "grad_norm": 1.5811102390289307, "learning_rate": 2.5602257537501856e-05, "loss": 0.1263, "step": 9870 }, { "epoch": 0.7336996881033715, "grad_norm": 3.2773425579071045, "learning_rate": 2.559780187137977e-05, "loss": 0.0771, "step": 9880 }, { "epoch": 0.734442299123719, "grad_norm": 1.7898057699203491, "learning_rate": 2.559334620525769e-05, "loss": 0.0871, "step": 9890 }, { "epoch": 0.7351849101440665, "grad_norm": 2.309032917022705, "learning_rate": 2.55888905391356e-05, "loss": 0.0703, "step": 9900 }, { "epoch": 0.7359275211644141, "grad_norm": 1.4760417938232422, "learning_rate": 2.5584434873013516e-05, "loss": 0.0877, "step": 9910 }, { "epoch": 0.7366701321847616, "grad_norm": 1.4691712856292725, "learning_rate": 2.557997920689143e-05, "loss": 0.0909, "step": 9920 }, { "epoch": 0.7374127432051092, "grad_norm": 1.479776382446289, "learning_rate": 2.5575523540769346e-05, "loss": 0.0983, "step": 9930 }, { "epoch": 0.7381553542254568, "grad_norm": 2.160743474960327, "learning_rate": 2.557106787464726e-05, "loss": 0.0964, "step": 9940 }, { "epoch": 0.7388979652458042, "grad_norm": 1.2513461112976074, "learning_rate": 2.5566612208525173e-05, "loss": 0.0811, "step": 9950 }, { "epoch": 0.7396405762661518, "grad_norm": 1.080775499343872, "learning_rate": 2.556215654240309e-05, "loss": 0.0922, "step": 9960 }, { "epoch": 0.7403831872864993, "grad_norm": 2.420680284500122, "learning_rate": 2.5557700876281006e-05, "loss": 0.0645, "step": 9970 }, { "epoch": 0.7411257983068469, "grad_norm": 2.0995841026306152, "learning_rate": 2.5553245210158918e-05, "loss": 0.106, "step": 9980 }, { "epoch": 0.7418684093271944, "grad_norm": 3.2964303493499756, "learning_rate": 2.5548789544036836e-05, "loss": 0.0891, "step": 9990 }, { "epoch": 0.742611020347542, "grad_norm": 1.5144083499908447, "learning_rate": 2.5544333877914748e-05, "loss": 0.0698, "step": 10000 }, { "epoch": 0.7433536313678895, "grad_norm": 3.1648800373077393, "learning_rate": 2.5539878211792663e-05, "loss": 0.1023, "step": 10010 }, { "epoch": 0.744096242388237, "grad_norm": 2.7684147357940674, "learning_rate": 2.5535422545670578e-05, "loss": 0.0946, "step": 10020 }, { "epoch": 0.7448388534085846, "grad_norm": 2.4703927040100098, "learning_rate": 2.5530966879548493e-05, "loss": 0.0917, "step": 10030 }, { "epoch": 0.7455814644289321, "grad_norm": 4.016003131866455, "learning_rate": 2.5526511213426408e-05, "loss": 0.0888, "step": 10040 }, { "epoch": 0.7463240754492797, "grad_norm": 0.4022844135761261, "learning_rate": 2.5522055547304323e-05, "loss": 0.059, "step": 10050 }, { "epoch": 0.7470666864696273, "grad_norm": 0.8048895597457886, "learning_rate": 2.5517599881182238e-05, "loss": 0.0885, "step": 10060 }, { "epoch": 0.7478092974899747, "grad_norm": 3.6403074264526367, "learning_rate": 2.5513144215060153e-05, "loss": 0.0704, "step": 10070 }, { "epoch": 0.7485519085103223, "grad_norm": 1.1787481307983398, "learning_rate": 2.5508688548938068e-05, "loss": 0.0869, "step": 10080 }, { "epoch": 0.7492945195306698, "grad_norm": 2.7455785274505615, "learning_rate": 2.5504232882815983e-05, "loss": 0.0837, "step": 10090 }, { "epoch": 0.7500371305510174, "grad_norm": 1.82301664352417, "learning_rate": 2.5499777216693898e-05, "loss": 0.0581, "step": 10100 }, { "epoch": 0.750779741571365, "grad_norm": 1.8503745794296265, "learning_rate": 2.549532155057181e-05, "loss": 0.0519, "step": 10110 }, { "epoch": 0.7515223525917124, "grad_norm": 1.0572456121444702, "learning_rate": 2.5490865884449724e-05, "loss": 0.055, "step": 10120 }, { "epoch": 0.75226496361206, "grad_norm": 1.1795002222061157, "learning_rate": 2.5486410218327643e-05, "loss": 0.1019, "step": 10130 }, { "epoch": 0.7530075746324075, "grad_norm": 2.340430736541748, "learning_rate": 2.5481954552205554e-05, "loss": 0.0892, "step": 10140 }, { "epoch": 0.7537501856527551, "grad_norm": 2.2384378910064697, "learning_rate": 2.547749888608347e-05, "loss": 0.0461, "step": 10150 }, { "epoch": 0.7544927966731027, "grad_norm": 3.9596447944641113, "learning_rate": 2.5473043219961388e-05, "loss": 0.0699, "step": 10160 }, { "epoch": 0.7552354076934502, "grad_norm": 2.694197654724121, "learning_rate": 2.54685875538393e-05, "loss": 0.087, "step": 10170 }, { "epoch": 0.7559780187137977, "grad_norm": 1.5229603052139282, "learning_rate": 2.5464131887717214e-05, "loss": 0.0611, "step": 10180 }, { "epoch": 0.7567206297341452, "grad_norm": 1.1745027303695679, "learning_rate": 2.545967622159513e-05, "loss": 0.0937, "step": 10190 }, { "epoch": 0.7574632407544928, "grad_norm": 2.827160120010376, "learning_rate": 2.5455220555473044e-05, "loss": 0.0918, "step": 10200 }, { "epoch": 0.7582058517748403, "grad_norm": 0.49699798226356506, "learning_rate": 2.545076488935096e-05, "loss": 0.039, "step": 10210 }, { "epoch": 0.7589484627951879, "grad_norm": 0.5466452240943909, "learning_rate": 2.544630922322887e-05, "loss": 0.0774, "step": 10220 }, { "epoch": 0.7596910738155355, "grad_norm": 1.8753949403762817, "learning_rate": 2.544185355710679e-05, "loss": 0.0982, "step": 10230 }, { "epoch": 0.7604336848358829, "grad_norm": 2.802274465560913, "learning_rate": 2.5437397890984704e-05, "loss": 0.114, "step": 10240 }, { "epoch": 0.7611762958562305, "grad_norm": 2.2179017066955566, "learning_rate": 2.5432942224862616e-05, "loss": 0.0404, "step": 10250 }, { "epoch": 0.761918906876578, "grad_norm": 1.2496877908706665, "learning_rate": 2.5428486558740534e-05, "loss": 0.0764, "step": 10260 }, { "epoch": 0.7626615178969256, "grad_norm": 1.20204496383667, "learning_rate": 2.542403089261845e-05, "loss": 0.0817, "step": 10270 }, { "epoch": 0.7634041289172732, "grad_norm": 2.656388521194458, "learning_rate": 2.541957522649636e-05, "loss": 0.0801, "step": 10280 }, { "epoch": 0.7641467399376207, "grad_norm": 0.9805976748466492, "learning_rate": 2.5415119560374276e-05, "loss": 0.0655, "step": 10290 }, { "epoch": 0.7648893509579682, "grad_norm": 1.0946846008300781, "learning_rate": 2.5410663894252194e-05, "loss": 0.0897, "step": 10300 }, { "epoch": 0.7656319619783157, "grad_norm": 1.9143744707107544, "learning_rate": 2.5406208228130106e-05, "loss": 0.1053, "step": 10310 }, { "epoch": 0.7663745729986633, "grad_norm": 2.236309766769409, "learning_rate": 2.540175256200802e-05, "loss": 0.0781, "step": 10320 }, { "epoch": 0.7671171840190109, "grad_norm": 0.839529275894165, "learning_rate": 2.5397296895885936e-05, "loss": 0.0727, "step": 10330 }, { "epoch": 0.7678597950393584, "grad_norm": 1.2142996788024902, "learning_rate": 2.539284122976385e-05, "loss": 0.069, "step": 10340 }, { "epoch": 0.768602406059706, "grad_norm": 3.3854808807373047, "learning_rate": 2.5388385563641766e-05, "loss": 0.086, "step": 10350 }, { "epoch": 0.7693450170800534, "grad_norm": 1.9810289144515991, "learning_rate": 2.5383929897519677e-05, "loss": 0.0621, "step": 10360 }, { "epoch": 0.770087628100401, "grad_norm": 1.3424344062805176, "learning_rate": 2.5379474231397596e-05, "loss": 0.0884, "step": 10370 }, { "epoch": 0.7708302391207486, "grad_norm": 1.7278804779052734, "learning_rate": 2.537501856527551e-05, "loss": 0.0618, "step": 10380 }, { "epoch": 0.7715728501410961, "grad_norm": 2.9425151348114014, "learning_rate": 2.5370562899153422e-05, "loss": 0.1162, "step": 10390 }, { "epoch": 0.7723154611614437, "grad_norm": 0.7557898759841919, "learning_rate": 2.536610723303134e-05, "loss": 0.1009, "step": 10400 }, { "epoch": 0.7730580721817911, "grad_norm": 0.9816102981567383, "learning_rate": 2.5361651566909256e-05, "loss": 0.0806, "step": 10410 }, { "epoch": 0.7738006832021387, "grad_norm": 0.9218798875808716, "learning_rate": 2.5357195900787167e-05, "loss": 0.0424, "step": 10420 }, { "epoch": 0.7745432942224862, "grad_norm": 1.2472357749938965, "learning_rate": 2.5352740234665082e-05, "loss": 0.0706, "step": 10430 }, { "epoch": 0.7752859052428338, "grad_norm": 3.426825523376465, "learning_rate": 2.5348284568542997e-05, "loss": 0.0776, "step": 10440 }, { "epoch": 0.7760285162631814, "grad_norm": 4.194761753082275, "learning_rate": 2.5343828902420912e-05, "loss": 0.0697, "step": 10450 }, { "epoch": 0.7767711272835289, "grad_norm": 0.678124189376831, "learning_rate": 2.5339373236298827e-05, "loss": 0.1128, "step": 10460 }, { "epoch": 0.7775137383038764, "grad_norm": 2.648623466491699, "learning_rate": 2.5334917570176742e-05, "loss": 0.0876, "step": 10470 }, { "epoch": 0.7782563493242239, "grad_norm": 1.699841856956482, "learning_rate": 2.5330461904054657e-05, "loss": 0.074, "step": 10480 }, { "epoch": 0.7789989603445715, "grad_norm": 3.2049789428710938, "learning_rate": 2.5326006237932572e-05, "loss": 0.0872, "step": 10490 }, { "epoch": 0.7797415713649191, "grad_norm": 3.888385057449341, "learning_rate": 2.5321550571810487e-05, "loss": 0.0717, "step": 10500 }, { "epoch": 0.7804841823852666, "grad_norm": 2.0463638305664062, "learning_rate": 2.5317094905688402e-05, "loss": 0.1034, "step": 10510 }, { "epoch": 0.7812267934056142, "grad_norm": 0.6997508406639099, "learning_rate": 2.5312639239566314e-05, "loss": 0.0717, "step": 10520 }, { "epoch": 0.7819694044259616, "grad_norm": 1.8925009965896606, "learning_rate": 2.530818357344423e-05, "loss": 0.0693, "step": 10530 }, { "epoch": 0.7827120154463092, "grad_norm": 1.6464449167251587, "learning_rate": 2.5303727907322147e-05, "loss": 0.0822, "step": 10540 }, { "epoch": 0.7834546264666568, "grad_norm": 0.8865845203399658, "learning_rate": 2.529927224120006e-05, "loss": 0.0847, "step": 10550 }, { "epoch": 0.7841972374870043, "grad_norm": 2.912022113800049, "learning_rate": 2.5294816575077974e-05, "loss": 0.0784, "step": 10560 }, { "epoch": 0.7849398485073519, "grad_norm": 2.305199146270752, "learning_rate": 2.5290360908955892e-05, "loss": 0.0715, "step": 10570 }, { "epoch": 0.7856824595276994, "grad_norm": 3.301766872406006, "learning_rate": 2.5285905242833804e-05, "loss": 0.0818, "step": 10580 }, { "epoch": 0.7864250705480469, "grad_norm": 0.7540196180343628, "learning_rate": 2.528144957671172e-05, "loss": 0.0804, "step": 10590 }, { "epoch": 0.7871676815683945, "grad_norm": 4.044961452484131, "learning_rate": 2.5276993910589634e-05, "loss": 0.0827, "step": 10600 }, { "epoch": 0.787910292588742, "grad_norm": 1.4841824769973755, "learning_rate": 2.527253824446755e-05, "loss": 0.1031, "step": 10610 }, { "epoch": 0.7886529036090896, "grad_norm": 1.3933384418487549, "learning_rate": 2.5268082578345464e-05, "loss": 0.057, "step": 10620 }, { "epoch": 0.7893955146294371, "grad_norm": 2.6198787689208984, "learning_rate": 2.5263626912223375e-05, "loss": 0.0856, "step": 10630 }, { "epoch": 0.7901381256497847, "grad_norm": 1.5979726314544678, "learning_rate": 2.5259171246101294e-05, "loss": 0.0774, "step": 10640 }, { "epoch": 0.7908807366701321, "grad_norm": 1.575772762298584, "learning_rate": 2.525471557997921e-05, "loss": 0.0775, "step": 10650 }, { "epoch": 0.7916233476904797, "grad_norm": 2.2343573570251465, "learning_rate": 2.525025991385712e-05, "loss": 0.109, "step": 10660 }, { "epoch": 0.7923659587108273, "grad_norm": 0.6971462368965149, "learning_rate": 2.524580424773504e-05, "loss": 0.072, "step": 10670 }, { "epoch": 0.7931085697311748, "grad_norm": 1.1593713760375977, "learning_rate": 2.5241348581612954e-05, "loss": 0.1163, "step": 10680 }, { "epoch": 0.7938511807515224, "grad_norm": 2.554516077041626, "learning_rate": 2.5236892915490865e-05, "loss": 0.096, "step": 10690 }, { "epoch": 0.7945937917718698, "grad_norm": 1.123022198677063, "learning_rate": 2.523243724936878e-05, "loss": 0.0701, "step": 10700 }, { "epoch": 0.7953364027922174, "grad_norm": 2.6108126640319824, "learning_rate": 2.52279815832467e-05, "loss": 0.0747, "step": 10710 }, { "epoch": 0.796079013812565, "grad_norm": 0.9052862524986267, "learning_rate": 2.522352591712461e-05, "loss": 0.0864, "step": 10720 }, { "epoch": 0.7968216248329125, "grad_norm": 1.4516713619232178, "learning_rate": 2.5219070251002525e-05, "loss": 0.0646, "step": 10730 }, { "epoch": 0.7975642358532601, "grad_norm": 4.304675579071045, "learning_rate": 2.521461458488044e-05, "loss": 0.0857, "step": 10740 }, { "epoch": 0.7983068468736076, "grad_norm": 1.783659815788269, "learning_rate": 2.5210158918758355e-05, "loss": 0.0866, "step": 10750 }, { "epoch": 0.7990494578939551, "grad_norm": 1.544155240058899, "learning_rate": 2.520570325263627e-05, "loss": 0.0547, "step": 10760 }, { "epoch": 0.7997920689143027, "grad_norm": 3.2248337268829346, "learning_rate": 2.5201247586514182e-05, "loss": 0.0715, "step": 10770 }, { "epoch": 0.8005346799346502, "grad_norm": 1.370150089263916, "learning_rate": 2.51967919203921e-05, "loss": 0.0913, "step": 10780 }, { "epoch": 0.8012772909549978, "grad_norm": 1.1197993755340576, "learning_rate": 2.5192336254270015e-05, "loss": 0.0771, "step": 10790 }, { "epoch": 0.8020199019753453, "grad_norm": 1.1327694654464722, "learning_rate": 2.5187880588147927e-05, "loss": 0.0875, "step": 10800 }, { "epoch": 0.8027625129956929, "grad_norm": 1.8613241910934448, "learning_rate": 2.5183424922025845e-05, "loss": 0.0681, "step": 10810 }, { "epoch": 0.8035051240160404, "grad_norm": 2.5763204097747803, "learning_rate": 2.517896925590376e-05, "loss": 0.0884, "step": 10820 }, { "epoch": 0.8042477350363879, "grad_norm": 2.63012433052063, "learning_rate": 2.5174513589781672e-05, "loss": 0.0685, "step": 10830 }, { "epoch": 0.8049903460567355, "grad_norm": 0.752113401889801, "learning_rate": 2.5170057923659587e-05, "loss": 0.1005, "step": 10840 }, { "epoch": 0.805732957077083, "grad_norm": 1.2157506942749023, "learning_rate": 2.5165602257537502e-05, "loss": 0.0859, "step": 10850 }, { "epoch": 0.8064755680974306, "grad_norm": 2.4420969486236572, "learning_rate": 2.5161146591415417e-05, "loss": 0.0972, "step": 10860 }, { "epoch": 0.8072181791177782, "grad_norm": 2.374080181121826, "learning_rate": 2.5156690925293332e-05, "loss": 0.0523, "step": 10870 }, { "epoch": 0.8079607901381256, "grad_norm": 0.5466364622116089, "learning_rate": 2.5152235259171247e-05, "loss": 0.0482, "step": 10880 }, { "epoch": 0.8087034011584732, "grad_norm": 0.722277045249939, "learning_rate": 2.5147779593049162e-05, "loss": 0.0616, "step": 10890 }, { "epoch": 0.8094460121788207, "grad_norm": 3.784972906112671, "learning_rate": 2.5143323926927077e-05, "loss": 0.0835, "step": 10900 }, { "epoch": 0.8101886231991683, "grad_norm": 1.7221379280090332, "learning_rate": 2.5138868260804992e-05, "loss": 0.0891, "step": 10910 }, { "epoch": 0.8109312342195159, "grad_norm": 1.2730120420455933, "learning_rate": 2.5134412594682907e-05, "loss": 0.078, "step": 10920 }, { "epoch": 0.8116738452398634, "grad_norm": 1.8988823890686035, "learning_rate": 2.512995692856082e-05, "loss": 0.0918, "step": 10930 }, { "epoch": 0.8124164562602109, "grad_norm": 0.7136462926864624, "learning_rate": 2.5125501262438733e-05, "loss": 0.047, "step": 10940 }, { "epoch": 0.8131590672805584, "grad_norm": 1.3096719980239868, "learning_rate": 2.5121045596316652e-05, "loss": 0.09, "step": 10950 }, { "epoch": 0.813901678300906, "grad_norm": 1.3436990976333618, "learning_rate": 2.5116589930194563e-05, "loss": 0.0795, "step": 10960 }, { "epoch": 0.8146442893212535, "grad_norm": 1.0467826128005981, "learning_rate": 2.511213426407248e-05, "loss": 0.0906, "step": 10970 }, { "epoch": 0.8153869003416011, "grad_norm": 0.924268364906311, "learning_rate": 2.5107678597950397e-05, "loss": 0.094, "step": 10980 }, { "epoch": 0.8161295113619486, "grad_norm": 1.8887720108032227, "learning_rate": 2.510322293182831e-05, "loss": 0.0878, "step": 10990 }, { "epoch": 0.8168721223822961, "grad_norm": 3.255546808242798, "learning_rate": 2.5098767265706223e-05, "loss": 0.0729, "step": 11000 }, { "epoch": 0.8176147334026437, "grad_norm": 1.3691035509109497, "learning_rate": 2.509431159958414e-05, "loss": 0.0854, "step": 11010 }, { "epoch": 0.8183573444229912, "grad_norm": 0.6990775465965271, "learning_rate": 2.5089855933462053e-05, "loss": 0.046, "step": 11020 }, { "epoch": 0.8190999554433388, "grad_norm": 2.0553324222564697, "learning_rate": 2.508540026733997e-05, "loss": 0.0902, "step": 11030 }, { "epoch": 0.8198425664636864, "grad_norm": 2.257805109024048, "learning_rate": 2.508094460121788e-05, "loss": 0.0885, "step": 11040 }, { "epoch": 0.8205851774840338, "grad_norm": 1.704160451889038, "learning_rate": 2.50764889350958e-05, "loss": 0.0632, "step": 11050 }, { "epoch": 0.8213277885043814, "grad_norm": 0.8274914622306824, "learning_rate": 2.5072033268973713e-05, "loss": 0.0925, "step": 11060 }, { "epoch": 0.8220703995247289, "grad_norm": 0.8776381015777588, "learning_rate": 2.5067577602851625e-05, "loss": 0.0756, "step": 11070 }, { "epoch": 0.8228130105450765, "grad_norm": 1.621468424797058, "learning_rate": 2.5063121936729543e-05, "loss": 0.0895, "step": 11080 }, { "epoch": 0.8235556215654241, "grad_norm": 0.5569895505905151, "learning_rate": 2.5058666270607458e-05, "loss": 0.0859, "step": 11090 }, { "epoch": 0.8242982325857716, "grad_norm": 1.4502453804016113, "learning_rate": 2.505421060448537e-05, "loss": 0.113, "step": 11100 }, { "epoch": 0.8250408436061191, "grad_norm": 2.805652141571045, "learning_rate": 2.5049754938363285e-05, "loss": 0.0803, "step": 11110 }, { "epoch": 0.8257834546264666, "grad_norm": 1.2207911014556885, "learning_rate": 2.5045299272241203e-05, "loss": 0.0753, "step": 11120 }, { "epoch": 0.8265260656468142, "grad_norm": 1.3321232795715332, "learning_rate": 2.5040843606119115e-05, "loss": 0.0789, "step": 11130 }, { "epoch": 0.8272686766671618, "grad_norm": 0.8445536494255066, "learning_rate": 2.503638793999703e-05, "loss": 0.1171, "step": 11140 }, { "epoch": 0.8280112876875093, "grad_norm": 1.156607985496521, "learning_rate": 2.5031932273874945e-05, "loss": 0.0651, "step": 11150 }, { "epoch": 0.8287538987078569, "grad_norm": 2.5844602584838867, "learning_rate": 2.502747660775286e-05, "loss": 0.0992, "step": 11160 }, { "epoch": 0.8294965097282043, "grad_norm": 2.682854413986206, "learning_rate": 2.5023020941630775e-05, "loss": 0.0615, "step": 11170 }, { "epoch": 0.8302391207485519, "grad_norm": 1.1782902479171753, "learning_rate": 2.5018565275508686e-05, "loss": 0.075, "step": 11180 }, { "epoch": 0.8309817317688994, "grad_norm": 3.394202709197998, "learning_rate": 2.5014109609386605e-05, "loss": 0.1351, "step": 11190 }, { "epoch": 0.831724342789247, "grad_norm": 2.0263335704803467, "learning_rate": 2.500965394326452e-05, "loss": 0.086, "step": 11200 }, { "epoch": 0.8324669538095946, "grad_norm": 2.133747100830078, "learning_rate": 2.500519827714243e-05, "loss": 0.0788, "step": 11210 }, { "epoch": 0.8332095648299421, "grad_norm": 3.7382562160491943, "learning_rate": 2.500074261102035e-05, "loss": 0.067, "step": 11220 }, { "epoch": 0.8339521758502896, "grad_norm": 1.4864078760147095, "learning_rate": 2.4996286944898265e-05, "loss": 0.0822, "step": 11230 }, { "epoch": 0.8346947868706371, "grad_norm": 2.0430474281311035, "learning_rate": 2.4991831278776176e-05, "loss": 0.0774, "step": 11240 }, { "epoch": 0.8354373978909847, "grad_norm": 3.536273956298828, "learning_rate": 2.498737561265409e-05, "loss": 0.1103, "step": 11250 }, { "epoch": 0.8361800089113323, "grad_norm": 0.7639611959457397, "learning_rate": 2.4982919946532006e-05, "loss": 0.0826, "step": 11260 }, { "epoch": 0.8369226199316798, "grad_norm": 1.1882314682006836, "learning_rate": 2.497846428040992e-05, "loss": 0.0952, "step": 11270 }, { "epoch": 0.8376652309520273, "grad_norm": 1.9526349306106567, "learning_rate": 2.4974008614287836e-05, "loss": 0.0446, "step": 11280 }, { "epoch": 0.8384078419723748, "grad_norm": 2.2650139331817627, "learning_rate": 2.496955294816575e-05, "loss": 0.0896, "step": 11290 }, { "epoch": 0.8391504529927224, "grad_norm": 1.7543269395828247, "learning_rate": 2.4965097282043666e-05, "loss": 0.0582, "step": 11300 }, { "epoch": 0.83989306401307, "grad_norm": 1.1936362981796265, "learning_rate": 2.496064161592158e-05, "loss": 0.0939, "step": 11310 }, { "epoch": 0.8406356750334175, "grad_norm": 2.0941545963287354, "learning_rate": 2.4956185949799496e-05, "loss": 0.0691, "step": 11320 }, { "epoch": 0.8413782860537651, "grad_norm": 3.267097234725952, "learning_rate": 2.495173028367741e-05, "loss": 0.0826, "step": 11330 }, { "epoch": 0.8421208970741125, "grad_norm": 2.769155263900757, "learning_rate": 2.4947274617555326e-05, "loss": 0.0572, "step": 11340 }, { "epoch": 0.8428635080944601, "grad_norm": 0.9428232312202454, "learning_rate": 2.4942818951433238e-05, "loss": 0.081, "step": 11350 }, { "epoch": 0.8436061191148077, "grad_norm": 1.1093528270721436, "learning_rate": 2.4938363285311156e-05, "loss": 0.0688, "step": 11360 }, { "epoch": 0.8443487301351552, "grad_norm": 1.8220789432525635, "learning_rate": 2.4933907619189068e-05, "loss": 0.0744, "step": 11370 }, { "epoch": 0.8450913411555028, "grad_norm": 3.5718438625335693, "learning_rate": 2.4929451953066983e-05, "loss": 0.0902, "step": 11380 }, { "epoch": 0.8458339521758503, "grad_norm": 1.5545248985290527, "learning_rate": 2.49249962869449e-05, "loss": 0.1087, "step": 11390 }, { "epoch": 0.8465765631961978, "grad_norm": 1.3270010948181152, "learning_rate": 2.4920540620822813e-05, "loss": 0.1128, "step": 11400 }, { "epoch": 0.8473191742165453, "grad_norm": 1.9811359643936157, "learning_rate": 2.4916084954700728e-05, "loss": 0.0802, "step": 11410 }, { "epoch": 0.8480617852368929, "grad_norm": 2.4535109996795654, "learning_rate": 2.4911629288578643e-05, "loss": 0.076, "step": 11420 }, { "epoch": 0.8488043962572405, "grad_norm": 2.042264223098755, "learning_rate": 2.4907173622456558e-05, "loss": 0.0889, "step": 11430 }, { "epoch": 0.849547007277588, "grad_norm": 1.1611895561218262, "learning_rate": 2.4902717956334473e-05, "loss": 0.0782, "step": 11440 }, { "epoch": 0.8502896182979356, "grad_norm": 1.567514181137085, "learning_rate": 2.4898262290212385e-05, "loss": 0.1065, "step": 11450 }, { "epoch": 0.851032229318283, "grad_norm": 1.7414668798446655, "learning_rate": 2.4893806624090303e-05, "loss": 0.0752, "step": 11460 }, { "epoch": 0.8517748403386306, "grad_norm": 1.3344578742980957, "learning_rate": 2.4889350957968218e-05, "loss": 0.0794, "step": 11470 }, { "epoch": 0.8525174513589782, "grad_norm": 1.340126395225525, "learning_rate": 2.488489529184613e-05, "loss": 0.0936, "step": 11480 }, { "epoch": 0.8532600623793257, "grad_norm": 2.9865872859954834, "learning_rate": 2.4880439625724048e-05, "loss": 0.0619, "step": 11490 }, { "epoch": 0.8540026733996733, "grad_norm": 2.3079800605773926, "learning_rate": 2.4875983959601963e-05, "loss": 0.0819, "step": 11500 }, { "epoch": 0.8547452844200208, "grad_norm": 2.029001474380493, "learning_rate": 2.4871528293479874e-05, "loss": 0.0964, "step": 11510 }, { "epoch": 0.8554878954403683, "grad_norm": 0.8514242768287659, "learning_rate": 2.486707262735779e-05, "loss": 0.1067, "step": 11520 }, { "epoch": 0.8562305064607159, "grad_norm": 3.7588460445404053, "learning_rate": 2.4862616961235708e-05, "loss": 0.0691, "step": 11530 }, { "epoch": 0.8569731174810634, "grad_norm": 1.4834811687469482, "learning_rate": 2.485816129511362e-05, "loss": 0.0713, "step": 11540 }, { "epoch": 0.857715728501411, "grad_norm": 1.4009684324264526, "learning_rate": 2.4853705628991534e-05, "loss": 0.1032, "step": 11550 }, { "epoch": 0.8584583395217585, "grad_norm": 4.322129249572754, "learning_rate": 2.484924996286945e-05, "loss": 0.1125, "step": 11560 }, { "epoch": 0.859200950542106, "grad_norm": 2.336434841156006, "learning_rate": 2.4844794296747364e-05, "loss": 0.1049, "step": 11570 }, { "epoch": 0.8599435615624536, "grad_norm": 1.3329766988754272, "learning_rate": 2.484033863062528e-05, "loss": 0.0499, "step": 11580 }, { "epoch": 0.8606861725828011, "grad_norm": 2.4188973903656006, "learning_rate": 2.483588296450319e-05, "loss": 0.0634, "step": 11590 }, { "epoch": 0.8614287836031487, "grad_norm": 0.7930353283882141, "learning_rate": 2.483142729838111e-05, "loss": 0.0704, "step": 11600 }, { "epoch": 0.8621713946234962, "grad_norm": 1.0637152194976807, "learning_rate": 2.4826971632259024e-05, "loss": 0.0958, "step": 11610 }, { "epoch": 0.8629140056438438, "grad_norm": 1.6092619895935059, "learning_rate": 2.4822515966136936e-05, "loss": 0.0967, "step": 11620 }, { "epoch": 0.8636566166641912, "grad_norm": 1.6927438974380493, "learning_rate": 2.4818060300014854e-05, "loss": 0.0814, "step": 11630 }, { "epoch": 0.8643992276845388, "grad_norm": 2.1163792610168457, "learning_rate": 2.481360463389277e-05, "loss": 0.0845, "step": 11640 }, { "epoch": 0.8651418387048864, "grad_norm": 3.7081539630889893, "learning_rate": 2.480914896777068e-05, "loss": 0.0802, "step": 11650 }, { "epoch": 0.8658844497252339, "grad_norm": 0.5612799525260925, "learning_rate": 2.48046933016486e-05, "loss": 0.0845, "step": 11660 }, { "epoch": 0.8666270607455815, "grad_norm": 3.5866827964782715, "learning_rate": 2.480023763552651e-05, "loss": 0.0662, "step": 11670 }, { "epoch": 0.867369671765929, "grad_norm": 2.168499231338501, "learning_rate": 2.4795781969404426e-05, "loss": 0.077, "step": 11680 }, { "epoch": 0.8681122827862765, "grad_norm": 1.5439636707305908, "learning_rate": 2.479132630328234e-05, "loss": 0.0681, "step": 11690 }, { "epoch": 0.8688548938066241, "grad_norm": 5.759429931640625, "learning_rate": 2.4786870637160256e-05, "loss": 0.0536, "step": 11700 }, { "epoch": 0.8695975048269716, "grad_norm": 3.6019375324249268, "learning_rate": 2.478241497103817e-05, "loss": 0.0706, "step": 11710 }, { "epoch": 0.8703401158473192, "grad_norm": 2.023331880569458, "learning_rate": 2.4777959304916086e-05, "loss": 0.1183, "step": 11720 }, { "epoch": 0.8710827268676667, "grad_norm": 2.2907047271728516, "learning_rate": 2.4773503638794e-05, "loss": 0.1194, "step": 11730 }, { "epoch": 0.8718253378880143, "grad_norm": 0.9772320985794067, "learning_rate": 2.4769047972671916e-05, "loss": 0.0775, "step": 11740 }, { "epoch": 0.8725679489083618, "grad_norm": 2.4488956928253174, "learning_rate": 2.476459230654983e-05, "loss": 0.0775, "step": 11750 }, { "epoch": 0.8733105599287093, "grad_norm": 1.9681178331375122, "learning_rate": 2.4760136640427743e-05, "loss": 0.0738, "step": 11760 }, { "epoch": 0.8740531709490569, "grad_norm": 1.475229024887085, "learning_rate": 2.475568097430566e-05, "loss": 0.0754, "step": 11770 }, { "epoch": 0.8747957819694044, "grad_norm": 2.42449951171875, "learning_rate": 2.4751225308183573e-05, "loss": 0.0802, "step": 11780 }, { "epoch": 0.875538392989752, "grad_norm": 1.4891407489776611, "learning_rate": 2.4746769642061488e-05, "loss": 0.0627, "step": 11790 }, { "epoch": 0.8762810040100996, "grad_norm": 2.7915236949920654, "learning_rate": 2.4742313975939406e-05, "loss": 0.0902, "step": 11800 }, { "epoch": 0.877023615030447, "grad_norm": 0.7196487188339233, "learning_rate": 2.4737858309817318e-05, "loss": 0.0803, "step": 11810 }, { "epoch": 0.8777662260507946, "grad_norm": 2.8779609203338623, "learning_rate": 2.4733402643695233e-05, "loss": 0.0804, "step": 11820 }, { "epoch": 0.8785088370711421, "grad_norm": 1.198697566986084, "learning_rate": 2.4728946977573148e-05, "loss": 0.0507, "step": 11830 }, { "epoch": 0.8792514480914897, "grad_norm": 2.312344789505005, "learning_rate": 2.4724491311451063e-05, "loss": 0.0869, "step": 11840 }, { "epoch": 0.8799940591118373, "grad_norm": 1.2055100202560425, "learning_rate": 2.4720035645328978e-05, "loss": 0.0791, "step": 11850 }, { "epoch": 0.8807366701321847, "grad_norm": 1.9583430290222168, "learning_rate": 2.4715579979206893e-05, "loss": 0.0704, "step": 11860 }, { "epoch": 0.8814792811525323, "grad_norm": 3.8078420162200928, "learning_rate": 2.4711124313084808e-05, "loss": 0.0853, "step": 11870 }, { "epoch": 0.8822218921728798, "grad_norm": 1.3811652660369873, "learning_rate": 2.4706668646962723e-05, "loss": 0.0972, "step": 11880 }, { "epoch": 0.8829645031932274, "grad_norm": 0.9326895475387573, "learning_rate": 2.4702212980840634e-05, "loss": 0.0777, "step": 11890 }, { "epoch": 0.883707114213575, "grad_norm": 1.280218243598938, "learning_rate": 2.4697757314718552e-05, "loss": 0.0856, "step": 11900 }, { "epoch": 0.8844497252339225, "grad_norm": 3.6391515731811523, "learning_rate": 2.4693301648596467e-05, "loss": 0.0919, "step": 11910 }, { "epoch": 0.88519233625427, "grad_norm": 1.1083297729492188, "learning_rate": 2.468884598247438e-05, "loss": 0.0576, "step": 11920 }, { "epoch": 0.8859349472746175, "grad_norm": 1.3229732513427734, "learning_rate": 2.4684390316352294e-05, "loss": 0.0891, "step": 11930 }, { "epoch": 0.8866775582949651, "grad_norm": 1.0628166198730469, "learning_rate": 2.4679934650230212e-05, "loss": 0.1007, "step": 11940 }, { "epoch": 0.8874201693153126, "grad_norm": 1.2441374063491821, "learning_rate": 2.4675478984108124e-05, "loss": 0.0989, "step": 11950 }, { "epoch": 0.8881627803356602, "grad_norm": 0.5451275110244751, "learning_rate": 2.467102331798604e-05, "loss": 0.0576, "step": 11960 }, { "epoch": 0.8889053913560078, "grad_norm": 0.9148317575454712, "learning_rate": 2.4666567651863954e-05, "loss": 0.0591, "step": 11970 }, { "epoch": 0.8896480023763552, "grad_norm": 0.7988538146018982, "learning_rate": 2.466211198574187e-05, "loss": 0.0683, "step": 11980 }, { "epoch": 0.8903906133967028, "grad_norm": 2.098226547241211, "learning_rate": 2.4657656319619784e-05, "loss": 0.0964, "step": 11990 }, { "epoch": 0.8911332244170503, "grad_norm": 2.0330681800842285, "learning_rate": 2.4653200653497696e-05, "loss": 0.0865, "step": 12000 }, { "epoch": 0.8918758354373979, "grad_norm": 2.2921535968780518, "learning_rate": 2.4648744987375614e-05, "loss": 0.077, "step": 12010 }, { "epoch": 0.8926184464577455, "grad_norm": 3.7081544399261475, "learning_rate": 2.464428932125353e-05, "loss": 0.099, "step": 12020 }, { "epoch": 0.893361057478093, "grad_norm": 0.6023477911949158, "learning_rate": 2.463983365513144e-05, "loss": 0.0599, "step": 12030 }, { "epoch": 0.8941036684984405, "grad_norm": 1.6796938180923462, "learning_rate": 2.463537798900936e-05, "loss": 0.0984, "step": 12040 }, { "epoch": 0.894846279518788, "grad_norm": 4.132201194763184, "learning_rate": 2.4630922322887274e-05, "loss": 0.0771, "step": 12050 }, { "epoch": 0.8955888905391356, "grad_norm": 2.146115303039551, "learning_rate": 2.4626466656765186e-05, "loss": 0.0808, "step": 12060 }, { "epoch": 0.8963315015594832, "grad_norm": 0.9783619046211243, "learning_rate": 2.4622010990643104e-05, "loss": 0.0676, "step": 12070 }, { "epoch": 0.8970741125798307, "grad_norm": 0.4546336829662323, "learning_rate": 2.4617555324521016e-05, "loss": 0.092, "step": 12080 }, { "epoch": 0.8978167236001783, "grad_norm": 1.2638888359069824, "learning_rate": 2.461309965839893e-05, "loss": 0.0672, "step": 12090 }, { "epoch": 0.8985593346205257, "grad_norm": 1.3266007900238037, "learning_rate": 2.4608643992276846e-05, "loss": 0.0567, "step": 12100 }, { "epoch": 0.8993019456408733, "grad_norm": 2.1246678829193115, "learning_rate": 2.460418832615476e-05, "loss": 0.0795, "step": 12110 }, { "epoch": 0.9000445566612209, "grad_norm": 2.1990578174591064, "learning_rate": 2.4599732660032676e-05, "loss": 0.0881, "step": 12120 }, { "epoch": 0.9007871676815684, "grad_norm": 0.5446377992630005, "learning_rate": 2.459527699391059e-05, "loss": 0.0697, "step": 12130 }, { "epoch": 0.901529778701916, "grad_norm": 2.7443840503692627, "learning_rate": 2.4590821327788506e-05, "loss": 0.0912, "step": 12140 }, { "epoch": 0.9022723897222634, "grad_norm": 1.4836909770965576, "learning_rate": 2.458636566166642e-05, "loss": 0.0904, "step": 12150 }, { "epoch": 0.903015000742611, "grad_norm": 0.3852311968803406, "learning_rate": 2.4581909995544336e-05, "loss": 0.0695, "step": 12160 }, { "epoch": 0.9037576117629585, "grad_norm": 1.652395248413086, "learning_rate": 2.4577454329422247e-05, "loss": 0.0836, "step": 12170 }, { "epoch": 0.9045002227833061, "grad_norm": 3.4490652084350586, "learning_rate": 2.4572998663300166e-05, "loss": 0.0974, "step": 12180 }, { "epoch": 0.9052428338036537, "grad_norm": 1.1233237981796265, "learning_rate": 2.4568542997178077e-05, "loss": 0.0968, "step": 12190 }, { "epoch": 0.9059854448240012, "grad_norm": 1.1226853132247925, "learning_rate": 2.4564087331055992e-05, "loss": 0.0837, "step": 12200 }, { "epoch": 0.9067280558443487, "grad_norm": 6.846561908721924, "learning_rate": 2.455963166493391e-05, "loss": 0.0965, "step": 12210 }, { "epoch": 0.9074706668646962, "grad_norm": 2.6397814750671387, "learning_rate": 2.4555175998811822e-05, "loss": 0.074, "step": 12220 }, { "epoch": 0.9082132778850438, "grad_norm": 1.7175049781799316, "learning_rate": 2.4550720332689737e-05, "loss": 0.076, "step": 12230 }, { "epoch": 0.9089558889053914, "grad_norm": 1.3105518817901611, "learning_rate": 2.4546264666567652e-05, "loss": 0.0952, "step": 12240 }, { "epoch": 0.9096984999257389, "grad_norm": 3.014943838119507, "learning_rate": 2.4541809000445567e-05, "loss": 0.089, "step": 12250 }, { "epoch": 0.9104411109460865, "grad_norm": 1.1072237491607666, "learning_rate": 2.4537353334323482e-05, "loss": 0.0823, "step": 12260 }, { "epoch": 0.9111837219664339, "grad_norm": 1.2074459791183472, "learning_rate": 2.4532897668201397e-05, "loss": 0.0787, "step": 12270 }, { "epoch": 0.9119263329867815, "grad_norm": 0.8429141640663147, "learning_rate": 2.4528442002079312e-05, "loss": 0.1024, "step": 12280 }, { "epoch": 0.9126689440071291, "grad_norm": 2.6227517127990723, "learning_rate": 2.4523986335957227e-05, "loss": 0.0799, "step": 12290 }, { "epoch": 0.9134115550274766, "grad_norm": 0.7948519587516785, "learning_rate": 2.451953066983514e-05, "loss": 0.0722, "step": 12300 }, { "epoch": 0.9141541660478242, "grad_norm": 1.5158371925354004, "learning_rate": 2.4515075003713057e-05, "loss": 0.0938, "step": 12310 }, { "epoch": 0.9148967770681717, "grad_norm": 1.2404049634933472, "learning_rate": 2.4510619337590972e-05, "loss": 0.1068, "step": 12320 }, { "epoch": 0.9156393880885192, "grad_norm": 1.7605699300765991, "learning_rate": 2.4506163671468884e-05, "loss": 0.0894, "step": 12330 }, { "epoch": 0.9163819991088668, "grad_norm": 1.6435573101043701, "learning_rate": 2.45017080053468e-05, "loss": 0.0742, "step": 12340 }, { "epoch": 0.9171246101292143, "grad_norm": 0.6193175911903381, "learning_rate": 2.4497252339224717e-05, "loss": 0.0551, "step": 12350 }, { "epoch": 0.9178672211495619, "grad_norm": 2.4157028198242188, "learning_rate": 2.449279667310263e-05, "loss": 0.0889, "step": 12360 }, { "epoch": 0.9186098321699094, "grad_norm": 3.2014384269714355, "learning_rate": 2.4488341006980544e-05, "loss": 0.0777, "step": 12370 }, { "epoch": 0.919352443190257, "grad_norm": 2.560277223587036, "learning_rate": 2.448388534085846e-05, "loss": 0.0892, "step": 12380 }, { "epoch": 0.9200950542106044, "grad_norm": 1.1629691123962402, "learning_rate": 2.4479429674736374e-05, "loss": 0.0693, "step": 12390 }, { "epoch": 0.920837665230952, "grad_norm": 0.7739498019218445, "learning_rate": 2.447497400861429e-05, "loss": 0.0751, "step": 12400 }, { "epoch": 0.9215802762512996, "grad_norm": 3.0681796073913574, "learning_rate": 2.44705183424922e-05, "loss": 0.1, "step": 12410 }, { "epoch": 0.9223228872716471, "grad_norm": 1.9392578601837158, "learning_rate": 2.446606267637012e-05, "loss": 0.0949, "step": 12420 }, { "epoch": 0.9230654982919947, "grad_norm": 1.269616723060608, "learning_rate": 2.4461607010248034e-05, "loss": 0.0821, "step": 12430 }, { "epoch": 0.9238081093123421, "grad_norm": 0.9152816534042358, "learning_rate": 2.4457151344125945e-05, "loss": 0.0905, "step": 12440 }, { "epoch": 0.9245507203326897, "grad_norm": 1.6232317686080933, "learning_rate": 2.4452695678003864e-05, "loss": 0.0869, "step": 12450 }, { "epoch": 0.9252933313530373, "grad_norm": 3.454188585281372, "learning_rate": 2.444824001188178e-05, "loss": 0.0908, "step": 12460 }, { "epoch": 0.9260359423733848, "grad_norm": 1.880387783050537, "learning_rate": 2.444378434575969e-05, "loss": 0.0705, "step": 12470 }, { "epoch": 0.9267785533937324, "grad_norm": 2.0626840591430664, "learning_rate": 2.443932867963761e-05, "loss": 0.0897, "step": 12480 }, { "epoch": 0.92752116441408, "grad_norm": 1.5957422256469727, "learning_rate": 2.443487301351552e-05, "loss": 0.0902, "step": 12490 }, { "epoch": 0.9282637754344274, "grad_norm": 3.2366816997528076, "learning_rate": 2.4430417347393435e-05, "loss": 0.0757, "step": 12500 }, { "epoch": 0.929006386454775, "grad_norm": 0.9479996562004089, "learning_rate": 2.442596168127135e-05, "loss": 0.0631, "step": 12510 }, { "epoch": 0.9297489974751225, "grad_norm": 1.3636139631271362, "learning_rate": 2.4421506015149265e-05, "loss": 0.0845, "step": 12520 }, { "epoch": 0.9304916084954701, "grad_norm": 1.1171748638153076, "learning_rate": 2.441705034902718e-05, "loss": 0.0786, "step": 12530 }, { "epoch": 0.9312342195158176, "grad_norm": 1.6914044618606567, "learning_rate": 2.4412594682905095e-05, "loss": 0.0919, "step": 12540 }, { "epoch": 0.9319768305361652, "grad_norm": 1.3057868480682373, "learning_rate": 2.440813901678301e-05, "loss": 0.079, "step": 12550 }, { "epoch": 0.9327194415565127, "grad_norm": 4.414134979248047, "learning_rate": 2.4403683350660925e-05, "loss": 0.0893, "step": 12560 }, { "epoch": 0.9334620525768602, "grad_norm": 0.9300063848495483, "learning_rate": 2.439922768453884e-05, "loss": 0.1079, "step": 12570 }, { "epoch": 0.9342046635972078, "grad_norm": 0.956235408782959, "learning_rate": 2.4394772018416752e-05, "loss": 0.1299, "step": 12580 }, { "epoch": 0.9349472746175553, "grad_norm": 1.3673025369644165, "learning_rate": 2.439031635229467e-05, "loss": 0.0704, "step": 12590 }, { "epoch": 0.9356898856379029, "grad_norm": 0.966641366481781, "learning_rate": 2.4385860686172582e-05, "loss": 0.0806, "step": 12600 }, { "epoch": 0.9364324966582505, "grad_norm": 1.7886812686920166, "learning_rate": 2.4381405020050497e-05, "loss": 0.0621, "step": 12610 }, { "epoch": 0.9371751076785979, "grad_norm": 1.5795032978057861, "learning_rate": 2.4376949353928415e-05, "loss": 0.0645, "step": 12620 }, { "epoch": 0.9379177186989455, "grad_norm": 0.8962666392326355, "learning_rate": 2.4372493687806327e-05, "loss": 0.0825, "step": 12630 }, { "epoch": 0.938660329719293, "grad_norm": 1.669140100479126, "learning_rate": 2.4368038021684242e-05, "loss": 0.1027, "step": 12640 }, { "epoch": 0.9394029407396406, "grad_norm": 0.6645317077636719, "learning_rate": 2.4363582355562157e-05, "loss": 0.0672, "step": 12650 }, { "epoch": 0.9401455517599882, "grad_norm": 1.4275974035263062, "learning_rate": 2.4359126689440072e-05, "loss": 0.094, "step": 12660 }, { "epoch": 0.9408881627803357, "grad_norm": 1.8758856058120728, "learning_rate": 2.4354671023317987e-05, "loss": 0.0911, "step": 12670 }, { "epoch": 0.9416307738006832, "grad_norm": 1.4419819116592407, "learning_rate": 2.4350215357195902e-05, "loss": 0.0613, "step": 12680 }, { "epoch": 0.9423733848210307, "grad_norm": 1.668791651725769, "learning_rate": 2.4345759691073817e-05, "loss": 0.067, "step": 12690 }, { "epoch": 0.9431159958413783, "grad_norm": 1.9676769971847534, "learning_rate": 2.4341304024951732e-05, "loss": 0.0881, "step": 12700 }, { "epoch": 0.9438586068617258, "grad_norm": 0.5292758941650391, "learning_rate": 2.4336848358829643e-05, "loss": 0.0722, "step": 12710 }, { "epoch": 0.9446012178820734, "grad_norm": 2.253980875015259, "learning_rate": 2.433239269270756e-05, "loss": 0.0689, "step": 12720 }, { "epoch": 0.9453438289024209, "grad_norm": 2.0630314350128174, "learning_rate": 2.4327937026585477e-05, "loss": 0.0867, "step": 12730 }, { "epoch": 0.9460864399227684, "grad_norm": 0.9975630640983582, "learning_rate": 2.4323481360463388e-05, "loss": 0.1088, "step": 12740 }, { "epoch": 0.946829050943116, "grad_norm": 1.6637675762176514, "learning_rate": 2.4319025694341303e-05, "loss": 0.103, "step": 12750 }, { "epoch": 0.9475716619634635, "grad_norm": 1.508355975151062, "learning_rate": 2.431457002821922e-05, "loss": 0.1022, "step": 12760 }, { "epoch": 0.9483142729838111, "grad_norm": 1.989896297454834, "learning_rate": 2.4310114362097133e-05, "loss": 0.0647, "step": 12770 }, { "epoch": 0.9490568840041587, "grad_norm": 0.7862587571144104, "learning_rate": 2.4305658695975048e-05, "loss": 0.0754, "step": 12780 }, { "epoch": 0.9497994950245061, "grad_norm": 0.7664479613304138, "learning_rate": 2.4301203029852967e-05, "loss": 0.0759, "step": 12790 }, { "epoch": 0.9505421060448537, "grad_norm": 1.1246778964996338, "learning_rate": 2.4296747363730878e-05, "loss": 0.0671, "step": 12800 }, { "epoch": 0.9512847170652012, "grad_norm": 1.6385598182678223, "learning_rate": 2.4292291697608793e-05, "loss": 0.1062, "step": 12810 }, { "epoch": 0.9520273280855488, "grad_norm": 1.8573966026306152, "learning_rate": 2.4287836031486705e-05, "loss": 0.0691, "step": 12820 }, { "epoch": 0.9527699391058964, "grad_norm": 1.2876501083374023, "learning_rate": 2.4283380365364623e-05, "loss": 0.0699, "step": 12830 }, { "epoch": 0.9535125501262439, "grad_norm": 3.32975435256958, "learning_rate": 2.4278924699242538e-05, "loss": 0.0984, "step": 12840 }, { "epoch": 0.9542551611465914, "grad_norm": 2.0247581005096436, "learning_rate": 2.427446903312045e-05, "loss": 0.0784, "step": 12850 }, { "epoch": 0.9549977721669389, "grad_norm": 3.556913375854492, "learning_rate": 2.4270013366998368e-05, "loss": 0.0756, "step": 12860 }, { "epoch": 0.9557403831872865, "grad_norm": 1.9502661228179932, "learning_rate": 2.4265557700876283e-05, "loss": 0.1019, "step": 12870 }, { "epoch": 0.956482994207634, "grad_norm": 1.5548827648162842, "learning_rate": 2.4261102034754195e-05, "loss": 0.077, "step": 12880 }, { "epoch": 0.9572256052279816, "grad_norm": 2.959385871887207, "learning_rate": 2.4256646368632113e-05, "loss": 0.0799, "step": 12890 }, { "epoch": 0.9579682162483292, "grad_norm": 0.9673056602478027, "learning_rate": 2.4252190702510025e-05, "loss": 0.0768, "step": 12900 }, { "epoch": 0.9587108272686766, "grad_norm": 1.747621774673462, "learning_rate": 2.424773503638794e-05, "loss": 0.0604, "step": 12910 }, { "epoch": 0.9594534382890242, "grad_norm": 1.6321135759353638, "learning_rate": 2.4243279370265855e-05, "loss": 0.0706, "step": 12920 }, { "epoch": 0.9601960493093717, "grad_norm": 2.2727482318878174, "learning_rate": 2.423882370414377e-05, "loss": 0.0939, "step": 12930 }, { "epoch": 0.9609386603297193, "grad_norm": 0.6536130309104919, "learning_rate": 2.4234368038021685e-05, "loss": 0.0881, "step": 12940 }, { "epoch": 0.9616812713500669, "grad_norm": 2.0956132411956787, "learning_rate": 2.42299123718996e-05, "loss": 0.0837, "step": 12950 }, { "epoch": 0.9624238823704144, "grad_norm": 1.1719980239868164, "learning_rate": 2.4225456705777515e-05, "loss": 0.0789, "step": 12960 }, { "epoch": 0.9631664933907619, "grad_norm": 3.233799934387207, "learning_rate": 2.422100103965543e-05, "loss": 0.0597, "step": 12970 }, { "epoch": 0.9639091044111094, "grad_norm": 3.5093204975128174, "learning_rate": 2.4216545373533345e-05, "loss": 0.1026, "step": 12980 }, { "epoch": 0.964651715431457, "grad_norm": 1.597965121269226, "learning_rate": 2.4212089707411256e-05, "loss": 0.0852, "step": 12990 }, { "epoch": 0.9653943264518046, "grad_norm": 1.1015452146530151, "learning_rate": 2.4207634041289175e-05, "loss": 0.0651, "step": 13000 }, { "epoch": 0.9661369374721521, "grad_norm": 0.7174215316772461, "learning_rate": 2.4203178375167086e-05, "loss": 0.0693, "step": 13010 }, { "epoch": 0.9668795484924996, "grad_norm": 1.493241786956787, "learning_rate": 2.4198722709045e-05, "loss": 0.0757, "step": 13020 }, { "epoch": 0.9676221595128471, "grad_norm": 2.9904990196228027, "learning_rate": 2.419426704292292e-05, "loss": 0.0782, "step": 13030 }, { "epoch": 0.9683647705331947, "grad_norm": 1.8118109703063965, "learning_rate": 2.418981137680083e-05, "loss": 0.0718, "step": 13040 }, { "epoch": 0.9691073815535423, "grad_norm": 3.275655508041382, "learning_rate": 2.4185355710678746e-05, "loss": 0.0977, "step": 13050 }, { "epoch": 0.9698499925738898, "grad_norm": 1.704103708267212, "learning_rate": 2.4180900044556665e-05, "loss": 0.0578, "step": 13060 }, { "epoch": 0.9705926035942374, "grad_norm": 1.3468433618545532, "learning_rate": 2.4176444378434576e-05, "loss": 0.0595, "step": 13070 }, { "epoch": 0.9713352146145848, "grad_norm": 0.6979770064353943, "learning_rate": 2.417198871231249e-05, "loss": 0.0875, "step": 13080 }, { "epoch": 0.9720778256349324, "grad_norm": 3.1347808837890625, "learning_rate": 2.4167533046190406e-05, "loss": 0.1095, "step": 13090 }, { "epoch": 0.97282043665528, "grad_norm": 2.341815710067749, "learning_rate": 2.416307738006832e-05, "loss": 0.0909, "step": 13100 }, { "epoch": 0.9735630476756275, "grad_norm": 1.4380031824111938, "learning_rate": 2.4158621713946236e-05, "loss": 0.0855, "step": 13110 }, { "epoch": 0.9743056586959751, "grad_norm": 2.384162425994873, "learning_rate": 2.4154166047824148e-05, "loss": 0.0851, "step": 13120 }, { "epoch": 0.9750482697163226, "grad_norm": 3.7416675090789795, "learning_rate": 2.4149710381702066e-05, "loss": 0.0855, "step": 13130 }, { "epoch": 0.9757908807366701, "grad_norm": 1.3692728281021118, "learning_rate": 2.414525471557998e-05, "loss": 0.0492, "step": 13140 }, { "epoch": 0.9765334917570176, "grad_norm": 2.30066180229187, "learning_rate": 2.4140799049457893e-05, "loss": 0.0818, "step": 13150 }, { "epoch": 0.9772761027773652, "grad_norm": 0.9965130686759949, "learning_rate": 2.4136343383335808e-05, "loss": 0.08, "step": 13160 }, { "epoch": 0.9780187137977128, "grad_norm": 3.0066161155700684, "learning_rate": 2.4131887717213726e-05, "loss": 0.0932, "step": 13170 }, { "epoch": 0.9787613248180603, "grad_norm": 2.1586639881134033, "learning_rate": 2.4127432051091638e-05, "loss": 0.0582, "step": 13180 }, { "epoch": 0.9795039358384079, "grad_norm": 1.0595694780349731, "learning_rate": 2.4122976384969553e-05, "loss": 0.0731, "step": 13190 }, { "epoch": 0.9802465468587553, "grad_norm": 1.56610906124115, "learning_rate": 2.411852071884747e-05, "loss": 0.054, "step": 13200 }, { "epoch": 0.9809891578791029, "grad_norm": 0.9875765442848206, "learning_rate": 2.4114065052725383e-05, "loss": 0.0748, "step": 13210 }, { "epoch": 0.9817317688994505, "grad_norm": 0.9391055703163147, "learning_rate": 2.4109609386603298e-05, "loss": 0.0635, "step": 13220 }, { "epoch": 0.982474379919798, "grad_norm": 2.4511687755584717, "learning_rate": 2.410515372048121e-05, "loss": 0.0709, "step": 13230 }, { "epoch": 0.9832169909401456, "grad_norm": 1.185548186302185, "learning_rate": 2.4100698054359128e-05, "loss": 0.0477, "step": 13240 }, { "epoch": 0.9839596019604931, "grad_norm": 2.511913299560547, "learning_rate": 2.4096242388237043e-05, "loss": 0.055, "step": 13250 }, { "epoch": 0.9847022129808406, "grad_norm": 3.263899326324463, "learning_rate": 2.4091786722114954e-05, "loss": 0.0808, "step": 13260 }, { "epoch": 0.9854448240011882, "grad_norm": 1.0159684419631958, "learning_rate": 2.4087331055992873e-05, "loss": 0.0897, "step": 13270 }, { "epoch": 0.9861874350215357, "grad_norm": 1.7687329053878784, "learning_rate": 2.4082875389870788e-05, "loss": 0.0789, "step": 13280 }, { "epoch": 0.9869300460418833, "grad_norm": 3.7545151710510254, "learning_rate": 2.40784197237487e-05, "loss": 0.0785, "step": 13290 }, { "epoch": 0.9876726570622308, "grad_norm": 4.160385608673096, "learning_rate": 2.4073964057626618e-05, "loss": 0.0784, "step": 13300 }, { "epoch": 0.9884152680825783, "grad_norm": 2.456456184387207, "learning_rate": 2.4069508391504533e-05, "loss": 0.0863, "step": 13310 }, { "epoch": 0.9891578791029259, "grad_norm": 1.3430705070495605, "learning_rate": 2.4065052725382444e-05, "loss": 0.0961, "step": 13320 }, { "epoch": 0.9899004901232734, "grad_norm": 1.3270246982574463, "learning_rate": 2.406059705926036e-05, "loss": 0.0721, "step": 13330 }, { "epoch": 0.990643101143621, "grad_norm": 1.4000052213668823, "learning_rate": 2.4056141393138274e-05, "loss": 0.0679, "step": 13340 }, { "epoch": 0.9913857121639685, "grad_norm": 3.377154588699341, "learning_rate": 2.405168572701619e-05, "loss": 0.0651, "step": 13350 }, { "epoch": 0.9921283231843161, "grad_norm": 1.1014114618301392, "learning_rate": 2.4047230060894104e-05, "loss": 0.1156, "step": 13360 }, { "epoch": 0.9928709342046635, "grad_norm": 2.2198503017425537, "learning_rate": 2.404277439477202e-05, "loss": 0.0821, "step": 13370 }, { "epoch": 0.9936135452250111, "grad_norm": 2.7557029724121094, "learning_rate": 2.4038318728649934e-05, "loss": 0.0833, "step": 13380 }, { "epoch": 0.9943561562453587, "grad_norm": 3.052049160003662, "learning_rate": 2.403386306252785e-05, "loss": 0.0832, "step": 13390 }, { "epoch": 0.9950987672657062, "grad_norm": 1.0992413759231567, "learning_rate": 2.402940739640576e-05, "loss": 0.1037, "step": 13400 }, { "epoch": 0.9958413782860538, "grad_norm": 0.9476717114448547, "learning_rate": 2.402495173028368e-05, "loss": 0.0732, "step": 13410 }, { "epoch": 0.9965839893064014, "grad_norm": 0.403255820274353, "learning_rate": 2.402049606416159e-05, "loss": 0.0565, "step": 13420 }, { "epoch": 0.9973266003267488, "grad_norm": 0.33830782771110535, "learning_rate": 2.4016040398039506e-05, "loss": 0.0614, "step": 13430 }, { "epoch": 0.9980692113470964, "grad_norm": 1.7131567001342773, "learning_rate": 2.4011584731917424e-05, "loss": 0.1047, "step": 13440 }, { "epoch": 0.9988118223674439, "grad_norm": 1.484653115272522, "learning_rate": 2.4007129065795336e-05, "loss": 0.089, "step": 13450 }, { "epoch": 0.9995544333877915, "grad_norm": 1.4303562641143799, "learning_rate": 2.400267339967325e-05, "loss": 0.0672, "step": 13460 }, { "epoch": 1.0, "eval_f1": 0.0, "eval_loss": 0.06958512961864471, "eval_runtime": 835.1475, "eval_samples_per_second": 45.524, "eval_steps_per_second": 2.846, "step": 13466 }, { "epoch": 1.000297044408139, "grad_norm": 1.3870664834976196, "learning_rate": 2.399821773355117e-05, "loss": 0.0639, "step": 13470 }, { "epoch": 1.0010396554284866, "grad_norm": 1.3017977476119995, "learning_rate": 2.399376206742908e-05, "loss": 0.0541, "step": 13480 }, { "epoch": 1.001782266448834, "grad_norm": 3.119060516357422, "learning_rate": 2.3989306401306996e-05, "loss": 0.0774, "step": 13490 }, { "epoch": 1.0025248774691817, "grad_norm": 1.463212013244629, "learning_rate": 2.398485073518491e-05, "loss": 0.0869, "step": 13500 }, { "epoch": 1.0032674884895292, "grad_norm": 1.4256367683410645, "learning_rate": 2.3980395069062826e-05, "loss": 0.0856, "step": 13510 }, { "epoch": 1.0040100995098766, "grad_norm": 2.062420606613159, "learning_rate": 2.397593940294074e-05, "loss": 0.0575, "step": 13520 }, { "epoch": 1.0047527105302243, "grad_norm": 1.5311766862869263, "learning_rate": 2.3971483736818653e-05, "loss": 0.1091, "step": 13530 }, { "epoch": 1.0054953215505718, "grad_norm": 1.6785446405410767, "learning_rate": 2.396702807069657e-05, "loss": 0.0978, "step": 13540 }, { "epoch": 1.0062379325709194, "grad_norm": 2.021958827972412, "learning_rate": 2.3962572404574486e-05, "loss": 0.1013, "step": 13550 }, { "epoch": 1.0069805435912669, "grad_norm": 1.236924409866333, "learning_rate": 2.3958116738452397e-05, "loss": 0.0621, "step": 13560 }, { "epoch": 1.0077231546116145, "grad_norm": 0.7201011180877686, "learning_rate": 2.3953661072330312e-05, "loss": 0.0832, "step": 13570 }, { "epoch": 1.008465765631962, "grad_norm": 2.1270737648010254, "learning_rate": 2.394920540620823e-05, "loss": 0.0553, "step": 13580 }, { "epoch": 1.0092083766523094, "grad_norm": 2.2044267654418945, "learning_rate": 2.3944749740086142e-05, "loss": 0.091, "step": 13590 }, { "epoch": 1.0099509876726571, "grad_norm": 1.7166577577590942, "learning_rate": 2.3940294073964057e-05, "loss": 0.0774, "step": 13600 }, { "epoch": 1.0106935986930046, "grad_norm": 1.7393256425857544, "learning_rate": 2.3935838407841976e-05, "loss": 0.0649, "step": 13610 }, { "epoch": 1.0114362097133522, "grad_norm": 1.1891884803771973, "learning_rate": 2.3931382741719887e-05, "loss": 0.0873, "step": 13620 }, { "epoch": 1.0121788207336997, "grad_norm": 0.8433098793029785, "learning_rate": 2.3926927075597802e-05, "loss": 0.0742, "step": 13630 }, { "epoch": 1.0129214317540471, "grad_norm": 1.3263208866119385, "learning_rate": 2.3922471409475717e-05, "loss": 0.0663, "step": 13640 }, { "epoch": 1.0136640427743948, "grad_norm": 2.3335938453674316, "learning_rate": 2.3918015743353632e-05, "loss": 0.0824, "step": 13650 }, { "epoch": 1.0144066537947423, "grad_norm": 0.9380444884300232, "learning_rate": 2.3913560077231547e-05, "loss": 0.0787, "step": 13660 }, { "epoch": 1.01514926481509, "grad_norm": 1.832839846611023, "learning_rate": 2.390910441110946e-05, "loss": 0.0656, "step": 13670 }, { "epoch": 1.0158918758354374, "grad_norm": 1.5540494918823242, "learning_rate": 2.3904648744987377e-05, "loss": 0.0403, "step": 13680 }, { "epoch": 1.0166344868557848, "grad_norm": 3.9998695850372314, "learning_rate": 2.3900193078865292e-05, "loss": 0.1071, "step": 13690 }, { "epoch": 1.0173770978761325, "grad_norm": 2.738377571105957, "learning_rate": 2.3895737412743204e-05, "loss": 0.0826, "step": 13700 }, { "epoch": 1.01811970889648, "grad_norm": 5.063292503356934, "learning_rate": 2.3891281746621122e-05, "loss": 0.0653, "step": 13710 }, { "epoch": 1.0188623199168276, "grad_norm": 0.8923290967941284, "learning_rate": 2.3886826080499037e-05, "loss": 0.089, "step": 13720 }, { "epoch": 1.019604930937175, "grad_norm": 0.7146623134613037, "learning_rate": 2.388237041437695e-05, "loss": 0.0731, "step": 13730 }, { "epoch": 1.0203475419575228, "grad_norm": 5.034801006317139, "learning_rate": 2.3877914748254864e-05, "loss": 0.0837, "step": 13740 }, { "epoch": 1.0210901529778702, "grad_norm": 2.207108736038208, "learning_rate": 2.387345908213278e-05, "loss": 0.0966, "step": 13750 }, { "epoch": 1.0218327639982177, "grad_norm": 4.187524795532227, "learning_rate": 2.3869003416010694e-05, "loss": 0.0873, "step": 13760 }, { "epoch": 1.0225753750185653, "grad_norm": 2.0011796951293945, "learning_rate": 2.386454774988861e-05, "loss": 0.1067, "step": 13770 }, { "epoch": 1.0233179860389128, "grad_norm": 1.419082760810852, "learning_rate": 2.3860092083766524e-05, "loss": 0.1013, "step": 13780 }, { "epoch": 1.0240605970592604, "grad_norm": 1.378090500831604, "learning_rate": 2.385563641764444e-05, "loss": 0.0844, "step": 13790 }, { "epoch": 1.024803208079608, "grad_norm": 1.8442440032958984, "learning_rate": 2.3851180751522354e-05, "loss": 0.0922, "step": 13800 }, { "epoch": 1.0255458190999553, "grad_norm": 1.9411593675613403, "learning_rate": 2.3846725085400266e-05, "loss": 0.0699, "step": 13810 }, { "epoch": 1.026288430120303, "grad_norm": 1.1153783798217773, "learning_rate": 2.3842269419278184e-05, "loss": 0.0986, "step": 13820 }, { "epoch": 1.0270310411406505, "grad_norm": 2.119503974914551, "learning_rate": 2.3837813753156096e-05, "loss": 0.0851, "step": 13830 }, { "epoch": 1.0277736521609981, "grad_norm": 2.0890700817108154, "learning_rate": 2.383335808703401e-05, "loss": 0.0687, "step": 13840 }, { "epoch": 1.0285162631813456, "grad_norm": 3.1581473350524902, "learning_rate": 2.382890242091193e-05, "loss": 0.0774, "step": 13850 }, { "epoch": 1.0292588742016933, "grad_norm": 1.0169066190719604, "learning_rate": 2.382444675478984e-05, "loss": 0.0636, "step": 13860 }, { "epoch": 1.0300014852220407, "grad_norm": 1.5723227262496948, "learning_rate": 2.3819991088667756e-05, "loss": 0.0878, "step": 13870 }, { "epoch": 1.0307440962423882, "grad_norm": 1.652784824371338, "learning_rate": 2.3815535422545674e-05, "loss": 0.0862, "step": 13880 }, { "epoch": 1.0314867072627358, "grad_norm": 1.3980462551116943, "learning_rate": 2.3811079756423586e-05, "loss": 0.0937, "step": 13890 }, { "epoch": 1.0322293182830833, "grad_norm": 0.9966709613800049, "learning_rate": 2.38066240903015e-05, "loss": 0.0551, "step": 13900 }, { "epoch": 1.032971929303431, "grad_norm": 1.0643346309661865, "learning_rate": 2.3802168424179416e-05, "loss": 0.0647, "step": 13910 }, { "epoch": 1.0337145403237784, "grad_norm": 1.1754989624023438, "learning_rate": 2.379771275805733e-05, "loss": 0.0859, "step": 13920 }, { "epoch": 1.0344571513441259, "grad_norm": 1.513095736503601, "learning_rate": 2.3793257091935245e-05, "loss": 0.1229, "step": 13930 }, { "epoch": 1.0351997623644735, "grad_norm": 2.310269832611084, "learning_rate": 2.3788801425813157e-05, "loss": 0.0641, "step": 13940 }, { "epoch": 1.035942373384821, "grad_norm": 1.0776907205581665, "learning_rate": 2.3784345759691075e-05, "loss": 0.0738, "step": 13950 }, { "epoch": 1.0366849844051687, "grad_norm": 1.2684451341629028, "learning_rate": 2.377989009356899e-05, "loss": 0.1101, "step": 13960 }, { "epoch": 1.037427595425516, "grad_norm": 2.617946147918701, "learning_rate": 2.3775434427446902e-05, "loss": 0.0603, "step": 13970 }, { "epoch": 1.0381702064458636, "grad_norm": 2.604550361633301, "learning_rate": 2.3770978761324817e-05, "loss": 0.1068, "step": 13980 }, { "epoch": 1.0389128174662112, "grad_norm": 2.7444822788238525, "learning_rate": 2.3766523095202735e-05, "loss": 0.081, "step": 13990 }, { "epoch": 1.0396554284865587, "grad_norm": 2.1153526306152344, "learning_rate": 2.3762067429080647e-05, "loss": 0.0752, "step": 14000 }, { "epoch": 1.0403980395069063, "grad_norm": 2.4944326877593994, "learning_rate": 2.3757611762958562e-05, "loss": 0.1094, "step": 14010 }, { "epoch": 1.0411406505272538, "grad_norm": 2.197497606277466, "learning_rate": 2.375315609683648e-05, "loss": 0.0758, "step": 14020 }, { "epoch": 1.0418832615476015, "grad_norm": 1.1447632312774658, "learning_rate": 2.3748700430714392e-05, "loss": 0.0637, "step": 14030 }, { "epoch": 1.042625872567949, "grad_norm": 2.18037486076355, "learning_rate": 2.3744244764592307e-05, "loss": 0.0899, "step": 14040 }, { "epoch": 1.0433684835882964, "grad_norm": 1.8311865329742432, "learning_rate": 2.3739789098470222e-05, "loss": 0.0924, "step": 14050 }, { "epoch": 1.044111094608644, "grad_norm": 1.3843854665756226, "learning_rate": 2.3735333432348137e-05, "loss": 0.0737, "step": 14060 }, { "epoch": 1.0448537056289915, "grad_norm": 2.358302116394043, "learning_rate": 2.3730877766226052e-05, "loss": 0.1003, "step": 14070 }, { "epoch": 1.0455963166493392, "grad_norm": 0.5193414688110352, "learning_rate": 2.3726422100103964e-05, "loss": 0.0753, "step": 14080 }, { "epoch": 1.0463389276696866, "grad_norm": 2.2045769691467285, "learning_rate": 2.3721966433981882e-05, "loss": 0.0484, "step": 14090 }, { "epoch": 1.047081538690034, "grad_norm": 1.636837124824524, "learning_rate": 2.3717510767859797e-05, "loss": 0.0846, "step": 14100 }, { "epoch": 1.0478241497103817, "grad_norm": 2.898538112640381, "learning_rate": 2.371305510173771e-05, "loss": 0.1106, "step": 14110 }, { "epoch": 1.0485667607307292, "grad_norm": 1.6339201927185059, "learning_rate": 2.3708599435615627e-05, "loss": 0.0969, "step": 14120 }, { "epoch": 1.0493093717510769, "grad_norm": 4.053697109222412, "learning_rate": 2.3704143769493542e-05, "loss": 0.0898, "step": 14130 }, { "epoch": 1.0500519827714243, "grad_norm": 2.793971300125122, "learning_rate": 2.3699688103371454e-05, "loss": 0.0996, "step": 14140 }, { "epoch": 1.050794593791772, "grad_norm": 1.560320258140564, "learning_rate": 2.369523243724937e-05, "loss": 0.1084, "step": 14150 }, { "epoch": 1.0515372048121194, "grad_norm": 2.7155508995056152, "learning_rate": 2.3690776771127284e-05, "loss": 0.0503, "step": 14160 }, { "epoch": 1.0522798158324669, "grad_norm": 1.7181273698806763, "learning_rate": 2.36863211050052e-05, "loss": 0.0793, "step": 14170 }, { "epoch": 1.0530224268528146, "grad_norm": 3.6145498752593994, "learning_rate": 2.3681865438883114e-05, "loss": 0.0727, "step": 14180 }, { "epoch": 1.053765037873162, "grad_norm": 1.1024489402770996, "learning_rate": 2.367740977276103e-05, "loss": 0.0675, "step": 14190 }, { "epoch": 1.0545076488935097, "grad_norm": 1.619287371635437, "learning_rate": 2.3672954106638944e-05, "loss": 0.0636, "step": 14200 }, { "epoch": 1.0552502599138571, "grad_norm": 2.327834367752075, "learning_rate": 2.366849844051686e-05, "loss": 0.0938, "step": 14210 }, { "epoch": 1.0559928709342046, "grad_norm": 2.3670992851257324, "learning_rate": 2.366404277439477e-05, "loss": 0.0786, "step": 14220 }, { "epoch": 1.0567354819545522, "grad_norm": 2.3187897205352783, "learning_rate": 2.365958710827269e-05, "loss": 0.1029, "step": 14230 }, { "epoch": 1.0574780929748997, "grad_norm": 2.0554943084716797, "learning_rate": 2.3655131442150604e-05, "loss": 0.0771, "step": 14240 }, { "epoch": 1.0582207039952474, "grad_norm": 1.3860465288162231, "learning_rate": 2.3650675776028515e-05, "loss": 0.0645, "step": 14250 }, { "epoch": 1.0589633150155948, "grad_norm": 0.33183351159095764, "learning_rate": 2.3646220109906434e-05, "loss": 0.0551, "step": 14260 }, { "epoch": 1.0597059260359423, "grad_norm": 0.7407335042953491, "learning_rate": 2.3641764443784345e-05, "loss": 0.1163, "step": 14270 }, { "epoch": 1.06044853705629, "grad_norm": 1.9329426288604736, "learning_rate": 2.363730877766226e-05, "loss": 0.0972, "step": 14280 }, { "epoch": 1.0611911480766374, "grad_norm": 0.46645310521125793, "learning_rate": 2.363285311154018e-05, "loss": 0.0547, "step": 14290 }, { "epoch": 1.061933759096985, "grad_norm": 0.9803817272186279, "learning_rate": 2.362839744541809e-05, "loss": 0.0556, "step": 14300 }, { "epoch": 1.0626763701173325, "grad_norm": 2.7999866008758545, "learning_rate": 2.3623941779296005e-05, "loss": 0.0965, "step": 14310 }, { "epoch": 1.0634189811376802, "grad_norm": 0.8801766037940979, "learning_rate": 2.361948611317392e-05, "loss": 0.0768, "step": 14320 }, { "epoch": 1.0641615921580276, "grad_norm": 1.0280200242996216, "learning_rate": 2.3615030447051835e-05, "loss": 0.0472, "step": 14330 }, { "epoch": 1.064904203178375, "grad_norm": 1.479634165763855, "learning_rate": 2.361057478092975e-05, "loss": 0.0671, "step": 14340 }, { "epoch": 1.0656468141987228, "grad_norm": 0.7711525559425354, "learning_rate": 2.3606119114807662e-05, "loss": 0.0725, "step": 14350 }, { "epoch": 1.0663894252190702, "grad_norm": 2.1043448448181152, "learning_rate": 2.360166344868558e-05, "loss": 0.0788, "step": 14360 }, { "epoch": 1.0671320362394179, "grad_norm": 1.8969895839691162, "learning_rate": 2.3597207782563495e-05, "loss": 0.0519, "step": 14370 }, { "epoch": 1.0678746472597653, "grad_norm": 3.172367811203003, "learning_rate": 2.3592752116441407e-05, "loss": 0.1093, "step": 14380 }, { "epoch": 1.0686172582801128, "grad_norm": 2.328660726547241, "learning_rate": 2.358829645031932e-05, "loss": 0.0743, "step": 14390 }, { "epoch": 1.0693598693004605, "grad_norm": 1.0215742588043213, "learning_rate": 2.358384078419724e-05, "loss": 0.0577, "step": 14400 }, { "epoch": 1.070102480320808, "grad_norm": 2.7807462215423584, "learning_rate": 2.357938511807515e-05, "loss": 0.0538, "step": 14410 }, { "epoch": 1.0708450913411556, "grad_norm": 2.544025421142578, "learning_rate": 2.3574929451953067e-05, "loss": 0.0984, "step": 14420 }, { "epoch": 1.071587702361503, "grad_norm": 1.8840546607971191, "learning_rate": 2.3570473785830985e-05, "loss": 0.0891, "step": 14430 }, { "epoch": 1.0723303133818507, "grad_norm": 1.2693723440170288, "learning_rate": 2.3566018119708897e-05, "loss": 0.098, "step": 14440 }, { "epoch": 1.0730729244021981, "grad_norm": 2.465930938720703, "learning_rate": 2.356156245358681e-05, "loss": 0.1154, "step": 14450 }, { "epoch": 1.0738155354225456, "grad_norm": 3.5287487506866455, "learning_rate": 2.3557106787464727e-05, "loss": 0.0794, "step": 14460 }, { "epoch": 1.0745581464428933, "grad_norm": 2.1311392784118652, "learning_rate": 2.355265112134264e-05, "loss": 0.0911, "step": 14470 }, { "epoch": 1.0753007574632407, "grad_norm": 1.7194443941116333, "learning_rate": 2.3548195455220557e-05, "loss": 0.0689, "step": 14480 }, { "epoch": 1.0760433684835884, "grad_norm": 1.9258701801300049, "learning_rate": 2.3543739789098468e-05, "loss": 0.0488, "step": 14490 }, { "epoch": 1.0767859795039358, "grad_norm": 1.2407386302947998, "learning_rate": 2.3539284122976387e-05, "loss": 0.066, "step": 14500 }, { "epoch": 1.0775285905242833, "grad_norm": 0.2325424998998642, "learning_rate": 2.35348284568543e-05, "loss": 0.0667, "step": 14510 }, { "epoch": 1.078271201544631, "grad_norm": 0.7023411989212036, "learning_rate": 2.3530372790732213e-05, "loss": 0.0774, "step": 14520 }, { "epoch": 1.0790138125649784, "grad_norm": 0.42877697944641113, "learning_rate": 2.352591712461013e-05, "loss": 0.0441, "step": 14530 }, { "epoch": 1.079756423585326, "grad_norm": 0.8632937669754028, "learning_rate": 2.3521461458488047e-05, "loss": 0.0564, "step": 14540 }, { "epoch": 1.0804990346056735, "grad_norm": 7.092894077301025, "learning_rate": 2.3517005792365958e-05, "loss": 0.0802, "step": 14550 }, { "epoch": 1.081241645626021, "grad_norm": 2.007236957550049, "learning_rate": 2.3512550126243873e-05, "loss": 0.1011, "step": 14560 }, { "epoch": 1.0819842566463687, "grad_norm": 2.4065308570861816, "learning_rate": 2.3508094460121788e-05, "loss": 0.0985, "step": 14570 }, { "epoch": 1.082726867666716, "grad_norm": 4.827093601226807, "learning_rate": 2.3503638793999703e-05, "loss": 0.1154, "step": 14580 }, { "epoch": 1.0834694786870638, "grad_norm": 2.402787208557129, "learning_rate": 2.3499183127877618e-05, "loss": 0.0893, "step": 14590 }, { "epoch": 1.0842120897074112, "grad_norm": 1.0034596920013428, "learning_rate": 2.3494727461755533e-05, "loss": 0.1013, "step": 14600 }, { "epoch": 1.084954700727759, "grad_norm": 1.7055171728134155, "learning_rate": 2.3490271795633448e-05, "loss": 0.0918, "step": 14610 }, { "epoch": 1.0856973117481064, "grad_norm": 2.0638813972473145, "learning_rate": 2.3485816129511363e-05, "loss": 0.0888, "step": 14620 }, { "epoch": 1.0864399227684538, "grad_norm": 1.2625740766525269, "learning_rate": 2.3481360463389275e-05, "loss": 0.0901, "step": 14630 }, { "epoch": 1.0871825337888015, "grad_norm": 1.9347448348999023, "learning_rate": 2.3476904797267193e-05, "loss": 0.0508, "step": 14640 }, { "epoch": 1.087925144809149, "grad_norm": 1.2374624013900757, "learning_rate": 2.3472449131145108e-05, "loss": 0.0544, "step": 14650 }, { "epoch": 1.0886677558294966, "grad_norm": 1.0130029916763306, "learning_rate": 2.346799346502302e-05, "loss": 0.0371, "step": 14660 }, { "epoch": 1.089410366849844, "grad_norm": 2.272545576095581, "learning_rate": 2.3463537798900938e-05, "loss": 0.0668, "step": 14670 }, { "epoch": 1.0901529778701915, "grad_norm": 1.1492127180099487, "learning_rate": 2.345908213277885e-05, "loss": 0.0909, "step": 14680 }, { "epoch": 1.0908955888905392, "grad_norm": 0.4936734735965729, "learning_rate": 2.3454626466656765e-05, "loss": 0.0579, "step": 14690 }, { "epoch": 1.0916381999108866, "grad_norm": 1.2032221555709839, "learning_rate": 2.3450170800534683e-05, "loss": 0.0953, "step": 14700 }, { "epoch": 1.0923808109312343, "grad_norm": 1.325255036354065, "learning_rate": 2.3445715134412595e-05, "loss": 0.0653, "step": 14710 }, { "epoch": 1.0931234219515817, "grad_norm": 2.2962052822113037, "learning_rate": 2.344125946829051e-05, "loss": 0.0618, "step": 14720 }, { "epoch": 1.0938660329719294, "grad_norm": 0.9786393642425537, "learning_rate": 2.3436803802168425e-05, "loss": 0.0784, "step": 14730 }, { "epoch": 1.0946086439922769, "grad_norm": 1.7134709358215332, "learning_rate": 2.343234813604634e-05, "loss": 0.0592, "step": 14740 }, { "epoch": 1.0953512550126243, "grad_norm": 0.9471766948699951, "learning_rate": 2.3427892469924255e-05, "loss": 0.112, "step": 14750 }, { "epoch": 1.096093866032972, "grad_norm": 0.9687842130661011, "learning_rate": 2.342343680380217e-05, "loss": 0.0799, "step": 14760 }, { "epoch": 1.0968364770533194, "grad_norm": 1.6770538091659546, "learning_rate": 2.3418981137680085e-05, "loss": 0.0894, "step": 14770 }, { "epoch": 1.097579088073667, "grad_norm": 2.252725124359131, "learning_rate": 2.3414525471558e-05, "loss": 0.0903, "step": 14780 }, { "epoch": 1.0983216990940146, "grad_norm": 1.531714677810669, "learning_rate": 2.341006980543591e-05, "loss": 0.0644, "step": 14790 }, { "epoch": 1.099064310114362, "grad_norm": 2.1551620960235596, "learning_rate": 2.3405614139313826e-05, "loss": 0.073, "step": 14800 }, { "epoch": 1.0998069211347097, "grad_norm": 1.436985969543457, "learning_rate": 2.3401158473191745e-05, "loss": 0.0695, "step": 14810 }, { "epoch": 1.1005495321550571, "grad_norm": 2.5473999977111816, "learning_rate": 2.3396702807069656e-05, "loss": 0.037, "step": 14820 }, { "epoch": 1.1012921431754048, "grad_norm": 2.2128050327301025, "learning_rate": 2.339224714094757e-05, "loss": 0.0767, "step": 14830 }, { "epoch": 1.1020347541957523, "grad_norm": 0.8083871006965637, "learning_rate": 2.338779147482549e-05, "loss": 0.0557, "step": 14840 }, { "epoch": 1.1027773652160997, "grad_norm": 2.2617440223693848, "learning_rate": 2.33833358087034e-05, "loss": 0.0922, "step": 14850 }, { "epoch": 1.1035199762364474, "grad_norm": 0.7747202515602112, "learning_rate": 2.3378880142581316e-05, "loss": 0.0625, "step": 14860 }, { "epoch": 1.1042625872567948, "grad_norm": 1.1605490446090698, "learning_rate": 2.337442447645923e-05, "loss": 0.0792, "step": 14870 }, { "epoch": 1.1050051982771425, "grad_norm": 1.8753223419189453, "learning_rate": 2.3369968810337146e-05, "loss": 0.0943, "step": 14880 }, { "epoch": 1.10574780929749, "grad_norm": 0.9175904393196106, "learning_rate": 2.336551314421506e-05, "loss": 0.0921, "step": 14890 }, { "epoch": 1.1064904203178376, "grad_norm": 0.9853323101997375, "learning_rate": 2.3361057478092973e-05, "loss": 0.0689, "step": 14900 }, { "epoch": 1.107233031338185, "grad_norm": 1.734784722328186, "learning_rate": 2.335660181197089e-05, "loss": 0.0748, "step": 14910 }, { "epoch": 1.1079756423585325, "grad_norm": 3.9313881397247314, "learning_rate": 2.3352146145848806e-05, "loss": 0.0829, "step": 14920 }, { "epoch": 1.1087182533788802, "grad_norm": 0.6658304929733276, "learning_rate": 2.3347690479726718e-05, "loss": 0.0597, "step": 14930 }, { "epoch": 1.1094608643992276, "grad_norm": 1.8570702075958252, "learning_rate": 2.3343234813604636e-05, "loss": 0.0969, "step": 14940 }, { "epoch": 1.1102034754195753, "grad_norm": 2.4052164554595947, "learning_rate": 2.333877914748255e-05, "loss": 0.0937, "step": 14950 }, { "epoch": 1.1109460864399228, "grad_norm": 2.2822628021240234, "learning_rate": 2.3334323481360463e-05, "loss": 0.086, "step": 14960 }, { "epoch": 1.1116886974602702, "grad_norm": 4.604571342468262, "learning_rate": 2.3329867815238378e-05, "loss": 0.1085, "step": 14970 }, { "epoch": 1.1124313084806179, "grad_norm": 2.1120212078094482, "learning_rate": 2.3325412149116293e-05, "loss": 0.0738, "step": 14980 }, { "epoch": 1.1131739195009653, "grad_norm": 2.652463912963867, "learning_rate": 2.3320956482994208e-05, "loss": 0.0612, "step": 14990 }, { "epoch": 1.113916530521313, "grad_norm": 1.0508460998535156, "learning_rate": 2.3316500816872123e-05, "loss": 0.0624, "step": 15000 }, { "epoch": 1.1146591415416605, "grad_norm": 2.2255547046661377, "learning_rate": 2.3312045150750038e-05, "loss": 0.0836, "step": 15010 }, { "epoch": 1.1154017525620081, "grad_norm": 1.7754733562469482, "learning_rate": 2.3307589484627953e-05, "loss": 0.1167, "step": 15020 }, { "epoch": 1.1161443635823556, "grad_norm": 1.857144832611084, "learning_rate": 2.3303133818505868e-05, "loss": 0.0596, "step": 15030 }, { "epoch": 1.116886974602703, "grad_norm": 3.7554049491882324, "learning_rate": 2.3298678152383783e-05, "loss": 0.0647, "step": 15040 }, { "epoch": 1.1176295856230507, "grad_norm": 3.861762762069702, "learning_rate": 2.3294222486261698e-05, "loss": 0.083, "step": 15050 }, { "epoch": 1.1183721966433982, "grad_norm": 0.333187997341156, "learning_rate": 2.3289766820139613e-05, "loss": 0.0567, "step": 15060 }, { "epoch": 1.1191148076637458, "grad_norm": 3.2010586261749268, "learning_rate": 2.3285311154017524e-05, "loss": 0.0925, "step": 15070 }, { "epoch": 1.1198574186840933, "grad_norm": 3.66748046875, "learning_rate": 2.3280855487895443e-05, "loss": 0.0884, "step": 15080 }, { "epoch": 1.1206000297044407, "grad_norm": 0.8364987969398499, "learning_rate": 2.3276399821773354e-05, "loss": 0.0644, "step": 15090 }, { "epoch": 1.1213426407247884, "grad_norm": 2.9807636737823486, "learning_rate": 2.327194415565127e-05, "loss": 0.0687, "step": 15100 }, { "epoch": 1.1220852517451358, "grad_norm": 0.977165937423706, "learning_rate": 2.3267488489529188e-05, "loss": 0.0714, "step": 15110 }, { "epoch": 1.1228278627654835, "grad_norm": 1.4593790769577026, "learning_rate": 2.32630328234071e-05, "loss": 0.0698, "step": 15120 }, { "epoch": 1.123570473785831, "grad_norm": 0.9217209219932556, "learning_rate": 2.3258577157285014e-05, "loss": 0.0764, "step": 15130 }, { "epoch": 1.1243130848061784, "grad_norm": 1.4398505687713623, "learning_rate": 2.325412149116293e-05, "loss": 0.1199, "step": 15140 }, { "epoch": 1.125055695826526, "grad_norm": 1.7291533946990967, "learning_rate": 2.3249665825040844e-05, "loss": 0.0899, "step": 15150 }, { "epoch": 1.1257983068468735, "grad_norm": 2.176523208618164, "learning_rate": 2.324521015891876e-05, "loss": 0.0746, "step": 15160 }, { "epoch": 1.1265409178672212, "grad_norm": 2.276003122329712, "learning_rate": 2.3240754492796674e-05, "loss": 0.1026, "step": 15170 }, { "epoch": 1.1272835288875687, "grad_norm": 2.0524089336395264, "learning_rate": 2.323629882667459e-05, "loss": 0.0692, "step": 15180 }, { "epoch": 1.1280261399079161, "grad_norm": 0.9038380980491638, "learning_rate": 2.3231843160552504e-05, "loss": 0.0775, "step": 15190 }, { "epoch": 1.1287687509282638, "grad_norm": 2.3695902824401855, "learning_rate": 2.3227387494430416e-05, "loss": 0.1008, "step": 15200 }, { "epoch": 1.1295113619486112, "grad_norm": 1.6643588542938232, "learning_rate": 2.322293182830833e-05, "loss": 0.0637, "step": 15210 }, { "epoch": 1.130253972968959, "grad_norm": 0.4091331660747528, "learning_rate": 2.321847616218625e-05, "loss": 0.0768, "step": 15220 }, { "epoch": 1.1309965839893064, "grad_norm": 2.0745925903320312, "learning_rate": 2.321402049606416e-05, "loss": 0.082, "step": 15230 }, { "epoch": 1.131739195009654, "grad_norm": 3.2917673587799072, "learning_rate": 2.3209564829942076e-05, "loss": 0.1074, "step": 15240 }, { "epoch": 1.1324818060300015, "grad_norm": 2.7250659465789795, "learning_rate": 2.3205109163819994e-05, "loss": 0.0742, "step": 15250 }, { "epoch": 1.133224417050349, "grad_norm": 0.4812146723270416, "learning_rate": 2.3200653497697906e-05, "loss": 0.0644, "step": 15260 }, { "epoch": 1.1339670280706966, "grad_norm": 1.7859218120574951, "learning_rate": 2.319619783157582e-05, "loss": 0.1025, "step": 15270 }, { "epoch": 1.134709639091044, "grad_norm": 1.6448826789855957, "learning_rate": 2.3191742165453736e-05, "loss": 0.0694, "step": 15280 }, { "epoch": 1.1354522501113917, "grad_norm": 1.5270838737487793, "learning_rate": 2.318728649933165e-05, "loss": 0.084, "step": 15290 }, { "epoch": 1.1361948611317392, "grad_norm": 2.1386685371398926, "learning_rate": 2.3182830833209566e-05, "loss": 0.0864, "step": 15300 }, { "epoch": 1.1369374721520868, "grad_norm": 2.1466033458709717, "learning_rate": 2.3178375167087477e-05, "loss": 0.0792, "step": 15310 }, { "epoch": 1.1376800831724343, "grad_norm": 1.733211874961853, "learning_rate": 2.3173919500965396e-05, "loss": 0.0966, "step": 15320 }, { "epoch": 1.1384226941927817, "grad_norm": 0.9286133646965027, "learning_rate": 2.316946383484331e-05, "loss": 0.0736, "step": 15330 }, { "epoch": 1.1391653052131294, "grad_norm": 3.003708839416504, "learning_rate": 2.3165008168721222e-05, "loss": 0.1012, "step": 15340 }, { "epoch": 1.1399079162334769, "grad_norm": 0.6814678311347961, "learning_rate": 2.316055250259914e-05, "loss": 0.0611, "step": 15350 }, { "epoch": 1.1406505272538245, "grad_norm": 1.6721028089523315, "learning_rate": 2.3156096836477056e-05, "loss": 0.0972, "step": 15360 }, { "epoch": 1.141393138274172, "grad_norm": 1.2787104845046997, "learning_rate": 2.3151641170354967e-05, "loss": 0.0652, "step": 15370 }, { "epoch": 1.1421357492945194, "grad_norm": 1.4805560111999512, "learning_rate": 2.3147185504232882e-05, "loss": 0.0678, "step": 15380 }, { "epoch": 1.1428783603148671, "grad_norm": 0.9559769034385681, "learning_rate": 2.3142729838110797e-05, "loss": 0.0718, "step": 15390 }, { "epoch": 1.1436209713352146, "grad_norm": 2.3336753845214844, "learning_rate": 2.3138274171988712e-05, "loss": 0.0716, "step": 15400 }, { "epoch": 1.1443635823555622, "grad_norm": 1.7206687927246094, "learning_rate": 2.3133818505866627e-05, "loss": 0.0532, "step": 15410 }, { "epoch": 1.1451061933759097, "grad_norm": 4.165546894073486, "learning_rate": 2.3129362839744542e-05, "loss": 0.0606, "step": 15420 }, { "epoch": 1.1458488043962571, "grad_norm": 3.124039888381958, "learning_rate": 2.3124907173622457e-05, "loss": 0.111, "step": 15430 }, { "epoch": 1.1465914154166048, "grad_norm": 0.9052489995956421, "learning_rate": 2.3120451507500372e-05, "loss": 0.0805, "step": 15440 }, { "epoch": 1.1473340264369523, "grad_norm": 1.5409538745880127, "learning_rate": 2.3115995841378287e-05, "loss": 0.0552, "step": 15450 }, { "epoch": 1.1480766374573, "grad_norm": 0.6143955588340759, "learning_rate": 2.3111540175256202e-05, "loss": 0.0527, "step": 15460 }, { "epoch": 1.1488192484776474, "grad_norm": 2.081989288330078, "learning_rate": 2.3107084509134117e-05, "loss": 0.1173, "step": 15470 }, { "epoch": 1.1495618594979948, "grad_norm": 1.4942225217819214, "learning_rate": 2.310262884301203e-05, "loss": 0.0951, "step": 15480 }, { "epoch": 1.1503044705183425, "grad_norm": 3.5311174392700195, "learning_rate": 2.3098173176889947e-05, "loss": 0.1277, "step": 15490 }, { "epoch": 1.15104708153869, "grad_norm": 1.772064447402954, "learning_rate": 2.309371751076786e-05, "loss": 0.0568, "step": 15500 }, { "epoch": 1.1517896925590376, "grad_norm": 1.9447821378707886, "learning_rate": 2.3089261844645774e-05, "loss": 0.0542, "step": 15510 }, { "epoch": 1.152532303579385, "grad_norm": 2.5192413330078125, "learning_rate": 2.3084806178523692e-05, "loss": 0.0636, "step": 15520 }, { "epoch": 1.1532749145997327, "grad_norm": 0.6667538285255432, "learning_rate": 2.3080350512401604e-05, "loss": 0.0897, "step": 15530 }, { "epoch": 1.1540175256200802, "grad_norm": 1.988601803779602, "learning_rate": 2.307589484627952e-05, "loss": 0.1259, "step": 15540 }, { "epoch": 1.1547601366404276, "grad_norm": 1.6338027715682983, "learning_rate": 2.3071439180157434e-05, "loss": 0.0578, "step": 15550 }, { "epoch": 1.1555027476607753, "grad_norm": 3.52496600151062, "learning_rate": 2.306698351403535e-05, "loss": 0.074, "step": 15560 }, { "epoch": 1.1562453586811228, "grad_norm": 1.9696495532989502, "learning_rate": 2.3062527847913264e-05, "loss": 0.1064, "step": 15570 }, { "epoch": 1.1569879697014704, "grad_norm": 1.104413390159607, "learning_rate": 2.305807218179118e-05, "loss": 0.0936, "step": 15580 }, { "epoch": 1.157730580721818, "grad_norm": 2.5523598194122314, "learning_rate": 2.3053616515669094e-05, "loss": 0.0714, "step": 15590 }, { "epoch": 1.1584731917421656, "grad_norm": 1.7481622695922852, "learning_rate": 2.304916084954701e-05, "loss": 0.1121, "step": 15600 }, { "epoch": 1.159215802762513, "grad_norm": 3.4101874828338623, "learning_rate": 2.304470518342492e-05, "loss": 0.0916, "step": 15610 }, { "epoch": 1.1599584137828605, "grad_norm": 1.432702660560608, "learning_rate": 2.3040249517302835e-05, "loss": 0.1003, "step": 15620 }, { "epoch": 1.1607010248032081, "grad_norm": 0.6182481646537781, "learning_rate": 2.3035793851180754e-05, "loss": 0.0504, "step": 15630 }, { "epoch": 1.1614436358235556, "grad_norm": 2.7727530002593994, "learning_rate": 2.3031338185058665e-05, "loss": 0.0671, "step": 15640 }, { "epoch": 1.1621862468439033, "grad_norm": 0.7968599200248718, "learning_rate": 2.302688251893658e-05, "loss": 0.0742, "step": 15650 }, { "epoch": 1.1629288578642507, "grad_norm": 1.9094624519348145, "learning_rate": 2.30224268528145e-05, "loss": 0.0778, "step": 15660 }, { "epoch": 1.1636714688845982, "grad_norm": 2.032755136489868, "learning_rate": 2.301797118669241e-05, "loss": 0.0674, "step": 15670 }, { "epoch": 1.1644140799049458, "grad_norm": 2.188685417175293, "learning_rate": 2.3013515520570325e-05, "loss": 0.1031, "step": 15680 }, { "epoch": 1.1651566909252933, "grad_norm": 0.8258926868438721, "learning_rate": 2.3009059854448244e-05, "loss": 0.0641, "step": 15690 }, { "epoch": 1.165899301945641, "grad_norm": 0.9573965668678284, "learning_rate": 2.3004604188326155e-05, "loss": 0.0601, "step": 15700 }, { "epoch": 1.1666419129659884, "grad_norm": 1.2031244039535522, "learning_rate": 2.300014852220407e-05, "loss": 0.0518, "step": 15710 }, { "epoch": 1.1673845239863359, "grad_norm": 3.0771892070770264, "learning_rate": 2.2995692856081982e-05, "loss": 0.1124, "step": 15720 }, { "epoch": 1.1681271350066835, "grad_norm": 1.2991482019424438, "learning_rate": 2.29912371899599e-05, "loss": 0.0508, "step": 15730 }, { "epoch": 1.168869746027031, "grad_norm": 0.9079421162605286, "learning_rate": 2.2986781523837815e-05, "loss": 0.0638, "step": 15740 }, { "epoch": 1.1696123570473786, "grad_norm": 1.197899341583252, "learning_rate": 2.2982325857715727e-05, "loss": 0.0828, "step": 15750 }, { "epoch": 1.170354968067726, "grad_norm": 1.4822005033493042, "learning_rate": 2.2977870191593645e-05, "loss": 0.0511, "step": 15760 }, { "epoch": 1.1710975790880735, "grad_norm": 2.059238910675049, "learning_rate": 2.297341452547156e-05, "loss": 0.0762, "step": 15770 }, { "epoch": 1.1718401901084212, "grad_norm": 2.548032283782959, "learning_rate": 2.2968958859349472e-05, "loss": 0.0711, "step": 15780 }, { "epoch": 1.1725828011287687, "grad_norm": 2.6228041648864746, "learning_rate": 2.2964503193227387e-05, "loss": 0.0709, "step": 15790 }, { "epoch": 1.1733254121491163, "grad_norm": 1.9563509225845337, "learning_rate": 2.2960047527105302e-05, "loss": 0.0589, "step": 15800 }, { "epoch": 1.1740680231694638, "grad_norm": 1.4019722938537598, "learning_rate": 2.2955591860983217e-05, "loss": 0.0733, "step": 15810 }, { "epoch": 1.1748106341898115, "grad_norm": 1.6653647422790527, "learning_rate": 2.2951136194861132e-05, "loss": 0.0505, "step": 15820 }, { "epoch": 1.175553245210159, "grad_norm": 2.455420732498169, "learning_rate": 2.2946680528739047e-05, "loss": 0.1232, "step": 15830 }, { "epoch": 1.1762958562305064, "grad_norm": 1.1181570291519165, "learning_rate": 2.2942224862616962e-05, "loss": 0.0626, "step": 15840 }, { "epoch": 1.177038467250854, "grad_norm": 1.8775357007980347, "learning_rate": 2.2937769196494877e-05, "loss": 0.0971, "step": 15850 }, { "epoch": 1.1777810782712015, "grad_norm": 1.8063764572143555, "learning_rate": 2.2933313530372792e-05, "loss": 0.12, "step": 15860 }, { "epoch": 1.1785236892915492, "grad_norm": 0.9645183682441711, "learning_rate": 2.2928857864250707e-05, "loss": 0.0783, "step": 15870 }, { "epoch": 1.1792663003118966, "grad_norm": 2.029613971710205, "learning_rate": 2.2924402198128622e-05, "loss": 0.1101, "step": 15880 }, { "epoch": 1.1800089113322443, "grad_norm": 0.7178744673728943, "learning_rate": 2.2919946532006534e-05, "loss": 0.0689, "step": 15890 }, { "epoch": 1.1807515223525917, "grad_norm": 1.271041750907898, "learning_rate": 2.2915490865884452e-05, "loss": 0.0662, "step": 15900 }, { "epoch": 1.1814941333729392, "grad_norm": 2.572619915008545, "learning_rate": 2.2911035199762364e-05, "loss": 0.0677, "step": 15910 }, { "epoch": 1.1822367443932869, "grad_norm": 0.5422751307487488, "learning_rate": 2.290657953364028e-05, "loss": 0.0662, "step": 15920 }, { "epoch": 1.1829793554136343, "grad_norm": 2.011805295944214, "learning_rate": 2.2902123867518197e-05, "loss": 0.078, "step": 15930 }, { "epoch": 1.183721966433982, "grad_norm": 1.7894953489303589, "learning_rate": 2.289766820139611e-05, "loss": 0.0817, "step": 15940 }, { "epoch": 1.1844645774543294, "grad_norm": 0.9145591855049133, "learning_rate": 2.2893212535274024e-05, "loss": 0.0867, "step": 15950 }, { "epoch": 1.1852071884746769, "grad_norm": 2.027958631515503, "learning_rate": 2.288875686915194e-05, "loss": 0.0579, "step": 15960 }, { "epoch": 1.1859497994950245, "grad_norm": 0.9415296316146851, "learning_rate": 2.2884301203029853e-05, "loss": 0.0854, "step": 15970 }, { "epoch": 1.186692410515372, "grad_norm": 2.021057367324829, "learning_rate": 2.287984553690777e-05, "loss": 0.073, "step": 15980 }, { "epoch": 1.1874350215357197, "grad_norm": 1.6900122165679932, "learning_rate": 2.2875389870785683e-05, "loss": 0.0772, "step": 15990 }, { "epoch": 1.1881776325560671, "grad_norm": 0.6172839999198914, "learning_rate": 2.28709342046636e-05, "loss": 0.0522, "step": 16000 }, { "epoch": 1.1889202435764146, "grad_norm": 2.918687105178833, "learning_rate": 2.2866478538541513e-05, "loss": 0.0566, "step": 16010 }, { "epoch": 1.1896628545967622, "grad_norm": 1.5707764625549316, "learning_rate": 2.2862022872419425e-05, "loss": 0.0721, "step": 16020 }, { "epoch": 1.1904054656171097, "grad_norm": 1.0280346870422363, "learning_rate": 2.285756720629734e-05, "loss": 0.0646, "step": 16030 }, { "epoch": 1.1911480766374574, "grad_norm": 1.4963786602020264, "learning_rate": 2.285311154017526e-05, "loss": 0.0555, "step": 16040 }, { "epoch": 1.1918906876578048, "grad_norm": 0.9982985258102417, "learning_rate": 2.284865587405317e-05, "loss": 0.0647, "step": 16050 }, { "epoch": 1.1926332986781523, "grad_norm": 1.1375582218170166, "learning_rate": 2.2844200207931085e-05, "loss": 0.0728, "step": 16060 }, { "epoch": 1.1933759096985, "grad_norm": 4.054408073425293, "learning_rate": 2.2839744541809003e-05, "loss": 0.0874, "step": 16070 }, { "epoch": 1.1941185207188474, "grad_norm": 3.6856000423431396, "learning_rate": 2.2835288875686915e-05, "loss": 0.1011, "step": 16080 }, { "epoch": 1.194861131739195, "grad_norm": 3.7098820209503174, "learning_rate": 2.283083320956483e-05, "loss": 0.0638, "step": 16090 }, { "epoch": 1.1956037427595425, "grad_norm": 0.7960993647575378, "learning_rate": 2.282637754344275e-05, "loss": 0.0832, "step": 16100 }, { "epoch": 1.1963463537798902, "grad_norm": 1.0602184534072876, "learning_rate": 2.282192187732066e-05, "loss": 0.0824, "step": 16110 }, { "epoch": 1.1970889648002376, "grad_norm": 1.5129131078720093, "learning_rate": 2.2817466211198575e-05, "loss": 0.0734, "step": 16120 }, { "epoch": 1.197831575820585, "grad_norm": 2.3577966690063477, "learning_rate": 2.2813010545076487e-05, "loss": 0.0931, "step": 16130 }, { "epoch": 1.1985741868409328, "grad_norm": 2.6945302486419678, "learning_rate": 2.2808554878954405e-05, "loss": 0.1088, "step": 16140 }, { "epoch": 1.1993167978612802, "grad_norm": 1.0213791131973267, "learning_rate": 2.280409921283232e-05, "loss": 0.0683, "step": 16150 }, { "epoch": 1.2000594088816279, "grad_norm": 0.7909874320030212, "learning_rate": 2.279964354671023e-05, "loss": 0.0453, "step": 16160 }, { "epoch": 1.2008020199019753, "grad_norm": 1.2978131771087646, "learning_rate": 2.279518788058815e-05, "loss": 0.0787, "step": 16170 }, { "epoch": 1.201544630922323, "grad_norm": 0.5896059274673462, "learning_rate": 2.2790732214466065e-05, "loss": 0.0719, "step": 16180 }, { "epoch": 1.2022872419426704, "grad_norm": 2.9565205574035645, "learning_rate": 2.2786276548343977e-05, "loss": 0.1046, "step": 16190 }, { "epoch": 1.203029852963018, "grad_norm": 4.079509258270264, "learning_rate": 2.278182088222189e-05, "loss": 0.0686, "step": 16200 }, { "epoch": 1.2037724639833656, "grad_norm": 2.044127941131592, "learning_rate": 2.277736521609981e-05, "loss": 0.0618, "step": 16210 }, { "epoch": 1.204515075003713, "grad_norm": 2.2022972106933594, "learning_rate": 2.277290954997772e-05, "loss": 0.0973, "step": 16220 }, { "epoch": 1.2052576860240607, "grad_norm": 1.957979440689087, "learning_rate": 2.2768453883855637e-05, "loss": 0.0765, "step": 16230 }, { "epoch": 1.2060002970444081, "grad_norm": 0.7121636271476746, "learning_rate": 2.276399821773355e-05, "loss": 0.0732, "step": 16240 }, { "epoch": 1.2067429080647556, "grad_norm": 2.143155574798584, "learning_rate": 2.2759542551611467e-05, "loss": 0.0564, "step": 16250 }, { "epoch": 1.2074855190851033, "grad_norm": 2.7049849033355713, "learning_rate": 2.275508688548938e-05, "loss": 0.0806, "step": 16260 }, { "epoch": 1.2082281301054507, "grad_norm": 2.114739418029785, "learning_rate": 2.2750631219367297e-05, "loss": 0.0569, "step": 16270 }, { "epoch": 1.2089707411257984, "grad_norm": 2.1519935131073, "learning_rate": 2.274617555324521e-05, "loss": 0.0727, "step": 16280 }, { "epoch": 1.2097133521461458, "grad_norm": 0.7403703927993774, "learning_rate": 2.2741719887123127e-05, "loss": 0.0468, "step": 16290 }, { "epoch": 1.2104559631664933, "grad_norm": 1.2450178861618042, "learning_rate": 2.2737264221001038e-05, "loss": 0.09, "step": 16300 }, { "epoch": 1.211198574186841, "grad_norm": 2.1269819736480713, "learning_rate": 2.2732808554878957e-05, "loss": 0.0532, "step": 16310 }, { "epoch": 1.2119411852071884, "grad_norm": 1.752208948135376, "learning_rate": 2.2728352888756868e-05, "loss": 0.0908, "step": 16320 }, { "epoch": 1.212683796227536, "grad_norm": 2.0299859046936035, "learning_rate": 2.2723897222634783e-05, "loss": 0.1128, "step": 16330 }, { "epoch": 1.2134264072478835, "grad_norm": 0.9291142225265503, "learning_rate": 2.27194415565127e-05, "loss": 0.07, "step": 16340 }, { "epoch": 1.214169018268231, "grad_norm": 1.3614659309387207, "learning_rate": 2.2714985890390613e-05, "loss": 0.1037, "step": 16350 }, { "epoch": 1.2149116292885787, "grad_norm": 0.6833984851837158, "learning_rate": 2.2710530224268528e-05, "loss": 0.0574, "step": 16360 }, { "epoch": 1.215654240308926, "grad_norm": 1.6602541208267212, "learning_rate": 2.2706074558146443e-05, "loss": 0.0677, "step": 16370 }, { "epoch": 1.2163968513292738, "grad_norm": 3.005326509475708, "learning_rate": 2.2701618892024358e-05, "loss": 0.0806, "step": 16380 }, { "epoch": 1.2171394623496212, "grad_norm": 0.7319986820220947, "learning_rate": 2.2697163225902273e-05, "loss": 0.0621, "step": 16390 }, { "epoch": 1.217882073369969, "grad_norm": 1.5393048524856567, "learning_rate": 2.2692707559780188e-05, "loss": 0.0795, "step": 16400 }, { "epoch": 1.2186246843903163, "grad_norm": 1.8766040802001953, "learning_rate": 2.2688251893658103e-05, "loss": 0.0525, "step": 16410 }, { "epoch": 1.2193672954106638, "grad_norm": 0.824567437171936, "learning_rate": 2.2683796227536018e-05, "loss": 0.0746, "step": 16420 }, { "epoch": 1.2201099064310115, "grad_norm": 0.9083713293075562, "learning_rate": 2.267934056141393e-05, "loss": 0.0791, "step": 16430 }, { "epoch": 1.220852517451359, "grad_norm": 1.9115943908691406, "learning_rate": 2.2674884895291848e-05, "loss": 0.076, "step": 16440 }, { "epoch": 1.2215951284717066, "grad_norm": 1.7898435592651367, "learning_rate": 2.2670429229169763e-05, "loss": 0.0917, "step": 16450 }, { "epoch": 1.222337739492054, "grad_norm": 3.3459744453430176, "learning_rate": 2.2665973563047675e-05, "loss": 0.0693, "step": 16460 }, { "epoch": 1.2230803505124017, "grad_norm": 2.533830404281616, "learning_rate": 2.266151789692559e-05, "loss": 0.0948, "step": 16470 }, { "epoch": 1.2238229615327492, "grad_norm": 0.8340369462966919, "learning_rate": 2.2657062230803508e-05, "loss": 0.0518, "step": 16480 }, { "epoch": 1.2245655725530966, "grad_norm": 0.5460755825042725, "learning_rate": 2.265260656468142e-05, "loss": 0.0776, "step": 16490 }, { "epoch": 1.2253081835734443, "grad_norm": 0.6815189719200134, "learning_rate": 2.2648150898559335e-05, "loss": 0.0595, "step": 16500 }, { "epoch": 1.2260507945937917, "grad_norm": 1.1513220071792603, "learning_rate": 2.2643695232437253e-05, "loss": 0.146, "step": 16510 }, { "epoch": 1.2267934056141394, "grad_norm": 1.9079151153564453, "learning_rate": 2.2639239566315165e-05, "loss": 0.0827, "step": 16520 }, { "epoch": 1.2275360166344869, "grad_norm": 1.8923051357269287, "learning_rate": 2.263478390019308e-05, "loss": 0.1135, "step": 16530 }, { "epoch": 1.2282786276548343, "grad_norm": 1.8979130983352661, "learning_rate": 2.263032823407099e-05, "loss": 0.0895, "step": 16540 }, { "epoch": 1.229021238675182, "grad_norm": 1.6301295757293701, "learning_rate": 2.262587256794891e-05, "loss": 0.0891, "step": 16550 }, { "epoch": 1.2297638496955294, "grad_norm": 1.7979081869125366, "learning_rate": 2.2621416901826825e-05, "loss": 0.0765, "step": 16560 }, { "epoch": 1.230506460715877, "grad_norm": 0.5826703310012817, "learning_rate": 2.2616961235704736e-05, "loss": 0.0668, "step": 16570 }, { "epoch": 1.2312490717362246, "grad_norm": 1.6885042190551758, "learning_rate": 2.2612505569582655e-05, "loss": 0.0646, "step": 16580 }, { "epoch": 1.231991682756572, "grad_norm": 0.9739753603935242, "learning_rate": 2.260804990346057e-05, "loss": 0.0566, "step": 16590 }, { "epoch": 1.2327342937769197, "grad_norm": 0.9012984037399292, "learning_rate": 2.260359423733848e-05, "loss": 0.0785, "step": 16600 }, { "epoch": 1.2334769047972671, "grad_norm": 0.8952996134757996, "learning_rate": 2.2599138571216396e-05, "loss": 0.0655, "step": 16610 }, { "epoch": 1.2342195158176148, "grad_norm": 2.116847515106201, "learning_rate": 2.2594682905094315e-05, "loss": 0.077, "step": 16620 }, { "epoch": 1.2349621268379622, "grad_norm": 0.5466039180755615, "learning_rate": 2.2590227238972226e-05, "loss": 0.0754, "step": 16630 }, { "epoch": 1.2357047378583097, "grad_norm": 0.7208026647567749, "learning_rate": 2.258577157285014e-05, "loss": 0.0987, "step": 16640 }, { "epoch": 1.2364473488786574, "grad_norm": 1.444373607635498, "learning_rate": 2.2581315906728056e-05, "loss": 0.0711, "step": 16650 }, { "epoch": 1.2371899598990048, "grad_norm": 1.2086124420166016, "learning_rate": 2.257686024060597e-05, "loss": 0.0818, "step": 16660 }, { "epoch": 1.2379325709193525, "grad_norm": 1.3369284868240356, "learning_rate": 2.2572404574483886e-05, "loss": 0.0722, "step": 16670 }, { "epoch": 1.2386751819397, "grad_norm": 0.9845725893974304, "learning_rate": 2.25679489083618e-05, "loss": 0.072, "step": 16680 }, { "epoch": 1.2394177929600476, "grad_norm": 0.6263337135314941, "learning_rate": 2.2563493242239716e-05, "loss": 0.0455, "step": 16690 }, { "epoch": 1.240160403980395, "grad_norm": 1.889050841331482, "learning_rate": 2.255903757611763e-05, "loss": 0.0967, "step": 16700 }, { "epoch": 1.2409030150007425, "grad_norm": 0.5217537879943848, "learning_rate": 2.2554581909995543e-05, "loss": 0.0857, "step": 16710 }, { "epoch": 1.2416456260210902, "grad_norm": 1.730975866317749, "learning_rate": 2.255012624387346e-05, "loss": 0.0831, "step": 16720 }, { "epoch": 1.2423882370414376, "grad_norm": 0.8797131776809692, "learning_rate": 2.2545670577751373e-05, "loss": 0.071, "step": 16730 }, { "epoch": 1.2431308480617853, "grad_norm": 2.5811779499053955, "learning_rate": 2.2541214911629288e-05, "loss": 0.0956, "step": 16740 }, { "epoch": 1.2438734590821328, "grad_norm": 1.4201879501342773, "learning_rate": 2.2536759245507206e-05, "loss": 0.0722, "step": 16750 }, { "epoch": 1.2446160701024804, "grad_norm": 2.9858505725860596, "learning_rate": 2.2532303579385118e-05, "loss": 0.0867, "step": 16760 }, { "epoch": 1.2453586811228279, "grad_norm": 2.050238609313965, "learning_rate": 2.2527847913263033e-05, "loss": 0.0588, "step": 16770 }, { "epoch": 1.2461012921431753, "grad_norm": 3.033705472946167, "learning_rate": 2.2523392247140948e-05, "loss": 0.0687, "step": 16780 }, { "epoch": 1.246843903163523, "grad_norm": 1.8493062257766724, "learning_rate": 2.2518936581018863e-05, "loss": 0.0653, "step": 16790 }, { "epoch": 1.2475865141838705, "grad_norm": 2.4249043464660645, "learning_rate": 2.2514480914896778e-05, "loss": 0.1034, "step": 16800 }, { "epoch": 1.2483291252042181, "grad_norm": 0.8284013271331787, "learning_rate": 2.2510025248774693e-05, "loss": 0.0978, "step": 16810 }, { "epoch": 1.2490717362245656, "grad_norm": 1.3202928304672241, "learning_rate": 2.2505569582652608e-05, "loss": 0.0709, "step": 16820 }, { "epoch": 1.249814347244913, "grad_norm": 1.9100357294082642, "learning_rate": 2.2501113916530523e-05, "loss": 0.085, "step": 16830 }, { "epoch": 1.2505569582652607, "grad_norm": 1.8051328659057617, "learning_rate": 2.2496658250408434e-05, "loss": 0.0653, "step": 16840 }, { "epoch": 1.2512995692856081, "grad_norm": 0.7372807860374451, "learning_rate": 2.2492202584286353e-05, "loss": 0.0798, "step": 16850 }, { "epoch": 1.2520421803059558, "grad_norm": 1.493801474571228, "learning_rate": 2.2487746918164268e-05, "loss": 0.0862, "step": 16860 }, { "epoch": 1.2527847913263033, "grad_norm": 0.8077939748764038, "learning_rate": 2.248329125204218e-05, "loss": 0.0651, "step": 16870 }, { "epoch": 1.2535274023466507, "grad_norm": 1.4354842901229858, "learning_rate": 2.2478835585920094e-05, "loss": 0.0754, "step": 16880 }, { "epoch": 1.2542700133669984, "grad_norm": 1.9970204830169678, "learning_rate": 2.2474379919798013e-05, "loss": 0.0746, "step": 16890 }, { "epoch": 1.2550126243873458, "grad_norm": 0.7201411724090576, "learning_rate": 2.2469924253675924e-05, "loss": 0.0706, "step": 16900 }, { "epoch": 1.2557552354076935, "grad_norm": 2.7510123252868652, "learning_rate": 2.246546858755384e-05, "loss": 0.0799, "step": 16910 }, { "epoch": 1.256497846428041, "grad_norm": 1.3006263971328735, "learning_rate": 2.2461012921431758e-05, "loss": 0.0689, "step": 16920 }, { "epoch": 1.2572404574483884, "grad_norm": 2.1631722450256348, "learning_rate": 2.245655725530967e-05, "loss": 0.0484, "step": 16930 }, { "epoch": 1.257983068468736, "grad_norm": 0.6136536598205566, "learning_rate": 2.2452101589187584e-05, "loss": 0.1071, "step": 16940 }, { "epoch": 1.2587256794890835, "grad_norm": 2.494858503341675, "learning_rate": 2.2447645923065496e-05, "loss": 0.0693, "step": 16950 }, { "epoch": 1.2594682905094312, "grad_norm": 3.487287998199463, "learning_rate": 2.2443190256943414e-05, "loss": 0.0533, "step": 16960 }, { "epoch": 1.2602109015297787, "grad_norm": 1.1997121572494507, "learning_rate": 2.243873459082133e-05, "loss": 0.0713, "step": 16970 }, { "epoch": 1.260953512550126, "grad_norm": 1.8079684972763062, "learning_rate": 2.243427892469924e-05, "loss": 0.0849, "step": 16980 }, { "epoch": 1.2616961235704738, "grad_norm": 1.9124133586883545, "learning_rate": 2.242982325857716e-05, "loss": 0.0789, "step": 16990 }, { "epoch": 1.2624387345908215, "grad_norm": 1.6641535758972168, "learning_rate": 2.2425367592455074e-05, "loss": 0.0694, "step": 17000 }, { "epoch": 1.263181345611169, "grad_norm": 1.7782231569290161, "learning_rate": 2.2420911926332986e-05, "loss": 0.0773, "step": 17010 }, { "epoch": 1.2639239566315164, "grad_norm": 1.2206722497940063, "learning_rate": 2.24164562602109e-05, "loss": 0.0739, "step": 17020 }, { "epoch": 1.264666567651864, "grad_norm": 0.541761577129364, "learning_rate": 2.241200059408882e-05, "loss": 0.0639, "step": 17030 }, { "epoch": 1.2654091786722115, "grad_norm": 0.547622561454773, "learning_rate": 2.240754492796673e-05, "loss": 0.0702, "step": 17040 }, { "epoch": 1.2661517896925591, "grad_norm": 4.135508060455322, "learning_rate": 2.2403089261844646e-05, "loss": 0.0927, "step": 17050 }, { "epoch": 1.2668944007129066, "grad_norm": 3.19258975982666, "learning_rate": 2.239863359572256e-05, "loss": 0.0736, "step": 17060 }, { "epoch": 1.267637011733254, "grad_norm": 1.1759032011032104, "learning_rate": 2.2394177929600476e-05, "loss": 0.0686, "step": 17070 }, { "epoch": 1.2683796227536017, "grad_norm": 0.7790769338607788, "learning_rate": 2.238972226347839e-05, "loss": 0.0961, "step": 17080 }, { "epoch": 1.2691222337739492, "grad_norm": 0.9479905962944031, "learning_rate": 2.2385266597356306e-05, "loss": 0.0555, "step": 17090 }, { "epoch": 1.2698648447942968, "grad_norm": 1.7313250303268433, "learning_rate": 2.238081093123422e-05, "loss": 0.099, "step": 17100 }, { "epoch": 1.2706074558146443, "grad_norm": 0.5795320272445679, "learning_rate": 2.2376355265112136e-05, "loss": 0.0669, "step": 17110 }, { "epoch": 1.2713500668349917, "grad_norm": 1.634346604347229, "learning_rate": 2.2371899598990047e-05, "loss": 0.0771, "step": 17120 }, { "epoch": 1.2720926778553394, "grad_norm": 2.006558656692505, "learning_rate": 2.2367443932867966e-05, "loss": 0.0762, "step": 17130 }, { "epoch": 1.2728352888756869, "grad_norm": 1.0356532335281372, "learning_rate": 2.236298826674588e-05, "loss": 0.0903, "step": 17140 }, { "epoch": 1.2735778998960345, "grad_norm": 1.004071831703186, "learning_rate": 2.2358532600623792e-05, "loss": 0.0676, "step": 17150 }, { "epoch": 1.274320510916382, "grad_norm": 1.3854845762252808, "learning_rate": 2.235407693450171e-05, "loss": 0.0705, "step": 17160 }, { "epoch": 1.2750631219367294, "grad_norm": 2.1504805088043213, "learning_rate": 2.2349621268379622e-05, "loss": 0.072, "step": 17170 }, { "epoch": 1.275805732957077, "grad_norm": 1.1549479961395264, "learning_rate": 2.2345165602257537e-05, "loss": 0.0642, "step": 17180 }, { "epoch": 1.2765483439774246, "grad_norm": 2.8275554180145264, "learning_rate": 2.2340709936135452e-05, "loss": 0.0646, "step": 17190 }, { "epoch": 1.2772909549977722, "grad_norm": 0.6290885806083679, "learning_rate": 2.2336254270013367e-05, "loss": 0.0875, "step": 17200 }, { "epoch": 1.2780335660181197, "grad_norm": 2.0349278450012207, "learning_rate": 2.2331798603891282e-05, "loss": 0.0624, "step": 17210 }, { "epoch": 1.2787761770384671, "grad_norm": 1.7029626369476318, "learning_rate": 2.2327342937769197e-05, "loss": 0.057, "step": 17220 }, { "epoch": 1.2795187880588148, "grad_norm": 0.9866172075271606, "learning_rate": 2.2322887271647112e-05, "loss": 0.08, "step": 17230 }, { "epoch": 1.2802613990791623, "grad_norm": 1.6005713939666748, "learning_rate": 2.2318431605525027e-05, "loss": 0.0767, "step": 17240 }, { "epoch": 1.28100401009951, "grad_norm": 0.7228248119354248, "learning_rate": 2.231397593940294e-05, "loss": 0.1184, "step": 17250 }, { "epoch": 1.2817466211198574, "grad_norm": 2.2156078815460205, "learning_rate": 2.2309520273280857e-05, "loss": 0.1104, "step": 17260 }, { "epoch": 1.2824892321402048, "grad_norm": 1.3294280767440796, "learning_rate": 2.2305064607158772e-05, "loss": 0.0742, "step": 17270 }, { "epoch": 1.2832318431605525, "grad_norm": 1.8837758302688599, "learning_rate": 2.2300608941036684e-05, "loss": 0.0753, "step": 17280 }, { "epoch": 1.2839744541809002, "grad_norm": 1.9538664817810059, "learning_rate": 2.22961532749146e-05, "loss": 0.0685, "step": 17290 }, { "epoch": 1.2847170652012476, "grad_norm": 1.3086044788360596, "learning_rate": 2.2291697608792517e-05, "loss": 0.069, "step": 17300 }, { "epoch": 1.285459676221595, "grad_norm": 0.43306243419647217, "learning_rate": 2.228724194267043e-05, "loss": 0.0688, "step": 17310 }, { "epoch": 1.2862022872419427, "grad_norm": 1.7001709938049316, "learning_rate": 2.2282786276548344e-05, "loss": 0.0964, "step": 17320 }, { "epoch": 1.2869448982622902, "grad_norm": 0.9813358187675476, "learning_rate": 2.2278330610426262e-05, "loss": 0.0748, "step": 17330 }, { "epoch": 1.2876875092826379, "grad_norm": 1.8679172992706299, "learning_rate": 2.2273874944304174e-05, "loss": 0.0962, "step": 17340 }, { "epoch": 1.2884301203029853, "grad_norm": 1.7753219604492188, "learning_rate": 2.226941927818209e-05, "loss": 0.0818, "step": 17350 }, { "epoch": 1.2891727313233328, "grad_norm": 1.1424388885498047, "learning_rate": 2.226496361206e-05, "loss": 0.0718, "step": 17360 }, { "epoch": 1.2899153423436804, "grad_norm": 1.6380572319030762, "learning_rate": 2.226050794593792e-05, "loss": 0.0815, "step": 17370 }, { "epoch": 1.2906579533640279, "grad_norm": 0.8902571201324463, "learning_rate": 2.2256052279815834e-05, "loss": 0.075, "step": 17380 }, { "epoch": 1.2914005643843756, "grad_norm": 0.8039567470550537, "learning_rate": 2.2251596613693745e-05, "loss": 0.0554, "step": 17390 }, { "epoch": 1.292143175404723, "grad_norm": 2.0586135387420654, "learning_rate": 2.2247140947571664e-05, "loss": 0.0662, "step": 17400 }, { "epoch": 1.2928857864250705, "grad_norm": 3.4961864948272705, "learning_rate": 2.224268528144958e-05, "loss": 0.1205, "step": 17410 }, { "epoch": 1.2936283974454181, "grad_norm": 1.724418044090271, "learning_rate": 2.223822961532749e-05, "loss": 0.0581, "step": 17420 }, { "epoch": 1.2943710084657656, "grad_norm": 1.78573739528656, "learning_rate": 2.2233773949205405e-05, "loss": 0.0827, "step": 17430 }, { "epoch": 1.2951136194861133, "grad_norm": 1.7535440921783447, "learning_rate": 2.2229318283083324e-05, "loss": 0.0835, "step": 17440 }, { "epoch": 1.2958562305064607, "grad_norm": 0.7381752729415894, "learning_rate": 2.2224862616961235e-05, "loss": 0.0748, "step": 17450 }, { "epoch": 1.2965988415268082, "grad_norm": 2.1226701736450195, "learning_rate": 2.222040695083915e-05, "loss": 0.0663, "step": 17460 }, { "epoch": 1.2973414525471558, "grad_norm": 1.3175716400146484, "learning_rate": 2.2215951284717065e-05, "loss": 0.1009, "step": 17470 }, { "epoch": 1.2980840635675033, "grad_norm": 1.1516002416610718, "learning_rate": 2.221149561859498e-05, "loss": 0.075, "step": 17480 }, { "epoch": 1.298826674587851, "grad_norm": 2.0485615730285645, "learning_rate": 2.2207039952472895e-05, "loss": 0.0753, "step": 17490 }, { "epoch": 1.2995692856081984, "grad_norm": 1.492017149925232, "learning_rate": 2.220258428635081e-05, "loss": 0.0801, "step": 17500 }, { "epoch": 1.3003118966285458, "grad_norm": 1.2517192363739014, "learning_rate": 2.2198128620228725e-05, "loss": 0.0704, "step": 17510 }, { "epoch": 1.3010545076488935, "grad_norm": 1.73708176612854, "learning_rate": 2.219367295410664e-05, "loss": 0.0564, "step": 17520 }, { "epoch": 1.301797118669241, "grad_norm": 1.0200793743133545, "learning_rate": 2.2189217287984552e-05, "loss": 0.0521, "step": 17530 }, { "epoch": 1.3025397296895886, "grad_norm": 3.1325795650482178, "learning_rate": 2.218476162186247e-05, "loss": 0.0628, "step": 17540 }, { "epoch": 1.303282340709936, "grad_norm": 2.2476203441619873, "learning_rate": 2.2180305955740385e-05, "loss": 0.0747, "step": 17550 }, { "epoch": 1.3040249517302835, "grad_norm": 0.6878722906112671, "learning_rate": 2.2175850289618297e-05, "loss": 0.0894, "step": 17560 }, { "epoch": 1.3047675627506312, "grad_norm": 2.48412823677063, "learning_rate": 2.2171394623496215e-05, "loss": 0.0809, "step": 17570 }, { "epoch": 1.3055101737709789, "grad_norm": 1.0366617441177368, "learning_rate": 2.2166938957374127e-05, "loss": 0.0724, "step": 17580 }, { "epoch": 1.3062527847913263, "grad_norm": 0.9051563739776611, "learning_rate": 2.2162483291252042e-05, "loss": 0.1042, "step": 17590 }, { "epoch": 1.3069953958116738, "grad_norm": 0.7146435976028442, "learning_rate": 2.2158027625129957e-05, "loss": 0.1003, "step": 17600 }, { "epoch": 1.3077380068320215, "grad_norm": 1.7611632347106934, "learning_rate": 2.2153571959007872e-05, "loss": 0.0837, "step": 17610 }, { "epoch": 1.308480617852369, "grad_norm": 1.3389374017715454, "learning_rate": 2.2149116292885787e-05, "loss": 0.0652, "step": 17620 }, { "epoch": 1.3092232288727166, "grad_norm": 2.716177463531494, "learning_rate": 2.2144660626763702e-05, "loss": 0.0718, "step": 17630 }, { "epoch": 1.309965839893064, "grad_norm": 0.6163918972015381, "learning_rate": 2.2140204960641617e-05, "loss": 0.0744, "step": 17640 }, { "epoch": 1.3107084509134115, "grad_norm": 2.4319982528686523, "learning_rate": 2.2135749294519532e-05, "loss": 0.097, "step": 17650 }, { "epoch": 1.3114510619337592, "grad_norm": 2.8777670860290527, "learning_rate": 2.2131293628397447e-05, "loss": 0.0849, "step": 17660 }, { "epoch": 1.3121936729541066, "grad_norm": 2.7861387729644775, "learning_rate": 2.2126837962275362e-05, "loss": 0.076, "step": 17670 }, { "epoch": 1.3129362839744543, "grad_norm": 0.650431752204895, "learning_rate": 2.2122382296153277e-05, "loss": 0.0888, "step": 17680 }, { "epoch": 1.3136788949948017, "grad_norm": 0.553596019744873, "learning_rate": 2.211792663003119e-05, "loss": 0.0619, "step": 17690 }, { "epoch": 1.3144215060151492, "grad_norm": 1.0089176893234253, "learning_rate": 2.2113470963909103e-05, "loss": 0.0777, "step": 17700 }, { "epoch": 1.3151641170354968, "grad_norm": 1.6355758905410767, "learning_rate": 2.2109015297787022e-05, "loss": 0.0507, "step": 17710 }, { "epoch": 1.3159067280558443, "grad_norm": 1.1922086477279663, "learning_rate": 2.2104559631664933e-05, "loss": 0.0612, "step": 17720 }, { "epoch": 1.316649339076192, "grad_norm": 3.1001734733581543, "learning_rate": 2.210010396554285e-05, "loss": 0.103, "step": 17730 }, { "epoch": 1.3173919500965394, "grad_norm": 2.3296868801116943, "learning_rate": 2.2095648299420767e-05, "loss": 0.0626, "step": 17740 }, { "epoch": 1.3181345611168869, "grad_norm": 1.527961015701294, "learning_rate": 2.209119263329868e-05, "loss": 0.0762, "step": 17750 }, { "epoch": 1.3188771721372345, "grad_norm": 2.4450912475585938, "learning_rate": 2.2086736967176593e-05, "loss": 0.0799, "step": 17760 }, { "epoch": 1.319619783157582, "grad_norm": 1.477561354637146, "learning_rate": 2.2082281301054505e-05, "loss": 0.0466, "step": 17770 }, { "epoch": 1.3203623941779297, "grad_norm": 2.102966070175171, "learning_rate": 2.2077825634932423e-05, "loss": 0.0812, "step": 17780 }, { "epoch": 1.3211050051982771, "grad_norm": 1.731831669807434, "learning_rate": 2.207336996881034e-05, "loss": 0.0636, "step": 17790 }, { "epoch": 1.3218476162186246, "grad_norm": 1.5313726663589478, "learning_rate": 2.206891430268825e-05, "loss": 0.0619, "step": 17800 }, { "epoch": 1.3225902272389722, "grad_norm": 1.2742550373077393, "learning_rate": 2.206445863656617e-05, "loss": 0.0647, "step": 17810 }, { "epoch": 1.3233328382593197, "grad_norm": 0.7429075241088867, "learning_rate": 2.2060002970444083e-05, "loss": 0.0574, "step": 17820 }, { "epoch": 1.3240754492796674, "grad_norm": 2.3844103813171387, "learning_rate": 2.2055547304321995e-05, "loss": 0.0455, "step": 17830 }, { "epoch": 1.3248180603000148, "grad_norm": 3.4696733951568604, "learning_rate": 2.2051091638199913e-05, "loss": 0.0856, "step": 17840 }, { "epoch": 1.3255606713203623, "grad_norm": 1.2580214738845825, "learning_rate": 2.204663597207783e-05, "loss": 0.0525, "step": 17850 }, { "epoch": 1.32630328234071, "grad_norm": 2.1355206966400146, "learning_rate": 2.204218030595574e-05, "loss": 0.0532, "step": 17860 }, { "epoch": 1.3270458933610576, "grad_norm": 3.5435104370117188, "learning_rate": 2.2037724639833655e-05, "loss": 0.0589, "step": 17870 }, { "epoch": 1.327788504381405, "grad_norm": 0.7375121712684631, "learning_rate": 2.203326897371157e-05, "loss": 0.0516, "step": 17880 }, { "epoch": 1.3285311154017525, "grad_norm": 1.3405452966690063, "learning_rate": 2.2028813307589485e-05, "loss": 0.0609, "step": 17890 }, { "epoch": 1.3292737264221002, "grad_norm": 1.448654294013977, "learning_rate": 2.20243576414674e-05, "loss": 0.0716, "step": 17900 }, { "epoch": 1.3300163374424476, "grad_norm": 1.0215409994125366, "learning_rate": 2.2019901975345315e-05, "loss": 0.0627, "step": 17910 }, { "epoch": 1.3307589484627953, "grad_norm": 2.50747013092041, "learning_rate": 2.201544630922323e-05, "loss": 0.0607, "step": 17920 }, { "epoch": 1.3315015594831427, "grad_norm": 0.947949230670929, "learning_rate": 2.2010990643101145e-05, "loss": 0.092, "step": 17930 }, { "epoch": 1.3322441705034902, "grad_norm": 1.444300889968872, "learning_rate": 2.2006534976979057e-05, "loss": 0.052, "step": 17940 }, { "epoch": 1.3329867815238379, "grad_norm": 1.492150068283081, "learning_rate": 2.2002079310856975e-05, "loss": 0.0644, "step": 17950 }, { "epoch": 1.3337293925441853, "grad_norm": 3.0664021968841553, "learning_rate": 2.199762364473489e-05, "loss": 0.1176, "step": 17960 }, { "epoch": 1.334472003564533, "grad_norm": 2.1446638107299805, "learning_rate": 2.19931679786128e-05, "loss": 0.0621, "step": 17970 }, { "epoch": 1.3352146145848804, "grad_norm": 2.1840789318084717, "learning_rate": 2.198871231249072e-05, "loss": 0.0744, "step": 17980 }, { "epoch": 1.335957225605228, "grad_norm": 6.352825164794922, "learning_rate": 2.198425664636863e-05, "loss": 0.0636, "step": 17990 }, { "epoch": 1.3366998366255756, "grad_norm": 1.7833071947097778, "learning_rate": 2.1979800980246546e-05, "loss": 0.0835, "step": 18000 }, { "epoch": 1.337442447645923, "grad_norm": 1.596863031387329, "learning_rate": 2.197534531412446e-05, "loss": 0.0639, "step": 18010 }, { "epoch": 1.3381850586662707, "grad_norm": 1.079342246055603, "learning_rate": 2.1970889648002376e-05, "loss": 0.0675, "step": 18020 }, { "epoch": 1.3389276696866181, "grad_norm": 2.5016486644744873, "learning_rate": 2.196643398188029e-05, "loss": 0.0717, "step": 18030 }, { "epoch": 1.3396702807069656, "grad_norm": 1.00153648853302, "learning_rate": 2.1961978315758206e-05, "loss": 0.0901, "step": 18040 }, { "epoch": 1.3404128917273133, "grad_norm": 0.4505369961261749, "learning_rate": 2.195752264963612e-05, "loss": 0.0617, "step": 18050 }, { "epoch": 1.3411555027476607, "grad_norm": 0.8627389669418335, "learning_rate": 2.1953066983514036e-05, "loss": 0.0915, "step": 18060 }, { "epoch": 1.3418981137680084, "grad_norm": 1.8708628416061401, "learning_rate": 2.194861131739195e-05, "loss": 0.087, "step": 18070 }, { "epoch": 1.3426407247883558, "grad_norm": 1.866942286491394, "learning_rate": 2.1944155651269866e-05, "loss": 0.0633, "step": 18080 }, { "epoch": 1.3433833358087033, "grad_norm": 3.1067397594451904, "learning_rate": 2.193969998514778e-05, "loss": 0.065, "step": 18090 }, { "epoch": 1.344125946829051, "grad_norm": 1.4775131940841675, "learning_rate": 2.1935244319025693e-05, "loss": 0.0858, "step": 18100 }, { "epoch": 1.3448685578493984, "grad_norm": 1.9869881868362427, "learning_rate": 2.1930788652903608e-05, "loss": 0.088, "step": 18110 }, { "epoch": 1.345611168869746, "grad_norm": 2.1801204681396484, "learning_rate": 2.1926332986781526e-05, "loss": 0.0941, "step": 18120 }, { "epoch": 1.3463537798900935, "grad_norm": 1.2219593524932861, "learning_rate": 2.1921877320659438e-05, "loss": 0.0897, "step": 18130 }, { "epoch": 1.347096390910441, "grad_norm": 1.6922121047973633, "learning_rate": 2.1917421654537353e-05, "loss": 0.0905, "step": 18140 }, { "epoch": 1.3478390019307886, "grad_norm": 4.362298488616943, "learning_rate": 2.191296598841527e-05, "loss": 0.0714, "step": 18150 }, { "epoch": 1.3485816129511363, "grad_norm": 1.9066132307052612, "learning_rate": 2.1908510322293183e-05, "loss": 0.076, "step": 18160 }, { "epoch": 1.3493242239714838, "grad_norm": 2.2827999591827393, "learning_rate": 2.1904054656171098e-05, "loss": 0.0872, "step": 18170 }, { "epoch": 1.3500668349918312, "grad_norm": 0.735640287399292, "learning_rate": 2.189959899004901e-05, "loss": 0.0668, "step": 18180 }, { "epoch": 1.350809446012179, "grad_norm": 1.7946842908859253, "learning_rate": 2.1895143323926928e-05, "loss": 0.0767, "step": 18190 }, { "epoch": 1.3515520570325263, "grad_norm": 0.793258547782898, "learning_rate": 2.1890687657804843e-05, "loss": 0.0484, "step": 18200 }, { "epoch": 1.352294668052874, "grad_norm": 1.9546618461608887, "learning_rate": 2.1886231991682755e-05, "loss": 0.1081, "step": 18210 }, { "epoch": 1.3530372790732215, "grad_norm": 1.6390115022659302, "learning_rate": 2.1881776325560673e-05, "loss": 0.051, "step": 18220 }, { "epoch": 1.353779890093569, "grad_norm": 0.6895598769187927, "learning_rate": 2.1877320659438588e-05, "loss": 0.0765, "step": 18230 }, { "epoch": 1.3545225011139166, "grad_norm": 1.3696023225784302, "learning_rate": 2.18728649933165e-05, "loss": 0.1083, "step": 18240 }, { "epoch": 1.355265112134264, "grad_norm": 1.8594785928726196, "learning_rate": 2.1868409327194418e-05, "loss": 0.0773, "step": 18250 }, { "epoch": 1.3560077231546117, "grad_norm": 1.0615592002868652, "learning_rate": 2.1863953661072333e-05, "loss": 0.079, "step": 18260 }, { "epoch": 1.3567503341749592, "grad_norm": 1.725924015045166, "learning_rate": 2.1859497994950245e-05, "loss": 0.0957, "step": 18270 }, { "epoch": 1.3574929451953066, "grad_norm": 0.7727744579315186, "learning_rate": 2.185504232882816e-05, "loss": 0.0664, "step": 18280 }, { "epoch": 1.3582355562156543, "grad_norm": 0.4827175438404083, "learning_rate": 2.1850586662706075e-05, "loss": 0.0786, "step": 18290 }, { "epoch": 1.3589781672360017, "grad_norm": 2.3868589401245117, "learning_rate": 2.184613099658399e-05, "loss": 0.0757, "step": 18300 }, { "epoch": 1.3597207782563494, "grad_norm": 1.4127172231674194, "learning_rate": 2.1841675330461905e-05, "loss": 0.0805, "step": 18310 }, { "epoch": 1.3604633892766969, "grad_norm": 3.6567182540893555, "learning_rate": 2.183721966433982e-05, "loss": 0.0501, "step": 18320 }, { "epoch": 1.3612060002970443, "grad_norm": 0.46795493364334106, "learning_rate": 2.1832763998217735e-05, "loss": 0.0702, "step": 18330 }, { "epoch": 1.361948611317392, "grad_norm": 1.1529920101165771, "learning_rate": 2.182830833209565e-05, "loss": 0.0891, "step": 18340 }, { "epoch": 1.3626912223377394, "grad_norm": 1.0906422138214111, "learning_rate": 2.182385266597356e-05, "loss": 0.0789, "step": 18350 }, { "epoch": 1.363433833358087, "grad_norm": 4.010624408721924, "learning_rate": 2.181939699985148e-05, "loss": 0.1198, "step": 18360 }, { "epoch": 1.3641764443784345, "grad_norm": 1.2487231492996216, "learning_rate": 2.1814941333729395e-05, "loss": 0.0811, "step": 18370 }, { "epoch": 1.364919055398782, "grad_norm": 1.9714354276657104, "learning_rate": 2.1810485667607306e-05, "loss": 0.065, "step": 18380 }, { "epoch": 1.3656616664191297, "grad_norm": 2.3405370712280273, "learning_rate": 2.1806030001485224e-05, "loss": 0.1, "step": 18390 }, { "epoch": 1.3664042774394771, "grad_norm": 0.7739295959472656, "learning_rate": 2.1801574335363136e-05, "loss": 0.0794, "step": 18400 }, { "epoch": 1.3671468884598248, "grad_norm": 1.5467528104782104, "learning_rate": 2.179711866924105e-05, "loss": 0.0625, "step": 18410 }, { "epoch": 1.3678894994801722, "grad_norm": 2.0062620639801025, "learning_rate": 2.1792663003118966e-05, "loss": 0.1144, "step": 18420 }, { "epoch": 1.3686321105005197, "grad_norm": 1.0899155139923096, "learning_rate": 2.178820733699688e-05, "loss": 0.0832, "step": 18430 }, { "epoch": 1.3693747215208674, "grad_norm": 1.5174329280853271, "learning_rate": 2.1783751670874796e-05, "loss": 0.0886, "step": 18440 }, { "epoch": 1.370117332541215, "grad_norm": 1.060883641242981, "learning_rate": 2.177929600475271e-05, "loss": 0.0619, "step": 18450 }, { "epoch": 1.3708599435615625, "grad_norm": 2.5630977153778076, "learning_rate": 2.1774840338630626e-05, "loss": 0.0697, "step": 18460 }, { "epoch": 1.37160255458191, "grad_norm": 1.6093450784683228, "learning_rate": 2.177038467250854e-05, "loss": 0.0803, "step": 18470 }, { "epoch": 1.3723451656022576, "grad_norm": 1.996664047241211, "learning_rate": 2.1765929006386456e-05, "loss": 0.0881, "step": 18480 }, { "epoch": 1.373087776622605, "grad_norm": 1.6483838558197021, "learning_rate": 2.176147334026437e-05, "loss": 0.0536, "step": 18490 }, { "epoch": 1.3738303876429527, "grad_norm": 2.682058572769165, "learning_rate": 2.1757017674142286e-05, "loss": 0.0822, "step": 18500 }, { "epoch": 1.3745729986633002, "grad_norm": 0.638530969619751, "learning_rate": 2.1752562008020198e-05, "loss": 0.0843, "step": 18510 }, { "epoch": 1.3753156096836476, "grad_norm": 0.7603070139884949, "learning_rate": 2.1748106341898113e-05, "loss": 0.0736, "step": 18520 }, { "epoch": 1.3760582207039953, "grad_norm": 1.7410355806350708, "learning_rate": 2.174365067577603e-05, "loss": 0.0999, "step": 18530 }, { "epoch": 1.3768008317243428, "grad_norm": 1.657575011253357, "learning_rate": 2.1739195009653943e-05, "loss": 0.0609, "step": 18540 }, { "epoch": 1.3775434427446904, "grad_norm": 0.7977071404457092, "learning_rate": 2.1734739343531858e-05, "loss": 0.0636, "step": 18550 }, { "epoch": 1.3782860537650379, "grad_norm": 0.5938560962677002, "learning_rate": 2.1730283677409776e-05, "loss": 0.0776, "step": 18560 }, { "epoch": 1.3790286647853853, "grad_norm": 2.3194963932037354, "learning_rate": 2.1725828011287688e-05, "loss": 0.0953, "step": 18570 }, { "epoch": 1.379771275805733, "grad_norm": 1.4146885871887207, "learning_rate": 2.1721372345165603e-05, "loss": 0.0885, "step": 18580 }, { "epoch": 1.3805138868260804, "grad_norm": 2.4094855785369873, "learning_rate": 2.1716916679043518e-05, "loss": 0.0629, "step": 18590 }, { "epoch": 1.3812564978464281, "grad_norm": 2.476471424102783, "learning_rate": 2.1712461012921433e-05, "loss": 0.0522, "step": 18600 }, { "epoch": 1.3819991088667756, "grad_norm": 1.0352263450622559, "learning_rate": 2.1708005346799348e-05, "loss": 0.0605, "step": 18610 }, { "epoch": 1.382741719887123, "grad_norm": 1.2471846342086792, "learning_rate": 2.170354968067726e-05, "loss": 0.0606, "step": 18620 }, { "epoch": 1.3834843309074707, "grad_norm": 0.748393177986145, "learning_rate": 2.1699094014555178e-05, "loss": 0.0825, "step": 18630 }, { "epoch": 1.3842269419278181, "grad_norm": 2.0027894973754883, "learning_rate": 2.1694638348433093e-05, "loss": 0.1089, "step": 18640 }, { "epoch": 1.3849695529481658, "grad_norm": 0.7631524205207825, "learning_rate": 2.1690182682311004e-05, "loss": 0.0627, "step": 18650 }, { "epoch": 1.3857121639685133, "grad_norm": 1.3359373807907104, "learning_rate": 2.1685727016188923e-05, "loss": 0.0581, "step": 18660 }, { "epoch": 1.3864547749888607, "grad_norm": 1.8302745819091797, "learning_rate": 2.1681271350066838e-05, "loss": 0.0578, "step": 18670 }, { "epoch": 1.3871973860092084, "grad_norm": 1.2512954473495483, "learning_rate": 2.167681568394475e-05, "loss": 0.0746, "step": 18680 }, { "epoch": 1.3879399970295558, "grad_norm": 1.0682908296585083, "learning_rate": 2.1672360017822664e-05, "loss": 0.0568, "step": 18690 }, { "epoch": 1.3886826080499035, "grad_norm": 0.7478026747703552, "learning_rate": 2.166790435170058e-05, "loss": 0.0947, "step": 18700 }, { "epoch": 1.389425219070251, "grad_norm": 1.646852731704712, "learning_rate": 2.1663448685578494e-05, "loss": 0.0718, "step": 18710 }, { "epoch": 1.3901678300905984, "grad_norm": 1.0658780336380005, "learning_rate": 2.165899301945641e-05, "loss": 0.0939, "step": 18720 }, { "epoch": 1.390910441110946, "grad_norm": 1.9189115762710571, "learning_rate": 2.1654537353334324e-05, "loss": 0.0844, "step": 18730 }, { "epoch": 1.3916530521312938, "grad_norm": 1.009257197380066, "learning_rate": 2.165008168721224e-05, "loss": 0.0549, "step": 18740 }, { "epoch": 1.3923956631516412, "grad_norm": 1.1717352867126465, "learning_rate": 2.1645626021090154e-05, "loss": 0.0741, "step": 18750 }, { "epoch": 1.3931382741719887, "grad_norm": 1.070173740386963, "learning_rate": 2.1641170354968066e-05, "loss": 0.099, "step": 18760 }, { "epoch": 1.3938808851923363, "grad_norm": 2.932997465133667, "learning_rate": 2.1636714688845984e-05, "loss": 0.0674, "step": 18770 }, { "epoch": 1.3946234962126838, "grad_norm": 0.7119723558425903, "learning_rate": 2.16322590227239e-05, "loss": 0.0768, "step": 18780 }, { "epoch": 1.3953661072330314, "grad_norm": 2.5483529567718506, "learning_rate": 2.162780335660181e-05, "loss": 0.0427, "step": 18790 }, { "epoch": 1.396108718253379, "grad_norm": 1.3585890531539917, "learning_rate": 2.162334769047973e-05, "loss": 0.0429, "step": 18800 }, { "epoch": 1.3968513292737263, "grad_norm": 1.786074161529541, "learning_rate": 2.161889202435764e-05, "loss": 0.0965, "step": 18810 }, { "epoch": 1.397593940294074, "grad_norm": 2.01636004447937, "learning_rate": 2.1614436358235556e-05, "loss": 0.0469, "step": 18820 }, { "epoch": 1.3983365513144215, "grad_norm": 1.713550090789795, "learning_rate": 2.160998069211347e-05, "loss": 0.0725, "step": 18830 }, { "epoch": 1.3990791623347691, "grad_norm": 4.37731409072876, "learning_rate": 2.1605525025991386e-05, "loss": 0.0788, "step": 18840 }, { "epoch": 1.3998217733551166, "grad_norm": 2.8098561763763428, "learning_rate": 2.16010693598693e-05, "loss": 0.0677, "step": 18850 }, { "epoch": 1.400564384375464, "grad_norm": 2.595644235610962, "learning_rate": 2.1596613693747216e-05, "loss": 0.088, "step": 18860 }, { "epoch": 1.4013069953958117, "grad_norm": 2.6050455570220947, "learning_rate": 2.159215802762513e-05, "loss": 0.0683, "step": 18870 }, { "epoch": 1.4020496064161592, "grad_norm": 1.2562687397003174, "learning_rate": 2.1587702361503046e-05, "loss": 0.0605, "step": 18880 }, { "epoch": 1.4027922174365068, "grad_norm": 0.9690125584602356, "learning_rate": 2.158324669538096e-05, "loss": 0.0506, "step": 18890 }, { "epoch": 1.4035348284568543, "grad_norm": 0.6721828579902649, "learning_rate": 2.1578791029258876e-05, "loss": 0.0553, "step": 18900 }, { "epoch": 1.4042774394772017, "grad_norm": 0.4955576956272125, "learning_rate": 2.157433536313679e-05, "loss": 0.065, "step": 18910 }, { "epoch": 1.4050200504975494, "grad_norm": 2.1765050888061523, "learning_rate": 2.1569879697014702e-05, "loss": 0.0669, "step": 18920 }, { "epoch": 1.4057626615178969, "grad_norm": 1.8637815713882446, "learning_rate": 2.1565424030892617e-05, "loss": 0.0629, "step": 18930 }, { "epoch": 1.4065052725382445, "grad_norm": 2.6646058559417725, "learning_rate": 2.1560968364770536e-05, "loss": 0.0729, "step": 18940 }, { "epoch": 1.407247883558592, "grad_norm": 1.4432345628738403, "learning_rate": 2.1556512698648447e-05, "loss": 0.0601, "step": 18950 }, { "epoch": 1.4079904945789394, "grad_norm": 0.9706814885139465, "learning_rate": 2.1552057032526362e-05, "loss": 0.0596, "step": 18960 }, { "epoch": 1.408733105599287, "grad_norm": 1.4810203313827515, "learning_rate": 2.154760136640428e-05, "loss": 0.0816, "step": 18970 }, { "epoch": 1.4094757166196346, "grad_norm": 0.5370448231697083, "learning_rate": 2.1543145700282192e-05, "loss": 0.0685, "step": 18980 }, { "epoch": 1.4102183276399822, "grad_norm": 1.4338277578353882, "learning_rate": 2.1538690034160107e-05, "loss": 0.0821, "step": 18990 }, { "epoch": 1.4109609386603297, "grad_norm": 1.7360191345214844, "learning_rate": 2.1534234368038022e-05, "loss": 0.0853, "step": 19000 }, { "epoch": 1.4117035496806771, "grad_norm": 1.8380330801010132, "learning_rate": 2.1529778701915937e-05, "loss": 0.0599, "step": 19010 }, { "epoch": 1.4124461607010248, "grad_norm": 0.9603464007377625, "learning_rate": 2.1525323035793852e-05, "loss": 0.0783, "step": 19020 }, { "epoch": 1.4131887717213725, "grad_norm": 1.198040246963501, "learning_rate": 2.1520867369671764e-05, "loss": 0.0801, "step": 19030 }, { "epoch": 1.41393138274172, "grad_norm": 2.3278064727783203, "learning_rate": 2.1516411703549682e-05, "loss": 0.0785, "step": 19040 }, { "epoch": 1.4146739937620674, "grad_norm": 2.149242877960205, "learning_rate": 2.1511956037427597e-05, "loss": 0.0699, "step": 19050 }, { "epoch": 1.415416604782415, "grad_norm": 1.5481926202774048, "learning_rate": 2.150750037130551e-05, "loss": 0.0785, "step": 19060 }, { "epoch": 1.4161592158027625, "grad_norm": 1.473336100578308, "learning_rate": 2.1503044705183427e-05, "loss": 0.1051, "step": 19070 }, { "epoch": 1.4169018268231102, "grad_norm": 1.15213143825531, "learning_rate": 2.1498589039061342e-05, "loss": 0.08, "step": 19080 }, { "epoch": 1.4176444378434576, "grad_norm": 1.5161607265472412, "learning_rate": 2.1494133372939254e-05, "loss": 0.08, "step": 19090 }, { "epoch": 1.418387048863805, "grad_norm": 1.6533968448638916, "learning_rate": 2.148967770681717e-05, "loss": 0.0633, "step": 19100 }, { "epoch": 1.4191296598841527, "grad_norm": 3.5496084690093994, "learning_rate": 2.1485222040695087e-05, "loss": 0.0849, "step": 19110 }, { "epoch": 1.4198722709045002, "grad_norm": 0.9919148683547974, "learning_rate": 2.1480766374573e-05, "loss": 0.0349, "step": 19120 }, { "epoch": 1.4206148819248479, "grad_norm": 1.724232792854309, "learning_rate": 2.1476310708450914e-05, "loss": 0.0672, "step": 19130 }, { "epoch": 1.4213574929451953, "grad_norm": 2.5495779514312744, "learning_rate": 2.147185504232883e-05, "loss": 0.0786, "step": 19140 }, { "epoch": 1.4221001039655428, "grad_norm": 1.9082344770431519, "learning_rate": 2.1467399376206744e-05, "loss": 0.0668, "step": 19150 }, { "epoch": 1.4228427149858904, "grad_norm": 0.5212798714637756, "learning_rate": 2.146294371008466e-05, "loss": 0.059, "step": 19160 }, { "epoch": 1.4235853260062379, "grad_norm": 1.6344729661941528, "learning_rate": 2.145848804396257e-05, "loss": 0.0821, "step": 19170 }, { "epoch": 1.4243279370265856, "grad_norm": 1.4449615478515625, "learning_rate": 2.145403237784049e-05, "loss": 0.0597, "step": 19180 }, { "epoch": 1.425070548046933, "grad_norm": 1.3735612630844116, "learning_rate": 2.1449576711718404e-05, "loss": 0.049, "step": 19190 }, { "epoch": 1.4258131590672805, "grad_norm": 1.045433521270752, "learning_rate": 2.1445121045596315e-05, "loss": 0.079, "step": 19200 }, { "epoch": 1.4265557700876281, "grad_norm": 0.6579065918922424, "learning_rate": 2.1440665379474234e-05, "loss": 0.0651, "step": 19210 }, { "epoch": 1.4272983811079756, "grad_norm": 2.254539728164673, "learning_rate": 2.1436209713352145e-05, "loss": 0.0883, "step": 19220 }, { "epoch": 1.4280409921283232, "grad_norm": 0.9771292209625244, "learning_rate": 2.143175404723006e-05, "loss": 0.0561, "step": 19230 }, { "epoch": 1.4287836031486707, "grad_norm": 1.2068768739700317, "learning_rate": 2.142729838110798e-05, "loss": 0.0761, "step": 19240 }, { "epoch": 1.4295262141690182, "grad_norm": 2.0585765838623047, "learning_rate": 2.142284271498589e-05, "loss": 0.0978, "step": 19250 }, { "epoch": 1.4302688251893658, "grad_norm": 1.229931354522705, "learning_rate": 2.1418387048863805e-05, "loss": 0.0664, "step": 19260 }, { "epoch": 1.4310114362097133, "grad_norm": 1.6777613162994385, "learning_rate": 2.141393138274172e-05, "loss": 0.0979, "step": 19270 }, { "epoch": 1.431754047230061, "grad_norm": 1.9495104551315308, "learning_rate": 2.1409475716619635e-05, "loss": 0.0684, "step": 19280 }, { "epoch": 1.4324966582504084, "grad_norm": 2.273432493209839, "learning_rate": 2.140502005049755e-05, "loss": 0.0975, "step": 19290 }, { "epoch": 1.4332392692707558, "grad_norm": 1.896386742591858, "learning_rate": 2.1400564384375465e-05, "loss": 0.0974, "step": 19300 }, { "epoch": 1.4339818802911035, "grad_norm": 0.9202299118041992, "learning_rate": 2.139610871825338e-05, "loss": 0.0809, "step": 19310 }, { "epoch": 1.4347244913114512, "grad_norm": 1.1943680047988892, "learning_rate": 2.1391653052131295e-05, "loss": 0.0808, "step": 19320 }, { "epoch": 1.4354671023317986, "grad_norm": 2.0343806743621826, "learning_rate": 2.1387197386009207e-05, "loss": 0.072, "step": 19330 }, { "epoch": 1.436209713352146, "grad_norm": 3.626370668411255, "learning_rate": 2.1382741719887122e-05, "loss": 0.0568, "step": 19340 }, { "epoch": 1.4369523243724938, "grad_norm": 5.546724796295166, "learning_rate": 2.137828605376504e-05, "loss": 0.0625, "step": 19350 }, { "epoch": 1.4376949353928412, "grad_norm": 1.03886878490448, "learning_rate": 2.1373830387642952e-05, "loss": 0.0897, "step": 19360 }, { "epoch": 1.4384375464131889, "grad_norm": 2.2745583057403564, "learning_rate": 2.1369374721520867e-05, "loss": 0.0781, "step": 19370 }, { "epoch": 1.4391801574335363, "grad_norm": 2.1215226650238037, "learning_rate": 2.1364919055398785e-05, "loss": 0.0764, "step": 19380 }, { "epoch": 1.4399227684538838, "grad_norm": 1.7415093183517456, "learning_rate": 2.1360463389276697e-05, "loss": 0.0959, "step": 19390 }, { "epoch": 1.4406653794742315, "grad_norm": 4.175904750823975, "learning_rate": 2.1356007723154612e-05, "loss": 0.0898, "step": 19400 }, { "epoch": 1.441407990494579, "grad_norm": 1.0260313749313354, "learning_rate": 2.1351552057032527e-05, "loss": 0.1057, "step": 19410 }, { "epoch": 1.4421506015149266, "grad_norm": 1.1393053531646729, "learning_rate": 2.1347096390910442e-05, "loss": 0.0632, "step": 19420 }, { "epoch": 1.442893212535274, "grad_norm": 0.7006543278694153, "learning_rate": 2.1342640724788357e-05, "loss": 0.0601, "step": 19430 }, { "epoch": 1.4436358235556215, "grad_norm": 1.356958270072937, "learning_rate": 2.133818505866627e-05, "loss": 0.0522, "step": 19440 }, { "epoch": 1.4443784345759692, "grad_norm": 0.9803171753883362, "learning_rate": 2.1333729392544187e-05, "loss": 0.0593, "step": 19450 }, { "epoch": 1.4451210455963166, "grad_norm": 0.602212131023407, "learning_rate": 2.1329273726422102e-05, "loss": 0.0412, "step": 19460 }, { "epoch": 1.4458636566166643, "grad_norm": 2.612487554550171, "learning_rate": 2.1324818060300013e-05, "loss": 0.1007, "step": 19470 }, { "epoch": 1.4466062676370117, "grad_norm": 2.2010037899017334, "learning_rate": 2.1320362394177932e-05, "loss": 0.0845, "step": 19480 }, { "epoch": 1.4473488786573592, "grad_norm": 1.3629838228225708, "learning_rate": 2.1315906728055847e-05, "loss": 0.0794, "step": 19490 }, { "epoch": 1.4480914896777068, "grad_norm": 1.0975171327590942, "learning_rate": 2.131145106193376e-05, "loss": 0.0656, "step": 19500 }, { "epoch": 1.4488341006980543, "grad_norm": 1.8527283668518066, "learning_rate": 2.1306995395811673e-05, "loss": 0.0724, "step": 19510 }, { "epoch": 1.449576711718402, "grad_norm": 1.6812669038772583, "learning_rate": 2.1302539729689592e-05, "loss": 0.0959, "step": 19520 }, { "epoch": 1.4503193227387494, "grad_norm": 1.0494896173477173, "learning_rate": 2.1298084063567503e-05, "loss": 0.0544, "step": 19530 }, { "epoch": 1.4510619337590969, "grad_norm": 0.5817059874534607, "learning_rate": 2.129362839744542e-05, "loss": 0.0808, "step": 19540 }, { "epoch": 1.4518045447794445, "grad_norm": 1.512558102607727, "learning_rate": 2.1289172731323333e-05, "loss": 0.0952, "step": 19550 }, { "epoch": 1.452547155799792, "grad_norm": 4.8917694091796875, "learning_rate": 2.128471706520125e-05, "loss": 0.0987, "step": 19560 }, { "epoch": 1.4532897668201397, "grad_norm": 2.3295490741729736, "learning_rate": 2.1280261399079163e-05, "loss": 0.0872, "step": 19570 }, { "epoch": 1.4540323778404871, "grad_norm": 1.3101624250411987, "learning_rate": 2.1275805732957075e-05, "loss": 0.0484, "step": 19580 }, { "epoch": 1.4547749888608346, "grad_norm": 3.150947332382202, "learning_rate": 2.1271350066834993e-05, "loss": 0.0876, "step": 19590 }, { "epoch": 1.4555175998811822, "grad_norm": 1.6120469570159912, "learning_rate": 2.1266894400712908e-05, "loss": 0.0806, "step": 19600 }, { "epoch": 1.45626021090153, "grad_norm": 1.210063099861145, "learning_rate": 2.126243873459082e-05, "loss": 0.0542, "step": 19610 }, { "epoch": 1.4570028219218774, "grad_norm": 2.553395986557007, "learning_rate": 2.1257983068468738e-05, "loss": 0.0743, "step": 19620 }, { "epoch": 1.4577454329422248, "grad_norm": 2.5680768489837646, "learning_rate": 2.125352740234665e-05, "loss": 0.0764, "step": 19630 }, { "epoch": 1.4584880439625725, "grad_norm": 1.1068662405014038, "learning_rate": 2.1249071736224565e-05, "loss": 0.0778, "step": 19640 }, { "epoch": 1.45923065498292, "grad_norm": 2.1891543865203857, "learning_rate": 2.1244616070102483e-05, "loss": 0.0654, "step": 19650 }, { "epoch": 1.4599732660032676, "grad_norm": 1.6965099573135376, "learning_rate": 2.1240160403980395e-05, "loss": 0.0634, "step": 19660 }, { "epoch": 1.460715877023615, "grad_norm": 2.3436694145202637, "learning_rate": 2.123570473785831e-05, "loss": 0.098, "step": 19670 }, { "epoch": 1.4614584880439625, "grad_norm": 0.8156054615974426, "learning_rate": 2.1231249071736225e-05, "loss": 0.0676, "step": 19680 }, { "epoch": 1.4622010990643102, "grad_norm": 1.2002021074295044, "learning_rate": 2.122679340561414e-05, "loss": 0.082, "step": 19690 }, { "epoch": 1.4629437100846576, "grad_norm": 1.8691362142562866, "learning_rate": 2.1222337739492055e-05, "loss": 0.0487, "step": 19700 }, { "epoch": 1.4636863211050053, "grad_norm": 2.66479229927063, "learning_rate": 2.121788207336997e-05, "loss": 0.0784, "step": 19710 }, { "epoch": 1.4644289321253527, "grad_norm": 1.1679737567901611, "learning_rate": 2.1213426407247885e-05, "loss": 0.0645, "step": 19720 }, { "epoch": 1.4651715431457002, "grad_norm": 1.454779028892517, "learning_rate": 2.12089707411258e-05, "loss": 0.0739, "step": 19730 }, { "epoch": 1.4659141541660479, "grad_norm": 2.86212420463562, "learning_rate": 2.120451507500371e-05, "loss": 0.0479, "step": 19740 }, { "epoch": 1.4666567651863953, "grad_norm": 1.7495118379592896, "learning_rate": 2.1200059408881626e-05, "loss": 0.0796, "step": 19750 }, { "epoch": 1.467399376206743, "grad_norm": 1.2656298875808716, "learning_rate": 2.1195603742759545e-05, "loss": 0.079, "step": 19760 }, { "epoch": 1.4681419872270904, "grad_norm": 0.7113642692565918, "learning_rate": 2.1191148076637456e-05, "loss": 0.0707, "step": 19770 }, { "epoch": 1.468884598247438, "grad_norm": 0.5482021570205688, "learning_rate": 2.118669241051537e-05, "loss": 0.0678, "step": 19780 }, { "epoch": 1.4696272092677856, "grad_norm": 2.1381452083587646, "learning_rate": 2.118223674439329e-05, "loss": 0.0847, "step": 19790 }, { "epoch": 1.470369820288133, "grad_norm": 1.8469703197479248, "learning_rate": 2.11777810782712e-05, "loss": 0.0654, "step": 19800 }, { "epoch": 1.4711124313084807, "grad_norm": 1.900571346282959, "learning_rate": 2.1173325412149116e-05, "loss": 0.05, "step": 19810 }, { "epoch": 1.4718550423288281, "grad_norm": 0.9532872438430786, "learning_rate": 2.116886974602703e-05, "loss": 0.0535, "step": 19820 }, { "epoch": 1.4725976533491756, "grad_norm": 1.2694770097732544, "learning_rate": 2.1164414079904946e-05, "loss": 0.0907, "step": 19830 }, { "epoch": 1.4733402643695233, "grad_norm": 0.9396808743476868, "learning_rate": 2.115995841378286e-05, "loss": 0.0592, "step": 19840 }, { "epoch": 1.4740828753898707, "grad_norm": 1.6375855207443237, "learning_rate": 2.1155502747660773e-05, "loss": 0.0707, "step": 19850 }, { "epoch": 1.4748254864102184, "grad_norm": 2.0587351322174072, "learning_rate": 2.115104708153869e-05, "loss": 0.084, "step": 19860 }, { "epoch": 1.4755680974305658, "grad_norm": 0.8906083106994629, "learning_rate": 2.1146591415416606e-05, "loss": 0.078, "step": 19870 }, { "epoch": 1.4763107084509133, "grad_norm": 0.7745434045791626, "learning_rate": 2.1142135749294518e-05, "loss": 0.0601, "step": 19880 }, { "epoch": 1.477053319471261, "grad_norm": 1.4954042434692383, "learning_rate": 2.1137680083172436e-05, "loss": 0.0797, "step": 19890 }, { "epoch": 1.4777959304916086, "grad_norm": 1.5005775690078735, "learning_rate": 2.113322441705035e-05, "loss": 0.0563, "step": 19900 }, { "epoch": 1.478538541511956, "grad_norm": 1.535308837890625, "learning_rate": 2.1128768750928263e-05, "loss": 0.0555, "step": 19910 }, { "epoch": 1.4792811525323035, "grad_norm": 1.6805989742279053, "learning_rate": 2.1124313084806178e-05, "loss": 0.0413, "step": 19920 }, { "epoch": 1.4800237635526512, "grad_norm": 1.622406244277954, "learning_rate": 2.1119857418684096e-05, "loss": 0.0634, "step": 19930 }, { "epoch": 1.4807663745729986, "grad_norm": 1.6346774101257324, "learning_rate": 2.1115401752562008e-05, "loss": 0.0793, "step": 19940 }, { "epoch": 1.4815089855933463, "grad_norm": 2.6986865997314453, "learning_rate": 2.1110946086439923e-05, "loss": 0.0689, "step": 19950 }, { "epoch": 1.4822515966136938, "grad_norm": 2.2464749813079834, "learning_rate": 2.1106490420317838e-05, "loss": 0.0736, "step": 19960 }, { "epoch": 1.4829942076340412, "grad_norm": 0.5846890211105347, "learning_rate": 2.1102034754195753e-05, "loss": 0.055, "step": 19970 }, { "epoch": 1.483736818654389, "grad_norm": 2.5977261066436768, "learning_rate": 2.1097579088073668e-05, "loss": 0.0637, "step": 19980 }, { "epoch": 1.4844794296747363, "grad_norm": 2.5025405883789062, "learning_rate": 2.109312342195158e-05, "loss": 0.0687, "step": 19990 }, { "epoch": 1.485222040695084, "grad_norm": 0.5519008040428162, "learning_rate": 2.1088667755829498e-05, "loss": 0.0369, "step": 20000 }, { "epoch": 1.4859646517154315, "grad_norm": 2.253185272216797, "learning_rate": 2.1084212089707413e-05, "loss": 0.0943, "step": 20010 }, { "epoch": 1.486707262735779, "grad_norm": 1.709266185760498, "learning_rate": 2.1079756423585325e-05, "loss": 0.0721, "step": 20020 }, { "epoch": 1.4874498737561266, "grad_norm": 1.548275351524353, "learning_rate": 2.1075300757463243e-05, "loss": 0.0819, "step": 20030 }, { "epoch": 1.488192484776474, "grad_norm": 0.7985262274742126, "learning_rate": 2.1070845091341158e-05, "loss": 0.0646, "step": 20040 }, { "epoch": 1.4889350957968217, "grad_norm": 2.2757515907287598, "learning_rate": 2.106638942521907e-05, "loss": 0.0766, "step": 20050 }, { "epoch": 1.4896777068171692, "grad_norm": 1.4162112474441528, "learning_rate": 2.1061933759096988e-05, "loss": 0.0765, "step": 20060 }, { "epoch": 1.4904203178375166, "grad_norm": 2.37214994430542, "learning_rate": 2.10574780929749e-05, "loss": 0.0794, "step": 20070 }, { "epoch": 1.4911629288578643, "grad_norm": 2.4197685718536377, "learning_rate": 2.1053022426852814e-05, "loss": 0.1291, "step": 20080 }, { "epoch": 1.4919055398782117, "grad_norm": 1.205228328704834, "learning_rate": 2.104856676073073e-05, "loss": 0.1016, "step": 20090 }, { "epoch": 1.4926481508985594, "grad_norm": 0.3539555072784424, "learning_rate": 2.1044111094608644e-05, "loss": 0.0769, "step": 20100 }, { "epoch": 1.4933907619189069, "grad_norm": 0.9795430302619934, "learning_rate": 2.103965542848656e-05, "loss": 0.0789, "step": 20110 }, { "epoch": 1.4941333729392543, "grad_norm": 2.009331703186035, "learning_rate": 2.1035199762364474e-05, "loss": 0.0557, "step": 20120 }, { "epoch": 1.494875983959602, "grad_norm": 2.3191285133361816, "learning_rate": 2.103074409624239e-05, "loss": 0.065, "step": 20130 }, { "epoch": 1.4956185949799494, "grad_norm": 2.4035158157348633, "learning_rate": 2.1026288430120304e-05, "loss": 0.0668, "step": 20140 }, { "epoch": 1.496361206000297, "grad_norm": 1.0998272895812988, "learning_rate": 2.1021832763998216e-05, "loss": 0.0536, "step": 20150 }, { "epoch": 1.4971038170206445, "grad_norm": 3.933292865753174, "learning_rate": 2.101737709787613e-05, "loss": 0.1027, "step": 20160 }, { "epoch": 1.497846428040992, "grad_norm": 0.8710070848464966, "learning_rate": 2.101292143175405e-05, "loss": 0.0521, "step": 20170 }, { "epoch": 1.4985890390613397, "grad_norm": 3.407097816467285, "learning_rate": 2.100846576563196e-05, "loss": 0.0823, "step": 20180 }, { "epoch": 1.4993316500816873, "grad_norm": 2.3524112701416016, "learning_rate": 2.1004010099509876e-05, "loss": 0.1098, "step": 20190 }, { "epoch": 1.5000742611020348, "grad_norm": 2.8110392093658447, "learning_rate": 2.0999554433387794e-05, "loss": 0.0903, "step": 20200 }, { "epoch": 1.5008168721223822, "grad_norm": 2.2008228302001953, "learning_rate": 2.0995098767265706e-05, "loss": 0.0546, "step": 20210 }, { "epoch": 1.5015594831427297, "grad_norm": 0.38597363233566284, "learning_rate": 2.099064310114362e-05, "loss": 0.049, "step": 20220 }, { "epoch": 1.5023020941630774, "grad_norm": 1.819575309753418, "learning_rate": 2.098618743502154e-05, "loss": 0.0668, "step": 20230 }, { "epoch": 1.503044705183425, "grad_norm": 1.7330279350280762, "learning_rate": 2.098173176889945e-05, "loss": 0.0746, "step": 20240 }, { "epoch": 1.5037873162037725, "grad_norm": 1.4571242332458496, "learning_rate": 2.0977276102777366e-05, "loss": 0.0804, "step": 20250 }, { "epoch": 1.50452992722412, "grad_norm": 0.8865562677383423, "learning_rate": 2.0972820436655278e-05, "loss": 0.08, "step": 20260 }, { "epoch": 1.5052725382444674, "grad_norm": 1.0869084596633911, "learning_rate": 2.0968364770533196e-05, "loss": 0.0526, "step": 20270 }, { "epoch": 1.506015149264815, "grad_norm": 1.397093653678894, "learning_rate": 2.096390910441111e-05, "loss": 0.0899, "step": 20280 }, { "epoch": 1.5067577602851627, "grad_norm": 1.4997838735580444, "learning_rate": 2.0959453438289023e-05, "loss": 0.0929, "step": 20290 }, { "epoch": 1.5075003713055102, "grad_norm": 2.05659556388855, "learning_rate": 2.095499777216694e-05, "loss": 0.0715, "step": 20300 }, { "epoch": 1.5082429823258576, "grad_norm": 2.342632293701172, "learning_rate": 2.0950542106044856e-05, "loss": 0.0825, "step": 20310 }, { "epoch": 1.5089855933462053, "grad_norm": 2.975311040878296, "learning_rate": 2.0946086439922768e-05, "loss": 0.0916, "step": 20320 }, { "epoch": 1.5097282043665528, "grad_norm": 2.030165672302246, "learning_rate": 2.0941630773800683e-05, "loss": 0.0953, "step": 20330 }, { "epoch": 1.5104708153869004, "grad_norm": 1.042168378829956, "learning_rate": 2.09371751076786e-05, "loss": 0.0659, "step": 20340 }, { "epoch": 1.5112134264072479, "grad_norm": 1.342262864112854, "learning_rate": 2.0932719441556513e-05, "loss": 0.0727, "step": 20350 }, { "epoch": 1.5119560374275953, "grad_norm": 1.2941288948059082, "learning_rate": 2.0928263775434428e-05, "loss": 0.0511, "step": 20360 }, { "epoch": 1.512698648447943, "grad_norm": 3.6500983238220215, "learning_rate": 2.0923808109312343e-05, "loss": 0.1324, "step": 20370 }, { "epoch": 1.5134412594682907, "grad_norm": 1.0113955736160278, "learning_rate": 2.0919352443190258e-05, "loss": 0.0696, "step": 20380 }, { "epoch": 1.5141838704886381, "grad_norm": 1.395719289779663, "learning_rate": 2.0914896777068173e-05, "loss": 0.0683, "step": 20390 }, { "epoch": 1.5149264815089856, "grad_norm": 1.7119672298431396, "learning_rate": 2.0910441110946084e-05, "loss": 0.062, "step": 20400 }, { "epoch": 1.515669092529333, "grad_norm": 1.9944767951965332, "learning_rate": 2.0905985444824003e-05, "loss": 0.0649, "step": 20410 }, { "epoch": 1.5164117035496807, "grad_norm": 1.1254253387451172, "learning_rate": 2.0901529778701917e-05, "loss": 0.0545, "step": 20420 }, { "epoch": 1.5171543145700284, "grad_norm": 2.864976406097412, "learning_rate": 2.089707411257983e-05, "loss": 0.0984, "step": 20430 }, { "epoch": 1.5178969255903758, "grad_norm": 2.1159725189208984, "learning_rate": 2.0892618446457747e-05, "loss": 0.06, "step": 20440 }, { "epoch": 1.5186395366107233, "grad_norm": 0.9338457584381104, "learning_rate": 2.0888162780335662e-05, "loss": 0.0659, "step": 20450 }, { "epoch": 1.5193821476310707, "grad_norm": 1.4100627899169922, "learning_rate": 2.0883707114213574e-05, "loss": 0.0716, "step": 20460 }, { "epoch": 1.5201247586514184, "grad_norm": 1.341646432876587, "learning_rate": 2.0879251448091492e-05, "loss": 0.0737, "step": 20470 }, { "epoch": 1.520867369671766, "grad_norm": 3.197697401046753, "learning_rate": 2.0874795781969404e-05, "loss": 0.0897, "step": 20480 }, { "epoch": 1.5216099806921135, "grad_norm": 0.8237298727035522, "learning_rate": 2.087034011584732e-05, "loss": 0.0553, "step": 20490 }, { "epoch": 1.522352591712461, "grad_norm": 1.78400456905365, "learning_rate": 2.0865884449725234e-05, "loss": 0.0745, "step": 20500 }, { "epoch": 1.5230952027328084, "grad_norm": 2.6086509227752686, "learning_rate": 2.086142878360315e-05, "loss": 0.1298, "step": 20510 }, { "epoch": 1.523837813753156, "grad_norm": 1.0803875923156738, "learning_rate": 2.0856973117481064e-05, "loss": 0.0673, "step": 20520 }, { "epoch": 1.5245804247735038, "grad_norm": 0.390265554189682, "learning_rate": 2.085251745135898e-05, "loss": 0.0434, "step": 20530 }, { "epoch": 1.5253230357938512, "grad_norm": 1.218558669090271, "learning_rate": 2.0848061785236894e-05, "loss": 0.0666, "step": 20540 }, { "epoch": 1.5260656468141987, "grad_norm": 1.0969456434249878, "learning_rate": 2.084360611911481e-05, "loss": 0.095, "step": 20550 }, { "epoch": 1.526808257834546, "grad_norm": 1.8818721771240234, "learning_rate": 2.0839150452992724e-05, "loss": 0.0808, "step": 20560 }, { "epoch": 1.5275508688548938, "grad_norm": 2.201045036315918, "learning_rate": 2.0834694786870636e-05, "loss": 0.1017, "step": 20570 }, { "epoch": 1.5282934798752414, "grad_norm": 1.147294521331787, "learning_rate": 2.0830239120748554e-05, "loss": 0.0744, "step": 20580 }, { "epoch": 1.529036090895589, "grad_norm": 0.8210429549217224, "learning_rate": 2.0825783454626466e-05, "loss": 0.0528, "step": 20590 }, { "epoch": 1.5297787019159363, "grad_norm": 3.2045974731445312, "learning_rate": 2.082132778850438e-05, "loss": 0.0918, "step": 20600 }, { "epoch": 1.530521312936284, "grad_norm": 3.5814733505249023, "learning_rate": 2.08168721223823e-05, "loss": 0.0625, "step": 20610 }, { "epoch": 1.5312639239566315, "grad_norm": 0.8140150308609009, "learning_rate": 2.081241645626021e-05, "loss": 0.1008, "step": 20620 }, { "epoch": 1.5320065349769791, "grad_norm": 2.3892927169799805, "learning_rate": 2.0807960790138126e-05, "loss": 0.0747, "step": 20630 }, { "epoch": 1.5327491459973266, "grad_norm": 1.1212306022644043, "learning_rate": 2.0803505124016044e-05, "loss": 0.0827, "step": 20640 }, { "epoch": 1.533491757017674, "grad_norm": 1.2150450944900513, "learning_rate": 2.0799049457893956e-05, "loss": 0.056, "step": 20650 }, { "epoch": 1.5342343680380217, "grad_norm": 2.157820463180542, "learning_rate": 2.079459379177187e-05, "loss": 0.0952, "step": 20660 }, { "epoch": 1.5349769790583694, "grad_norm": 2.5032124519348145, "learning_rate": 2.0790138125649782e-05, "loss": 0.0591, "step": 20670 }, { "epoch": 1.5357195900787168, "grad_norm": 1.2760533094406128, "learning_rate": 2.07856824595277e-05, "loss": 0.0746, "step": 20680 }, { "epoch": 1.5364622010990643, "grad_norm": 3.50283145904541, "learning_rate": 2.0781226793405616e-05, "loss": 0.0743, "step": 20690 }, { "epoch": 1.5372048121194117, "grad_norm": 2.669391393661499, "learning_rate": 2.0776771127283527e-05, "loss": 0.0721, "step": 20700 }, { "epoch": 1.5379474231397594, "grad_norm": 3.1741724014282227, "learning_rate": 2.0772315461161446e-05, "loss": 0.0815, "step": 20710 }, { "epoch": 1.538690034160107, "grad_norm": 2.0243866443634033, "learning_rate": 2.076785979503936e-05, "loss": 0.0678, "step": 20720 }, { "epoch": 1.5394326451804545, "grad_norm": 3.2684175968170166, "learning_rate": 2.0763404128917272e-05, "loss": 0.0838, "step": 20730 }, { "epoch": 1.540175256200802, "grad_norm": 1.0323513746261597, "learning_rate": 2.0758948462795187e-05, "loss": 0.0892, "step": 20740 }, { "epoch": 1.5409178672211494, "grad_norm": 0.852678656578064, "learning_rate": 2.0754492796673106e-05, "loss": 0.0938, "step": 20750 }, { "epoch": 1.541660478241497, "grad_norm": 1.296675205230713, "learning_rate": 2.0750037130551017e-05, "loss": 0.0761, "step": 20760 }, { "epoch": 1.5424030892618448, "grad_norm": 0.6336604952812195, "learning_rate": 2.0745581464428932e-05, "loss": 0.0521, "step": 20770 }, { "epoch": 1.5431457002821922, "grad_norm": 0.9696953296661377, "learning_rate": 2.0741125798306847e-05, "loss": 0.0773, "step": 20780 }, { "epoch": 1.5438883113025397, "grad_norm": 2.513984203338623, "learning_rate": 2.0736670132184762e-05, "loss": 0.0938, "step": 20790 }, { "epoch": 1.5446309223228871, "grad_norm": 1.3726938962936401, "learning_rate": 2.0732214466062677e-05, "loss": 0.0799, "step": 20800 }, { "epoch": 1.5453735333432348, "grad_norm": 1.0419371128082275, "learning_rate": 2.0727758799940592e-05, "loss": 0.0765, "step": 20810 }, { "epoch": 1.5461161443635825, "grad_norm": 4.229785442352295, "learning_rate": 2.0723303133818507e-05, "loss": 0.0761, "step": 20820 }, { "epoch": 1.54685875538393, "grad_norm": 3.3588624000549316, "learning_rate": 2.0718847467696422e-05, "loss": 0.0683, "step": 20830 }, { "epoch": 1.5476013664042774, "grad_norm": 1.541447401046753, "learning_rate": 2.0714391801574334e-05, "loss": 0.0785, "step": 20840 }, { "epoch": 1.5483439774246248, "grad_norm": 1.4682326316833496, "learning_rate": 2.0709936135452252e-05, "loss": 0.056, "step": 20850 }, { "epoch": 1.5490865884449725, "grad_norm": 1.9115557670593262, "learning_rate": 2.0705480469330167e-05, "loss": 0.0658, "step": 20860 }, { "epoch": 1.5498291994653202, "grad_norm": 2.6742310523986816, "learning_rate": 2.070102480320808e-05, "loss": 0.0542, "step": 20870 }, { "epoch": 1.5505718104856676, "grad_norm": 1.7176462411880493, "learning_rate": 2.0696569137085997e-05, "loss": 0.0604, "step": 20880 }, { "epoch": 1.551314421506015, "grad_norm": 2.445446252822876, "learning_rate": 2.069211347096391e-05, "loss": 0.1063, "step": 20890 }, { "epoch": 1.5520570325263627, "grad_norm": 2.274242401123047, "learning_rate": 2.0687657804841824e-05, "loss": 0.0842, "step": 20900 }, { "epoch": 1.5527996435467102, "grad_norm": 0.9645692110061646, "learning_rate": 2.068320213871974e-05, "loss": 0.0487, "step": 20910 }, { "epoch": 1.5535422545670579, "grad_norm": 2.782325506210327, "learning_rate": 2.0678746472597654e-05, "loss": 0.0783, "step": 20920 }, { "epoch": 1.5542848655874053, "grad_norm": 2.1064939498901367, "learning_rate": 2.067429080647557e-05, "loss": 0.0555, "step": 20930 }, { "epoch": 1.5550274766077528, "grad_norm": 2.168714761734009, "learning_rate": 2.0669835140353484e-05, "loss": 0.0864, "step": 20940 }, { "epoch": 1.5557700876281004, "grad_norm": 1.001583456993103, "learning_rate": 2.06653794742314e-05, "loss": 0.0761, "step": 20950 }, { "epoch": 1.556512698648448, "grad_norm": 1.3306684494018555, "learning_rate": 2.0660923808109314e-05, "loss": 0.0596, "step": 20960 }, { "epoch": 1.5572553096687956, "grad_norm": 0.9188536405563354, "learning_rate": 2.065646814198723e-05, "loss": 0.0653, "step": 20970 }, { "epoch": 1.557997920689143, "grad_norm": 1.159742832183838, "learning_rate": 2.065201247586514e-05, "loss": 0.0942, "step": 20980 }, { "epoch": 1.5587405317094905, "grad_norm": 3.285846471786499, "learning_rate": 2.064755680974306e-05, "loss": 0.0673, "step": 20990 }, { "epoch": 1.5594831427298381, "grad_norm": 1.9212596416473389, "learning_rate": 2.064310114362097e-05, "loss": 0.0477, "step": 21000 }, { "epoch": 1.5602257537501858, "grad_norm": 1.4942169189453125, "learning_rate": 2.0638645477498885e-05, "loss": 0.0735, "step": 21010 }, { "epoch": 1.5609683647705332, "grad_norm": 2.1586103439331055, "learning_rate": 2.0634189811376804e-05, "loss": 0.0594, "step": 21020 }, { "epoch": 1.5617109757908807, "grad_norm": 0.46512606739997864, "learning_rate": 2.0629734145254715e-05, "loss": 0.0662, "step": 21030 }, { "epoch": 1.5624535868112281, "grad_norm": 1.9914196729660034, "learning_rate": 2.062527847913263e-05, "loss": 0.0415, "step": 21040 }, { "epoch": 1.5631961978315758, "grad_norm": 0.8854469060897827, "learning_rate": 2.062082281301055e-05, "loss": 0.0776, "step": 21050 }, { "epoch": 1.5639388088519235, "grad_norm": 0.9890866875648499, "learning_rate": 2.061636714688846e-05, "loss": 0.0666, "step": 21060 }, { "epoch": 1.564681419872271, "grad_norm": 1.0312457084655762, "learning_rate": 2.0611911480766375e-05, "loss": 0.0777, "step": 21070 }, { "epoch": 1.5654240308926184, "grad_norm": 1.1136667728424072, "learning_rate": 2.0607455814644287e-05, "loss": 0.0548, "step": 21080 }, { "epoch": 1.5661666419129658, "grad_norm": 1.6554890871047974, "learning_rate": 2.0603000148522205e-05, "loss": 0.0659, "step": 21090 }, { "epoch": 1.5669092529333135, "grad_norm": 1.3619014024734497, "learning_rate": 2.059854448240012e-05, "loss": 0.0773, "step": 21100 }, { "epoch": 1.5676518639536612, "grad_norm": 2.3610949516296387, "learning_rate": 2.0594088816278032e-05, "loss": 0.0513, "step": 21110 }, { "epoch": 1.5683944749740086, "grad_norm": 1.5495983362197876, "learning_rate": 2.058963315015595e-05, "loss": 0.0767, "step": 21120 }, { "epoch": 1.569137085994356, "grad_norm": 0.9158768653869629, "learning_rate": 2.0585177484033865e-05, "loss": 0.0831, "step": 21130 }, { "epoch": 1.5698796970147035, "grad_norm": 0.8416073322296143, "learning_rate": 2.0580721817911777e-05, "loss": 0.0824, "step": 21140 }, { "epoch": 1.5706223080350512, "grad_norm": 4.620311737060547, "learning_rate": 2.0576266151789692e-05, "loss": 0.0638, "step": 21150 }, { "epoch": 1.5713649190553989, "grad_norm": 1.3405332565307617, "learning_rate": 2.057181048566761e-05, "loss": 0.0829, "step": 21160 }, { "epoch": 1.5721075300757463, "grad_norm": 1.7006781101226807, "learning_rate": 2.0567354819545522e-05, "loss": 0.0619, "step": 21170 }, { "epoch": 1.5728501410960938, "grad_norm": 2.3096494674682617, "learning_rate": 2.0562899153423437e-05, "loss": 0.069, "step": 21180 }, { "epoch": 1.5735927521164415, "grad_norm": 1.4846018552780151, "learning_rate": 2.0558443487301352e-05, "loss": 0.0623, "step": 21190 }, { "epoch": 1.574335363136789, "grad_norm": 1.648088812828064, "learning_rate": 2.0553987821179267e-05, "loss": 0.0544, "step": 21200 }, { "epoch": 1.5750779741571366, "grad_norm": 1.0111830234527588, "learning_rate": 2.0549532155057182e-05, "loss": 0.0544, "step": 21210 }, { "epoch": 1.575820585177484, "grad_norm": 3.3681936264038086, "learning_rate": 2.0545076488935097e-05, "loss": 0.0648, "step": 21220 }, { "epoch": 1.5765631961978315, "grad_norm": 2.3970768451690674, "learning_rate": 2.0540620822813012e-05, "loss": 0.0545, "step": 21230 }, { "epoch": 1.5773058072181791, "grad_norm": 3.242074489593506, "learning_rate": 2.0536165156690927e-05, "loss": 0.0729, "step": 21240 }, { "epoch": 1.5780484182385268, "grad_norm": 0.41246843338012695, "learning_rate": 2.0531709490568838e-05, "loss": 0.0549, "step": 21250 }, { "epoch": 1.5787910292588743, "grad_norm": 1.8131650686264038, "learning_rate": 2.0527253824446757e-05, "loss": 0.0633, "step": 21260 }, { "epoch": 1.5795336402792217, "grad_norm": 1.4795677661895752, "learning_rate": 2.052279815832467e-05, "loss": 0.0668, "step": 21270 }, { "epoch": 1.5802762512995692, "grad_norm": 0.927021324634552, "learning_rate": 2.0518342492202583e-05, "loss": 0.0673, "step": 21280 }, { "epoch": 1.5810188623199168, "grad_norm": 3.456859827041626, "learning_rate": 2.05138868260805e-05, "loss": 0.0764, "step": 21290 }, { "epoch": 1.5817614733402645, "grad_norm": 2.829115390777588, "learning_rate": 2.0509431159958413e-05, "loss": 0.0865, "step": 21300 }, { "epoch": 1.582504084360612, "grad_norm": 3.3823864459991455, "learning_rate": 2.0504975493836328e-05, "loss": 0.0578, "step": 21310 }, { "epoch": 1.5832466953809594, "grad_norm": 1.5947513580322266, "learning_rate": 2.0500519827714243e-05, "loss": 0.0997, "step": 21320 }, { "epoch": 1.5839893064013069, "grad_norm": 2.1573078632354736, "learning_rate": 2.0496064161592158e-05, "loss": 0.0788, "step": 21330 }, { "epoch": 1.5847319174216545, "grad_norm": 0.3694283962249756, "learning_rate": 2.0491608495470073e-05, "loss": 0.0275, "step": 21340 }, { "epoch": 1.5854745284420022, "grad_norm": 1.0279000997543335, "learning_rate": 2.0487152829347988e-05, "loss": 0.0967, "step": 21350 }, { "epoch": 1.5862171394623497, "grad_norm": 1.637056827545166, "learning_rate": 2.0482697163225903e-05, "loss": 0.0605, "step": 21360 }, { "epoch": 1.586959750482697, "grad_norm": 1.0048965215682983, "learning_rate": 2.0478241497103818e-05, "loss": 0.0656, "step": 21370 }, { "epoch": 1.5877023615030446, "grad_norm": 1.906299114227295, "learning_rate": 2.0473785830981733e-05, "loss": 0.0811, "step": 21380 }, { "epoch": 1.5884449725233922, "grad_norm": 3.761151075363159, "learning_rate": 2.0469330164859645e-05, "loss": 0.0829, "step": 21390 }, { "epoch": 1.58918758354374, "grad_norm": 3.385910749435425, "learning_rate": 2.0464874498737563e-05, "loss": 0.0476, "step": 21400 }, { "epoch": 1.5899301945640874, "grad_norm": 1.3071726560592651, "learning_rate": 2.0460418832615475e-05, "loss": 0.0931, "step": 21410 }, { "epoch": 1.5906728055844348, "grad_norm": 1.4268453121185303, "learning_rate": 2.045596316649339e-05, "loss": 0.1017, "step": 21420 }, { "epoch": 1.5914154166047823, "grad_norm": 0.96445232629776, "learning_rate": 2.0451507500371308e-05, "loss": 0.0711, "step": 21430 }, { "epoch": 1.59215802762513, "grad_norm": 1.8206923007965088, "learning_rate": 2.044705183424922e-05, "loss": 0.0682, "step": 21440 }, { "epoch": 1.5929006386454776, "grad_norm": 1.7625178098678589, "learning_rate": 2.0442596168127135e-05, "loss": 0.0502, "step": 21450 }, { "epoch": 1.593643249665825, "grad_norm": 1.0990653038024902, "learning_rate": 2.0438140502005053e-05, "loss": 0.0705, "step": 21460 }, { "epoch": 1.5943858606861725, "grad_norm": 0.8799698352813721, "learning_rate": 2.0433684835882965e-05, "loss": 0.0513, "step": 21470 }, { "epoch": 1.5951284717065202, "grad_norm": 1.2714344263076782, "learning_rate": 2.042922916976088e-05, "loss": 0.0877, "step": 21480 }, { "epoch": 1.5958710827268676, "grad_norm": 0.9506982564926147, "learning_rate": 2.0424773503638795e-05, "loss": 0.1113, "step": 21490 }, { "epoch": 1.5966136937472153, "grad_norm": 1.751642107963562, "learning_rate": 2.042031783751671e-05, "loss": 0.08, "step": 21500 }, { "epoch": 1.5973563047675627, "grad_norm": 3.1982038021087646, "learning_rate": 2.0415862171394625e-05, "loss": 0.074, "step": 21510 }, { "epoch": 1.5980989157879102, "grad_norm": 1.0957239866256714, "learning_rate": 2.0411406505272536e-05, "loss": 0.059, "step": 21520 }, { "epoch": 1.5988415268082579, "grad_norm": 0.893408477306366, "learning_rate": 2.0406950839150455e-05, "loss": 0.0789, "step": 21530 }, { "epoch": 1.5995841378286055, "grad_norm": 1.416986346244812, "learning_rate": 2.040249517302837e-05, "loss": 0.0941, "step": 21540 }, { "epoch": 1.600326748848953, "grad_norm": 0.7409796118736267, "learning_rate": 2.039803950690628e-05, "loss": 0.0687, "step": 21550 }, { "epoch": 1.6010693598693004, "grad_norm": 1.517598271369934, "learning_rate": 2.0393583840784196e-05, "loss": 0.0638, "step": 21560 }, { "epoch": 1.6018119708896479, "grad_norm": 1.0811076164245605, "learning_rate": 2.0389128174662115e-05, "loss": 0.0784, "step": 21570 }, { "epoch": 1.6025545819099956, "grad_norm": 3.014960765838623, "learning_rate": 2.0384672508540026e-05, "loss": 0.0529, "step": 21580 }, { "epoch": 1.6032971929303432, "grad_norm": 4.6855669021606445, "learning_rate": 2.038021684241794e-05, "loss": 0.0614, "step": 21590 }, { "epoch": 1.6040398039506907, "grad_norm": 2.0930423736572266, "learning_rate": 2.0375761176295856e-05, "loss": 0.0794, "step": 21600 }, { "epoch": 1.6047824149710381, "grad_norm": 2.3684639930725098, "learning_rate": 2.037130551017377e-05, "loss": 0.0806, "step": 21610 }, { "epoch": 1.6055250259913856, "grad_norm": 2.803929090499878, "learning_rate": 2.0366849844051686e-05, "loss": 0.1052, "step": 21620 }, { "epoch": 1.6062676370117333, "grad_norm": 1.4284909963607788, "learning_rate": 2.03623941779296e-05, "loss": 0.078, "step": 21630 }, { "epoch": 1.607010248032081, "grad_norm": 1.5730488300323486, "learning_rate": 2.0357938511807516e-05, "loss": 0.0623, "step": 21640 }, { "epoch": 1.6077528590524284, "grad_norm": 1.4216201305389404, "learning_rate": 2.035348284568543e-05, "loss": 0.0399, "step": 21650 }, { "epoch": 1.6084954700727758, "grad_norm": 1.4788250923156738, "learning_rate": 2.0349027179563343e-05, "loss": 0.0619, "step": 21660 }, { "epoch": 1.6092380810931233, "grad_norm": 1.5001707077026367, "learning_rate": 2.034457151344126e-05, "loss": 0.0683, "step": 21670 }, { "epoch": 1.609980692113471, "grad_norm": 2.592287063598633, "learning_rate": 2.0340115847319176e-05, "loss": 0.0971, "step": 21680 }, { "epoch": 1.6107233031338186, "grad_norm": 2.6354775428771973, "learning_rate": 2.0335660181197088e-05, "loss": 0.0832, "step": 21690 }, { "epoch": 1.611465914154166, "grad_norm": 0.5848486423492432, "learning_rate": 2.0331204515075006e-05, "loss": 0.066, "step": 21700 }, { "epoch": 1.6122085251745135, "grad_norm": 2.4620141983032227, "learning_rate": 2.0326748848952918e-05, "loss": 0.0688, "step": 21710 }, { "epoch": 1.612951136194861, "grad_norm": 0.8085265755653381, "learning_rate": 2.0322293182830833e-05, "loss": 0.0639, "step": 21720 }, { "epoch": 1.6136937472152086, "grad_norm": 2.16105580329895, "learning_rate": 2.0317837516708748e-05, "loss": 0.111, "step": 21730 }, { "epoch": 1.6144363582355563, "grad_norm": 2.140782356262207, "learning_rate": 2.0313381850586663e-05, "loss": 0.0902, "step": 21740 }, { "epoch": 1.6151789692559038, "grad_norm": 1.3173938989639282, "learning_rate": 2.0308926184464578e-05, "loss": 0.0528, "step": 21750 }, { "epoch": 1.6159215802762512, "grad_norm": 1.9319645166397095, "learning_rate": 2.0304470518342493e-05, "loss": 0.061, "step": 21760 }, { "epoch": 1.6166641912965989, "grad_norm": 1.2936400175094604, "learning_rate": 2.0300014852220408e-05, "loss": 0.0601, "step": 21770 }, { "epoch": 1.6174068023169463, "grad_norm": 0.23843184113502502, "learning_rate": 2.0295559186098323e-05, "loss": 0.0479, "step": 21780 }, { "epoch": 1.618149413337294, "grad_norm": 5.628422260284424, "learning_rate": 2.0291103519976238e-05, "loss": 0.0611, "step": 21790 }, { "epoch": 1.6188920243576415, "grad_norm": 1.189815640449524, "learning_rate": 2.028664785385415e-05, "loss": 0.0843, "step": 21800 }, { "epoch": 1.619634635377989, "grad_norm": 0.9391959309577942, "learning_rate": 2.0282192187732068e-05, "loss": 0.0786, "step": 21810 }, { "epoch": 1.6203772463983366, "grad_norm": 2.154853582382202, "learning_rate": 2.027773652160998e-05, "loss": 0.0816, "step": 21820 }, { "epoch": 1.6211198574186843, "grad_norm": 2.5236504077911377, "learning_rate": 2.0273280855487894e-05, "loss": 0.0779, "step": 21830 }, { "epoch": 1.6218624684390317, "grad_norm": 1.4820054769515991, "learning_rate": 2.0268825189365813e-05, "loss": 0.071, "step": 21840 }, { "epoch": 1.6226050794593792, "grad_norm": 0.6480633616447449, "learning_rate": 2.0264369523243724e-05, "loss": 0.0475, "step": 21850 }, { "epoch": 1.6233476904797266, "grad_norm": 2.3438937664031982, "learning_rate": 2.025991385712164e-05, "loss": 0.1059, "step": 21860 }, { "epoch": 1.6240903015000743, "grad_norm": 3.0132994651794434, "learning_rate": 2.0255458190999558e-05, "loss": 0.065, "step": 21870 }, { "epoch": 1.624832912520422, "grad_norm": 3.362128496170044, "learning_rate": 2.025100252487747e-05, "loss": 0.0946, "step": 21880 }, { "epoch": 1.6255755235407694, "grad_norm": 2.070509672164917, "learning_rate": 2.0246546858755384e-05, "loss": 0.0744, "step": 21890 }, { "epoch": 1.6263181345611168, "grad_norm": 2.716153383255005, "learning_rate": 2.02420911926333e-05, "loss": 0.0517, "step": 21900 }, { "epoch": 1.6270607455814643, "grad_norm": 0.9877446293830872, "learning_rate": 2.0237635526511214e-05, "loss": 0.0744, "step": 21910 }, { "epoch": 1.627803356601812, "grad_norm": 0.48732122778892517, "learning_rate": 2.023317986038913e-05, "loss": 0.0395, "step": 21920 }, { "epoch": 1.6285459676221596, "grad_norm": 2.898503541946411, "learning_rate": 2.022872419426704e-05, "loss": 0.0776, "step": 21930 }, { "epoch": 1.629288578642507, "grad_norm": 1.4797714948654175, "learning_rate": 2.022426852814496e-05, "loss": 0.0721, "step": 21940 }, { "epoch": 1.6300311896628545, "grad_norm": 2.628574848175049, "learning_rate": 2.0219812862022874e-05, "loss": 0.0751, "step": 21950 }, { "epoch": 1.630773800683202, "grad_norm": 0.8774972558021545, "learning_rate": 2.0215357195900786e-05, "loss": 0.0778, "step": 21960 }, { "epoch": 1.6315164117035497, "grad_norm": 3.261282205581665, "learning_rate": 2.02109015297787e-05, "loss": 0.0896, "step": 21970 }, { "epoch": 1.6322590227238973, "grad_norm": 1.9803194999694824, "learning_rate": 2.020644586365662e-05, "loss": 0.0978, "step": 21980 }, { "epoch": 1.6330016337442448, "grad_norm": 2.566403865814209, "learning_rate": 2.020199019753453e-05, "loss": 0.0794, "step": 21990 }, { "epoch": 1.6337442447645922, "grad_norm": 1.7836480140686035, "learning_rate": 2.0197534531412446e-05, "loss": 0.0688, "step": 22000 }, { "epoch": 1.6344868557849397, "grad_norm": 1.82455575466156, "learning_rate": 2.0193078865290364e-05, "loss": 0.0448, "step": 22010 }, { "epoch": 1.6352294668052874, "grad_norm": 2.849226713180542, "learning_rate": 2.0188623199168276e-05, "loss": 0.0601, "step": 22020 }, { "epoch": 1.635972077825635, "grad_norm": 2.3754022121429443, "learning_rate": 2.018416753304619e-05, "loss": 0.0588, "step": 22030 }, { "epoch": 1.6367146888459825, "grad_norm": 3.1693973541259766, "learning_rate": 2.0179711866924106e-05, "loss": 0.0812, "step": 22040 }, { "epoch": 1.63745729986633, "grad_norm": 1.5568816661834717, "learning_rate": 2.017525620080202e-05, "loss": 0.0859, "step": 22050 }, { "epoch": 1.6381999108866776, "grad_norm": 1.8013701438903809, "learning_rate": 2.0170800534679936e-05, "loss": 0.1001, "step": 22060 }, { "epoch": 1.638942521907025, "grad_norm": 1.7124766111373901, "learning_rate": 2.0166344868557847e-05, "loss": 0.0779, "step": 22070 }, { "epoch": 1.6396851329273727, "grad_norm": 3.0362048149108887, "learning_rate": 2.0161889202435766e-05, "loss": 0.0592, "step": 22080 }, { "epoch": 1.6404277439477202, "grad_norm": 3.313502788543701, "learning_rate": 2.015743353631368e-05, "loss": 0.0933, "step": 22090 }, { "epoch": 1.6411703549680676, "grad_norm": 1.0095113515853882, "learning_rate": 2.0152977870191592e-05, "loss": 0.0747, "step": 22100 }, { "epoch": 1.6419129659884153, "grad_norm": 1.5468275547027588, "learning_rate": 2.014852220406951e-05, "loss": 0.0409, "step": 22110 }, { "epoch": 1.642655577008763, "grad_norm": 4.015323162078857, "learning_rate": 2.0144066537947422e-05, "loss": 0.0768, "step": 22120 }, { "epoch": 1.6433981880291104, "grad_norm": 2.2448365688323975, "learning_rate": 2.0139610871825337e-05, "loss": 0.059, "step": 22130 }, { "epoch": 1.6441407990494579, "grad_norm": 1.0703582763671875, "learning_rate": 2.0135155205703252e-05, "loss": 0.0744, "step": 22140 }, { "epoch": 1.6448834100698053, "grad_norm": 1.5378329753875732, "learning_rate": 2.0130699539581167e-05, "loss": 0.0652, "step": 22150 }, { "epoch": 1.645626021090153, "grad_norm": 3.0120882987976074, "learning_rate": 2.0126243873459082e-05, "loss": 0.0799, "step": 22160 }, { "epoch": 1.6463686321105007, "grad_norm": 2.719409227371216, "learning_rate": 2.0121788207336997e-05, "loss": 0.0785, "step": 22170 }, { "epoch": 1.6471112431308481, "grad_norm": 2.3582966327667236, "learning_rate": 2.0117332541214912e-05, "loss": 0.0735, "step": 22180 }, { "epoch": 1.6478538541511956, "grad_norm": 1.0670444965362549, "learning_rate": 2.0112876875092827e-05, "loss": 0.0701, "step": 22190 }, { "epoch": 1.648596465171543, "grad_norm": 2.2394518852233887, "learning_rate": 2.0108421208970742e-05, "loss": 0.07, "step": 22200 }, { "epoch": 1.6493390761918907, "grad_norm": 1.6190416812896729, "learning_rate": 2.0103965542848657e-05, "loss": 0.0538, "step": 22210 }, { "epoch": 1.6500816872122384, "grad_norm": 3.418266773223877, "learning_rate": 2.0099509876726572e-05, "loss": 0.078, "step": 22220 }, { "epoch": 1.6508242982325858, "grad_norm": 1.8200223445892334, "learning_rate": 2.0095054210604484e-05, "loss": 0.0464, "step": 22230 }, { "epoch": 1.6515669092529333, "grad_norm": 1.1762034893035889, "learning_rate": 2.00905985444824e-05, "loss": 0.1071, "step": 22240 }, { "epoch": 1.6523095202732807, "grad_norm": 2.370851755142212, "learning_rate": 2.0086142878360317e-05, "loss": 0.072, "step": 22250 }, { "epoch": 1.6530521312936284, "grad_norm": 1.7569416761398315, "learning_rate": 2.008168721223823e-05, "loss": 0.0923, "step": 22260 }, { "epoch": 1.653794742313976, "grad_norm": 1.144127368927002, "learning_rate": 2.0077231546116144e-05, "loss": 0.0771, "step": 22270 }, { "epoch": 1.6545373533343235, "grad_norm": 2.696286201477051, "learning_rate": 2.0072775879994062e-05, "loss": 0.0989, "step": 22280 }, { "epoch": 1.655279964354671, "grad_norm": 1.715278148651123, "learning_rate": 2.0068320213871974e-05, "loss": 0.0781, "step": 22290 }, { "epoch": 1.6560225753750184, "grad_norm": 0.7108889818191528, "learning_rate": 2.006386454774989e-05, "loss": 0.0512, "step": 22300 }, { "epoch": 1.656765186395366, "grad_norm": 1.54939603805542, "learning_rate": 2.0059408881627804e-05, "loss": 0.073, "step": 22310 }, { "epoch": 1.6575077974157137, "grad_norm": 1.0879472494125366, "learning_rate": 2.005495321550572e-05, "loss": 0.0884, "step": 22320 }, { "epoch": 1.6582504084360612, "grad_norm": 0.8523910641670227, "learning_rate": 2.0050497549383634e-05, "loss": 0.0762, "step": 22330 }, { "epoch": 1.6589930194564086, "grad_norm": 0.9075714945793152, "learning_rate": 2.0046041883261546e-05, "loss": 0.0792, "step": 22340 }, { "epoch": 1.6597356304767563, "grad_norm": 1.5962119102478027, "learning_rate": 2.0041586217139464e-05, "loss": 0.0665, "step": 22350 }, { "epoch": 1.6604782414971038, "grad_norm": 2.9406886100769043, "learning_rate": 2.003713055101738e-05, "loss": 0.0658, "step": 22360 }, { "epoch": 1.6612208525174514, "grad_norm": 1.0619057416915894, "learning_rate": 2.003267488489529e-05, "loss": 0.0719, "step": 22370 }, { "epoch": 1.661963463537799, "grad_norm": 1.0932631492614746, "learning_rate": 2.0028219218773206e-05, "loss": 0.0747, "step": 22380 }, { "epoch": 1.6627060745581463, "grad_norm": 2.542506217956543, "learning_rate": 2.0023763552651124e-05, "loss": 0.067, "step": 22390 }, { "epoch": 1.663448685578494, "grad_norm": 2.6380186080932617, "learning_rate": 2.0019307886529036e-05, "loss": 0.033, "step": 22400 }, { "epoch": 1.6641912965988417, "grad_norm": 1.5135997533798218, "learning_rate": 2.001485222040695e-05, "loss": 0.0786, "step": 22410 }, { "epoch": 1.6649339076191891, "grad_norm": 0.8384225368499756, "learning_rate": 2.001039655428487e-05, "loss": 0.084, "step": 22420 }, { "epoch": 1.6656765186395366, "grad_norm": 2.0017759799957275, "learning_rate": 2.000594088816278e-05, "loss": 0.0913, "step": 22430 }, { "epoch": 1.666419129659884, "grad_norm": 2.2100701332092285, "learning_rate": 2.0001485222040696e-05, "loss": 0.0485, "step": 22440 }, { "epoch": 1.6671617406802317, "grad_norm": 3.30169939994812, "learning_rate": 1.999702955591861e-05, "loss": 0.0674, "step": 22450 }, { "epoch": 1.6679043517005794, "grad_norm": 0.9708051085472107, "learning_rate": 1.9992573889796525e-05, "loss": 0.0807, "step": 22460 }, { "epoch": 1.6686469627209268, "grad_norm": 0.9182741045951843, "learning_rate": 1.998811822367444e-05, "loss": 0.0432, "step": 22470 }, { "epoch": 1.6693895737412743, "grad_norm": 4.127451419830322, "learning_rate": 1.9983662557552352e-05, "loss": 0.0857, "step": 22480 }, { "epoch": 1.6701321847616217, "grad_norm": 1.7990877628326416, "learning_rate": 1.997920689143027e-05, "loss": 0.0993, "step": 22490 }, { "epoch": 1.6708747957819694, "grad_norm": 3.319918155670166, "learning_rate": 1.9974751225308185e-05, "loss": 0.0492, "step": 22500 }, { "epoch": 1.671617406802317, "grad_norm": 1.4016786813735962, "learning_rate": 1.9970295559186097e-05, "loss": 0.0593, "step": 22510 }, { "epoch": 1.6723600178226645, "grad_norm": 3.1249544620513916, "learning_rate": 1.9965839893064015e-05, "loss": 0.1065, "step": 22520 }, { "epoch": 1.673102628843012, "grad_norm": 1.5883194208145142, "learning_rate": 1.9961384226941927e-05, "loss": 0.083, "step": 22530 }, { "epoch": 1.6738452398633594, "grad_norm": 0.8119624257087708, "learning_rate": 1.9956928560819842e-05, "loss": 0.0768, "step": 22540 }, { "epoch": 1.674587850883707, "grad_norm": 1.9466767311096191, "learning_rate": 1.9952472894697757e-05, "loss": 0.0688, "step": 22550 }, { "epoch": 1.6753304619040548, "grad_norm": 1.9473903179168701, "learning_rate": 1.9948017228575672e-05, "loss": 0.0631, "step": 22560 }, { "epoch": 1.6760730729244022, "grad_norm": 1.485198974609375, "learning_rate": 1.9943561562453587e-05, "loss": 0.0685, "step": 22570 }, { "epoch": 1.6768156839447497, "grad_norm": 1.3554059267044067, "learning_rate": 1.9939105896331502e-05, "loss": 0.0596, "step": 22580 }, { "epoch": 1.6775582949650971, "grad_norm": 2.8601107597351074, "learning_rate": 1.9934650230209417e-05, "loss": 0.0897, "step": 22590 }, { "epoch": 1.6783009059854448, "grad_norm": 0.8527280688285828, "learning_rate": 1.9930194564087332e-05, "loss": 0.0749, "step": 22600 }, { "epoch": 1.6790435170057925, "grad_norm": 1.9120954275131226, "learning_rate": 1.9925738897965247e-05, "loss": 0.0733, "step": 22610 }, { "epoch": 1.67978612802614, "grad_norm": 0.8848724365234375, "learning_rate": 1.9921283231843162e-05, "loss": 0.047, "step": 22620 }, { "epoch": 1.6805287390464874, "grad_norm": 1.2848988771438599, "learning_rate": 1.9916827565721077e-05, "loss": 0.0584, "step": 22630 }, { "epoch": 1.681271350066835, "grad_norm": 1.2438756227493286, "learning_rate": 1.991237189959899e-05, "loss": 0.0784, "step": 22640 }, { "epoch": 1.6820139610871825, "grad_norm": 0.8794949054718018, "learning_rate": 1.9907916233476904e-05, "loss": 0.0759, "step": 22650 }, { "epoch": 1.6827565721075302, "grad_norm": 2.005244255065918, "learning_rate": 1.9903460567354822e-05, "loss": 0.0558, "step": 22660 }, { "epoch": 1.6834991831278776, "grad_norm": 1.2207728624343872, "learning_rate": 1.9899004901232734e-05, "loss": 0.0891, "step": 22670 }, { "epoch": 1.684241794148225, "grad_norm": 1.019566297531128, "learning_rate": 1.989454923511065e-05, "loss": 0.0762, "step": 22680 }, { "epoch": 1.6849844051685727, "grad_norm": 1.250605583190918, "learning_rate": 1.9890093568988567e-05, "loss": 0.0728, "step": 22690 }, { "epoch": 1.6857270161889204, "grad_norm": 2.422374963760376, "learning_rate": 1.988563790286648e-05, "loss": 0.0747, "step": 22700 }, { "epoch": 1.6864696272092679, "grad_norm": 2.8228814601898193, "learning_rate": 1.9881182236744394e-05, "loss": 0.0741, "step": 22710 }, { "epoch": 1.6872122382296153, "grad_norm": 2.183687448501587, "learning_rate": 1.987672657062231e-05, "loss": 0.0659, "step": 22720 }, { "epoch": 1.6879548492499628, "grad_norm": 1.0389907360076904, "learning_rate": 1.9872270904500224e-05, "loss": 0.0901, "step": 22730 }, { "epoch": 1.6886974602703104, "grad_norm": 2.4531607627868652, "learning_rate": 1.986781523837814e-05, "loss": 0.0963, "step": 22740 }, { "epoch": 1.689440071290658, "grad_norm": 1.1364638805389404, "learning_rate": 1.986335957225605e-05, "loss": 0.0745, "step": 22750 }, { "epoch": 1.6901826823110055, "grad_norm": 1.6292158365249634, "learning_rate": 1.985890390613397e-05, "loss": 0.0886, "step": 22760 }, { "epoch": 1.690925293331353, "grad_norm": 2.255054473876953, "learning_rate": 1.9854448240011884e-05, "loss": 0.0885, "step": 22770 }, { "epoch": 1.6916679043517004, "grad_norm": 0.5803804993629456, "learning_rate": 1.9849992573889795e-05, "loss": 0.0325, "step": 22780 }, { "epoch": 1.6924105153720481, "grad_norm": 1.7144925594329834, "learning_rate": 1.984553690776771e-05, "loss": 0.0705, "step": 22790 }, { "epoch": 1.6931531263923958, "grad_norm": 0.6633053421974182, "learning_rate": 1.984108124164563e-05, "loss": 0.0524, "step": 22800 }, { "epoch": 1.6938957374127432, "grad_norm": 3.752182960510254, "learning_rate": 1.983662557552354e-05, "loss": 0.0732, "step": 22810 }, { "epoch": 1.6946383484330907, "grad_norm": 1.711698055267334, "learning_rate": 1.9832169909401455e-05, "loss": 0.0639, "step": 22820 }, { "epoch": 1.6953809594534381, "grad_norm": 3.3594610691070557, "learning_rate": 1.9827714243279374e-05, "loss": 0.0828, "step": 22830 }, { "epoch": 1.6961235704737858, "grad_norm": 2.337766647338867, "learning_rate": 1.9823258577157285e-05, "loss": 0.0718, "step": 22840 }, { "epoch": 1.6968661814941335, "grad_norm": 0.9109551310539246, "learning_rate": 1.98188029110352e-05, "loss": 0.0757, "step": 22850 }, { "epoch": 1.697608792514481, "grad_norm": 2.4265153408050537, "learning_rate": 1.9814347244913115e-05, "loss": 0.1019, "step": 22860 }, { "epoch": 1.6983514035348284, "grad_norm": 2.5311357975006104, "learning_rate": 1.980989157879103e-05, "loss": 0.07, "step": 22870 }, { "epoch": 1.6990940145551758, "grad_norm": 1.0674959421157837, "learning_rate": 1.9805435912668945e-05, "loss": 0.0914, "step": 22880 }, { "epoch": 1.6998366255755235, "grad_norm": 1.6489328145980835, "learning_rate": 1.9800980246546857e-05, "loss": 0.0893, "step": 22890 }, { "epoch": 1.7005792365958712, "grad_norm": 1.366485595703125, "learning_rate": 1.9796524580424775e-05, "loss": 0.0594, "step": 22900 }, { "epoch": 1.7013218476162186, "grad_norm": 1.1169344186782837, "learning_rate": 1.979206891430269e-05, "loss": 0.0928, "step": 22910 }, { "epoch": 1.702064458636566, "grad_norm": 0.7352683544158936, "learning_rate": 1.97876132481806e-05, "loss": 0.0778, "step": 22920 }, { "epoch": 1.7028070696569138, "grad_norm": 1.1200909614562988, "learning_rate": 1.978315758205852e-05, "loss": 0.0808, "step": 22930 }, { "epoch": 1.7035496806772612, "grad_norm": 1.3726412057876587, "learning_rate": 1.9778701915936435e-05, "loss": 0.0678, "step": 22940 }, { "epoch": 1.7042922916976089, "grad_norm": 2.1558868885040283, "learning_rate": 1.9774246249814347e-05, "loss": 0.0842, "step": 22950 }, { "epoch": 1.7050349027179563, "grad_norm": 0.9559635519981384, "learning_rate": 1.976979058369226e-05, "loss": 0.0813, "step": 22960 }, { "epoch": 1.7057775137383038, "grad_norm": 0.5382719039916992, "learning_rate": 1.9765334917570177e-05, "loss": 0.0669, "step": 22970 }, { "epoch": 1.7065201247586514, "grad_norm": 1.2556627988815308, "learning_rate": 1.976087925144809e-05, "loss": 0.0981, "step": 22980 }, { "epoch": 1.7072627357789991, "grad_norm": 1.57675039768219, "learning_rate": 1.9756423585326007e-05, "loss": 0.0876, "step": 22990 }, { "epoch": 1.7080053467993466, "grad_norm": 1.055188536643982, "learning_rate": 1.975196791920392e-05, "loss": 0.0983, "step": 23000 }, { "epoch": 1.708747957819694, "grad_norm": 0.611940324306488, "learning_rate": 1.9747512253081837e-05, "loss": 0.0705, "step": 23010 }, { "epoch": 1.7094905688400415, "grad_norm": 2.1072449684143066, "learning_rate": 1.974305658695975e-05, "loss": 0.072, "step": 23020 }, { "epoch": 1.7102331798603891, "grad_norm": 1.6510035991668701, "learning_rate": 1.9738600920837667e-05, "loss": 0.0522, "step": 23030 }, { "epoch": 1.7109757908807368, "grad_norm": 0.40311571955680847, "learning_rate": 1.973414525471558e-05, "loss": 0.0883, "step": 23040 }, { "epoch": 1.7117184019010843, "grad_norm": 3.123772144317627, "learning_rate": 1.9729689588593493e-05, "loss": 0.0727, "step": 23050 }, { "epoch": 1.7124610129214317, "grad_norm": 0.9838127493858337, "learning_rate": 1.9725233922471408e-05, "loss": 0.0601, "step": 23060 }, { "epoch": 1.7132036239417792, "grad_norm": 0.6052844524383545, "learning_rate": 1.9720778256349327e-05, "loss": 0.0905, "step": 23070 }, { "epoch": 1.7139462349621268, "grad_norm": 2.6029303073883057, "learning_rate": 1.9716322590227238e-05, "loss": 0.0854, "step": 23080 }, { "epoch": 1.7146888459824745, "grad_norm": 1.6953434944152832, "learning_rate": 1.9711866924105153e-05, "loss": 0.0739, "step": 23090 }, { "epoch": 1.715431457002822, "grad_norm": 1.7296435832977295, "learning_rate": 1.970741125798307e-05, "loss": 0.0666, "step": 23100 }, { "epoch": 1.7161740680231694, "grad_norm": 1.8964383602142334, "learning_rate": 1.9702955591860983e-05, "loss": 0.0687, "step": 23110 }, { "epoch": 1.7169166790435169, "grad_norm": 1.0528844594955444, "learning_rate": 1.9698499925738898e-05, "loss": 0.0555, "step": 23120 }, { "epoch": 1.7176592900638645, "grad_norm": 2.4928388595581055, "learning_rate": 1.9694044259616813e-05, "loss": 0.0738, "step": 23130 }, { "epoch": 1.7184019010842122, "grad_norm": 2.2617714405059814, "learning_rate": 1.9689588593494728e-05, "loss": 0.1023, "step": 23140 }, { "epoch": 1.7191445121045597, "grad_norm": 1.0434247255325317, "learning_rate": 1.9685132927372643e-05, "loss": 0.0923, "step": 23150 }, { "epoch": 1.719887123124907, "grad_norm": 0.6594432592391968, "learning_rate": 1.9680677261250555e-05, "loss": 0.0626, "step": 23160 }, { "epoch": 1.7206297341452546, "grad_norm": 2.8370988368988037, "learning_rate": 1.9676221595128473e-05, "loss": 0.093, "step": 23170 }, { "epoch": 1.7213723451656022, "grad_norm": 0.7767960429191589, "learning_rate": 1.9671765929006388e-05, "loss": 0.0637, "step": 23180 }, { "epoch": 1.72211495618595, "grad_norm": 1.8710448741912842, "learning_rate": 1.96673102628843e-05, "loss": 0.0675, "step": 23190 }, { "epoch": 1.7228575672062973, "grad_norm": 3.039166212081909, "learning_rate": 1.9662854596762215e-05, "loss": 0.0883, "step": 23200 }, { "epoch": 1.7236001782266448, "grad_norm": 4.762219429016113, "learning_rate": 1.9658398930640133e-05, "loss": 0.0519, "step": 23210 }, { "epoch": 1.7243427892469925, "grad_norm": 1.641481876373291, "learning_rate": 1.9653943264518045e-05, "loss": 0.0918, "step": 23220 }, { "epoch": 1.72508540026734, "grad_norm": 0.6783468127250671, "learning_rate": 1.964948759839596e-05, "loss": 0.0757, "step": 23230 }, { "epoch": 1.7258280112876876, "grad_norm": 1.0476303100585938, "learning_rate": 1.9645031932273878e-05, "loss": 0.0607, "step": 23240 }, { "epoch": 1.726570622308035, "grad_norm": 1.5306792259216309, "learning_rate": 1.964057626615179e-05, "loss": 0.0644, "step": 23250 }, { "epoch": 1.7273132333283825, "grad_norm": 0.8044191598892212, "learning_rate": 1.9636120600029705e-05, "loss": 0.0881, "step": 23260 }, { "epoch": 1.7280558443487302, "grad_norm": 1.202543020248413, "learning_rate": 1.963166493390762e-05, "loss": 0.0665, "step": 23270 }, { "epoch": 1.7287984553690778, "grad_norm": 2.2589240074157715, "learning_rate": 1.9627209267785535e-05, "loss": 0.0831, "step": 23280 }, { "epoch": 1.7295410663894253, "grad_norm": 2.0191476345062256, "learning_rate": 1.962275360166345e-05, "loss": 0.0765, "step": 23290 }, { "epoch": 1.7302836774097727, "grad_norm": 2.8185505867004395, "learning_rate": 1.961829793554136e-05, "loss": 0.0791, "step": 23300 }, { "epoch": 1.7310262884301202, "grad_norm": 1.082022786140442, "learning_rate": 1.961384226941928e-05, "loss": 0.0978, "step": 23310 }, { "epoch": 1.7317688994504679, "grad_norm": 1.9204188585281372, "learning_rate": 1.9609386603297195e-05, "loss": 0.0893, "step": 23320 }, { "epoch": 1.7325115104708155, "grad_norm": 1.1445153951644897, "learning_rate": 1.9604930937175106e-05, "loss": 0.0547, "step": 23330 }, { "epoch": 1.733254121491163, "grad_norm": 0.6673332452774048, "learning_rate": 1.9600475271053025e-05, "loss": 0.0447, "step": 23340 }, { "epoch": 1.7339967325115104, "grad_norm": 4.474247455596924, "learning_rate": 1.959601960493094e-05, "loss": 0.11, "step": 23350 }, { "epoch": 1.7347393435318579, "grad_norm": 4.08743143081665, "learning_rate": 1.959156393880885e-05, "loss": 0.0967, "step": 23360 }, { "epoch": 1.7354819545522056, "grad_norm": 1.5097987651824951, "learning_rate": 1.9587108272686766e-05, "loss": 0.0715, "step": 23370 }, { "epoch": 1.7362245655725532, "grad_norm": 2.6313939094543457, "learning_rate": 1.958265260656468e-05, "loss": 0.0836, "step": 23380 }, { "epoch": 1.7369671765929007, "grad_norm": 0.9366971850395203, "learning_rate": 1.9578196940442596e-05, "loss": 0.0911, "step": 23390 }, { "epoch": 1.7377097876132481, "grad_norm": 0.614687979221344, "learning_rate": 1.957374127432051e-05, "loss": 0.0654, "step": 23400 }, { "epoch": 1.7384523986335956, "grad_norm": 1.9698867797851562, "learning_rate": 1.9569285608198426e-05, "loss": 0.0922, "step": 23410 }, { "epoch": 1.7391950096539432, "grad_norm": 2.608386754989624, "learning_rate": 1.956482994207634e-05, "loss": 0.0963, "step": 23420 }, { "epoch": 1.739937620674291, "grad_norm": 3.6688835620880127, "learning_rate": 1.9560374275954256e-05, "loss": 0.0771, "step": 23430 }, { "epoch": 1.7406802316946384, "grad_norm": 2.29097318649292, "learning_rate": 1.955591860983217e-05, "loss": 0.0723, "step": 23440 }, { "epoch": 1.7414228427149858, "grad_norm": 1.2678636312484741, "learning_rate": 1.9551462943710086e-05, "loss": 0.0709, "step": 23450 }, { "epoch": 1.7421654537353333, "grad_norm": 4.124483108520508, "learning_rate": 1.9547007277588e-05, "loss": 0.089, "step": 23460 }, { "epoch": 1.742908064755681, "grad_norm": 0.28768646717071533, "learning_rate": 1.9542551611465913e-05, "loss": 0.0933, "step": 23470 }, { "epoch": 1.7436506757760286, "grad_norm": 0.71043860912323, "learning_rate": 1.953809594534383e-05, "loss": 0.0806, "step": 23480 }, { "epoch": 1.744393286796376, "grad_norm": 1.6145790815353394, "learning_rate": 1.9533640279221743e-05, "loss": 0.0659, "step": 23490 }, { "epoch": 1.7451358978167235, "grad_norm": 1.6282793283462524, "learning_rate": 1.9529184613099658e-05, "loss": 0.1154, "step": 23500 }, { "epoch": 1.7458785088370712, "grad_norm": 1.4098920822143555, "learning_rate": 1.9524728946977576e-05, "loss": 0.073, "step": 23510 }, { "epoch": 1.7466211198574186, "grad_norm": 2.4502289295196533, "learning_rate": 1.9520273280855488e-05, "loss": 0.0688, "step": 23520 }, { "epoch": 1.7473637308777663, "grad_norm": 1.5324982404708862, "learning_rate": 1.9515817614733403e-05, "loss": 0.0789, "step": 23530 }, { "epoch": 1.7481063418981138, "grad_norm": 1.1918566226959229, "learning_rate": 1.9511361948611318e-05, "loss": 0.0879, "step": 23540 }, { "epoch": 1.7488489529184612, "grad_norm": 1.8699147701263428, "learning_rate": 1.9506906282489233e-05, "loss": 0.0677, "step": 23550 }, { "epoch": 1.7495915639388089, "grad_norm": 1.7043718099594116, "learning_rate": 1.9502450616367148e-05, "loss": 0.0751, "step": 23560 }, { "epoch": 1.7503341749591566, "grad_norm": 1.2603180408477783, "learning_rate": 1.949799495024506e-05, "loss": 0.0724, "step": 23570 }, { "epoch": 1.751076785979504, "grad_norm": 0.9619042277336121, "learning_rate": 1.9493539284122978e-05, "loss": 0.0815, "step": 23580 }, { "epoch": 1.7518193969998515, "grad_norm": 1.521238923072815, "learning_rate": 1.9489083618000893e-05, "loss": 0.0494, "step": 23590 }, { "epoch": 1.752562008020199, "grad_norm": 2.5039453506469727, "learning_rate": 1.9484627951878804e-05, "loss": 0.0824, "step": 23600 }, { "epoch": 1.7533046190405466, "grad_norm": 0.793013334274292, "learning_rate": 1.9480172285756723e-05, "loss": 0.0529, "step": 23610 }, { "epoch": 1.7540472300608942, "grad_norm": 2.6276259422302246, "learning_rate": 1.9475716619634638e-05, "loss": 0.0624, "step": 23620 }, { "epoch": 1.7547898410812417, "grad_norm": 0.7211153507232666, "learning_rate": 1.947126095351255e-05, "loss": 0.0859, "step": 23630 }, { "epoch": 1.7555324521015891, "grad_norm": 0.7055466175079346, "learning_rate": 1.9466805287390464e-05, "loss": 0.0813, "step": 23640 }, { "epoch": 1.7562750631219366, "grad_norm": 1.8342903852462769, "learning_rate": 1.9462349621268383e-05, "loss": 0.1125, "step": 23650 }, { "epoch": 1.7570176741422843, "grad_norm": 1.909565806388855, "learning_rate": 1.9457893955146294e-05, "loss": 0.1002, "step": 23660 }, { "epoch": 1.757760285162632, "grad_norm": 1.755029559135437, "learning_rate": 1.945343828902421e-05, "loss": 0.0785, "step": 23670 }, { "epoch": 1.7585028961829794, "grad_norm": 1.2598732709884644, "learning_rate": 1.9448982622902124e-05, "loss": 0.063, "step": 23680 }, { "epoch": 1.7592455072033268, "grad_norm": 1.9370228052139282, "learning_rate": 1.944452695678004e-05, "loss": 0.0481, "step": 23690 }, { "epoch": 1.7599881182236743, "grad_norm": 0.8600150942802429, "learning_rate": 1.9440071290657954e-05, "loss": 0.0818, "step": 23700 }, { "epoch": 1.760730729244022, "grad_norm": 1.0704541206359863, "learning_rate": 1.9435615624535866e-05, "loss": 0.0879, "step": 23710 }, { "epoch": 1.7614733402643696, "grad_norm": 0.8569614291191101, "learning_rate": 1.9431159958413784e-05, "loss": 0.0786, "step": 23720 }, { "epoch": 1.762215951284717, "grad_norm": 2.5720906257629395, "learning_rate": 1.94267042922917e-05, "loss": 0.0942, "step": 23730 }, { "epoch": 1.7629585623050645, "grad_norm": 1.6318458318710327, "learning_rate": 1.942224862616961e-05, "loss": 0.0819, "step": 23740 }, { "epoch": 1.763701173325412, "grad_norm": 0.5371285080909729, "learning_rate": 1.941779296004753e-05, "loss": 0.042, "step": 23750 }, { "epoch": 1.7644437843457597, "grad_norm": 0.4963701665401459, "learning_rate": 1.9413337293925444e-05, "loss": 0.0849, "step": 23760 }, { "epoch": 1.7651863953661073, "grad_norm": 4.991149425506592, "learning_rate": 1.9408881627803356e-05, "loss": 0.0797, "step": 23770 }, { "epoch": 1.7659290063864548, "grad_norm": 2.4451613426208496, "learning_rate": 1.940442596168127e-05, "loss": 0.0688, "step": 23780 }, { "epoch": 1.7666716174068022, "grad_norm": 0.5909414887428284, "learning_rate": 1.9399970295559186e-05, "loss": 0.0814, "step": 23790 }, { "epoch": 1.76741422842715, "grad_norm": 3.1932735443115234, "learning_rate": 1.93955146294371e-05, "loss": 0.0725, "step": 23800 }, { "epoch": 1.7681568394474974, "grad_norm": 1.4078024625778198, "learning_rate": 1.9391058963315016e-05, "loss": 0.0917, "step": 23810 }, { "epoch": 1.768899450467845, "grad_norm": 0.5630704164505005, "learning_rate": 1.938660329719293e-05, "loss": 0.049, "step": 23820 }, { "epoch": 1.7696420614881925, "grad_norm": 1.22433602809906, "learning_rate": 1.9382147631070846e-05, "loss": 0.0883, "step": 23830 }, { "epoch": 1.77038467250854, "grad_norm": 2.7386889457702637, "learning_rate": 1.937769196494876e-05, "loss": 0.0769, "step": 23840 }, { "epoch": 1.7711272835288876, "grad_norm": 0.52625572681427, "learning_rate": 1.9373236298826676e-05, "loss": 0.0807, "step": 23850 }, { "epoch": 1.7718698945492353, "grad_norm": 1.0019735097885132, "learning_rate": 1.936878063270459e-05, "loss": 0.074, "step": 23860 }, { "epoch": 1.7726125055695827, "grad_norm": 1.6387897729873657, "learning_rate": 1.9364324966582506e-05, "loss": 0.0592, "step": 23870 }, { "epoch": 1.7733551165899302, "grad_norm": 1.7129108905792236, "learning_rate": 1.9359869300460417e-05, "loss": 0.101, "step": 23880 }, { "epoch": 1.7740977276102776, "grad_norm": 2.9050674438476562, "learning_rate": 1.9355413634338336e-05, "loss": 0.0965, "step": 23890 }, { "epoch": 1.7748403386306253, "grad_norm": 2.6938226222991943, "learning_rate": 1.9350957968216247e-05, "loss": 0.0713, "step": 23900 }, { "epoch": 1.775582949650973, "grad_norm": 2.0255374908447266, "learning_rate": 1.9346502302094162e-05, "loss": 0.0711, "step": 23910 }, { "epoch": 1.7763255606713204, "grad_norm": 0.8787927627563477, "learning_rate": 1.934204663597208e-05, "loss": 0.0717, "step": 23920 }, { "epoch": 1.7770681716916679, "grad_norm": 0.6391984224319458, "learning_rate": 1.9337590969849992e-05, "loss": 0.0434, "step": 23930 }, { "epoch": 1.7778107827120153, "grad_norm": 2.4964632987976074, "learning_rate": 1.9333135303727907e-05, "loss": 0.0759, "step": 23940 }, { "epoch": 1.778553393732363, "grad_norm": 2.1311395168304443, "learning_rate": 1.9328679637605822e-05, "loss": 0.0499, "step": 23950 }, { "epoch": 1.7792960047527107, "grad_norm": 3.056412935256958, "learning_rate": 1.9324223971483737e-05, "loss": 0.0652, "step": 23960 }, { "epoch": 1.780038615773058, "grad_norm": 1.9002305269241333, "learning_rate": 1.9319768305361652e-05, "loss": 0.0528, "step": 23970 }, { "epoch": 1.7807812267934056, "grad_norm": 2.583705186843872, "learning_rate": 1.9315312639239564e-05, "loss": 0.0692, "step": 23980 }, { "epoch": 1.781523837813753, "grad_norm": 2.8918917179107666, "learning_rate": 1.9310856973117482e-05, "loss": 0.0405, "step": 23990 }, { "epoch": 1.7822664488341007, "grad_norm": 2.4750471115112305, "learning_rate": 1.9306401306995397e-05, "loss": 0.0704, "step": 24000 }, { "epoch": 1.7830090598544484, "grad_norm": 0.7255248427391052, "learning_rate": 1.930194564087331e-05, "loss": 0.0657, "step": 24010 }, { "epoch": 1.7837516708747958, "grad_norm": 1.313336968421936, "learning_rate": 1.9297489974751227e-05, "loss": 0.0548, "step": 24020 }, { "epoch": 1.7844942818951433, "grad_norm": 1.0988185405731201, "learning_rate": 1.9293034308629142e-05, "loss": 0.0681, "step": 24030 }, { "epoch": 1.7852368929154907, "grad_norm": 1.2511615753173828, "learning_rate": 1.9288578642507054e-05, "loss": 0.1, "step": 24040 }, { "epoch": 1.7859795039358384, "grad_norm": 0.9137929677963257, "learning_rate": 1.928412297638497e-05, "loss": 0.0922, "step": 24050 }, { "epoch": 1.786722114956186, "grad_norm": 1.0579396486282349, "learning_rate": 1.9279667310262887e-05, "loss": 0.0649, "step": 24060 }, { "epoch": 1.7874647259765335, "grad_norm": 2.3197665214538574, "learning_rate": 1.92752116441408e-05, "loss": 0.0646, "step": 24070 }, { "epoch": 1.788207336996881, "grad_norm": 0.7259221076965332, "learning_rate": 1.9270755978018714e-05, "loss": 0.0642, "step": 24080 }, { "epoch": 1.7889499480172286, "grad_norm": 1.1063398122787476, "learning_rate": 1.926630031189663e-05, "loss": 0.077, "step": 24090 }, { "epoch": 1.789692559037576, "grad_norm": 0.8561720848083496, "learning_rate": 1.9261844645774544e-05, "loss": 0.0799, "step": 24100 }, { "epoch": 1.7904351700579237, "grad_norm": 2.2343969345092773, "learning_rate": 1.925738897965246e-05, "loss": 0.0798, "step": 24110 }, { "epoch": 1.7911777810782712, "grad_norm": 0.8606523275375366, "learning_rate": 1.925293331353037e-05, "loss": 0.0636, "step": 24120 }, { "epoch": 1.7919203920986186, "grad_norm": 1.764185905456543, "learning_rate": 1.924847764740829e-05, "loss": 0.072, "step": 24130 }, { "epoch": 1.7926630031189663, "grad_norm": 2.313272714614868, "learning_rate": 1.9244021981286204e-05, "loss": 0.0701, "step": 24140 }, { "epoch": 1.793405614139314, "grad_norm": 0.961453914642334, "learning_rate": 1.9239566315164115e-05, "loss": 0.0845, "step": 24150 }, { "epoch": 1.7941482251596614, "grad_norm": 0.6591196060180664, "learning_rate": 1.9235110649042034e-05, "loss": 0.0578, "step": 24160 }, { "epoch": 1.7948908361800089, "grad_norm": 2.050917148590088, "learning_rate": 1.923065498291995e-05, "loss": 0.07, "step": 24170 }, { "epoch": 1.7956334472003563, "grad_norm": 1.651291847229004, "learning_rate": 1.922619931679786e-05, "loss": 0.1065, "step": 24180 }, { "epoch": 1.796376058220704, "grad_norm": 1.6536940336227417, "learning_rate": 1.9221743650675775e-05, "loss": 0.0675, "step": 24190 }, { "epoch": 1.7971186692410517, "grad_norm": 1.4677518606185913, "learning_rate": 1.921728798455369e-05, "loss": 0.0531, "step": 24200 }, { "epoch": 1.7978612802613991, "grad_norm": 0.6976707577705383, "learning_rate": 1.9212832318431605e-05, "loss": 0.0458, "step": 24210 }, { "epoch": 1.7986038912817466, "grad_norm": 2.0088629722595215, "learning_rate": 1.920837665230952e-05, "loss": 0.0756, "step": 24220 }, { "epoch": 1.799346502302094, "grad_norm": 1.7095671892166138, "learning_rate": 1.9203920986187435e-05, "loss": 0.0709, "step": 24230 }, { "epoch": 1.8000891133224417, "grad_norm": 1.893115758895874, "learning_rate": 1.919946532006535e-05, "loss": 0.0676, "step": 24240 }, { "epoch": 1.8008317243427894, "grad_norm": 2.080127716064453, "learning_rate": 1.9195009653943265e-05, "loss": 0.0533, "step": 24250 }, { "epoch": 1.8015743353631368, "grad_norm": 1.7303770780563354, "learning_rate": 1.919055398782118e-05, "loss": 0.0741, "step": 24260 }, { "epoch": 1.8023169463834843, "grad_norm": 1.3288146257400513, "learning_rate": 1.9186098321699095e-05, "loss": 0.0812, "step": 24270 }, { "epoch": 1.8030595574038317, "grad_norm": 2.545034408569336, "learning_rate": 1.918164265557701e-05, "loss": 0.0959, "step": 24280 }, { "epoch": 1.8038021684241794, "grad_norm": 2.7964491844177246, "learning_rate": 1.9177186989454922e-05, "loss": 0.0649, "step": 24290 }, { "epoch": 1.804544779444527, "grad_norm": 1.9604982137680054, "learning_rate": 1.917273132333284e-05, "loss": 0.0733, "step": 24300 }, { "epoch": 1.8052873904648745, "grad_norm": 2.917266607284546, "learning_rate": 1.9168275657210752e-05, "loss": 0.0903, "step": 24310 }, { "epoch": 1.806030001485222, "grad_norm": 2.3615570068359375, "learning_rate": 1.9163819991088667e-05, "loss": 0.065, "step": 24320 }, { "epoch": 1.8067726125055694, "grad_norm": 1.7897320985794067, "learning_rate": 1.9159364324966585e-05, "loss": 0.086, "step": 24330 }, { "epoch": 1.807515223525917, "grad_norm": 1.5693241357803345, "learning_rate": 1.9154908658844497e-05, "loss": 0.0876, "step": 24340 }, { "epoch": 1.8082578345462648, "grad_norm": 1.0281766653060913, "learning_rate": 1.9150452992722412e-05, "loss": 0.0841, "step": 24350 }, { "epoch": 1.8090004455666122, "grad_norm": 1.0320008993148804, "learning_rate": 1.9145997326600327e-05, "loss": 0.0852, "step": 24360 }, { "epoch": 1.8097430565869597, "grad_norm": 0.7011651396751404, "learning_rate": 1.9141541660478242e-05, "loss": 0.0549, "step": 24370 }, { "epoch": 1.8104856676073073, "grad_norm": 2.5355286598205566, "learning_rate": 1.9137085994356157e-05, "loss": 0.0809, "step": 24380 }, { "epoch": 1.8112282786276548, "grad_norm": 1.371387243270874, "learning_rate": 1.9132630328234072e-05, "loss": 0.0568, "step": 24390 }, { "epoch": 1.8119708896480025, "grad_norm": 1.14397132396698, "learning_rate": 1.9128174662111987e-05, "loss": 0.08, "step": 24400 }, { "epoch": 1.81271350066835, "grad_norm": 0.4540915787220001, "learning_rate": 1.9123718995989902e-05, "loss": 0.0759, "step": 24410 }, { "epoch": 1.8134561116886974, "grad_norm": 0.6488813757896423, "learning_rate": 1.9119263329867814e-05, "loss": 0.0805, "step": 24420 }, { "epoch": 1.814198722709045, "grad_norm": 2.155545949935913, "learning_rate": 1.9114807663745732e-05, "loss": 0.06, "step": 24430 }, { "epoch": 1.8149413337293927, "grad_norm": 2.4752142429351807, "learning_rate": 1.9110351997623647e-05, "loss": 0.0823, "step": 24440 }, { "epoch": 1.8156839447497402, "grad_norm": 1.4653511047363281, "learning_rate": 1.910589633150156e-05, "loss": 0.0863, "step": 24450 }, { "epoch": 1.8164265557700876, "grad_norm": 1.1317205429077148, "learning_rate": 1.9101440665379474e-05, "loss": 0.0554, "step": 24460 }, { "epoch": 1.817169166790435, "grad_norm": 1.3294923305511475, "learning_rate": 1.9096984999257392e-05, "loss": 0.0619, "step": 24470 }, { "epoch": 1.8179117778107827, "grad_norm": 2.8235435485839844, "learning_rate": 1.9092529333135304e-05, "loss": 0.0769, "step": 24480 }, { "epoch": 1.8186543888311304, "grad_norm": 0.2612536549568176, "learning_rate": 1.908807366701322e-05, "loss": 0.0782, "step": 24490 }, { "epoch": 1.8193969998514778, "grad_norm": 2.1067092418670654, "learning_rate": 1.9083618000891133e-05, "loss": 0.0641, "step": 24500 }, { "epoch": 1.8201396108718253, "grad_norm": 1.507546067237854, "learning_rate": 1.907916233476905e-05, "loss": 0.0673, "step": 24510 }, { "epoch": 1.8208822218921727, "grad_norm": 1.1641823053359985, "learning_rate": 1.9074706668646963e-05, "loss": 0.0735, "step": 24520 }, { "epoch": 1.8216248329125204, "grad_norm": 1.6520384550094604, "learning_rate": 1.9070251002524875e-05, "loss": 0.0714, "step": 24530 }, { "epoch": 1.822367443932868, "grad_norm": 1.908644676208496, "learning_rate": 1.9065795336402793e-05, "loss": 0.0808, "step": 24540 }, { "epoch": 1.8231100549532155, "grad_norm": 1.606765866279602, "learning_rate": 1.906133967028071e-05, "loss": 0.0944, "step": 24550 }, { "epoch": 1.823852665973563, "grad_norm": 1.8764336109161377, "learning_rate": 1.905688400415862e-05, "loss": 0.105, "step": 24560 }, { "epoch": 1.8245952769939104, "grad_norm": 1.009965419769287, "learning_rate": 1.905242833803654e-05, "loss": 0.039, "step": 24570 }, { "epoch": 1.8253378880142581, "grad_norm": 1.2469267845153809, "learning_rate": 1.9047972671914453e-05, "loss": 0.0539, "step": 24580 }, { "epoch": 1.8260804990346058, "grad_norm": 2.306645393371582, "learning_rate": 1.9043517005792365e-05, "loss": 0.0849, "step": 24590 }, { "epoch": 1.8268231100549532, "grad_norm": 2.0731003284454346, "learning_rate": 1.903906133967028e-05, "loss": 0.082, "step": 24600 }, { "epoch": 1.8275657210753007, "grad_norm": 2.53838849067688, "learning_rate": 1.9034605673548195e-05, "loss": 0.0964, "step": 24610 }, { "epoch": 1.8283083320956481, "grad_norm": 0.8338577747344971, "learning_rate": 1.903015000742611e-05, "loss": 0.0842, "step": 24620 }, { "epoch": 1.8290509431159958, "grad_norm": 0.40346524119377136, "learning_rate": 1.9025694341304025e-05, "loss": 0.0786, "step": 24630 }, { "epoch": 1.8297935541363435, "grad_norm": 3.7028415203094482, "learning_rate": 1.902123867518194e-05, "loss": 0.0967, "step": 24640 }, { "epoch": 1.830536165156691, "grad_norm": 1.1305569410324097, "learning_rate": 1.9016783009059855e-05, "loss": 0.0596, "step": 24650 }, { "epoch": 1.8312787761770384, "grad_norm": 1.63505220413208, "learning_rate": 1.901232734293777e-05, "loss": 0.0985, "step": 24660 }, { "epoch": 1.832021387197386, "grad_norm": 1.5454314947128296, "learning_rate": 1.9007871676815685e-05, "loss": 0.0677, "step": 24670 }, { "epoch": 1.8327639982177335, "grad_norm": 1.4274262189865112, "learning_rate": 1.90034160106936e-05, "loss": 0.0681, "step": 24680 }, { "epoch": 1.8335066092380812, "grad_norm": 0.888826847076416, "learning_rate": 1.8998960344571515e-05, "loss": 0.0922, "step": 24690 }, { "epoch": 1.8342492202584286, "grad_norm": 0.6009930968284607, "learning_rate": 1.8994504678449427e-05, "loss": 0.08, "step": 24700 }, { "epoch": 1.834991831278776, "grad_norm": 1.8012096881866455, "learning_rate": 1.8990049012327345e-05, "loss": 0.0726, "step": 24710 }, { "epoch": 1.8357344422991237, "grad_norm": 2.0512189865112305, "learning_rate": 1.8985593346205257e-05, "loss": 0.089, "step": 24720 }, { "epoch": 1.8364770533194714, "grad_norm": 1.3309962749481201, "learning_rate": 1.898113768008317e-05, "loss": 0.042, "step": 24730 }, { "epoch": 1.8372196643398189, "grad_norm": 2.834207057952881, "learning_rate": 1.897668201396109e-05, "loss": 0.0774, "step": 24740 }, { "epoch": 1.8379622753601663, "grad_norm": 0.6929059028625488, "learning_rate": 1.8972226347839e-05, "loss": 0.0601, "step": 24750 }, { "epoch": 1.8387048863805138, "grad_norm": 2.1518747806549072, "learning_rate": 1.8967770681716917e-05, "loss": 0.0647, "step": 24760 }, { "epoch": 1.8394474974008614, "grad_norm": 1.8092671632766724, "learning_rate": 1.896331501559483e-05, "loss": 0.0596, "step": 24770 }, { "epoch": 1.8401901084212091, "grad_norm": 1.499583125114441, "learning_rate": 1.8958859349472747e-05, "loss": 0.0625, "step": 24780 }, { "epoch": 1.8409327194415566, "grad_norm": 1.1313135623931885, "learning_rate": 1.895440368335066e-05, "loss": 0.0761, "step": 24790 }, { "epoch": 1.841675330461904, "grad_norm": 3.332144260406494, "learning_rate": 1.8949948017228577e-05, "loss": 0.1059, "step": 24800 }, { "epoch": 1.8424179414822515, "grad_norm": 1.193617820739746, "learning_rate": 1.894549235110649e-05, "loss": 0.0897, "step": 24810 }, { "epoch": 1.8431605525025991, "grad_norm": 1.0771690607070923, "learning_rate": 1.8941036684984407e-05, "loss": 0.0709, "step": 24820 }, { "epoch": 1.8439031635229468, "grad_norm": 2.919710874557495, "learning_rate": 1.8936581018862318e-05, "loss": 0.0729, "step": 24830 }, { "epoch": 1.8446457745432943, "grad_norm": 1.2123440504074097, "learning_rate": 1.8932125352740237e-05, "loss": 0.0586, "step": 24840 }, { "epoch": 1.8453883855636417, "grad_norm": 2.8051271438598633, "learning_rate": 1.892766968661815e-05, "loss": 0.1141, "step": 24850 }, { "epoch": 1.8461309965839892, "grad_norm": 1.1732330322265625, "learning_rate": 1.8923214020496063e-05, "loss": 0.0702, "step": 24860 }, { "epoch": 1.8468736076043368, "grad_norm": 1.468248963356018, "learning_rate": 1.8918758354373978e-05, "loss": 0.0755, "step": 24870 }, { "epoch": 1.8476162186246845, "grad_norm": 1.1212787628173828, "learning_rate": 1.8914302688251896e-05, "loss": 0.0733, "step": 24880 }, { "epoch": 1.848358829645032, "grad_norm": 1.3834295272827148, "learning_rate": 1.8909847022129808e-05, "loss": 0.0582, "step": 24890 }, { "epoch": 1.8491014406653794, "grad_norm": 1.426999807357788, "learning_rate": 1.8905391356007723e-05, "loss": 0.0763, "step": 24900 }, { "epoch": 1.8498440516857269, "grad_norm": 1.5629621744155884, "learning_rate": 1.890093568988564e-05, "loss": 0.0593, "step": 24910 }, { "epoch": 1.8505866627060745, "grad_norm": 1.6968507766723633, "learning_rate": 1.8896480023763553e-05, "loss": 0.1121, "step": 24920 }, { "epoch": 1.8513292737264222, "grad_norm": 1.8145391941070557, "learning_rate": 1.8892024357641468e-05, "loss": 0.0594, "step": 24930 }, { "epoch": 1.8520718847467696, "grad_norm": 2.452359914779663, "learning_rate": 1.888756869151938e-05, "loss": 0.0626, "step": 24940 }, { "epoch": 1.852814495767117, "grad_norm": 0.7949011921882629, "learning_rate": 1.8883113025397298e-05, "loss": 0.0816, "step": 24950 }, { "epoch": 1.8535571067874648, "grad_norm": 1.0505071878433228, "learning_rate": 1.8878657359275213e-05, "loss": 0.0874, "step": 24960 }, { "epoch": 1.8542997178078122, "grad_norm": 1.7781516313552856, "learning_rate": 1.8874201693153125e-05, "loss": 0.0677, "step": 24970 }, { "epoch": 1.85504232882816, "grad_norm": 1.9626563787460327, "learning_rate": 1.8869746027031043e-05, "loss": 0.0692, "step": 24980 }, { "epoch": 1.8557849398485073, "grad_norm": 0.7430385947227478, "learning_rate": 1.8865290360908958e-05, "loss": 0.0372, "step": 24990 }, { "epoch": 1.8565275508688548, "grad_norm": 2.076448440551758, "learning_rate": 1.886083469478687e-05, "loss": 0.0702, "step": 25000 }, { "epoch": 1.8572701618892025, "grad_norm": 1.5963934659957886, "learning_rate": 1.8856379028664788e-05, "loss": 0.0672, "step": 25010 }, { "epoch": 1.8580127729095501, "grad_norm": 2.167839765548706, "learning_rate": 1.88519233625427e-05, "loss": 0.0642, "step": 25020 }, { "epoch": 1.8587553839298976, "grad_norm": 1.1535776853561401, "learning_rate": 1.8847467696420615e-05, "loss": 0.1037, "step": 25030 }, { "epoch": 1.859497994950245, "grad_norm": 0.5450434684753418, "learning_rate": 1.884301203029853e-05, "loss": 0.0663, "step": 25040 }, { "epoch": 1.8602406059705925, "grad_norm": 1.2037556171417236, "learning_rate": 1.8838556364176445e-05, "loss": 0.0856, "step": 25050 }, { "epoch": 1.8609832169909402, "grad_norm": 1.5233439207077026, "learning_rate": 1.883410069805436e-05, "loss": 0.0765, "step": 25060 }, { "epoch": 1.8617258280112878, "grad_norm": 1.9642084836959839, "learning_rate": 1.8829645031932275e-05, "loss": 0.0701, "step": 25070 }, { "epoch": 1.8624684390316353, "grad_norm": 4.366384983062744, "learning_rate": 1.882518936581019e-05, "loss": 0.0879, "step": 25080 }, { "epoch": 1.8632110500519827, "grad_norm": 1.4830248355865479, "learning_rate": 1.8820733699688105e-05, "loss": 0.0753, "step": 25090 }, { "epoch": 1.8639536610723302, "grad_norm": 0.9438735842704773, "learning_rate": 1.881627803356602e-05, "loss": 0.0621, "step": 25100 }, { "epoch": 1.8646962720926779, "grad_norm": 1.960681438446045, "learning_rate": 1.881182236744393e-05, "loss": 0.102, "step": 25110 }, { "epoch": 1.8654388831130255, "grad_norm": 0.6813110709190369, "learning_rate": 1.880736670132185e-05, "loss": 0.0515, "step": 25120 }, { "epoch": 1.866181494133373, "grad_norm": 2.467663049697876, "learning_rate": 1.880291103519976e-05, "loss": 0.0824, "step": 25130 }, { "epoch": 1.8669241051537204, "grad_norm": 0.4229584336280823, "learning_rate": 1.8798455369077676e-05, "loss": 0.0507, "step": 25140 }, { "epoch": 1.8676667161740679, "grad_norm": 1.0609776973724365, "learning_rate": 1.8793999702955595e-05, "loss": 0.0637, "step": 25150 }, { "epoch": 1.8684093271944155, "grad_norm": 1.177756905555725, "learning_rate": 1.8789544036833506e-05, "loss": 0.0679, "step": 25160 }, { "epoch": 1.8691519382147632, "grad_norm": 1.098395586013794, "learning_rate": 1.878508837071142e-05, "loss": 0.0546, "step": 25170 }, { "epoch": 1.8698945492351107, "grad_norm": 0.5572521686553955, "learning_rate": 1.8780632704589336e-05, "loss": 0.0842, "step": 25180 }, { "epoch": 1.8706371602554581, "grad_norm": 1.3587162494659424, "learning_rate": 1.877617703846725e-05, "loss": 0.0654, "step": 25190 }, { "epoch": 1.8713797712758056, "grad_norm": 4.091668605804443, "learning_rate": 1.8771721372345166e-05, "loss": 0.1023, "step": 25200 }, { "epoch": 1.8721223822961532, "grad_norm": 2.425302267074585, "learning_rate": 1.876726570622308e-05, "loss": 0.0869, "step": 25210 }, { "epoch": 1.872864993316501, "grad_norm": 0.6244649291038513, "learning_rate": 1.8762810040100996e-05, "loss": 0.0898, "step": 25220 }, { "epoch": 1.8736076043368484, "grad_norm": 0.962476372718811, "learning_rate": 1.875835437397891e-05, "loss": 0.0778, "step": 25230 }, { "epoch": 1.8743502153571958, "grad_norm": 3.0341858863830566, "learning_rate": 1.8753898707856823e-05, "loss": 0.0679, "step": 25240 }, { "epoch": 1.8750928263775435, "grad_norm": 1.5342912673950195, "learning_rate": 1.874944304173474e-05, "loss": 0.0834, "step": 25250 }, { "epoch": 1.875835437397891, "grad_norm": 2.197174549102783, "learning_rate": 1.8744987375612656e-05, "loss": 0.0582, "step": 25260 }, { "epoch": 1.8765780484182386, "grad_norm": 1.7695233821868896, "learning_rate": 1.8740531709490568e-05, "loss": 0.0797, "step": 25270 }, { "epoch": 1.877320659438586, "grad_norm": 1.5914796590805054, "learning_rate": 1.8736076043368483e-05, "loss": 0.089, "step": 25280 }, { "epoch": 1.8780632704589335, "grad_norm": 3.123690128326416, "learning_rate": 1.87316203772464e-05, "loss": 0.069, "step": 25290 }, { "epoch": 1.8788058814792812, "grad_norm": 2.2254252433776855, "learning_rate": 1.8727164711124313e-05, "loss": 0.1029, "step": 25300 }, { "epoch": 1.8795484924996289, "grad_norm": 1.571058750152588, "learning_rate": 1.8722709045002228e-05, "loss": 0.0705, "step": 25310 }, { "epoch": 1.8802911035199763, "grad_norm": 1.3334083557128906, "learning_rate": 1.8718253378880146e-05, "loss": 0.0961, "step": 25320 }, { "epoch": 1.8810337145403238, "grad_norm": 3.364617109298706, "learning_rate": 1.8713797712758058e-05, "loss": 0.0876, "step": 25330 }, { "epoch": 1.8817763255606712, "grad_norm": 0.7864534258842468, "learning_rate": 1.8709342046635973e-05, "loss": 0.0615, "step": 25340 }, { "epoch": 1.8825189365810189, "grad_norm": 2.1435587406158447, "learning_rate": 1.8704886380513884e-05, "loss": 0.0666, "step": 25350 }, { "epoch": 1.8832615476013665, "grad_norm": 1.164170503616333, "learning_rate": 1.8700430714391803e-05, "loss": 0.0619, "step": 25360 }, { "epoch": 1.884004158621714, "grad_norm": 2.059136390686035, "learning_rate": 1.8695975048269718e-05, "loss": 0.0787, "step": 25370 }, { "epoch": 1.8847467696420614, "grad_norm": 0.4554833173751831, "learning_rate": 1.869151938214763e-05, "loss": 0.0513, "step": 25380 }, { "epoch": 1.885489380662409, "grad_norm": 0.9922448396682739, "learning_rate": 1.8687063716025548e-05, "loss": 0.0634, "step": 25390 }, { "epoch": 1.8862319916827566, "grad_norm": 1.9916400909423828, "learning_rate": 1.8682608049903463e-05, "loss": 0.0695, "step": 25400 }, { "epoch": 1.8869746027031042, "grad_norm": 0.31023046374320984, "learning_rate": 1.8678152383781374e-05, "loss": 0.0799, "step": 25410 }, { "epoch": 1.8877172137234517, "grad_norm": 1.627617597579956, "learning_rate": 1.8673696717659293e-05, "loss": 0.0685, "step": 25420 }, { "epoch": 1.8884598247437991, "grad_norm": 1.0137081146240234, "learning_rate": 1.8669241051537208e-05, "loss": 0.0887, "step": 25430 }, { "epoch": 1.8892024357641466, "grad_norm": 1.8060729503631592, "learning_rate": 1.866478538541512e-05, "loss": 0.0443, "step": 25440 }, { "epoch": 1.8899450467844943, "grad_norm": 1.671414852142334, "learning_rate": 1.8660329719293034e-05, "loss": 0.078, "step": 25450 }, { "epoch": 1.890687657804842, "grad_norm": 1.3879966735839844, "learning_rate": 1.865587405317095e-05, "loss": 0.0741, "step": 25460 }, { "epoch": 1.8914302688251894, "grad_norm": 2.2087647914886475, "learning_rate": 1.8651418387048864e-05, "loss": 0.0602, "step": 25470 }, { "epoch": 1.8921728798455368, "grad_norm": 1.051397681236267, "learning_rate": 1.864696272092678e-05, "loss": 0.0936, "step": 25480 }, { "epoch": 1.8929154908658843, "grad_norm": 1.432411789894104, "learning_rate": 1.8642507054804694e-05, "loss": 0.0838, "step": 25490 }, { "epoch": 1.893658101886232, "grad_norm": 2.1286797523498535, "learning_rate": 1.863805138868261e-05, "loss": 0.0594, "step": 25500 }, { "epoch": 1.8944007129065796, "grad_norm": 2.0686354637145996, "learning_rate": 1.8633595722560524e-05, "loss": 0.0601, "step": 25510 }, { "epoch": 1.895143323926927, "grad_norm": 1.1248515844345093, "learning_rate": 1.8629140056438436e-05, "loss": 0.0595, "step": 25520 }, { "epoch": 1.8958859349472745, "grad_norm": 1.444861650466919, "learning_rate": 1.8624684390316354e-05, "loss": 0.0484, "step": 25530 }, { "epoch": 1.8966285459676222, "grad_norm": 0.935176432132721, "learning_rate": 1.8620228724194266e-05, "loss": 0.0826, "step": 25540 }, { "epoch": 1.8973711569879697, "grad_norm": 1.5523897409439087, "learning_rate": 1.861577305807218e-05, "loss": 0.0676, "step": 25550 }, { "epoch": 1.8981137680083173, "grad_norm": 0.7804394364356995, "learning_rate": 1.86113173919501e-05, "loss": 0.0848, "step": 25560 }, { "epoch": 1.8988563790286648, "grad_norm": 2.2378127574920654, "learning_rate": 1.860686172582801e-05, "loss": 0.0884, "step": 25570 }, { "epoch": 1.8995989900490122, "grad_norm": 0.5532150268554688, "learning_rate": 1.8602406059705926e-05, "loss": 0.0641, "step": 25580 }, { "epoch": 1.90034160106936, "grad_norm": 0.7949833869934082, "learning_rate": 1.859795039358384e-05, "loss": 0.0543, "step": 25590 }, { "epoch": 1.9010842120897076, "grad_norm": 1.0134397745132446, "learning_rate": 1.8593494727461756e-05, "loss": 0.0702, "step": 25600 }, { "epoch": 1.901826823110055, "grad_norm": 1.708309531211853, "learning_rate": 1.858903906133967e-05, "loss": 0.0884, "step": 25610 }, { "epoch": 1.9025694341304025, "grad_norm": 0.8126017451286316, "learning_rate": 1.8584583395217586e-05, "loss": 0.0615, "step": 25620 }, { "epoch": 1.90331204515075, "grad_norm": 1.7196837663650513, "learning_rate": 1.85801277290955e-05, "loss": 0.0671, "step": 25630 }, { "epoch": 1.9040546561710976, "grad_norm": 2.439037561416626, "learning_rate": 1.8575672062973416e-05, "loss": 0.0789, "step": 25640 }, { "epoch": 1.9047972671914453, "grad_norm": 0.7953950762748718, "learning_rate": 1.8571216396851327e-05, "loss": 0.0709, "step": 25650 }, { "epoch": 1.9055398782117927, "grad_norm": 1.4773277044296265, "learning_rate": 1.8566760730729246e-05, "loss": 0.0926, "step": 25660 }, { "epoch": 1.9062824892321402, "grad_norm": 2.0613648891448975, "learning_rate": 1.856230506460716e-05, "loss": 0.0926, "step": 25670 }, { "epoch": 1.9070251002524876, "grad_norm": 1.0905838012695312, "learning_rate": 1.8557849398485072e-05, "loss": 0.0546, "step": 25680 }, { "epoch": 1.9077677112728353, "grad_norm": 0.6521821618080139, "learning_rate": 1.8553393732362987e-05, "loss": 0.0817, "step": 25690 }, { "epoch": 1.908510322293183, "grad_norm": 1.17433762550354, "learning_rate": 1.8548938066240906e-05, "loss": 0.0703, "step": 25700 }, { "epoch": 1.9092529333135304, "grad_norm": 1.6955013275146484, "learning_rate": 1.8544482400118817e-05, "loss": 0.0973, "step": 25710 }, { "epoch": 1.9099955443338779, "grad_norm": 2.2255401611328125, "learning_rate": 1.8540026733996732e-05, "loss": 0.0678, "step": 25720 }, { "epoch": 1.9107381553542253, "grad_norm": 1.3039476871490479, "learning_rate": 1.853557106787465e-05, "loss": 0.0667, "step": 25730 }, { "epoch": 1.911480766374573, "grad_norm": 0.46905070543289185, "learning_rate": 1.8531115401752562e-05, "loss": 0.0575, "step": 25740 }, { "epoch": 1.9122233773949207, "grad_norm": 0.5787822008132935, "learning_rate": 1.8526659735630477e-05, "loss": 0.0515, "step": 25750 }, { "epoch": 1.912965988415268, "grad_norm": 1.4693628549575806, "learning_rate": 1.852220406950839e-05, "loss": 0.0929, "step": 25760 }, { "epoch": 1.9137085994356156, "grad_norm": 1.5744353532791138, "learning_rate": 1.8517748403386307e-05, "loss": 0.0589, "step": 25770 }, { "epoch": 1.914451210455963, "grad_norm": 1.8583874702453613, "learning_rate": 1.8513292737264222e-05, "loss": 0.0586, "step": 25780 }, { "epoch": 1.9151938214763107, "grad_norm": 2.5291054248809814, "learning_rate": 1.8508837071142134e-05, "loss": 0.0623, "step": 25790 }, { "epoch": 1.9159364324966583, "grad_norm": 0.9900248050689697, "learning_rate": 1.8504381405020052e-05, "loss": 0.0273, "step": 25800 }, { "epoch": 1.9166790435170058, "grad_norm": 1.1880460977554321, "learning_rate": 1.8499925738897967e-05, "loss": 0.0901, "step": 25810 }, { "epoch": 1.9174216545373532, "grad_norm": 1.0495351552963257, "learning_rate": 1.849547007277588e-05, "loss": 0.0592, "step": 25820 }, { "epoch": 1.918164265557701, "grad_norm": 2.6768858432769775, "learning_rate": 1.8491014406653797e-05, "loss": 0.0908, "step": 25830 }, { "epoch": 1.9189068765780484, "grad_norm": 1.1184509992599487, "learning_rate": 1.8486558740531712e-05, "loss": 0.0594, "step": 25840 }, { "epoch": 1.919649487598396, "grad_norm": 4.1254401206970215, "learning_rate": 1.8482103074409624e-05, "loss": 0.0834, "step": 25850 }, { "epoch": 1.9203920986187435, "grad_norm": 0.6216328144073486, "learning_rate": 1.847764740828754e-05, "loss": 0.0647, "step": 25860 }, { "epoch": 1.921134709639091, "grad_norm": 2.5887441635131836, "learning_rate": 1.8473191742165454e-05, "loss": 0.063, "step": 25870 }, { "epoch": 1.9218773206594386, "grad_norm": 1.4074640274047852, "learning_rate": 1.846873607604337e-05, "loss": 0.0643, "step": 25880 }, { "epoch": 1.9226199316797863, "grad_norm": 1.3061528205871582, "learning_rate": 1.8464280409921284e-05, "loss": 0.0518, "step": 25890 }, { "epoch": 1.9233625427001337, "grad_norm": 4.136571884155273, "learning_rate": 1.84598247437992e-05, "loss": 0.0546, "step": 25900 }, { "epoch": 1.9241051537204812, "grad_norm": 2.307090997695923, "learning_rate": 1.8455369077677114e-05, "loss": 0.0685, "step": 25910 }, { "epoch": 1.9248477647408286, "grad_norm": 1.6441222429275513, "learning_rate": 1.845091341155503e-05, "loss": 0.0886, "step": 25920 }, { "epoch": 1.9255903757611763, "grad_norm": 0.6175203323364258, "learning_rate": 1.844645774543294e-05, "loss": 0.0401, "step": 25930 }, { "epoch": 1.926332986781524, "grad_norm": 0.5794946551322937, "learning_rate": 1.844200207931086e-05, "loss": 0.0627, "step": 25940 }, { "epoch": 1.9270755978018714, "grad_norm": 0.15005835890769958, "learning_rate": 1.843754641318877e-05, "loss": 0.0369, "step": 25950 }, { "epoch": 1.9278182088222189, "grad_norm": 2.209848403930664, "learning_rate": 1.8433090747066685e-05, "loss": 0.0724, "step": 25960 }, { "epoch": 1.9285608198425663, "grad_norm": 1.4631694555282593, "learning_rate": 1.8428635080944604e-05, "loss": 0.0872, "step": 25970 }, { "epoch": 1.929303430862914, "grad_norm": 1.996635913848877, "learning_rate": 1.8424179414822515e-05, "loss": 0.0592, "step": 25980 }, { "epoch": 1.9300460418832617, "grad_norm": 1.6855813264846802, "learning_rate": 1.841972374870043e-05, "loss": 0.0902, "step": 25990 }, { "epoch": 1.9307886529036091, "grad_norm": 1.024795413017273, "learning_rate": 1.8415268082578345e-05, "loss": 0.0532, "step": 26000 }, { "epoch": 1.9315312639239566, "grad_norm": 0.7905906438827515, "learning_rate": 1.841081241645626e-05, "loss": 0.0524, "step": 26010 }, { "epoch": 1.932273874944304, "grad_norm": 1.406113624572754, "learning_rate": 1.8406356750334175e-05, "loss": 0.0471, "step": 26020 }, { "epoch": 1.9330164859646517, "grad_norm": 2.197995185852051, "learning_rate": 1.840190108421209e-05, "loss": 0.0518, "step": 26030 }, { "epoch": 1.9337590969849994, "grad_norm": 0.5076370239257812, "learning_rate": 1.8397445418090005e-05, "loss": 0.056, "step": 26040 }, { "epoch": 1.9345017080053468, "grad_norm": 0.6380198001861572, "learning_rate": 1.839298975196792e-05, "loss": 0.0787, "step": 26050 }, { "epoch": 1.9352443190256943, "grad_norm": 0.7328625321388245, "learning_rate": 1.8388534085845832e-05, "loss": 0.064, "step": 26060 }, { "epoch": 1.9359869300460417, "grad_norm": 1.652590274810791, "learning_rate": 1.838407841972375e-05, "loss": 0.0843, "step": 26070 }, { "epoch": 1.9367295410663894, "grad_norm": 3.7497518062591553, "learning_rate": 1.8379622753601665e-05, "loss": 0.0686, "step": 26080 }, { "epoch": 1.937472152086737, "grad_norm": 0.787777304649353, "learning_rate": 1.8375167087479577e-05, "loss": 0.1322, "step": 26090 }, { "epoch": 1.9382147631070845, "grad_norm": 2.129948616027832, "learning_rate": 1.8370711421357492e-05, "loss": 0.0428, "step": 26100 }, { "epoch": 1.938957374127432, "grad_norm": 3.463418960571289, "learning_rate": 1.836625575523541e-05, "loss": 0.0811, "step": 26110 }, { "epoch": 1.9396999851477796, "grad_norm": 0.8304588794708252, "learning_rate": 1.8361800089113322e-05, "loss": 0.0744, "step": 26120 }, { "epoch": 1.940442596168127, "grad_norm": 0.47356998920440674, "learning_rate": 1.8357344422991237e-05, "loss": 0.0558, "step": 26130 }, { "epoch": 1.9411852071884748, "grad_norm": 1.1219099760055542, "learning_rate": 1.8352888756869155e-05, "loss": 0.0752, "step": 26140 }, { "epoch": 1.9419278182088222, "grad_norm": 1.6041889190673828, "learning_rate": 1.8348433090747067e-05, "loss": 0.0737, "step": 26150 }, { "epoch": 1.9426704292291697, "grad_norm": 0.80296790599823, "learning_rate": 1.8343977424624982e-05, "loss": 0.074, "step": 26160 }, { "epoch": 1.9434130402495173, "grad_norm": 2.8766558170318604, "learning_rate": 1.8339521758502893e-05, "loss": 0.0874, "step": 26170 }, { "epoch": 1.944155651269865, "grad_norm": 0.7357403635978699, "learning_rate": 1.8335066092380812e-05, "loss": 0.0765, "step": 26180 }, { "epoch": 1.9448982622902125, "grad_norm": 2.0172839164733887, "learning_rate": 1.8330610426258727e-05, "loss": 0.0734, "step": 26190 }, { "epoch": 1.94564087331056, "grad_norm": 1.475164771080017, "learning_rate": 1.832615476013664e-05, "loss": 0.092, "step": 26200 }, { "epoch": 1.9463834843309074, "grad_norm": 0.9614417552947998, "learning_rate": 1.8321699094014557e-05, "loss": 0.0785, "step": 26210 }, { "epoch": 1.947126095351255, "grad_norm": 2.393979549407959, "learning_rate": 1.8317243427892472e-05, "loss": 0.0762, "step": 26220 }, { "epoch": 1.9478687063716027, "grad_norm": 2.239128589630127, "learning_rate": 1.8312787761770383e-05, "loss": 0.0603, "step": 26230 }, { "epoch": 1.9486113173919501, "grad_norm": 0.7804839015007019, "learning_rate": 1.8308332095648302e-05, "loss": 0.0607, "step": 26240 }, { "epoch": 1.9493539284122976, "grad_norm": 1.400680422782898, "learning_rate": 1.8303876429526217e-05, "loss": 0.0694, "step": 26250 }, { "epoch": 1.950096539432645, "grad_norm": 1.6606298685073853, "learning_rate": 1.829942076340413e-05, "loss": 0.0692, "step": 26260 }, { "epoch": 1.9508391504529927, "grad_norm": 1.0099084377288818, "learning_rate": 1.8294965097282043e-05, "loss": 0.0588, "step": 26270 }, { "epoch": 1.9515817614733404, "grad_norm": 1.2126892805099487, "learning_rate": 1.829050943115996e-05, "loss": 0.0827, "step": 26280 }, { "epoch": 1.9523243724936878, "grad_norm": 2.0662033557891846, "learning_rate": 1.8286053765037873e-05, "loss": 0.057, "step": 26290 }, { "epoch": 1.9530669835140353, "grad_norm": 1.9331927299499512, "learning_rate": 1.828159809891579e-05, "loss": 0.073, "step": 26300 }, { "epoch": 1.9538095945343827, "grad_norm": 2.7068533897399902, "learning_rate": 1.8277142432793703e-05, "loss": 0.0848, "step": 26310 }, { "epoch": 1.9545522055547304, "grad_norm": 0.645256519317627, "learning_rate": 1.827268676667162e-05, "loss": 0.0731, "step": 26320 }, { "epoch": 1.955294816575078, "grad_norm": 0.46004560589790344, "learning_rate": 1.8268231100549533e-05, "loss": 0.0556, "step": 26330 }, { "epoch": 1.9560374275954255, "grad_norm": 2.7907826900482178, "learning_rate": 1.8263775434427445e-05, "loss": 0.076, "step": 26340 }, { "epoch": 1.956780038615773, "grad_norm": 0.48506757616996765, "learning_rate": 1.8259319768305363e-05, "loss": 0.0669, "step": 26350 }, { "epoch": 1.9575226496361204, "grad_norm": 2.199068546295166, "learning_rate": 1.825486410218328e-05, "loss": 0.0996, "step": 26360 }, { "epoch": 1.958265260656468, "grad_norm": 1.1990214586257935, "learning_rate": 1.825040843606119e-05, "loss": 0.0453, "step": 26370 }, { "epoch": 1.9590078716768158, "grad_norm": 1.2882795333862305, "learning_rate": 1.824595276993911e-05, "loss": 0.064, "step": 26380 }, { "epoch": 1.9597504826971632, "grad_norm": 0.6526997089385986, "learning_rate": 1.824149710381702e-05, "loss": 0.0817, "step": 26390 }, { "epoch": 1.9604930937175107, "grad_norm": 0.8225412368774414, "learning_rate": 1.8237041437694935e-05, "loss": 0.0611, "step": 26400 }, { "epoch": 1.9612357047378584, "grad_norm": 0.5846786499023438, "learning_rate": 1.8232585771572853e-05, "loss": 0.0877, "step": 26410 }, { "epoch": 1.9619783157582058, "grad_norm": 3.2029032707214355, "learning_rate": 1.8228130105450765e-05, "loss": 0.0773, "step": 26420 }, { "epoch": 1.9627209267785535, "grad_norm": 1.3183096647262573, "learning_rate": 1.822367443932868e-05, "loss": 0.052, "step": 26430 }, { "epoch": 1.963463537798901, "grad_norm": 0.731730043888092, "learning_rate": 1.8219218773206595e-05, "loss": 0.081, "step": 26440 }, { "epoch": 1.9642061488192484, "grad_norm": 1.890268087387085, "learning_rate": 1.821476310708451e-05, "loss": 0.0529, "step": 26450 }, { "epoch": 1.964948759839596, "grad_norm": 1.2270501852035522, "learning_rate": 1.8210307440962425e-05, "loss": 0.0869, "step": 26460 }, { "epoch": 1.9656913708599437, "grad_norm": 0.9441844820976257, "learning_rate": 1.8205851774840337e-05, "loss": 0.0552, "step": 26470 }, { "epoch": 1.9664339818802912, "grad_norm": 1.5903592109680176, "learning_rate": 1.8201396108718255e-05, "loss": 0.0693, "step": 26480 }, { "epoch": 1.9671765929006386, "grad_norm": 3.176476001739502, "learning_rate": 1.819694044259617e-05, "loss": 0.0863, "step": 26490 }, { "epoch": 1.967919203920986, "grad_norm": 0.41649898886680603, "learning_rate": 1.819248477647408e-05, "loss": 0.0417, "step": 26500 }, { "epoch": 1.9686618149413337, "grad_norm": 1.182588815689087, "learning_rate": 1.8188029110351997e-05, "loss": 0.0692, "step": 26510 }, { "epoch": 1.9694044259616814, "grad_norm": 1.535315990447998, "learning_rate": 1.8183573444229915e-05, "loss": 0.056, "step": 26520 }, { "epoch": 1.9701470369820289, "grad_norm": 0.7827800512313843, "learning_rate": 1.8179117778107826e-05, "loss": 0.0994, "step": 26530 }, { "epoch": 1.9708896480023763, "grad_norm": 1.4649193286895752, "learning_rate": 1.817466211198574e-05, "loss": 0.0624, "step": 26540 }, { "epoch": 1.9716322590227238, "grad_norm": 2.715514898300171, "learning_rate": 1.817020644586366e-05, "loss": 0.0686, "step": 26550 }, { "epoch": 1.9723748700430714, "grad_norm": 2.133049964904785, "learning_rate": 1.816575077974157e-05, "loss": 0.0645, "step": 26560 }, { "epoch": 1.973117481063419, "grad_norm": 1.230670690536499, "learning_rate": 1.8161295113619486e-05, "loss": 0.0873, "step": 26570 }, { "epoch": 1.9738600920837666, "grad_norm": 1.5158164501190186, "learning_rate": 1.8156839447497398e-05, "loss": 0.0826, "step": 26580 }, { "epoch": 1.974602703104114, "grad_norm": 2.380052089691162, "learning_rate": 1.8152383781375316e-05, "loss": 0.0839, "step": 26590 }, { "epoch": 1.9753453141244615, "grad_norm": 0.9779214262962341, "learning_rate": 1.814792811525323e-05, "loss": 0.1189, "step": 26600 }, { "epoch": 1.9760879251448091, "grad_norm": 0.7717707753181458, "learning_rate": 1.8143472449131143e-05, "loss": 0.0651, "step": 26610 }, { "epoch": 1.9768305361651568, "grad_norm": 1.0977226495742798, "learning_rate": 1.813901678300906e-05, "loss": 0.052, "step": 26620 }, { "epoch": 1.9775731471855043, "grad_norm": 1.963529348373413, "learning_rate": 1.8134561116886976e-05, "loss": 0.06, "step": 26630 }, { "epoch": 1.9783157582058517, "grad_norm": 0.4046013653278351, "learning_rate": 1.8130105450764888e-05, "loss": 0.0571, "step": 26640 }, { "epoch": 1.9790583692261992, "grad_norm": 1.8156684637069702, "learning_rate": 1.8125649784642806e-05, "loss": 0.0798, "step": 26650 }, { "epoch": 1.9798009802465468, "grad_norm": 0.9352402687072754, "learning_rate": 1.812119411852072e-05, "loss": 0.0706, "step": 26660 }, { "epoch": 1.9805435912668945, "grad_norm": 2.6892099380493164, "learning_rate": 1.8116738452398633e-05, "loss": 0.0716, "step": 26670 }, { "epoch": 1.981286202287242, "grad_norm": 1.3051759004592896, "learning_rate": 1.8112282786276548e-05, "loss": 0.0775, "step": 26680 }, { "epoch": 1.9820288133075894, "grad_norm": 0.3577052056789398, "learning_rate": 1.8107827120154463e-05, "loss": 0.0396, "step": 26690 }, { "epoch": 1.982771424327937, "grad_norm": 1.0343585014343262, "learning_rate": 1.8103371454032378e-05, "loss": 0.064, "step": 26700 }, { "epoch": 1.9835140353482845, "grad_norm": 2.462855100631714, "learning_rate": 1.8098915787910293e-05, "loss": 0.0882, "step": 26710 }, { "epoch": 1.9842566463686322, "grad_norm": 2.172545909881592, "learning_rate": 1.8094460121788208e-05, "loss": 0.0716, "step": 26720 }, { "epoch": 1.9849992573889796, "grad_norm": 1.9032946825027466, "learning_rate": 1.8090004455666123e-05, "loss": 0.0691, "step": 26730 }, { "epoch": 1.985741868409327, "grad_norm": 0.6433393359184265, "learning_rate": 1.8085548789544038e-05, "loss": 0.0452, "step": 26740 }, { "epoch": 1.9864844794296748, "grad_norm": 2.729414939880371, "learning_rate": 1.808109312342195e-05, "loss": 0.0704, "step": 26750 }, { "epoch": 1.9872270904500224, "grad_norm": 1.0757064819335938, "learning_rate": 1.8076637457299868e-05, "loss": 0.0853, "step": 26760 }, { "epoch": 1.9879697014703699, "grad_norm": 0.4232407212257385, "learning_rate": 1.8072181791177783e-05, "loss": 0.0757, "step": 26770 }, { "epoch": 1.9887123124907173, "grad_norm": 1.7221150398254395, "learning_rate": 1.8067726125055695e-05, "loss": 0.0709, "step": 26780 }, { "epoch": 1.9894549235110648, "grad_norm": 1.9497733116149902, "learning_rate": 1.8063270458933613e-05, "loss": 0.0886, "step": 26790 }, { "epoch": 1.9901975345314125, "grad_norm": 2.333503484725952, "learning_rate": 1.8058814792811525e-05, "loss": 0.0528, "step": 26800 }, { "epoch": 1.9909401455517601, "grad_norm": 0.809281051158905, "learning_rate": 1.805435912668944e-05, "loss": 0.0683, "step": 26810 }, { "epoch": 1.9916827565721076, "grad_norm": 3.6550369262695312, "learning_rate": 1.8049903460567358e-05, "loss": 0.0725, "step": 26820 }, { "epoch": 1.992425367592455, "grad_norm": 2.8974320888519287, "learning_rate": 1.804544779444527e-05, "loss": 0.0892, "step": 26830 }, { "epoch": 1.9931679786128025, "grad_norm": 0.9259861707687378, "learning_rate": 1.8040992128323185e-05, "loss": 0.0665, "step": 26840 }, { "epoch": 1.9939105896331502, "grad_norm": 2.125751495361328, "learning_rate": 1.80365364622011e-05, "loss": 0.0634, "step": 26850 }, { "epoch": 1.9946532006534978, "grad_norm": 1.0825103521347046, "learning_rate": 1.8032080796079015e-05, "loss": 0.0708, "step": 26860 }, { "epoch": 1.9953958116738453, "grad_norm": 1.3801538944244385, "learning_rate": 1.802762512995693e-05, "loss": 0.0787, "step": 26870 }, { "epoch": 1.9961384226941927, "grad_norm": 0.7324304580688477, "learning_rate": 1.8023169463834845e-05, "loss": 0.0689, "step": 26880 }, { "epoch": 1.9968810337145402, "grad_norm": 0.9306546449661255, "learning_rate": 1.801871379771276e-05, "loss": 0.0895, "step": 26890 }, { "epoch": 1.9976236447348878, "grad_norm": 3.5003910064697266, "learning_rate": 1.8014258131590675e-05, "loss": 0.057, "step": 26900 }, { "epoch": 1.9983662557552355, "grad_norm": 1.8445905447006226, "learning_rate": 1.8009802465468586e-05, "loss": 0.0806, "step": 26910 }, { "epoch": 1.999108866775583, "grad_norm": 2.1891441345214844, "learning_rate": 1.80053467993465e-05, "loss": 0.0742, "step": 26920 }, { "epoch": 1.9998514777959304, "grad_norm": 2.6680564880371094, "learning_rate": 1.800089113322442e-05, "loss": 0.0612, "step": 26930 }, { "epoch": 2.0, "eval_f1": 0.0, "eval_loss": 0.05881134420633316, "eval_runtime": 790.8114, "eval_samples_per_second": 48.076, "eval_steps_per_second": 3.006, "step": 26932 }, { "epoch": 2.000594088816278, "grad_norm": 1.1317378282546997, "learning_rate": 1.799643546710233e-05, "loss": 0.0591, "step": 26940 }, { "epoch": 2.0013366998366258, "grad_norm": 1.214632272720337, "learning_rate": 1.7991979800980246e-05, "loss": 0.0469, "step": 26950 }, { "epoch": 2.002079310856973, "grad_norm": 0.5876008868217468, "learning_rate": 1.7987524134858164e-05, "loss": 0.0565, "step": 26960 }, { "epoch": 2.0028219218773207, "grad_norm": 0.7250917553901672, "learning_rate": 1.7983068468736076e-05, "loss": 0.0597, "step": 26970 }, { "epoch": 2.003564532897668, "grad_norm": 1.3954814672470093, "learning_rate": 1.797861280261399e-05, "loss": 0.098, "step": 26980 }, { "epoch": 2.0043071439180156, "grad_norm": 0.8022347092628479, "learning_rate": 1.7974157136491906e-05, "loss": 0.0634, "step": 26990 }, { "epoch": 2.0050497549383635, "grad_norm": 1.9625482559204102, "learning_rate": 1.796970147036982e-05, "loss": 0.0774, "step": 27000 }, { "epoch": 2.005792365958711, "grad_norm": 1.0475565195083618, "learning_rate": 1.7965245804247736e-05, "loss": 0.0798, "step": 27010 }, { "epoch": 2.0065349769790584, "grad_norm": 1.3007713556289673, "learning_rate": 1.7960790138125648e-05, "loss": 0.0721, "step": 27020 }, { "epoch": 2.007277587999406, "grad_norm": 1.5825034379959106, "learning_rate": 1.7956334472003566e-05, "loss": 0.0741, "step": 27030 }, { "epoch": 2.0080201990197533, "grad_norm": 1.4288562536239624, "learning_rate": 1.795187880588148e-05, "loss": 0.0606, "step": 27040 }, { "epoch": 2.008762810040101, "grad_norm": 1.6191655397415161, "learning_rate": 1.7947423139759393e-05, "loss": 0.0893, "step": 27050 }, { "epoch": 2.0095054210604486, "grad_norm": 3.0551598072052, "learning_rate": 1.794296747363731e-05, "loss": 0.0609, "step": 27060 }, { "epoch": 2.010248032080796, "grad_norm": 0.9543463587760925, "learning_rate": 1.7938511807515226e-05, "loss": 0.0685, "step": 27070 }, { "epoch": 2.0109906431011435, "grad_norm": 1.9315248727798462, "learning_rate": 1.7934056141393138e-05, "loss": 0.0581, "step": 27080 }, { "epoch": 2.011733254121491, "grad_norm": 1.0615206956863403, "learning_rate": 1.7929600475271053e-05, "loss": 0.0834, "step": 27090 }, { "epoch": 2.012475865141839, "grad_norm": 3.055593729019165, "learning_rate": 1.7925144809148968e-05, "loss": 0.0777, "step": 27100 }, { "epoch": 2.0132184761621863, "grad_norm": 0.9650170207023621, "learning_rate": 1.7920689143026883e-05, "loss": 0.0563, "step": 27110 }, { "epoch": 2.0139610871825337, "grad_norm": 1.2124733924865723, "learning_rate": 1.7916233476904798e-05, "loss": 0.0975, "step": 27120 }, { "epoch": 2.014703698202881, "grad_norm": 2.321781635284424, "learning_rate": 1.7911777810782713e-05, "loss": 0.0769, "step": 27130 }, { "epoch": 2.015446309223229, "grad_norm": 1.0764652490615845, "learning_rate": 1.7907322144660628e-05, "loss": 0.0526, "step": 27140 }, { "epoch": 2.0161889202435765, "grad_norm": 1.4031742811203003, "learning_rate": 1.7902866478538543e-05, "loss": 0.061, "step": 27150 }, { "epoch": 2.016931531263924, "grad_norm": 0.9712595343589783, "learning_rate": 1.7898410812416454e-05, "loss": 0.0627, "step": 27160 }, { "epoch": 2.0176741422842714, "grad_norm": 1.150699496269226, "learning_rate": 1.7893955146294373e-05, "loss": 0.0839, "step": 27170 }, { "epoch": 2.018416753304619, "grad_norm": 1.0646690130233765, "learning_rate": 1.7889499480172288e-05, "loss": 0.084, "step": 27180 }, { "epoch": 2.019159364324967, "grad_norm": 1.4827255010604858, "learning_rate": 1.78850438140502e-05, "loss": 0.0448, "step": 27190 }, { "epoch": 2.0199019753453142, "grad_norm": 1.7980319261550903, "learning_rate": 1.7880588147928118e-05, "loss": 0.0918, "step": 27200 }, { "epoch": 2.0206445863656617, "grad_norm": 1.5512464046478271, "learning_rate": 1.787613248180603e-05, "loss": 0.0685, "step": 27210 }, { "epoch": 2.021387197386009, "grad_norm": 1.1397250890731812, "learning_rate": 1.7871676815683944e-05, "loss": 0.0731, "step": 27220 }, { "epoch": 2.0221298084063566, "grad_norm": 2.0807673931121826, "learning_rate": 1.7867221149561863e-05, "loss": 0.0851, "step": 27230 }, { "epoch": 2.0228724194267045, "grad_norm": 3.3200225830078125, "learning_rate": 1.7862765483439774e-05, "loss": 0.0615, "step": 27240 }, { "epoch": 2.023615030447052, "grad_norm": 1.3116739988327026, "learning_rate": 1.785830981731769e-05, "loss": 0.0569, "step": 27250 }, { "epoch": 2.0243576414673994, "grad_norm": 1.8456593751907349, "learning_rate": 1.7853854151195604e-05, "loss": 0.0873, "step": 27260 }, { "epoch": 2.025100252487747, "grad_norm": 1.7277987003326416, "learning_rate": 1.784939848507352e-05, "loss": 0.0576, "step": 27270 }, { "epoch": 2.0258428635080943, "grad_norm": 0.4091399013996124, "learning_rate": 1.7844942818951434e-05, "loss": 0.0437, "step": 27280 }, { "epoch": 2.026585474528442, "grad_norm": 3.058016300201416, "learning_rate": 1.784048715282935e-05, "loss": 0.0525, "step": 27290 }, { "epoch": 2.0273280855487896, "grad_norm": 3.718642234802246, "learning_rate": 1.7836031486707264e-05, "loss": 0.0712, "step": 27300 }, { "epoch": 2.028070696569137, "grad_norm": 2.157290458679199, "learning_rate": 1.783157582058518e-05, "loss": 0.0617, "step": 27310 }, { "epoch": 2.0288133075894845, "grad_norm": 2.4551494121551514, "learning_rate": 1.782712015446309e-05, "loss": 0.0888, "step": 27320 }, { "epoch": 2.029555918609832, "grad_norm": 1.5772738456726074, "learning_rate": 1.7822664488341006e-05, "loss": 0.0713, "step": 27330 }, { "epoch": 2.03029852963018, "grad_norm": 0.9461155533790588, "learning_rate": 1.7818208822218924e-05, "loss": 0.0612, "step": 27340 }, { "epoch": 2.0310411406505273, "grad_norm": 1.6049461364746094, "learning_rate": 1.7813753156096836e-05, "loss": 0.063, "step": 27350 }, { "epoch": 2.0317837516708748, "grad_norm": 0.41540223360061646, "learning_rate": 1.780929748997475e-05, "loss": 0.064, "step": 27360 }, { "epoch": 2.032526362691222, "grad_norm": 1.894874095916748, "learning_rate": 1.780484182385267e-05, "loss": 0.0669, "step": 27370 }, { "epoch": 2.0332689737115697, "grad_norm": 0.4419223964214325, "learning_rate": 1.780038615773058e-05, "loss": 0.0402, "step": 27380 }, { "epoch": 2.0340115847319176, "grad_norm": 0.962150514125824, "learning_rate": 1.7795930491608496e-05, "loss": 0.0849, "step": 27390 }, { "epoch": 2.034754195752265, "grad_norm": 1.1691762208938599, "learning_rate": 1.779147482548641e-05, "loss": 0.0736, "step": 27400 }, { "epoch": 2.0354968067726125, "grad_norm": 1.5563279390335083, "learning_rate": 1.7787019159364326e-05, "loss": 0.0569, "step": 27410 }, { "epoch": 2.03623941779296, "grad_norm": 2.58371639251709, "learning_rate": 1.778256349324224e-05, "loss": 0.0672, "step": 27420 }, { "epoch": 2.0369820288133074, "grad_norm": 0.5150777101516724, "learning_rate": 1.7778107827120152e-05, "loss": 0.0833, "step": 27430 }, { "epoch": 2.0377246398336553, "grad_norm": 2.1422712802886963, "learning_rate": 1.777365216099807e-05, "loss": 0.0657, "step": 27440 }, { "epoch": 2.0384672508540027, "grad_norm": 1.660101294517517, "learning_rate": 1.7769196494875986e-05, "loss": 0.0678, "step": 27450 }, { "epoch": 2.03920986187435, "grad_norm": 2.03857159614563, "learning_rate": 1.7764740828753897e-05, "loss": 0.0801, "step": 27460 }, { "epoch": 2.0399524728946976, "grad_norm": 1.208047866821289, "learning_rate": 1.7760285162631816e-05, "loss": 0.0321, "step": 27470 }, { "epoch": 2.0406950839150455, "grad_norm": 0.9345359206199646, "learning_rate": 1.775582949650973e-05, "loss": 0.0667, "step": 27480 }, { "epoch": 2.041437694935393, "grad_norm": 2.416853427886963, "learning_rate": 1.7751373830387642e-05, "loss": 0.089, "step": 27490 }, { "epoch": 2.0421803059557404, "grad_norm": 0.6501384377479553, "learning_rate": 1.7746918164265557e-05, "loss": 0.0976, "step": 27500 }, { "epoch": 2.042922916976088, "grad_norm": 1.3015940189361572, "learning_rate": 1.7742462498143472e-05, "loss": 0.0723, "step": 27510 }, { "epoch": 2.0436655279964353, "grad_norm": 1.5806890726089478, "learning_rate": 1.7738006832021387e-05, "loss": 0.0675, "step": 27520 }, { "epoch": 2.044408139016783, "grad_norm": 1.8144307136535645, "learning_rate": 1.7733551165899302e-05, "loss": 0.0506, "step": 27530 }, { "epoch": 2.0451507500371306, "grad_norm": 1.104903221130371, "learning_rate": 1.7729095499777217e-05, "loss": 0.0819, "step": 27540 }, { "epoch": 2.045893361057478, "grad_norm": 2.584608554840088, "learning_rate": 1.7724639833655132e-05, "loss": 0.0699, "step": 27550 }, { "epoch": 2.0466359720778255, "grad_norm": 2.7305595874786377, "learning_rate": 1.7720184167533047e-05, "loss": 0.0954, "step": 27560 }, { "epoch": 2.047378583098173, "grad_norm": 1.475791573524475, "learning_rate": 1.771572850141096e-05, "loss": 0.0682, "step": 27570 }, { "epoch": 2.048121194118521, "grad_norm": 0.9141472578048706, "learning_rate": 1.7711272835288877e-05, "loss": 0.0583, "step": 27580 }, { "epoch": 2.0488638051388683, "grad_norm": 1.4412683248519897, "learning_rate": 1.7706817169166792e-05, "loss": 0.0739, "step": 27590 }, { "epoch": 2.049606416159216, "grad_norm": 2.051802158355713, "learning_rate": 1.7702361503044704e-05, "loss": 0.0864, "step": 27600 }, { "epoch": 2.0503490271795632, "grad_norm": 1.61028254032135, "learning_rate": 1.7697905836922622e-05, "loss": 0.0379, "step": 27610 }, { "epoch": 2.0510916381999107, "grad_norm": 1.2123854160308838, "learning_rate": 1.7693450170800534e-05, "loss": 0.0821, "step": 27620 }, { "epoch": 2.0518342492202586, "grad_norm": 2.0213537216186523, "learning_rate": 1.768899450467845e-05, "loss": 0.0724, "step": 27630 }, { "epoch": 2.052576860240606, "grad_norm": 1.507546067237854, "learning_rate": 1.7684538838556367e-05, "loss": 0.0844, "step": 27640 }, { "epoch": 2.0533194712609535, "grad_norm": 1.3332023620605469, "learning_rate": 1.768008317243428e-05, "loss": 0.0679, "step": 27650 }, { "epoch": 2.054062082281301, "grad_norm": 1.5484570264816284, "learning_rate": 1.7675627506312194e-05, "loss": 0.0898, "step": 27660 }, { "epoch": 2.0548046933016484, "grad_norm": 1.267923355102539, "learning_rate": 1.767117184019011e-05, "loss": 0.1073, "step": 27670 }, { "epoch": 2.0555473043219963, "grad_norm": 2.02040958404541, "learning_rate": 1.7666716174068024e-05, "loss": 0.0747, "step": 27680 }, { "epoch": 2.0562899153423437, "grad_norm": 0.6762095093727112, "learning_rate": 1.766226050794594e-05, "loss": 0.0501, "step": 27690 }, { "epoch": 2.057032526362691, "grad_norm": 2.8425498008728027, "learning_rate": 1.7657804841823854e-05, "loss": 0.0708, "step": 27700 }, { "epoch": 2.0577751373830386, "grad_norm": 2.2198352813720703, "learning_rate": 1.765334917570177e-05, "loss": 0.0625, "step": 27710 }, { "epoch": 2.0585177484033865, "grad_norm": 0.9430508613586426, "learning_rate": 1.7648893509579684e-05, "loss": 0.0844, "step": 27720 }, { "epoch": 2.059260359423734, "grad_norm": 3.4453015327453613, "learning_rate": 1.7644437843457595e-05, "loss": 0.0821, "step": 27730 }, { "epoch": 2.0600029704440814, "grad_norm": 1.2415636777877808, "learning_rate": 1.763998217733551e-05, "loss": 0.0574, "step": 27740 }, { "epoch": 2.060745581464429, "grad_norm": 1.088160753250122, "learning_rate": 1.763552651121343e-05, "loss": 0.0639, "step": 27750 }, { "epoch": 2.0614881924847763, "grad_norm": 0.9937611818313599, "learning_rate": 1.763107084509134e-05, "loss": 0.0905, "step": 27760 }, { "epoch": 2.062230803505124, "grad_norm": 0.7093477845191956, "learning_rate": 1.7626615178969255e-05, "loss": 0.0714, "step": 27770 }, { "epoch": 2.0629734145254717, "grad_norm": 2.274669885635376, "learning_rate": 1.7622159512847174e-05, "loss": 0.0692, "step": 27780 }, { "epoch": 2.063716025545819, "grad_norm": 1.7703497409820557, "learning_rate": 1.7617703846725085e-05, "loss": 0.0621, "step": 27790 }, { "epoch": 2.0644586365661666, "grad_norm": 0.3393421173095703, "learning_rate": 1.7613248180603e-05, "loss": 0.0528, "step": 27800 }, { "epoch": 2.065201247586514, "grad_norm": 1.3570091724395752, "learning_rate": 1.760879251448092e-05, "loss": 0.0585, "step": 27810 }, { "epoch": 2.065943858606862, "grad_norm": 1.6953606605529785, "learning_rate": 1.760433684835883e-05, "loss": 0.0678, "step": 27820 }, { "epoch": 2.0666864696272094, "grad_norm": 0.6332347393035889, "learning_rate": 1.7599881182236745e-05, "loss": 0.0474, "step": 27830 }, { "epoch": 2.067429080647557, "grad_norm": 4.120887756347656, "learning_rate": 1.7595425516114657e-05, "loss": 0.1038, "step": 27840 }, { "epoch": 2.0681716916679043, "grad_norm": 2.6775243282318115, "learning_rate": 1.7590969849992575e-05, "loss": 0.0786, "step": 27850 }, { "epoch": 2.0689143026882517, "grad_norm": 1.4522796869277954, "learning_rate": 1.758651418387049e-05, "loss": 0.0603, "step": 27860 }, { "epoch": 2.0696569137085996, "grad_norm": 1.7313041687011719, "learning_rate": 1.7582058517748402e-05, "loss": 0.0938, "step": 27870 }, { "epoch": 2.070399524728947, "grad_norm": 0.7014159560203552, "learning_rate": 1.757760285162632e-05, "loss": 0.0905, "step": 27880 }, { "epoch": 2.0711421357492945, "grad_norm": 0.5226468443870544, "learning_rate": 1.7573147185504235e-05, "loss": 0.0669, "step": 27890 }, { "epoch": 2.071884746769642, "grad_norm": 0.4912814795970917, "learning_rate": 1.7568691519382147e-05, "loss": 0.0562, "step": 27900 }, { "epoch": 2.0726273577899894, "grad_norm": 0.46344590187072754, "learning_rate": 1.7564235853260062e-05, "loss": 0.0722, "step": 27910 }, { "epoch": 2.0733699688103373, "grad_norm": 0.5316863656044006, "learning_rate": 1.7559780187137977e-05, "loss": 0.0782, "step": 27920 }, { "epoch": 2.0741125798306848, "grad_norm": 1.1357983350753784, "learning_rate": 1.7555324521015892e-05, "loss": 0.0679, "step": 27930 }, { "epoch": 2.074855190851032, "grad_norm": 0.679740309715271, "learning_rate": 1.7550868854893807e-05, "loss": 0.0618, "step": 27940 }, { "epoch": 2.0755978018713797, "grad_norm": 1.716489553451538, "learning_rate": 1.7546413188771722e-05, "loss": 0.1001, "step": 27950 }, { "epoch": 2.076340412891727, "grad_norm": 1.4694855213165283, "learning_rate": 1.7541957522649637e-05, "loss": 0.068, "step": 27960 }, { "epoch": 2.077083023912075, "grad_norm": 2.162365436553955, "learning_rate": 1.7537501856527552e-05, "loss": 0.0555, "step": 27970 }, { "epoch": 2.0778256349324224, "grad_norm": 1.0880649089813232, "learning_rate": 1.7533046190405463e-05, "loss": 0.0558, "step": 27980 }, { "epoch": 2.07856824595277, "grad_norm": 2.1188676357269287, "learning_rate": 1.7528590524283382e-05, "loss": 0.0862, "step": 27990 }, { "epoch": 2.0793108569731173, "grad_norm": 1.4988460540771484, "learning_rate": 1.7524134858161297e-05, "loss": 0.0816, "step": 28000 }, { "epoch": 2.080053467993465, "grad_norm": 0.9901124238967896, "learning_rate": 1.751967919203921e-05, "loss": 0.0772, "step": 28010 }, { "epoch": 2.0807960790138127, "grad_norm": 1.7967792749404907, "learning_rate": 1.7515223525917127e-05, "loss": 0.062, "step": 28020 }, { "epoch": 2.08153869003416, "grad_norm": 1.3273664712905884, "learning_rate": 1.751076785979504e-05, "loss": 0.0599, "step": 28030 }, { "epoch": 2.0822813010545076, "grad_norm": 2.504648447036743, "learning_rate": 1.7506312193672953e-05, "loss": 0.0437, "step": 28040 }, { "epoch": 2.083023912074855, "grad_norm": 1.477177381515503, "learning_rate": 1.7501856527550872e-05, "loss": 0.0927, "step": 28050 }, { "epoch": 2.083766523095203, "grad_norm": 1.7123514413833618, "learning_rate": 1.7497400861428783e-05, "loss": 0.0999, "step": 28060 }, { "epoch": 2.0845091341155504, "grad_norm": 0.7901507616043091, "learning_rate": 1.74929451953067e-05, "loss": 0.0681, "step": 28070 }, { "epoch": 2.085251745135898, "grad_norm": 0.9315057396888733, "learning_rate": 1.7488489529184613e-05, "loss": 0.045, "step": 28080 }, { "epoch": 2.0859943561562453, "grad_norm": 0.80745929479599, "learning_rate": 1.748403386306253e-05, "loss": 0.0729, "step": 28090 }, { "epoch": 2.0867369671765927, "grad_norm": 1.3841748237609863, "learning_rate": 1.7479578196940443e-05, "loss": 0.0651, "step": 28100 }, { "epoch": 2.0874795781969406, "grad_norm": 4.800222873687744, "learning_rate": 1.747512253081836e-05, "loss": 0.0764, "step": 28110 }, { "epoch": 2.088222189217288, "grad_norm": 1.6602140665054321, "learning_rate": 1.7470666864696273e-05, "loss": 0.0666, "step": 28120 }, { "epoch": 2.0889648002376355, "grad_norm": 1.5725599527359009, "learning_rate": 1.7466211198574188e-05, "loss": 0.0759, "step": 28130 }, { "epoch": 2.089707411257983, "grad_norm": 2.319767713546753, "learning_rate": 1.74617555324521e-05, "loss": 0.1133, "step": 28140 }, { "epoch": 2.0904500222783304, "grad_norm": 2.916980266571045, "learning_rate": 1.7457299866330015e-05, "loss": 0.0703, "step": 28150 }, { "epoch": 2.0911926332986783, "grad_norm": 2.074702501296997, "learning_rate": 1.7452844200207933e-05, "loss": 0.0736, "step": 28160 }, { "epoch": 2.0919352443190258, "grad_norm": 1.4027667045593262, "learning_rate": 1.7448388534085845e-05, "loss": 0.0675, "step": 28170 }, { "epoch": 2.0926778553393732, "grad_norm": 2.333289861679077, "learning_rate": 1.744393286796376e-05, "loss": 0.0573, "step": 28180 }, { "epoch": 2.0934204663597207, "grad_norm": 1.7453340291976929, "learning_rate": 1.7439477201841678e-05, "loss": 0.0922, "step": 28190 }, { "epoch": 2.094163077380068, "grad_norm": 0.6930978894233704, "learning_rate": 1.743502153571959e-05, "loss": 0.0847, "step": 28200 }, { "epoch": 2.094905688400416, "grad_norm": 1.3762452602386475, "learning_rate": 1.7430565869597505e-05, "loss": 0.1109, "step": 28210 }, { "epoch": 2.0956482994207635, "grad_norm": 0.3616078197956085, "learning_rate": 1.7426110203475423e-05, "loss": 0.0753, "step": 28220 }, { "epoch": 2.096390910441111, "grad_norm": 2.1487832069396973, "learning_rate": 1.7421654537353335e-05, "loss": 0.0828, "step": 28230 }, { "epoch": 2.0971335214614584, "grad_norm": 0.9581325054168701, "learning_rate": 1.741719887123125e-05, "loss": 0.0691, "step": 28240 }, { "epoch": 2.097876132481806, "grad_norm": 2.8964858055114746, "learning_rate": 1.741274320510916e-05, "loss": 0.0917, "step": 28250 }, { "epoch": 2.0986187435021537, "grad_norm": 0.576937198638916, "learning_rate": 1.740828753898708e-05, "loss": 0.0624, "step": 28260 }, { "epoch": 2.099361354522501, "grad_norm": 1.5471432209014893, "learning_rate": 1.7403831872864995e-05, "loss": 0.0873, "step": 28270 }, { "epoch": 2.1001039655428486, "grad_norm": 1.0015627145767212, "learning_rate": 1.7399376206742906e-05, "loss": 0.0765, "step": 28280 }, { "epoch": 2.100846576563196, "grad_norm": 2.5197479724884033, "learning_rate": 1.7394920540620825e-05, "loss": 0.0679, "step": 28290 }, { "epoch": 2.101589187583544, "grad_norm": 1.3474407196044922, "learning_rate": 1.739046487449874e-05, "loss": 0.0497, "step": 28300 }, { "epoch": 2.1023317986038914, "grad_norm": 1.8976625204086304, "learning_rate": 1.738600920837665e-05, "loss": 0.0885, "step": 28310 }, { "epoch": 2.103074409624239, "grad_norm": 3.648165464401245, "learning_rate": 1.7381553542254566e-05, "loss": 0.0964, "step": 28320 }, { "epoch": 2.1038170206445863, "grad_norm": 0.9834181070327759, "learning_rate": 1.7377097876132485e-05, "loss": 0.0524, "step": 28330 }, { "epoch": 2.1045596316649338, "grad_norm": 2.057588815689087, "learning_rate": 1.7372642210010396e-05, "loss": 0.0609, "step": 28340 }, { "epoch": 2.1053022426852817, "grad_norm": 1.9514938592910767, "learning_rate": 1.736818654388831e-05, "loss": 0.09, "step": 28350 }, { "epoch": 2.106044853705629, "grad_norm": 0.6397399306297302, "learning_rate": 1.7363730877766226e-05, "loss": 0.0805, "step": 28360 }, { "epoch": 2.1067874647259766, "grad_norm": 0.7287691831588745, "learning_rate": 1.735927521164414e-05, "loss": 0.0517, "step": 28370 }, { "epoch": 2.107530075746324, "grad_norm": 0.5326383113861084, "learning_rate": 1.7354819545522056e-05, "loss": 0.0314, "step": 28380 }, { "epoch": 2.1082726867666715, "grad_norm": 0.252254843711853, "learning_rate": 1.735036387939997e-05, "loss": 0.0581, "step": 28390 }, { "epoch": 2.1090152977870193, "grad_norm": 3.9282045364379883, "learning_rate": 1.7345908213277886e-05, "loss": 0.0943, "step": 28400 }, { "epoch": 2.109757908807367, "grad_norm": 2.1069839000701904, "learning_rate": 1.73414525471558e-05, "loss": 0.0704, "step": 28410 }, { "epoch": 2.1105005198277142, "grad_norm": 1.8486924171447754, "learning_rate": 1.7336996881033713e-05, "loss": 0.07, "step": 28420 }, { "epoch": 2.1112431308480617, "grad_norm": 1.9672880172729492, "learning_rate": 1.733254121491163e-05, "loss": 0.0827, "step": 28430 }, { "epoch": 2.111985741868409, "grad_norm": 0.7764392495155334, "learning_rate": 1.7328085548789543e-05, "loss": 0.056, "step": 28440 }, { "epoch": 2.112728352888757, "grad_norm": 1.0631473064422607, "learning_rate": 1.7323629882667458e-05, "loss": 0.0485, "step": 28450 }, { "epoch": 2.1134709639091045, "grad_norm": 1.6349713802337646, "learning_rate": 1.7319174216545376e-05, "loss": 0.0778, "step": 28460 }, { "epoch": 2.114213574929452, "grad_norm": 2.0537021160125732, "learning_rate": 1.7314718550423288e-05, "loss": 0.072, "step": 28470 }, { "epoch": 2.1149561859497994, "grad_norm": 2.0460641384124756, "learning_rate": 1.7310262884301203e-05, "loss": 0.0889, "step": 28480 }, { "epoch": 2.115698796970147, "grad_norm": 1.6083009243011475, "learning_rate": 1.7305807218179118e-05, "loss": 0.0685, "step": 28490 }, { "epoch": 2.1164414079904947, "grad_norm": 2.218975305557251, "learning_rate": 1.7301351552057033e-05, "loss": 0.0664, "step": 28500 }, { "epoch": 2.117184019010842, "grad_norm": 1.3092341423034668, "learning_rate": 1.7296895885934948e-05, "loss": 0.0673, "step": 28510 }, { "epoch": 2.1179266300311896, "grad_norm": 1.1454136371612549, "learning_rate": 1.7292440219812863e-05, "loss": 0.0891, "step": 28520 }, { "epoch": 2.118669241051537, "grad_norm": 0.9033706188201904, "learning_rate": 1.7287984553690778e-05, "loss": 0.0671, "step": 28530 }, { "epoch": 2.1194118520718845, "grad_norm": 1.2205688953399658, "learning_rate": 1.7283528887568693e-05, "loss": 0.0726, "step": 28540 }, { "epoch": 2.1201544630922324, "grad_norm": 1.5144611597061157, "learning_rate": 1.7279073221446605e-05, "loss": 0.0493, "step": 28550 }, { "epoch": 2.12089707411258, "grad_norm": 0.9755750894546509, "learning_rate": 1.727461755532452e-05, "loss": 0.07, "step": 28560 }, { "epoch": 2.1216396851329273, "grad_norm": 1.5496515035629272, "learning_rate": 1.7270161889202438e-05, "loss": 0.0532, "step": 28570 }, { "epoch": 2.122382296153275, "grad_norm": 1.8168680667877197, "learning_rate": 1.726570622308035e-05, "loss": 0.0682, "step": 28580 }, { "epoch": 2.1231249071736222, "grad_norm": 1.872641682624817, "learning_rate": 1.7261250556958264e-05, "loss": 0.0572, "step": 28590 }, { "epoch": 2.12386751819397, "grad_norm": 1.2901333570480347, "learning_rate": 1.7256794890836183e-05, "loss": 0.0416, "step": 28600 }, { "epoch": 2.1246101292143176, "grad_norm": 1.7562663555145264, "learning_rate": 1.7252339224714094e-05, "loss": 0.0695, "step": 28610 }, { "epoch": 2.125352740234665, "grad_norm": 0.7023272514343262, "learning_rate": 1.724788355859201e-05, "loss": 0.047, "step": 28620 }, { "epoch": 2.1260953512550125, "grad_norm": 2.1506128311157227, "learning_rate": 1.7243427892469928e-05, "loss": 0.0718, "step": 28630 }, { "epoch": 2.1268379622753604, "grad_norm": 1.4215508699417114, "learning_rate": 1.723897222634784e-05, "loss": 0.0607, "step": 28640 }, { "epoch": 2.127580573295708, "grad_norm": 0.4577394723892212, "learning_rate": 1.7234516560225754e-05, "loss": 0.0454, "step": 28650 }, { "epoch": 2.1283231843160553, "grad_norm": 1.4944130182266235, "learning_rate": 1.7230060894103666e-05, "loss": 0.077, "step": 28660 }, { "epoch": 2.1290657953364027, "grad_norm": 1.4956315755844116, "learning_rate": 1.7225605227981584e-05, "loss": 0.0619, "step": 28670 }, { "epoch": 2.12980840635675, "grad_norm": 1.8317357301712036, "learning_rate": 1.72211495618595e-05, "loss": 0.0803, "step": 28680 }, { "epoch": 2.130551017377098, "grad_norm": 0.5078200101852417, "learning_rate": 1.721669389573741e-05, "loss": 0.0654, "step": 28690 }, { "epoch": 2.1312936283974455, "grad_norm": 1.6214134693145752, "learning_rate": 1.721223822961533e-05, "loss": 0.0542, "step": 28700 }, { "epoch": 2.132036239417793, "grad_norm": 2.1985042095184326, "learning_rate": 1.7207782563493244e-05, "loss": 0.0543, "step": 28710 }, { "epoch": 2.1327788504381404, "grad_norm": 1.8467528820037842, "learning_rate": 1.7203326897371156e-05, "loss": 0.0596, "step": 28720 }, { "epoch": 2.133521461458488, "grad_norm": 1.9839740991592407, "learning_rate": 1.719887123124907e-05, "loss": 0.082, "step": 28730 }, { "epoch": 2.1342640724788358, "grad_norm": 1.7829911708831787, "learning_rate": 1.719441556512699e-05, "loss": 0.0709, "step": 28740 }, { "epoch": 2.135006683499183, "grad_norm": 1.043108582496643, "learning_rate": 1.71899598990049e-05, "loss": 0.0571, "step": 28750 }, { "epoch": 2.1357492945195307, "grad_norm": 1.6130720376968384, "learning_rate": 1.7185504232882816e-05, "loss": 0.0577, "step": 28760 }, { "epoch": 2.136491905539878, "grad_norm": 0.3938934803009033, "learning_rate": 1.718104856676073e-05, "loss": 0.0713, "step": 28770 }, { "epoch": 2.1372345165602256, "grad_norm": 2.0002403259277344, "learning_rate": 1.7176592900638646e-05, "loss": 0.063, "step": 28780 }, { "epoch": 2.1379771275805735, "grad_norm": 1.1039925813674927, "learning_rate": 1.717213723451656e-05, "loss": 0.0555, "step": 28790 }, { "epoch": 2.138719738600921, "grad_norm": 1.2151685953140259, "learning_rate": 1.7167681568394476e-05, "loss": 0.0613, "step": 28800 }, { "epoch": 2.1394623496212684, "grad_norm": 0.6045921444892883, "learning_rate": 1.716322590227239e-05, "loss": 0.0827, "step": 28810 }, { "epoch": 2.140204960641616, "grad_norm": 1.541783094406128, "learning_rate": 1.7158770236150306e-05, "loss": 0.0771, "step": 28820 }, { "epoch": 2.1409475716619633, "grad_norm": 3.0538811683654785, "learning_rate": 1.7154314570028218e-05, "loss": 0.1097, "step": 28830 }, { "epoch": 2.141690182682311, "grad_norm": 1.1775662899017334, "learning_rate": 1.7149858903906136e-05, "loss": 0.0666, "step": 28840 }, { "epoch": 2.1424327937026586, "grad_norm": 1.498507022857666, "learning_rate": 1.7145403237784048e-05, "loss": 0.0671, "step": 28850 }, { "epoch": 2.143175404723006, "grad_norm": 0.7959874868392944, "learning_rate": 1.7140947571661963e-05, "loss": 0.057, "step": 28860 }, { "epoch": 2.1439180157433535, "grad_norm": 2.1774017810821533, "learning_rate": 1.713649190553988e-05, "loss": 0.1016, "step": 28870 }, { "epoch": 2.1446606267637014, "grad_norm": 0.7331980466842651, "learning_rate": 1.7132036239417793e-05, "loss": 0.0634, "step": 28880 }, { "epoch": 2.145403237784049, "grad_norm": 0.8138454556465149, "learning_rate": 1.7127580573295708e-05, "loss": 0.0474, "step": 28890 }, { "epoch": 2.1461458488043963, "grad_norm": 1.016754388809204, "learning_rate": 1.7123124907173623e-05, "loss": 0.0718, "step": 28900 }, { "epoch": 2.1468884598247437, "grad_norm": 1.882819414138794, "learning_rate": 1.7118669241051538e-05, "loss": 0.0809, "step": 28910 }, { "epoch": 2.147631070845091, "grad_norm": 0.7724311351776123, "learning_rate": 1.7114213574929453e-05, "loss": 0.0775, "step": 28920 }, { "epoch": 2.148373681865439, "grad_norm": 2.551377534866333, "learning_rate": 1.7109757908807368e-05, "loss": 0.0776, "step": 28930 }, { "epoch": 2.1491162928857865, "grad_norm": 1.454253911972046, "learning_rate": 1.7105302242685283e-05, "loss": 0.0635, "step": 28940 }, { "epoch": 2.149858903906134, "grad_norm": 0.8662858009338379, "learning_rate": 1.7100846576563197e-05, "loss": 0.0689, "step": 28950 }, { "epoch": 2.1506015149264814, "grad_norm": 1.5164152383804321, "learning_rate": 1.709639091044111e-05, "loss": 0.0956, "step": 28960 }, { "epoch": 2.151344125946829, "grad_norm": 1.9796892404556274, "learning_rate": 1.7091935244319024e-05, "loss": 0.08, "step": 28970 }, { "epoch": 2.152086736967177, "grad_norm": 2.7751400470733643, "learning_rate": 1.7087479578196942e-05, "loss": 0.0689, "step": 28980 }, { "epoch": 2.1528293479875242, "grad_norm": 0.9930230379104614, "learning_rate": 1.7083023912074854e-05, "loss": 0.0691, "step": 28990 }, { "epoch": 2.1535719590078717, "grad_norm": 0.8586207032203674, "learning_rate": 1.707856824595277e-05, "loss": 0.0729, "step": 29000 }, { "epoch": 2.154314570028219, "grad_norm": 1.4148691892623901, "learning_rate": 1.7074112579830687e-05, "loss": 0.0488, "step": 29010 }, { "epoch": 2.1550571810485666, "grad_norm": 1.3059778213500977, "learning_rate": 1.70696569137086e-05, "loss": 0.0812, "step": 29020 }, { "epoch": 2.1557997920689145, "grad_norm": 0.3190613389015198, "learning_rate": 1.7065201247586514e-05, "loss": 0.0524, "step": 29030 }, { "epoch": 2.156542403089262, "grad_norm": 2.4894535541534424, "learning_rate": 1.7060745581464432e-05, "loss": 0.1131, "step": 29040 }, { "epoch": 2.1572850141096094, "grad_norm": 2.3826353549957275, "learning_rate": 1.7056289915342344e-05, "loss": 0.0722, "step": 29050 }, { "epoch": 2.158027625129957, "grad_norm": 0.887143611907959, "learning_rate": 1.705183424922026e-05, "loss": 0.0498, "step": 29060 }, { "epoch": 2.1587702361503043, "grad_norm": 2.6809613704681396, "learning_rate": 1.704737858309817e-05, "loss": 0.1214, "step": 29070 }, { "epoch": 2.159512847170652, "grad_norm": 1.2329598665237427, "learning_rate": 1.704292291697609e-05, "loss": 0.0972, "step": 29080 }, { "epoch": 2.1602554581909996, "grad_norm": 2.446826457977295, "learning_rate": 1.7038467250854004e-05, "loss": 0.0659, "step": 29090 }, { "epoch": 2.160998069211347, "grad_norm": 1.961200475692749, "learning_rate": 1.7034011584731916e-05, "loss": 0.0396, "step": 29100 }, { "epoch": 2.1617406802316945, "grad_norm": 0.2081407755613327, "learning_rate": 1.7029555918609834e-05, "loss": 0.0452, "step": 29110 }, { "epoch": 2.162483291252042, "grad_norm": 1.8672429323196411, "learning_rate": 1.702510025248775e-05, "loss": 0.0674, "step": 29120 }, { "epoch": 2.16322590227239, "grad_norm": 2.0744621753692627, "learning_rate": 1.702064458636566e-05, "loss": 0.0899, "step": 29130 }, { "epoch": 2.1639685132927373, "grad_norm": 0.9854013323783875, "learning_rate": 1.7016188920243576e-05, "loss": 0.0731, "step": 29140 }, { "epoch": 2.1647111243130848, "grad_norm": 1.5703667402267456, "learning_rate": 1.7011733254121494e-05, "loss": 0.0887, "step": 29150 }, { "epoch": 2.165453735333432, "grad_norm": 1.483916997909546, "learning_rate": 1.7007277587999406e-05, "loss": 0.0863, "step": 29160 }, { "epoch": 2.1661963463537797, "grad_norm": 1.5202964544296265, "learning_rate": 1.700282192187732e-05, "loss": 0.0785, "step": 29170 }, { "epoch": 2.1669389573741276, "grad_norm": 0.9036029577255249, "learning_rate": 1.6998366255755236e-05, "loss": 0.0603, "step": 29180 }, { "epoch": 2.167681568394475, "grad_norm": 0.647527277469635, "learning_rate": 1.699391058963315e-05, "loss": 0.0313, "step": 29190 }, { "epoch": 2.1684241794148225, "grad_norm": 1.4471430778503418, "learning_rate": 1.6989454923511066e-05, "loss": 0.1026, "step": 29200 }, { "epoch": 2.16916679043517, "grad_norm": 1.5521758794784546, "learning_rate": 1.698499925738898e-05, "loss": 0.0581, "step": 29210 }, { "epoch": 2.169909401455518, "grad_norm": 0.8579624891281128, "learning_rate": 1.6980543591266896e-05, "loss": 0.0591, "step": 29220 }, { "epoch": 2.1706520124758653, "grad_norm": 0.8837199211120605, "learning_rate": 1.697608792514481e-05, "loss": 0.0776, "step": 29230 }, { "epoch": 2.1713946234962127, "grad_norm": 0.8631309866905212, "learning_rate": 1.6971632259022722e-05, "loss": 0.0709, "step": 29240 }, { "epoch": 2.17213723451656, "grad_norm": 1.2741137742996216, "learning_rate": 1.696717659290064e-05, "loss": 0.0607, "step": 29250 }, { "epoch": 2.1728798455369076, "grad_norm": 2.396149158477783, "learning_rate": 1.6962720926778556e-05, "loss": 0.0546, "step": 29260 }, { "epoch": 2.1736224565572555, "grad_norm": 1.4403908252716064, "learning_rate": 1.6958265260656467e-05, "loss": 0.0452, "step": 29270 }, { "epoch": 2.174365067577603, "grad_norm": 1.8442835807800293, "learning_rate": 1.6953809594534386e-05, "loss": 0.0869, "step": 29280 }, { "epoch": 2.1751076785979504, "grad_norm": 1.5323288440704346, "learning_rate": 1.6949353928412297e-05, "loss": 0.0881, "step": 29290 }, { "epoch": 2.175850289618298, "grad_norm": 0.6434569358825684, "learning_rate": 1.6944898262290212e-05, "loss": 0.0635, "step": 29300 }, { "epoch": 2.1765929006386453, "grad_norm": 0.8383660912513733, "learning_rate": 1.6940442596168127e-05, "loss": 0.0545, "step": 29310 }, { "epoch": 2.177335511658993, "grad_norm": 1.861343264579773, "learning_rate": 1.6935986930046042e-05, "loss": 0.0557, "step": 29320 }, { "epoch": 2.1780781226793406, "grad_norm": 1.4994820356369019, "learning_rate": 1.6931531263923957e-05, "loss": 0.0584, "step": 29330 }, { "epoch": 2.178820733699688, "grad_norm": 0.872047483921051, "learning_rate": 1.6927075597801872e-05, "loss": 0.1263, "step": 29340 }, { "epoch": 2.1795633447200355, "grad_norm": 1.3193352222442627, "learning_rate": 1.6922619931679787e-05, "loss": 0.0715, "step": 29350 }, { "epoch": 2.180305955740383, "grad_norm": 1.5889509916305542, "learning_rate": 1.6918164265557702e-05, "loss": 0.0859, "step": 29360 }, { "epoch": 2.181048566760731, "grad_norm": 1.012092113494873, "learning_rate": 1.6913708599435614e-05, "loss": 0.0596, "step": 29370 }, { "epoch": 2.1817911777810783, "grad_norm": 1.8053189516067505, "learning_rate": 1.6909252933313532e-05, "loss": 0.0625, "step": 29380 }, { "epoch": 2.182533788801426, "grad_norm": 0.7296652793884277, "learning_rate": 1.6904797267191447e-05, "loss": 0.0597, "step": 29390 }, { "epoch": 2.1832763998217732, "grad_norm": 1.110438346862793, "learning_rate": 1.690034160106936e-05, "loss": 0.0538, "step": 29400 }, { "epoch": 2.1840190108421207, "grad_norm": 2.128885507583618, "learning_rate": 1.6895885934947274e-05, "loss": 0.0559, "step": 29410 }, { "epoch": 2.1847616218624686, "grad_norm": 1.590346097946167, "learning_rate": 1.6891430268825192e-05, "loss": 0.0496, "step": 29420 }, { "epoch": 2.185504232882816, "grad_norm": 1.5324519872665405, "learning_rate": 1.6886974602703104e-05, "loss": 0.0778, "step": 29430 }, { "epoch": 2.1862468439031635, "grad_norm": 0.8141632080078125, "learning_rate": 1.688251893658102e-05, "loss": 0.079, "step": 29440 }, { "epoch": 2.186989454923511, "grad_norm": 2.3867433071136475, "learning_rate": 1.6878063270458937e-05, "loss": 0.0727, "step": 29450 }, { "epoch": 2.187732065943859, "grad_norm": 1.383835792541504, "learning_rate": 1.687360760433685e-05, "loss": 0.0553, "step": 29460 }, { "epoch": 2.1884746769642063, "grad_norm": 1.1126325130462646, "learning_rate": 1.6869151938214764e-05, "loss": 0.0698, "step": 29470 }, { "epoch": 2.1892172879845537, "grad_norm": 1.8178859949111938, "learning_rate": 1.6864696272092675e-05, "loss": 0.087, "step": 29480 }, { "epoch": 2.189959899004901, "grad_norm": 1.6789990663528442, "learning_rate": 1.6860240605970594e-05, "loss": 0.0652, "step": 29490 }, { "epoch": 2.1907025100252486, "grad_norm": 1.2279289960861206, "learning_rate": 1.685578493984851e-05, "loss": 0.0915, "step": 29500 }, { "epoch": 2.1914451210455965, "grad_norm": 1.7912808656692505, "learning_rate": 1.685132927372642e-05, "loss": 0.0849, "step": 29510 }, { "epoch": 2.192187732065944, "grad_norm": 0.8681305050849915, "learning_rate": 1.684687360760434e-05, "loss": 0.0771, "step": 29520 }, { "epoch": 2.1929303430862914, "grad_norm": 3.010956287384033, "learning_rate": 1.6842417941482254e-05, "loss": 0.0818, "step": 29530 }, { "epoch": 2.193672954106639, "grad_norm": 1.075864553451538, "learning_rate": 1.6837962275360165e-05, "loss": 0.0665, "step": 29540 }, { "epoch": 2.1944155651269863, "grad_norm": 1.1837166547775269, "learning_rate": 1.683350660923808e-05, "loss": 0.0608, "step": 29550 }, { "epoch": 2.195158176147334, "grad_norm": 0.6628900766372681, "learning_rate": 1.6829050943116e-05, "loss": 0.0717, "step": 29560 }, { "epoch": 2.1959007871676817, "grad_norm": 0.9537403583526611, "learning_rate": 1.682459527699391e-05, "loss": 0.0573, "step": 29570 }, { "epoch": 2.196643398188029, "grad_norm": 2.0913939476013184, "learning_rate": 1.6820139610871825e-05, "loss": 0.1074, "step": 29580 }, { "epoch": 2.1973860092083766, "grad_norm": 0.6338614821434021, "learning_rate": 1.681568394474974e-05, "loss": 0.0655, "step": 29590 }, { "epoch": 2.198128620228724, "grad_norm": 2.1373088359832764, "learning_rate": 1.6811228278627655e-05, "loss": 0.05, "step": 29600 }, { "epoch": 2.198871231249072, "grad_norm": 1.9427019357681274, "learning_rate": 1.680677261250557e-05, "loss": 0.0555, "step": 29610 }, { "epoch": 2.1996138422694194, "grad_norm": 1.4638181924819946, "learning_rate": 1.6802316946383485e-05, "loss": 0.0617, "step": 29620 }, { "epoch": 2.200356453289767, "grad_norm": 1.2105026245117188, "learning_rate": 1.67978612802614e-05, "loss": 0.0647, "step": 29630 }, { "epoch": 2.2010990643101143, "grad_norm": 1.4060852527618408, "learning_rate": 1.6793405614139315e-05, "loss": 0.0727, "step": 29640 }, { "epoch": 2.2018416753304617, "grad_norm": 2.5179665088653564, "learning_rate": 1.6788949948017227e-05, "loss": 0.0736, "step": 29650 }, { "epoch": 2.2025842863508096, "grad_norm": 2.2634148597717285, "learning_rate": 1.6784494281895145e-05, "loss": 0.0724, "step": 29660 }, { "epoch": 2.203326897371157, "grad_norm": 1.1711833477020264, "learning_rate": 1.678003861577306e-05, "loss": 0.0677, "step": 29670 }, { "epoch": 2.2040695083915045, "grad_norm": 1.9533751010894775, "learning_rate": 1.6775582949650972e-05, "loss": 0.0682, "step": 29680 }, { "epoch": 2.204812119411852, "grad_norm": 1.6772757768630981, "learning_rate": 1.677112728352889e-05, "loss": 0.0952, "step": 29690 }, { "epoch": 2.2055547304321994, "grad_norm": 1.3155872821807861, "learning_rate": 1.6766671617406802e-05, "loss": 0.0654, "step": 29700 }, { "epoch": 2.2062973414525473, "grad_norm": 0.7310417294502258, "learning_rate": 1.6762215951284717e-05, "loss": 0.0749, "step": 29710 }, { "epoch": 2.2070399524728948, "grad_norm": 1.559289813041687, "learning_rate": 1.6757760285162632e-05, "loss": 0.0524, "step": 29720 }, { "epoch": 2.207782563493242, "grad_norm": 1.340665578842163, "learning_rate": 1.6753304619040547e-05, "loss": 0.0692, "step": 29730 }, { "epoch": 2.2085251745135897, "grad_norm": 0.5830607414245605, "learning_rate": 1.6748848952918462e-05, "loss": 0.1112, "step": 29740 }, { "epoch": 2.209267785533937, "grad_norm": 1.7768720388412476, "learning_rate": 1.6744393286796377e-05, "loss": 0.0557, "step": 29750 }, { "epoch": 2.210010396554285, "grad_norm": 0.5409281849861145, "learning_rate": 1.6739937620674292e-05, "loss": 0.0565, "step": 29760 }, { "epoch": 2.2107530075746324, "grad_norm": 1.0394829511642456, "learning_rate": 1.6735481954552207e-05, "loss": 0.0581, "step": 29770 }, { "epoch": 2.21149561859498, "grad_norm": 1.975055456161499, "learning_rate": 1.673102628843012e-05, "loss": 0.101, "step": 29780 }, { "epoch": 2.2122382296153273, "grad_norm": 1.235195279121399, "learning_rate": 1.6726570622308037e-05, "loss": 0.0529, "step": 29790 }, { "epoch": 2.2129808406356752, "grad_norm": 3.506690263748169, "learning_rate": 1.672211495618595e-05, "loss": 0.0922, "step": 29800 }, { "epoch": 2.2137234516560227, "grad_norm": 2.5403103828430176, "learning_rate": 1.6717659290063863e-05, "loss": 0.0768, "step": 29810 }, { "epoch": 2.21446606267637, "grad_norm": 1.3894189596176147, "learning_rate": 1.6713203623941778e-05, "loss": 0.0642, "step": 29820 }, { "epoch": 2.2152086736967176, "grad_norm": 1.7695733308792114, "learning_rate": 1.6708747957819697e-05, "loss": 0.0699, "step": 29830 }, { "epoch": 2.215951284717065, "grad_norm": 1.9002711772918701, "learning_rate": 1.6704292291697608e-05, "loss": 0.0587, "step": 29840 }, { "epoch": 2.216693895737413, "grad_norm": 2.1885085105895996, "learning_rate": 1.6699836625575523e-05, "loss": 0.0808, "step": 29850 }, { "epoch": 2.2174365067577604, "grad_norm": 0.7245926856994629, "learning_rate": 1.669538095945344e-05, "loss": 0.0767, "step": 29860 }, { "epoch": 2.218179117778108, "grad_norm": 1.1416701078414917, "learning_rate": 1.6690925293331353e-05, "loss": 0.0552, "step": 29870 }, { "epoch": 2.2189217287984553, "grad_norm": 0.4477648138999939, "learning_rate": 1.6686469627209268e-05, "loss": 0.092, "step": 29880 }, { "epoch": 2.2196643398188027, "grad_norm": 1.4506340026855469, "learning_rate": 1.668201396108718e-05, "loss": 0.0803, "step": 29890 }, { "epoch": 2.2204069508391506, "grad_norm": 1.6978100538253784, "learning_rate": 1.6677558294965098e-05, "loss": 0.0516, "step": 29900 }, { "epoch": 2.221149561859498, "grad_norm": 0.5973829030990601, "learning_rate": 1.6673102628843013e-05, "loss": 0.0748, "step": 29910 }, { "epoch": 2.2218921728798455, "grad_norm": 1.091837763786316, "learning_rate": 1.6668646962720925e-05, "loss": 0.0831, "step": 29920 }, { "epoch": 2.222634783900193, "grad_norm": 0.9532679319381714, "learning_rate": 1.6664191296598843e-05, "loss": 0.0632, "step": 29930 }, { "epoch": 2.2233773949205404, "grad_norm": 1.0691170692443848, "learning_rate": 1.6659735630476758e-05, "loss": 0.0863, "step": 29940 }, { "epoch": 2.2241200059408883, "grad_norm": 0.724433958530426, "learning_rate": 1.665527996435467e-05, "loss": 0.0485, "step": 29950 }, { "epoch": 2.2248626169612358, "grad_norm": 2.8994836807250977, "learning_rate": 1.6650824298232585e-05, "loss": 0.0611, "step": 29960 }, { "epoch": 2.2256052279815832, "grad_norm": 0.5383403301239014, "learning_rate": 1.6646368632110503e-05, "loss": 0.0382, "step": 29970 }, { "epoch": 2.2263478390019307, "grad_norm": 4.375266075134277, "learning_rate": 1.6641912965988415e-05, "loss": 0.0434, "step": 29980 }, { "epoch": 2.227090450022278, "grad_norm": 4.827844619750977, "learning_rate": 1.663745729986633e-05, "loss": 0.0683, "step": 29990 }, { "epoch": 2.227833061042626, "grad_norm": 1.5270919799804688, "learning_rate": 1.6633001633744245e-05, "loss": 0.0859, "step": 30000 }, { "epoch": 2.2285756720629735, "grad_norm": 2.311201572418213, "learning_rate": 1.662854596762216e-05, "loss": 0.1089, "step": 30010 }, { "epoch": 2.229318283083321, "grad_norm": 1.0003308057785034, "learning_rate": 1.6624090301500075e-05, "loss": 0.0851, "step": 30020 }, { "epoch": 2.2300608941036684, "grad_norm": 1.064595103263855, "learning_rate": 1.661963463537799e-05, "loss": 0.0711, "step": 30030 }, { "epoch": 2.2308035051240163, "grad_norm": 2.0935966968536377, "learning_rate": 1.6615178969255905e-05, "loss": 0.073, "step": 30040 }, { "epoch": 2.2315461161443637, "grad_norm": 1.3619099855422974, "learning_rate": 1.661072330313382e-05, "loss": 0.0707, "step": 30050 }, { "epoch": 2.232288727164711, "grad_norm": 3.1474947929382324, "learning_rate": 1.660626763701173e-05, "loss": 0.0401, "step": 30060 }, { "epoch": 2.2330313381850586, "grad_norm": 2.438246250152588, "learning_rate": 1.660181197088965e-05, "loss": 0.0783, "step": 30070 }, { "epoch": 2.233773949205406, "grad_norm": 2.8391036987304688, "learning_rate": 1.6597356304767565e-05, "loss": 0.0673, "step": 30080 }, { "epoch": 2.234516560225754, "grad_norm": 1.0939983129501343, "learning_rate": 1.6592900638645476e-05, "loss": 0.0676, "step": 30090 }, { "epoch": 2.2352591712461014, "grad_norm": 1.0146929025650024, "learning_rate": 1.6588444972523395e-05, "loss": 0.0567, "step": 30100 }, { "epoch": 2.236001782266449, "grad_norm": 0.7500823140144348, "learning_rate": 1.6583989306401306e-05, "loss": 0.0618, "step": 30110 }, { "epoch": 2.2367443932867963, "grad_norm": 1.022570252418518, "learning_rate": 1.657953364027922e-05, "loss": 0.0878, "step": 30120 }, { "epoch": 2.2374870043071438, "grad_norm": 2.924889087677002, "learning_rate": 1.6575077974157136e-05, "loss": 0.0537, "step": 30130 }, { "epoch": 2.2382296153274917, "grad_norm": 1.7216036319732666, "learning_rate": 1.657062230803505e-05, "loss": 0.1027, "step": 30140 }, { "epoch": 2.238972226347839, "grad_norm": 0.9251584410667419, "learning_rate": 1.6566166641912966e-05, "loss": 0.0823, "step": 30150 }, { "epoch": 2.2397148373681866, "grad_norm": 3.20623779296875, "learning_rate": 1.656171097579088e-05, "loss": 0.0619, "step": 30160 }, { "epoch": 2.240457448388534, "grad_norm": 1.0136836767196655, "learning_rate": 1.6557255309668796e-05, "loss": 0.0755, "step": 30170 }, { "epoch": 2.2412000594088815, "grad_norm": 1.4571853876113892, "learning_rate": 1.655279964354671e-05, "loss": 0.1068, "step": 30180 }, { "epoch": 2.2419426704292293, "grad_norm": 1.8920878171920776, "learning_rate": 1.6548343977424626e-05, "loss": 0.0733, "step": 30190 }, { "epoch": 2.242685281449577, "grad_norm": 1.619733452796936, "learning_rate": 1.654388831130254e-05, "loss": 0.092, "step": 30200 }, { "epoch": 2.2434278924699242, "grad_norm": 2.0480968952178955, "learning_rate": 1.6539432645180456e-05, "loss": 0.061, "step": 30210 }, { "epoch": 2.2441705034902717, "grad_norm": 0.9134529829025269, "learning_rate": 1.6534976979058368e-05, "loss": 0.0773, "step": 30220 }, { "epoch": 2.244913114510619, "grad_norm": 1.261897325515747, "learning_rate": 1.6530521312936283e-05, "loss": 0.0656, "step": 30230 }, { "epoch": 2.245655725530967, "grad_norm": 1.7915825843811035, "learning_rate": 1.65260656468142e-05, "loss": 0.0685, "step": 30240 }, { "epoch": 2.2463983365513145, "grad_norm": 2.638289451599121, "learning_rate": 1.6521609980692113e-05, "loss": 0.0938, "step": 30250 }, { "epoch": 2.247140947571662, "grad_norm": 1.1665395498275757, "learning_rate": 1.6517154314570028e-05, "loss": 0.0999, "step": 30260 }, { "epoch": 2.2478835585920094, "grad_norm": 0.8219088912010193, "learning_rate": 1.6512698648447946e-05, "loss": 0.0732, "step": 30270 }, { "epoch": 2.248626169612357, "grad_norm": 1.1381323337554932, "learning_rate": 1.6508242982325858e-05, "loss": 0.1115, "step": 30280 }, { "epoch": 2.2493687806327047, "grad_norm": 2.4219374656677246, "learning_rate": 1.6503787316203773e-05, "loss": 0.0829, "step": 30290 }, { "epoch": 2.250111391653052, "grad_norm": 1.201055645942688, "learning_rate": 1.6499331650081684e-05, "loss": 0.047, "step": 30300 }, { "epoch": 2.2508540026733996, "grad_norm": 0.7391691207885742, "learning_rate": 1.6494875983959603e-05, "loss": 0.0452, "step": 30310 }, { "epoch": 2.251596613693747, "grad_norm": 0.3732907176017761, "learning_rate": 1.6490420317837518e-05, "loss": 0.0624, "step": 30320 }, { "epoch": 2.2523392247140945, "grad_norm": 0.5641918182373047, "learning_rate": 1.648596465171543e-05, "loss": 0.0906, "step": 30330 }, { "epoch": 2.2530818357344424, "grad_norm": 2.029506206512451, "learning_rate": 1.6481508985593348e-05, "loss": 0.0745, "step": 30340 }, { "epoch": 2.25382444675479, "grad_norm": 2.202345132827759, "learning_rate": 1.6477053319471263e-05, "loss": 0.0843, "step": 30350 }, { "epoch": 2.2545670577751373, "grad_norm": 2.1847546100616455, "learning_rate": 1.6472597653349174e-05, "loss": 0.0897, "step": 30360 }, { "epoch": 2.255309668795485, "grad_norm": 1.7908791303634644, "learning_rate": 1.646814198722709e-05, "loss": 0.0694, "step": 30370 }, { "epoch": 2.2560522798158322, "grad_norm": 3.3176610469818115, "learning_rate": 1.6463686321105008e-05, "loss": 0.0674, "step": 30380 }, { "epoch": 2.25679489083618, "grad_norm": 1.1117013692855835, "learning_rate": 1.645923065498292e-05, "loss": 0.0419, "step": 30390 }, { "epoch": 2.2575375018565276, "grad_norm": 2.5527663230895996, "learning_rate": 1.6454774988860834e-05, "loss": 0.0818, "step": 30400 }, { "epoch": 2.258280112876875, "grad_norm": 0.6371174454689026, "learning_rate": 1.645031932273875e-05, "loss": 0.0626, "step": 30410 }, { "epoch": 2.2590227238972225, "grad_norm": 1.5492578744888306, "learning_rate": 1.6445863656616664e-05, "loss": 0.0681, "step": 30420 }, { "epoch": 2.2597653349175704, "grad_norm": 0.8762646317481995, "learning_rate": 1.644140799049458e-05, "loss": 0.0724, "step": 30430 }, { "epoch": 2.260507945937918, "grad_norm": 2.67170786857605, "learning_rate": 1.6436952324372494e-05, "loss": 0.0614, "step": 30440 }, { "epoch": 2.2612505569582653, "grad_norm": 2.633847713470459, "learning_rate": 1.643249665825041e-05, "loss": 0.0799, "step": 30450 }, { "epoch": 2.2619931679786127, "grad_norm": 2.2049152851104736, "learning_rate": 1.6428040992128324e-05, "loss": 0.0641, "step": 30460 }, { "epoch": 2.26273577899896, "grad_norm": 6.4670186042785645, "learning_rate": 1.6423585326006236e-05, "loss": 0.0696, "step": 30470 }, { "epoch": 2.263478390019308, "grad_norm": 1.440319538116455, "learning_rate": 1.6419129659884154e-05, "loss": 0.0519, "step": 30480 }, { "epoch": 2.2642210010396555, "grad_norm": 1.5117939710617065, "learning_rate": 1.641467399376207e-05, "loss": 0.0679, "step": 30490 }, { "epoch": 2.264963612060003, "grad_norm": 0.705443799495697, "learning_rate": 1.641021832763998e-05, "loss": 0.0956, "step": 30500 }, { "epoch": 2.2657062230803504, "grad_norm": 2.2516098022460938, "learning_rate": 1.64057626615179e-05, "loss": 0.0704, "step": 30510 }, { "epoch": 2.266448834100698, "grad_norm": 1.6648731231689453, "learning_rate": 1.640130699539581e-05, "loss": 0.0531, "step": 30520 }, { "epoch": 2.2671914451210458, "grad_norm": 1.7640278339385986, "learning_rate": 1.6396851329273726e-05, "loss": 0.0787, "step": 30530 }, { "epoch": 2.267934056141393, "grad_norm": 0.9552247524261475, "learning_rate": 1.639239566315164e-05, "loss": 0.0744, "step": 30540 }, { "epoch": 2.2686766671617407, "grad_norm": 2.791886329650879, "learning_rate": 1.6387939997029556e-05, "loss": 0.0758, "step": 30550 }, { "epoch": 2.269419278182088, "grad_norm": 3.42543363571167, "learning_rate": 1.638348433090747e-05, "loss": 0.0743, "step": 30560 }, { "epoch": 2.270161889202436, "grad_norm": 2.195741653442383, "learning_rate": 1.6379028664785386e-05, "loss": 0.0793, "step": 30570 }, { "epoch": 2.2709045002227835, "grad_norm": 1.3603512048721313, "learning_rate": 1.63745729986633e-05, "loss": 0.094, "step": 30580 }, { "epoch": 2.271647111243131, "grad_norm": 0.9138447642326355, "learning_rate": 1.6370117332541216e-05, "loss": 0.0973, "step": 30590 }, { "epoch": 2.2723897222634784, "grad_norm": 1.8551127910614014, "learning_rate": 1.636566166641913e-05, "loss": 0.037, "step": 30600 }, { "epoch": 2.273132333283826, "grad_norm": 0.9217031002044678, "learning_rate": 1.6361206000297046e-05, "loss": 0.0537, "step": 30610 }, { "epoch": 2.2738749443041737, "grad_norm": 0.9425798654556274, "learning_rate": 1.635675033417496e-05, "loss": 0.067, "step": 30620 }, { "epoch": 2.274617555324521, "grad_norm": 2.4280683994293213, "learning_rate": 1.6352294668052872e-05, "loss": 0.0611, "step": 30630 }, { "epoch": 2.2753601663448686, "grad_norm": 1.6596300601959229, "learning_rate": 1.6347839001930787e-05, "loss": 0.1251, "step": 30640 }, { "epoch": 2.276102777365216, "grad_norm": 1.1469758749008179, "learning_rate": 1.6343383335808706e-05, "loss": 0.0594, "step": 30650 }, { "epoch": 2.2768453883855635, "grad_norm": 0.3723772466182709, "learning_rate": 1.6338927669686617e-05, "loss": 0.046, "step": 30660 }, { "epoch": 2.2775879994059114, "grad_norm": 1.2316060066223145, "learning_rate": 1.6334472003564532e-05, "loss": 0.0913, "step": 30670 }, { "epoch": 2.278330610426259, "grad_norm": 0.8177586793899536, "learning_rate": 1.633001633744245e-05, "loss": 0.0796, "step": 30680 }, { "epoch": 2.2790732214466063, "grad_norm": 2.4429774284362793, "learning_rate": 1.6325560671320362e-05, "loss": 0.0675, "step": 30690 }, { "epoch": 2.2798158324669537, "grad_norm": 2.337932586669922, "learning_rate": 1.6321105005198277e-05, "loss": 0.0905, "step": 30700 }, { "epoch": 2.280558443487301, "grad_norm": 1.1022733449935913, "learning_rate": 1.6316649339076192e-05, "loss": 0.0586, "step": 30710 }, { "epoch": 2.281301054507649, "grad_norm": 1.6780964136123657, "learning_rate": 1.6312193672954107e-05, "loss": 0.0661, "step": 30720 }, { "epoch": 2.2820436655279965, "grad_norm": 1.0337419509887695, "learning_rate": 1.6307738006832022e-05, "loss": 0.1078, "step": 30730 }, { "epoch": 2.282786276548344, "grad_norm": 0.539939284324646, "learning_rate": 1.6303282340709934e-05, "loss": 0.0679, "step": 30740 }, { "epoch": 2.2835288875686914, "grad_norm": 3.132404088973999, "learning_rate": 1.6298826674587852e-05, "loss": 0.0875, "step": 30750 }, { "epoch": 2.284271498589039, "grad_norm": 0.9195793271064758, "learning_rate": 1.6294371008465767e-05, "loss": 0.0704, "step": 30760 }, { "epoch": 2.285014109609387, "grad_norm": 1.0817722082138062, "learning_rate": 1.628991534234368e-05, "loss": 0.0503, "step": 30770 }, { "epoch": 2.2857567206297342, "grad_norm": 1.1843632459640503, "learning_rate": 1.6285459676221597e-05, "loss": 0.0481, "step": 30780 }, { "epoch": 2.2864993316500817, "grad_norm": 2.589695930480957, "learning_rate": 1.6281004010099512e-05, "loss": 0.0587, "step": 30790 }, { "epoch": 2.287241942670429, "grad_norm": 1.0527863502502441, "learning_rate": 1.6276548343977424e-05, "loss": 0.0863, "step": 30800 }, { "epoch": 2.2879845536907766, "grad_norm": 1.128021478652954, "learning_rate": 1.627209267785534e-05, "loss": 0.0738, "step": 30810 }, { "epoch": 2.2887271647111245, "grad_norm": 1.0027931928634644, "learning_rate": 1.6267637011733254e-05, "loss": 0.0423, "step": 30820 }, { "epoch": 2.289469775731472, "grad_norm": 1.4265313148498535, "learning_rate": 1.626318134561117e-05, "loss": 0.0381, "step": 30830 }, { "epoch": 2.2902123867518194, "grad_norm": 1.3314334154129028, "learning_rate": 1.6258725679489084e-05, "loss": 0.0521, "step": 30840 }, { "epoch": 2.290954997772167, "grad_norm": 1.9009617567062378, "learning_rate": 1.6254270013367e-05, "loss": 0.0532, "step": 30850 }, { "epoch": 2.2916976087925143, "grad_norm": 1.4951937198638916, "learning_rate": 1.6249814347244914e-05, "loss": 0.0764, "step": 30860 }, { "epoch": 2.292440219812862, "grad_norm": 1.393104910850525, "learning_rate": 1.624535868112283e-05, "loss": 0.0694, "step": 30870 }, { "epoch": 2.2931828308332096, "grad_norm": 1.1076534986495972, "learning_rate": 1.624090301500074e-05, "loss": 0.0542, "step": 30880 }, { "epoch": 2.293925441853557, "grad_norm": 2.4654695987701416, "learning_rate": 1.623644734887866e-05, "loss": 0.0647, "step": 30890 }, { "epoch": 2.2946680528739045, "grad_norm": 0.662090003490448, "learning_rate": 1.6231991682756574e-05, "loss": 0.0686, "step": 30900 }, { "epoch": 2.295410663894252, "grad_norm": 2.0039710998535156, "learning_rate": 1.6227536016634486e-05, "loss": 0.0792, "step": 30910 }, { "epoch": 2.2961532749146, "grad_norm": 0.9649547338485718, "learning_rate": 1.6223080350512404e-05, "loss": 0.0603, "step": 30920 }, { "epoch": 2.2968958859349473, "grad_norm": 1.8113723993301392, "learning_rate": 1.6218624684390316e-05, "loss": 0.0739, "step": 30930 }, { "epoch": 2.2976384969552948, "grad_norm": 0.8658888936042786, "learning_rate": 1.621416901826823e-05, "loss": 0.0417, "step": 30940 }, { "epoch": 2.298381107975642, "grad_norm": 1.820826530456543, "learning_rate": 1.6209713352146146e-05, "loss": 0.0584, "step": 30950 }, { "epoch": 2.2991237189959897, "grad_norm": 3.5532517433166504, "learning_rate": 1.620525768602406e-05, "loss": 0.0779, "step": 30960 }, { "epoch": 2.2998663300163376, "grad_norm": 1.2699205875396729, "learning_rate": 1.6200802019901976e-05, "loss": 0.0722, "step": 30970 }, { "epoch": 2.300608941036685, "grad_norm": 1.4592735767364502, "learning_rate": 1.619634635377989e-05, "loss": 0.0539, "step": 30980 }, { "epoch": 2.3013515520570325, "grad_norm": 1.4031466245651245, "learning_rate": 1.6191890687657805e-05, "loss": 0.0849, "step": 30990 }, { "epoch": 2.30209416307738, "grad_norm": 2.4945876598358154, "learning_rate": 1.618743502153572e-05, "loss": 0.0859, "step": 31000 }, { "epoch": 2.302836774097728, "grad_norm": 1.129449725151062, "learning_rate": 1.6182979355413635e-05, "loss": 0.0807, "step": 31010 }, { "epoch": 2.3035793851180753, "grad_norm": 3.043400526046753, "learning_rate": 1.617852368929155e-05, "loss": 0.056, "step": 31020 }, { "epoch": 2.3043219961384227, "grad_norm": 2.541620969772339, "learning_rate": 1.6174068023169465e-05, "loss": 0.0738, "step": 31030 }, { "epoch": 2.30506460715877, "grad_norm": 0.742087721824646, "learning_rate": 1.6169612357047377e-05, "loss": 0.0583, "step": 31040 }, { "epoch": 2.3058072181791176, "grad_norm": 2.608368158340454, "learning_rate": 1.6165156690925292e-05, "loss": 0.0645, "step": 31050 }, { "epoch": 2.3065498291994655, "grad_norm": 1.629696249961853, "learning_rate": 1.616070102480321e-05, "loss": 0.0897, "step": 31060 }, { "epoch": 2.307292440219813, "grad_norm": 1.6755260229110718, "learning_rate": 1.6156245358681122e-05, "loss": 0.065, "step": 31070 }, { "epoch": 2.3080350512401604, "grad_norm": 1.018589735031128, "learning_rate": 1.6151789692559037e-05, "loss": 0.0527, "step": 31080 }, { "epoch": 2.308777662260508, "grad_norm": 0.7608964443206787, "learning_rate": 1.6147334026436955e-05, "loss": 0.0697, "step": 31090 }, { "epoch": 2.3095202732808553, "grad_norm": 0.854860246181488, "learning_rate": 1.6142878360314867e-05, "loss": 0.0657, "step": 31100 }, { "epoch": 2.310262884301203, "grad_norm": 1.3979119062423706, "learning_rate": 1.6138422694192782e-05, "loss": 0.0776, "step": 31110 }, { "epoch": 2.3110054953215506, "grad_norm": 1.7942464351654053, "learning_rate": 1.6133967028070697e-05, "loss": 0.0688, "step": 31120 }, { "epoch": 2.311748106341898, "grad_norm": 1.9012426137924194, "learning_rate": 1.6129511361948612e-05, "loss": 0.0585, "step": 31130 }, { "epoch": 2.3124907173622455, "grad_norm": 1.9309673309326172, "learning_rate": 1.6125055695826527e-05, "loss": 0.0747, "step": 31140 }, { "epoch": 2.3132333283825934, "grad_norm": 1.511763572692871, "learning_rate": 1.612060002970444e-05, "loss": 0.0732, "step": 31150 }, { "epoch": 2.313975939402941, "grad_norm": 2.341627836227417, "learning_rate": 1.6116144363582357e-05, "loss": 0.0883, "step": 31160 }, { "epoch": 2.3147185504232883, "grad_norm": 2.5105557441711426, "learning_rate": 1.6111688697460272e-05, "loss": 0.0588, "step": 31170 }, { "epoch": 2.315461161443636, "grad_norm": 1.0133614540100098, "learning_rate": 1.6107233031338184e-05, "loss": 0.0887, "step": 31180 }, { "epoch": 2.3162037724639832, "grad_norm": 5.659106254577637, "learning_rate": 1.6102777365216102e-05, "loss": 0.0934, "step": 31190 }, { "epoch": 2.316946383484331, "grad_norm": 0.734591543674469, "learning_rate": 1.6098321699094017e-05, "loss": 0.0798, "step": 31200 }, { "epoch": 2.3176889945046786, "grad_norm": 2.1692800521850586, "learning_rate": 1.609386603297193e-05, "loss": 0.0662, "step": 31210 }, { "epoch": 2.318431605525026, "grad_norm": 0.6351516842842102, "learning_rate": 1.6089410366849844e-05, "loss": 0.0478, "step": 31220 }, { "epoch": 2.3191742165453735, "grad_norm": 1.4886484146118164, "learning_rate": 1.6084954700727762e-05, "loss": 0.0698, "step": 31230 }, { "epoch": 2.319916827565721, "grad_norm": 0.3388558626174927, "learning_rate": 1.6080499034605674e-05, "loss": 0.0557, "step": 31240 }, { "epoch": 2.320659438586069, "grad_norm": 0.2938145697116852, "learning_rate": 1.607604336848359e-05, "loss": 0.0889, "step": 31250 }, { "epoch": 2.3214020496064163, "grad_norm": 2.3439390659332275, "learning_rate": 1.6071587702361504e-05, "loss": 0.064, "step": 31260 }, { "epoch": 2.3221446606267637, "grad_norm": 1.6882779598236084, "learning_rate": 1.606713203623942e-05, "loss": 0.0456, "step": 31270 }, { "epoch": 2.322887271647111, "grad_norm": 1.6335844993591309, "learning_rate": 1.6062676370117334e-05, "loss": 0.0528, "step": 31280 }, { "epoch": 2.3236298826674586, "grad_norm": 1.1933094263076782, "learning_rate": 1.6058220703995245e-05, "loss": 0.0913, "step": 31290 }, { "epoch": 2.3243724936878065, "grad_norm": 2.4354095458984375, "learning_rate": 1.6053765037873164e-05, "loss": 0.0795, "step": 31300 }, { "epoch": 2.325115104708154, "grad_norm": 1.5631943941116333, "learning_rate": 1.604930937175108e-05, "loss": 0.0792, "step": 31310 }, { "epoch": 2.3258577157285014, "grad_norm": 2.0377357006073, "learning_rate": 1.604485370562899e-05, "loss": 0.0836, "step": 31320 }, { "epoch": 2.326600326748849, "grad_norm": 2.2633230686187744, "learning_rate": 1.604039803950691e-05, "loss": 0.0778, "step": 31330 }, { "epoch": 2.3273429377691963, "grad_norm": 1.7147982120513916, "learning_rate": 1.603594237338482e-05, "loss": 0.0757, "step": 31340 }, { "epoch": 2.328085548789544, "grad_norm": 2.1653573513031006, "learning_rate": 1.6031486707262735e-05, "loss": 0.0769, "step": 31350 }, { "epoch": 2.3288281598098917, "grad_norm": 1.5791271924972534, "learning_rate": 1.602703104114065e-05, "loss": 0.0756, "step": 31360 }, { "epoch": 2.329570770830239, "grad_norm": 2.118759870529175, "learning_rate": 1.6022575375018565e-05, "loss": 0.0939, "step": 31370 }, { "epoch": 2.3303133818505866, "grad_norm": 1.0190156698226929, "learning_rate": 1.601811970889648e-05, "loss": 0.0663, "step": 31380 }, { "epoch": 2.331055992870934, "grad_norm": 1.578240156173706, "learning_rate": 1.6013664042774395e-05, "loss": 0.0899, "step": 31390 }, { "epoch": 2.331798603891282, "grad_norm": 1.2670795917510986, "learning_rate": 1.600920837665231e-05, "loss": 0.056, "step": 31400 }, { "epoch": 2.3325412149116294, "grad_norm": 0.6794329285621643, "learning_rate": 1.6004752710530225e-05, "loss": 0.0792, "step": 31410 }, { "epoch": 2.333283825931977, "grad_norm": 1.2964015007019043, "learning_rate": 1.600029704440814e-05, "loss": 0.071, "step": 31420 }, { "epoch": 2.3340264369523243, "grad_norm": 1.2982021570205688, "learning_rate": 1.5995841378286055e-05, "loss": 0.0841, "step": 31430 }, { "epoch": 2.3347690479726717, "grad_norm": 2.294980049133301, "learning_rate": 1.599138571216397e-05, "loss": 0.0691, "step": 31440 }, { "epoch": 2.3355116589930196, "grad_norm": 1.0897853374481201, "learning_rate": 1.598693004604188e-05, "loss": 0.0898, "step": 31450 }, { "epoch": 2.336254270013367, "grad_norm": 2.2197341918945312, "learning_rate": 1.5982474379919797e-05, "loss": 0.0478, "step": 31460 }, { "epoch": 2.3369968810337145, "grad_norm": 2.2933666706085205, "learning_rate": 1.5978018713797715e-05, "loss": 0.039, "step": 31470 }, { "epoch": 2.337739492054062, "grad_norm": 2.1594624519348145, "learning_rate": 1.5973563047675627e-05, "loss": 0.0615, "step": 31480 }, { "epoch": 2.3384821030744094, "grad_norm": 0.8612133264541626, "learning_rate": 1.596910738155354e-05, "loss": 0.0652, "step": 31490 }, { "epoch": 2.3392247140947573, "grad_norm": 1.2774549722671509, "learning_rate": 1.596465171543146e-05, "loss": 0.0666, "step": 31500 }, { "epoch": 2.3399673251151047, "grad_norm": 0.8495298027992249, "learning_rate": 1.596019604930937e-05, "loss": 0.0859, "step": 31510 }, { "epoch": 2.340709936135452, "grad_norm": 0.653118908405304, "learning_rate": 1.5955740383187287e-05, "loss": 0.0565, "step": 31520 }, { "epoch": 2.3414525471557996, "grad_norm": 0.9799015522003174, "learning_rate": 1.59512847170652e-05, "loss": 0.0624, "step": 31530 }, { "epoch": 2.342195158176147, "grad_norm": 0.8276538848876953, "learning_rate": 1.5946829050943117e-05, "loss": 0.0819, "step": 31540 }, { "epoch": 2.342937769196495, "grad_norm": 2.8302557468414307, "learning_rate": 1.594237338482103e-05, "loss": 0.0728, "step": 31550 }, { "epoch": 2.3436803802168424, "grad_norm": 2.348175287246704, "learning_rate": 1.5937917718698943e-05, "loss": 0.0857, "step": 31560 }, { "epoch": 2.34442299123719, "grad_norm": 1.6633504629135132, "learning_rate": 1.593346205257686e-05, "loss": 0.0724, "step": 31570 }, { "epoch": 2.3451656022575373, "grad_norm": 0.9694968461990356, "learning_rate": 1.5929006386454777e-05, "loss": 0.0795, "step": 31580 }, { "epoch": 2.3459082132778852, "grad_norm": 2.0099871158599854, "learning_rate": 1.5924550720332688e-05, "loss": 0.0568, "step": 31590 }, { "epoch": 2.3466508242982327, "grad_norm": 1.2258661985397339, "learning_rate": 1.5920095054210607e-05, "loss": 0.0553, "step": 31600 }, { "epoch": 2.34739343531858, "grad_norm": 1.5101720094680786, "learning_rate": 1.591563938808852e-05, "loss": 0.0631, "step": 31610 }, { "epoch": 2.3481360463389276, "grad_norm": 1.2098814249038696, "learning_rate": 1.5911183721966433e-05, "loss": 0.0588, "step": 31620 }, { "epoch": 2.348878657359275, "grad_norm": 0.48147693276405334, "learning_rate": 1.5906728055844348e-05, "loss": 0.0648, "step": 31630 }, { "epoch": 2.349621268379623, "grad_norm": 1.1367077827453613, "learning_rate": 1.5902272389722267e-05, "loss": 0.0493, "step": 31640 }, { "epoch": 2.3503638793999704, "grad_norm": 1.2265082597732544, "learning_rate": 1.5897816723600178e-05, "loss": 0.067, "step": 31650 }, { "epoch": 2.351106490420318, "grad_norm": 3.4290337562561035, "learning_rate": 1.5893361057478093e-05, "loss": 0.0638, "step": 31660 }, { "epoch": 2.3518491014406653, "grad_norm": 1.1333622932434082, "learning_rate": 1.5888905391356008e-05, "loss": 0.0814, "step": 31670 }, { "epoch": 2.3525917124610127, "grad_norm": 1.8252911567687988, "learning_rate": 1.5884449725233923e-05, "loss": 0.0561, "step": 31680 }, { "epoch": 2.3533343234813606, "grad_norm": 0.9480688571929932, "learning_rate": 1.5879994059111838e-05, "loss": 0.0603, "step": 31690 }, { "epoch": 2.354076934501708, "grad_norm": 1.576391339302063, "learning_rate": 1.587553839298975e-05, "loss": 0.0568, "step": 31700 }, { "epoch": 2.3548195455220555, "grad_norm": 0.9777421951293945, "learning_rate": 1.5871082726867668e-05, "loss": 0.0481, "step": 31710 }, { "epoch": 2.355562156542403, "grad_norm": 1.60342538356781, "learning_rate": 1.5866627060745583e-05, "loss": 0.0618, "step": 31720 }, { "epoch": 2.356304767562751, "grad_norm": 1.1398688554763794, "learning_rate": 1.5862171394623495e-05, "loss": 0.0554, "step": 31730 }, { "epoch": 2.3570473785830983, "grad_norm": 0.9877955913543701, "learning_rate": 1.5857715728501413e-05, "loss": 0.0851, "step": 31740 }, { "epoch": 2.3577899896034458, "grad_norm": 1.1725736856460571, "learning_rate": 1.5853260062379325e-05, "loss": 0.0643, "step": 31750 }, { "epoch": 2.358532600623793, "grad_norm": 1.4532411098480225, "learning_rate": 1.584880439625724e-05, "loss": 0.057, "step": 31760 }, { "epoch": 2.3592752116441407, "grad_norm": 1.950925350189209, "learning_rate": 1.5844348730135155e-05, "loss": 0.039, "step": 31770 }, { "epoch": 2.3600178226644886, "grad_norm": 4.71244478225708, "learning_rate": 1.583989306401307e-05, "loss": 0.0499, "step": 31780 }, { "epoch": 2.360760433684836, "grad_norm": 3.2281157970428467, "learning_rate": 1.5835437397890985e-05, "loss": 0.0861, "step": 31790 }, { "epoch": 2.3615030447051835, "grad_norm": 1.568460464477539, "learning_rate": 1.58309817317689e-05, "loss": 0.0689, "step": 31800 }, { "epoch": 2.362245655725531, "grad_norm": 0.7501446008682251, "learning_rate": 1.5826526065646815e-05, "loss": 0.0754, "step": 31810 }, { "epoch": 2.3629882667458784, "grad_norm": 1.1231635808944702, "learning_rate": 1.582207039952473e-05, "loss": 0.088, "step": 31820 }, { "epoch": 2.3637308777662263, "grad_norm": 1.5443603992462158, "learning_rate": 1.5817614733402645e-05, "loss": 0.0677, "step": 31830 }, { "epoch": 2.3644734887865737, "grad_norm": 0.9648088216781616, "learning_rate": 1.581315906728056e-05, "loss": 0.0883, "step": 31840 }, { "epoch": 2.365216099806921, "grad_norm": 1.1336426734924316, "learning_rate": 1.5808703401158475e-05, "loss": 0.0545, "step": 31850 }, { "epoch": 2.3659587108272686, "grad_norm": 1.6498336791992188, "learning_rate": 1.5804247735036386e-05, "loss": 0.067, "step": 31860 }, { "epoch": 2.366701321847616, "grad_norm": 2.2792677879333496, "learning_rate": 1.57997920689143e-05, "loss": 0.0773, "step": 31870 }, { "epoch": 2.367443932867964, "grad_norm": 0.7917251586914062, "learning_rate": 1.579533640279222e-05, "loss": 0.0453, "step": 31880 }, { "epoch": 2.3681865438883114, "grad_norm": 1.3908026218414307, "learning_rate": 1.579088073667013e-05, "loss": 0.0596, "step": 31890 }, { "epoch": 2.368929154908659, "grad_norm": 0.8732894659042358, "learning_rate": 1.5786425070548046e-05, "loss": 0.0487, "step": 31900 }, { "epoch": 2.3696717659290063, "grad_norm": 0.30048489570617676, "learning_rate": 1.5781969404425965e-05, "loss": 0.0723, "step": 31910 }, { "epoch": 2.3704143769493538, "grad_norm": 1.2894927263259888, "learning_rate": 1.5777513738303876e-05, "loss": 0.0716, "step": 31920 }, { "epoch": 2.3711569879697016, "grad_norm": 0.718590259552002, "learning_rate": 1.577305807218179e-05, "loss": 0.0684, "step": 31930 }, { "epoch": 2.371899598990049, "grad_norm": 1.6592215299606323, "learning_rate": 1.5768602406059706e-05, "loss": 0.0799, "step": 31940 }, { "epoch": 2.3726422100103965, "grad_norm": 1.2301288843154907, "learning_rate": 1.576414673993762e-05, "loss": 0.0526, "step": 31950 }, { "epoch": 2.373384821030744, "grad_norm": 1.742710828781128, "learning_rate": 1.5759691073815536e-05, "loss": 0.051, "step": 31960 }, { "epoch": 2.3741274320510914, "grad_norm": 0.8959303498268127, "learning_rate": 1.5755235407693448e-05, "loss": 0.0439, "step": 31970 }, { "epoch": 2.3748700430714393, "grad_norm": 2.8201303482055664, "learning_rate": 1.5750779741571366e-05, "loss": 0.0608, "step": 31980 }, { "epoch": 2.375612654091787, "grad_norm": 1.701446294784546, "learning_rate": 1.574632407544928e-05, "loss": 0.0872, "step": 31990 }, { "epoch": 2.3763552651121342, "grad_norm": 0.7672728300094604, "learning_rate": 1.5741868409327193e-05, "loss": 0.0975, "step": 32000 }, { "epoch": 2.3770978761324817, "grad_norm": 2.049514055252075, "learning_rate": 1.573741274320511e-05, "loss": 0.0846, "step": 32010 }, { "epoch": 2.377840487152829, "grad_norm": 1.193021535873413, "learning_rate": 1.5732957077083026e-05, "loss": 0.0705, "step": 32020 }, { "epoch": 2.378583098173177, "grad_norm": 2.212050676345825, "learning_rate": 1.5728501410960938e-05, "loss": 0.0715, "step": 32030 }, { "epoch": 2.3793257091935245, "grad_norm": 1.0855233669281006, "learning_rate": 1.5724045744838853e-05, "loss": 0.0644, "step": 32040 }, { "epoch": 2.380068320213872, "grad_norm": 2.6349453926086426, "learning_rate": 1.571959007871677e-05, "loss": 0.0604, "step": 32050 }, { "epoch": 2.3808109312342194, "grad_norm": 0.948853611946106, "learning_rate": 1.5715134412594683e-05, "loss": 0.0721, "step": 32060 }, { "epoch": 2.381553542254567, "grad_norm": 1.4371938705444336, "learning_rate": 1.5710678746472598e-05, "loss": 0.0499, "step": 32070 }, { "epoch": 2.3822961532749147, "grad_norm": 0.6612533926963806, "learning_rate": 1.5706223080350513e-05, "loss": 0.0827, "step": 32080 }, { "epoch": 2.383038764295262, "grad_norm": 3.743394136428833, "learning_rate": 1.5701767414228428e-05, "loss": 0.0729, "step": 32090 }, { "epoch": 2.3837813753156096, "grad_norm": 1.6435579061508179, "learning_rate": 1.5697311748106343e-05, "loss": 0.0618, "step": 32100 }, { "epoch": 2.384523986335957, "grad_norm": 2.4289140701293945, "learning_rate": 1.5692856081984254e-05, "loss": 0.0822, "step": 32110 }, { "epoch": 2.3852665973563045, "grad_norm": 0.4796588122844696, "learning_rate": 1.5688400415862173e-05, "loss": 0.0544, "step": 32120 }, { "epoch": 2.3860092083766524, "grad_norm": 2.2078115940093994, "learning_rate": 1.5683944749740088e-05, "loss": 0.0714, "step": 32130 }, { "epoch": 2.386751819397, "grad_norm": 1.0921403169631958, "learning_rate": 1.5679489083618e-05, "loss": 0.0562, "step": 32140 }, { "epoch": 2.3874944304173473, "grad_norm": 2.1888418197631836, "learning_rate": 1.5675033417495918e-05, "loss": 0.0794, "step": 32150 }, { "epoch": 2.3882370414376948, "grad_norm": 2.4097537994384766, "learning_rate": 1.5670577751373833e-05, "loss": 0.1025, "step": 32160 }, { "epoch": 2.3889796524580427, "grad_norm": 2.782663583755493, "learning_rate": 1.5666122085251744e-05, "loss": 0.0667, "step": 32170 }, { "epoch": 2.38972226347839, "grad_norm": 2.659151554107666, "learning_rate": 1.5661666419129663e-05, "loss": 0.0515, "step": 32180 }, { "epoch": 2.3904648744987376, "grad_norm": 1.7082628011703491, "learning_rate": 1.5657210753007574e-05, "loss": 0.1092, "step": 32190 }, { "epoch": 2.391207485519085, "grad_norm": 1.8074676990509033, "learning_rate": 1.565275508688549e-05, "loss": 0.0675, "step": 32200 }, { "epoch": 2.3919500965394325, "grad_norm": 1.0085229873657227, "learning_rate": 1.5648299420763404e-05, "loss": 0.1148, "step": 32210 }, { "epoch": 2.3926927075597804, "grad_norm": 4.384804725646973, "learning_rate": 1.564384375464132e-05, "loss": 0.0551, "step": 32220 }, { "epoch": 2.393435318580128, "grad_norm": 1.1402451992034912, "learning_rate": 1.5639388088519234e-05, "loss": 0.0603, "step": 32230 }, { "epoch": 2.3941779296004753, "grad_norm": 0.9392279386520386, "learning_rate": 1.563493242239715e-05, "loss": 0.0836, "step": 32240 }, { "epoch": 2.3949205406208227, "grad_norm": 1.5614676475524902, "learning_rate": 1.5630476756275064e-05, "loss": 0.0956, "step": 32250 }, { "epoch": 2.39566315164117, "grad_norm": 1.321561574935913, "learning_rate": 1.562602109015298e-05, "loss": 0.0715, "step": 32260 }, { "epoch": 2.396405762661518, "grad_norm": 0.910446047782898, "learning_rate": 1.562156542403089e-05, "loss": 0.0434, "step": 32270 }, { "epoch": 2.3971483736818655, "grad_norm": 0.8245983123779297, "learning_rate": 1.5617109757908806e-05, "loss": 0.076, "step": 32280 }, { "epoch": 2.397890984702213, "grad_norm": 1.7808678150177002, "learning_rate": 1.5612654091786724e-05, "loss": 0.0731, "step": 32290 }, { "epoch": 2.3986335957225604, "grad_norm": 2.991241455078125, "learning_rate": 1.5608198425664636e-05, "loss": 0.087, "step": 32300 }, { "epoch": 2.3993762067429083, "grad_norm": 2.8807268142700195, "learning_rate": 1.560374275954255e-05, "loss": 0.0876, "step": 32310 }, { "epoch": 2.4001188177632558, "grad_norm": 0.5517368316650391, "learning_rate": 1.559928709342047e-05, "loss": 0.0773, "step": 32320 }, { "epoch": 2.400861428783603, "grad_norm": 1.6342768669128418, "learning_rate": 1.559483142729838e-05, "loss": 0.0609, "step": 32330 }, { "epoch": 2.4016040398039507, "grad_norm": 5.523561954498291, "learning_rate": 1.5590375761176296e-05, "loss": 0.0431, "step": 32340 }, { "epoch": 2.402346650824298, "grad_norm": 0.7796204090118408, "learning_rate": 1.558592009505421e-05, "loss": 0.0905, "step": 32350 }, { "epoch": 2.403089261844646, "grad_norm": 2.9090576171875, "learning_rate": 1.5581464428932126e-05, "loss": 0.0998, "step": 32360 }, { "epoch": 2.4038318728649934, "grad_norm": 0.7734149098396301, "learning_rate": 1.557700876281004e-05, "loss": 0.0585, "step": 32370 }, { "epoch": 2.404574483885341, "grad_norm": 1.852062702178955, "learning_rate": 1.5572553096687952e-05, "loss": 0.1048, "step": 32380 }, { "epoch": 2.4053170949056883, "grad_norm": 0.8608161807060242, "learning_rate": 1.556809743056587e-05, "loss": 0.0548, "step": 32390 }, { "epoch": 2.406059705926036, "grad_norm": 3.039947032928467, "learning_rate": 1.5563641764443786e-05, "loss": 0.0854, "step": 32400 }, { "epoch": 2.4068023169463837, "grad_norm": 2.2535059452056885, "learning_rate": 1.5559186098321697e-05, "loss": 0.0921, "step": 32410 }, { "epoch": 2.407544927966731, "grad_norm": 3.556171417236328, "learning_rate": 1.5554730432199616e-05, "loss": 0.0632, "step": 32420 }, { "epoch": 2.4082875389870786, "grad_norm": 2.6815476417541504, "learning_rate": 1.555027476607753e-05, "loss": 0.0689, "step": 32430 }, { "epoch": 2.409030150007426, "grad_norm": 0.38437405228614807, "learning_rate": 1.5545819099955442e-05, "loss": 0.0491, "step": 32440 }, { "epoch": 2.4097727610277735, "grad_norm": 1.429911494255066, "learning_rate": 1.5541363433833357e-05, "loss": 0.0423, "step": 32450 }, { "epoch": 2.4105153720481214, "grad_norm": 1.3753142356872559, "learning_rate": 1.5536907767711276e-05, "loss": 0.0816, "step": 32460 }, { "epoch": 2.411257983068469, "grad_norm": 1.1020511388778687, "learning_rate": 1.5532452101589187e-05, "loss": 0.0403, "step": 32470 }, { "epoch": 2.4120005940888163, "grad_norm": 3.2163803577423096, "learning_rate": 1.5527996435467102e-05, "loss": 0.09, "step": 32480 }, { "epoch": 2.4127432051091637, "grad_norm": 0.8695268630981445, "learning_rate": 1.5523540769345017e-05, "loss": 0.0565, "step": 32490 }, { "epoch": 2.413485816129511, "grad_norm": 0.9537298083305359, "learning_rate": 1.5519085103222932e-05, "loss": 0.0913, "step": 32500 }, { "epoch": 2.414228427149859, "grad_norm": 1.1136554479599, "learning_rate": 1.5514629437100847e-05, "loss": 0.0829, "step": 32510 }, { "epoch": 2.4149710381702065, "grad_norm": 1.2654924392700195, "learning_rate": 1.551017377097876e-05, "loss": 0.0915, "step": 32520 }, { "epoch": 2.415713649190554, "grad_norm": 3.5749497413635254, "learning_rate": 1.5505718104856677e-05, "loss": 0.0921, "step": 32530 }, { "epoch": 2.4164562602109014, "grad_norm": 1.6017907857894897, "learning_rate": 1.5501262438734592e-05, "loss": 0.0645, "step": 32540 }, { "epoch": 2.417198871231249, "grad_norm": 2.2795519828796387, "learning_rate": 1.5496806772612504e-05, "loss": 0.0624, "step": 32550 }, { "epoch": 2.4179414822515968, "grad_norm": 1.48874831199646, "learning_rate": 1.5492351106490422e-05, "loss": 0.0667, "step": 32560 }, { "epoch": 2.4186840932719442, "grad_norm": 3.069807529449463, "learning_rate": 1.5487895440368337e-05, "loss": 0.0767, "step": 32570 }, { "epoch": 2.4194267042922917, "grad_norm": 0.8146727085113525, "learning_rate": 1.548343977424625e-05, "loss": 0.09, "step": 32580 }, { "epoch": 2.420169315312639, "grad_norm": 2.0153746604919434, "learning_rate": 1.5478984108124167e-05, "loss": 0.0834, "step": 32590 }, { "epoch": 2.4209119263329866, "grad_norm": 1.9810839891433716, "learning_rate": 1.547452844200208e-05, "loss": 0.0843, "step": 32600 }, { "epoch": 2.4216545373533345, "grad_norm": 1.213164210319519, "learning_rate": 1.5470072775879994e-05, "loss": 0.0798, "step": 32610 }, { "epoch": 2.422397148373682, "grad_norm": 0.5485877394676208, "learning_rate": 1.546561710975791e-05, "loss": 0.0396, "step": 32620 }, { "epoch": 2.4231397593940294, "grad_norm": 1.8385777473449707, "learning_rate": 1.5461161443635824e-05, "loss": 0.0538, "step": 32630 }, { "epoch": 2.423882370414377, "grad_norm": 2.222101926803589, "learning_rate": 1.545670577751374e-05, "loss": 0.0494, "step": 32640 }, { "epoch": 2.4246249814347243, "grad_norm": 0.7490872740745544, "learning_rate": 1.5452250111391654e-05, "loss": 0.0645, "step": 32650 }, { "epoch": 2.425367592455072, "grad_norm": 1.1471827030181885, "learning_rate": 1.544779444526957e-05, "loss": 0.0842, "step": 32660 }, { "epoch": 2.4261102034754196, "grad_norm": 1.6201061010360718, "learning_rate": 1.5443338779147484e-05, "loss": 0.0834, "step": 32670 }, { "epoch": 2.426852814495767, "grad_norm": 1.3933659791946411, "learning_rate": 1.54388831130254e-05, "loss": 0.0674, "step": 32680 }, { "epoch": 2.4275954255161145, "grad_norm": 1.0082453489303589, "learning_rate": 1.543442744690331e-05, "loss": 0.043, "step": 32690 }, { "epoch": 2.428338036536462, "grad_norm": 0.7856671214103699, "learning_rate": 1.542997178078123e-05, "loss": 0.048, "step": 32700 }, { "epoch": 2.42908064755681, "grad_norm": 0.4259887635707855, "learning_rate": 1.542551611465914e-05, "loss": 0.0641, "step": 32710 }, { "epoch": 2.4298232585771573, "grad_norm": 0.42960453033447266, "learning_rate": 1.5421060448537055e-05, "loss": 0.0755, "step": 32720 }, { "epoch": 2.4305658695975048, "grad_norm": 3.8865599632263184, "learning_rate": 1.5416604782414974e-05, "loss": 0.0687, "step": 32730 }, { "epoch": 2.431308480617852, "grad_norm": 3.587674140930176, "learning_rate": 1.5412149116292885e-05, "loss": 0.0657, "step": 32740 }, { "epoch": 2.4320510916382, "grad_norm": 1.8166769742965698, "learning_rate": 1.54076934501708e-05, "loss": 0.0776, "step": 32750 }, { "epoch": 2.4327937026585476, "grad_norm": 2.718137264251709, "learning_rate": 1.5403237784048715e-05, "loss": 0.0761, "step": 32760 }, { "epoch": 2.433536313678895, "grad_norm": 1.776787281036377, "learning_rate": 1.539878211792663e-05, "loss": 0.0838, "step": 32770 }, { "epoch": 2.4342789246992425, "grad_norm": 0.9153753519058228, "learning_rate": 1.5394326451804545e-05, "loss": 0.065, "step": 32780 }, { "epoch": 2.43502153571959, "grad_norm": 1.0639044046401978, "learning_rate": 1.5389870785682457e-05, "loss": 0.0608, "step": 32790 }, { "epoch": 2.435764146739938, "grad_norm": 1.5037258863449097, "learning_rate": 1.5385415119560375e-05, "loss": 0.067, "step": 32800 }, { "epoch": 2.4365067577602852, "grad_norm": 1.892593502998352, "learning_rate": 1.538095945343829e-05, "loss": 0.061, "step": 32810 }, { "epoch": 2.4372493687806327, "grad_norm": 3.2514467239379883, "learning_rate": 1.5376503787316202e-05, "loss": 0.0801, "step": 32820 }, { "epoch": 2.43799197980098, "grad_norm": 1.7820117473602295, "learning_rate": 1.537204812119412e-05, "loss": 0.0887, "step": 32830 }, { "epoch": 2.4387345908213276, "grad_norm": 1.159784197807312, "learning_rate": 1.5367592455072035e-05, "loss": 0.0719, "step": 32840 }, { "epoch": 2.4394772018416755, "grad_norm": 2.0374605655670166, "learning_rate": 1.5363136788949947e-05, "loss": 0.0854, "step": 32850 }, { "epoch": 2.440219812862023, "grad_norm": 2.429708957672119, "learning_rate": 1.5358681122827862e-05, "loss": 0.0552, "step": 32860 }, { "epoch": 2.4409624238823704, "grad_norm": 2.368227243423462, "learning_rate": 1.535422545670578e-05, "loss": 0.07, "step": 32870 }, { "epoch": 2.441705034902718, "grad_norm": 0.7669575214385986, "learning_rate": 1.5349769790583692e-05, "loss": 0.0622, "step": 32880 }, { "epoch": 2.4424476459230657, "grad_norm": 0.9094696044921875, "learning_rate": 1.5345314124461607e-05, "loss": 0.0522, "step": 32890 }, { "epoch": 2.443190256943413, "grad_norm": 1.134655237197876, "learning_rate": 1.5340858458339522e-05, "loss": 0.0789, "step": 32900 }, { "epoch": 2.4439328679637606, "grad_norm": 1.5787122249603271, "learning_rate": 1.5336402792217437e-05, "loss": 0.0823, "step": 32910 }, { "epoch": 2.444675478984108, "grad_norm": 1.5267248153686523, "learning_rate": 1.5331947126095352e-05, "loss": 0.0557, "step": 32920 }, { "epoch": 2.4454180900044555, "grad_norm": 1.0408943891525269, "learning_rate": 1.5327491459973264e-05, "loss": 0.0481, "step": 32930 }, { "epoch": 2.4461607010248034, "grad_norm": 0.36609914898872375, "learning_rate": 1.5323035793851182e-05, "loss": 0.0744, "step": 32940 }, { "epoch": 2.446903312045151, "grad_norm": 0.8200104832649231, "learning_rate": 1.5318580127729097e-05, "loss": 0.087, "step": 32950 }, { "epoch": 2.4476459230654983, "grad_norm": 1.5445940494537354, "learning_rate": 1.531412446160701e-05, "loss": 0.0407, "step": 32960 }, { "epoch": 2.448388534085846, "grad_norm": 1.621883749961853, "learning_rate": 1.5309668795484927e-05, "loss": 0.0679, "step": 32970 }, { "epoch": 2.4491311451061932, "grad_norm": 0.8579855561256409, "learning_rate": 1.5305213129362842e-05, "loss": 0.0799, "step": 32980 }, { "epoch": 2.449873756126541, "grad_norm": 1.6563255786895752, "learning_rate": 1.5300757463240754e-05, "loss": 0.0634, "step": 32990 }, { "epoch": 2.4506163671468886, "grad_norm": 0.7070105671882629, "learning_rate": 1.5296301797118672e-05, "loss": 0.0956, "step": 33000 }, { "epoch": 2.451358978167236, "grad_norm": 2.0392887592315674, "learning_rate": 1.5291846130996584e-05, "loss": 0.0828, "step": 33010 }, { "epoch": 2.4521015891875835, "grad_norm": 1.0714646577835083, "learning_rate": 1.52873904648745e-05, "loss": 0.0737, "step": 33020 }, { "epoch": 2.452844200207931, "grad_norm": 1.0105502605438232, "learning_rate": 1.5282934798752413e-05, "loss": 0.0637, "step": 33030 }, { "epoch": 2.453586811228279, "grad_norm": 2.0838091373443604, "learning_rate": 1.527847913263033e-05, "loss": 0.0807, "step": 33040 }, { "epoch": 2.4543294222486263, "grad_norm": 3.914405107498169, "learning_rate": 1.5274023466508243e-05, "loss": 0.0905, "step": 33050 }, { "epoch": 2.4550720332689737, "grad_norm": 0.8861109018325806, "learning_rate": 1.526956780038616e-05, "loss": 0.0531, "step": 33060 }, { "epoch": 2.455814644289321, "grad_norm": 1.2929595708847046, "learning_rate": 1.5265112134264073e-05, "loss": 0.068, "step": 33070 }, { "epoch": 2.4565572553096686, "grad_norm": 1.6953091621398926, "learning_rate": 1.526065646814199e-05, "loss": 0.0674, "step": 33080 }, { "epoch": 2.4572998663300165, "grad_norm": 1.0414247512817383, "learning_rate": 1.5256200802019903e-05, "loss": 0.0852, "step": 33090 }, { "epoch": 2.458042477350364, "grad_norm": 1.8523513078689575, "learning_rate": 1.5251745135897817e-05, "loss": 0.0707, "step": 33100 }, { "epoch": 2.4587850883707114, "grad_norm": 1.1955831050872803, "learning_rate": 1.5247289469775732e-05, "loss": 0.0472, "step": 33110 }, { "epoch": 2.459527699391059, "grad_norm": 2.124166250228882, "learning_rate": 1.5242833803653645e-05, "loss": 0.0604, "step": 33120 }, { "epoch": 2.4602703104114063, "grad_norm": 2.213921070098877, "learning_rate": 1.5238378137531562e-05, "loss": 0.0677, "step": 33130 }, { "epoch": 2.461012921431754, "grad_norm": 0.6650950312614441, "learning_rate": 1.5233922471409477e-05, "loss": 0.0743, "step": 33140 }, { "epoch": 2.4617555324521017, "grad_norm": 0.972344160079956, "learning_rate": 1.522946680528739e-05, "loss": 0.0534, "step": 33150 }, { "epoch": 2.462498143472449, "grad_norm": 0.7517353296279907, "learning_rate": 1.5225011139165305e-05, "loss": 0.0736, "step": 33160 }, { "epoch": 2.4632407544927966, "grad_norm": 2.8627500534057617, "learning_rate": 1.5220555473043222e-05, "loss": 0.0417, "step": 33170 }, { "epoch": 2.463983365513144, "grad_norm": 0.2295779585838318, "learning_rate": 1.5216099806921135e-05, "loss": 0.0424, "step": 33180 }, { "epoch": 2.464725976533492, "grad_norm": 1.0014530420303345, "learning_rate": 1.521164414079905e-05, "loss": 0.0509, "step": 33190 }, { "epoch": 2.4654685875538394, "grad_norm": 0.8116422295570374, "learning_rate": 1.5207188474676963e-05, "loss": 0.0773, "step": 33200 }, { "epoch": 2.466211198574187, "grad_norm": 2.368131399154663, "learning_rate": 1.5202732808554878e-05, "loss": 0.0643, "step": 33210 }, { "epoch": 2.4669538095945343, "grad_norm": 0.573657751083374, "learning_rate": 1.5198277142432795e-05, "loss": 0.0482, "step": 33220 }, { "epoch": 2.4676964206148817, "grad_norm": 2.1423192024230957, "learning_rate": 1.5193821476310708e-05, "loss": 0.0896, "step": 33230 }, { "epoch": 2.4684390316352296, "grad_norm": 0.7535884976387024, "learning_rate": 1.5189365810188623e-05, "loss": 0.0625, "step": 33240 }, { "epoch": 2.469181642655577, "grad_norm": 1.2931323051452637, "learning_rate": 1.518491014406654e-05, "loss": 0.053, "step": 33250 }, { "epoch": 2.4699242536759245, "grad_norm": 2.7069478034973145, "learning_rate": 1.5180454477944452e-05, "loss": 0.0806, "step": 33260 }, { "epoch": 2.470666864696272, "grad_norm": 0.8650771379470825, "learning_rate": 1.5175998811822368e-05, "loss": 0.0689, "step": 33270 }, { "epoch": 2.4714094757166194, "grad_norm": 3.503204345703125, "learning_rate": 1.5171543145700283e-05, "loss": 0.0681, "step": 33280 }, { "epoch": 2.4721520867369673, "grad_norm": 0.5587957501411438, "learning_rate": 1.5167087479578197e-05, "loss": 0.0648, "step": 33290 }, { "epoch": 2.4728946977573147, "grad_norm": 2.2347841262817383, "learning_rate": 1.5162631813456113e-05, "loss": 0.0624, "step": 33300 }, { "epoch": 2.473637308777662, "grad_norm": 3.54778790473938, "learning_rate": 1.5158176147334025e-05, "loss": 0.0632, "step": 33310 }, { "epoch": 2.4743799197980096, "grad_norm": 1.227449893951416, "learning_rate": 1.5153720481211942e-05, "loss": 0.0717, "step": 33320 }, { "epoch": 2.4751225308183575, "grad_norm": 1.61305570602417, "learning_rate": 1.5149264815089857e-05, "loss": 0.0751, "step": 33330 }, { "epoch": 2.475865141838705, "grad_norm": 0.8405054807662964, "learning_rate": 1.514480914896777e-05, "loss": 0.0638, "step": 33340 }, { "epoch": 2.4766077528590524, "grad_norm": 0.8352612257003784, "learning_rate": 1.5140353482845687e-05, "loss": 0.0644, "step": 33350 }, { "epoch": 2.4773503638794, "grad_norm": 3.4174530506134033, "learning_rate": 1.5135897816723602e-05, "loss": 0.0562, "step": 33360 }, { "epoch": 2.4780929748997473, "grad_norm": 1.8341305255889893, "learning_rate": 1.5131442150601515e-05, "loss": 0.0558, "step": 33370 }, { "epoch": 2.4788355859200952, "grad_norm": 2.0367071628570557, "learning_rate": 1.512698648447943e-05, "loss": 0.0765, "step": 33380 }, { "epoch": 2.4795781969404427, "grad_norm": 2.6080222129821777, "learning_rate": 1.5122530818357347e-05, "loss": 0.0549, "step": 33390 }, { "epoch": 2.48032080796079, "grad_norm": 2.060661792755127, "learning_rate": 1.5118075152235258e-05, "loss": 0.0727, "step": 33400 }, { "epoch": 2.4810634189811376, "grad_norm": 0.36137092113494873, "learning_rate": 1.5113619486113175e-05, "loss": 0.0718, "step": 33410 }, { "epoch": 2.481806030001485, "grad_norm": 0.67535001039505, "learning_rate": 1.5109163819991088e-05, "loss": 0.0842, "step": 33420 }, { "epoch": 2.482548641021833, "grad_norm": 1.2539730072021484, "learning_rate": 1.5104708153869003e-05, "loss": 0.107, "step": 33430 }, { "epoch": 2.4832912520421804, "grad_norm": 1.4725462198257446, "learning_rate": 1.510025248774692e-05, "loss": 0.0611, "step": 33440 }, { "epoch": 2.484033863062528, "grad_norm": 2.846672534942627, "learning_rate": 1.5095796821624831e-05, "loss": 0.1161, "step": 33450 }, { "epoch": 2.4847764740828753, "grad_norm": 3.4515435695648193, "learning_rate": 1.5091341155502748e-05, "loss": 0.0771, "step": 33460 }, { "epoch": 2.485519085103223, "grad_norm": 2.5990917682647705, "learning_rate": 1.5086885489380665e-05, "loss": 0.0759, "step": 33470 }, { "epoch": 2.4862616961235706, "grad_norm": 0.4841431975364685, "learning_rate": 1.5082429823258576e-05, "loss": 0.0585, "step": 33480 }, { "epoch": 2.487004307143918, "grad_norm": 1.1416738033294678, "learning_rate": 1.5077974157136493e-05, "loss": 0.0593, "step": 33490 }, { "epoch": 2.4877469181642655, "grad_norm": 1.2249414920806885, "learning_rate": 1.5073518491014408e-05, "loss": 0.079, "step": 33500 }, { "epoch": 2.488489529184613, "grad_norm": 2.8479864597320557, "learning_rate": 1.5069062824892321e-05, "loss": 0.0688, "step": 33510 }, { "epoch": 2.489232140204961, "grad_norm": 0.494975745677948, "learning_rate": 1.5064607158770236e-05, "loss": 0.0848, "step": 33520 }, { "epoch": 2.4899747512253083, "grad_norm": 0.8648329973220825, "learning_rate": 1.506015149264815e-05, "loss": 0.079, "step": 33530 }, { "epoch": 2.4907173622456558, "grad_norm": 2.451526641845703, "learning_rate": 1.5055695826526066e-05, "loss": 0.0679, "step": 33540 }, { "epoch": 2.491459973266003, "grad_norm": 0.8754955530166626, "learning_rate": 1.5051240160403981e-05, "loss": 0.0766, "step": 33550 }, { "epoch": 2.4922025842863507, "grad_norm": 2.0408942699432373, "learning_rate": 1.5046784494281895e-05, "loss": 0.0696, "step": 33560 }, { "epoch": 2.4929451953066986, "grad_norm": 3.6392688751220703, "learning_rate": 1.504232882815981e-05, "loss": 0.0703, "step": 33570 }, { "epoch": 2.493687806327046, "grad_norm": 1.6540186405181885, "learning_rate": 1.5037873162037726e-05, "loss": 0.0852, "step": 33580 }, { "epoch": 2.4944304173473935, "grad_norm": 1.6503866910934448, "learning_rate": 1.503341749591564e-05, "loss": 0.0865, "step": 33590 }, { "epoch": 2.495173028367741, "grad_norm": 0.8147965669631958, "learning_rate": 1.5028961829793555e-05, "loss": 0.0757, "step": 33600 }, { "epoch": 2.4959156393880884, "grad_norm": 2.5200459957122803, "learning_rate": 1.5024506163671471e-05, "loss": 0.0582, "step": 33610 }, { "epoch": 2.4966582504084363, "grad_norm": 2.8182950019836426, "learning_rate": 1.5020050497549383e-05, "loss": 0.0855, "step": 33620 }, { "epoch": 2.4974008614287837, "grad_norm": 0.40025582909584045, "learning_rate": 1.50155948314273e-05, "loss": 0.0717, "step": 33630 }, { "epoch": 2.498143472449131, "grad_norm": 1.4286984205245972, "learning_rate": 1.5011139165305213e-05, "loss": 0.0483, "step": 33640 }, { "epoch": 2.4988860834694786, "grad_norm": 0.4235764145851135, "learning_rate": 1.5006683499183128e-05, "loss": 0.0372, "step": 33650 }, { "epoch": 2.499628694489826, "grad_norm": 1.5366827249526978, "learning_rate": 1.5002227833061045e-05, "loss": 0.0574, "step": 33660 }, { "epoch": 2.500371305510174, "grad_norm": 1.9339724779129028, "learning_rate": 1.4997772166938958e-05, "loss": 0.0612, "step": 33670 }, { "epoch": 2.5011139165305214, "grad_norm": 2.4985029697418213, "learning_rate": 1.4993316500816873e-05, "loss": 0.0724, "step": 33680 }, { "epoch": 2.501856527550869, "grad_norm": 1.3384310007095337, "learning_rate": 1.4988860834694786e-05, "loss": 0.0636, "step": 33690 }, { "epoch": 2.5025991385712163, "grad_norm": 0.9728517532348633, "learning_rate": 1.4984405168572701e-05, "loss": 0.0611, "step": 33700 }, { "epoch": 2.5033417495915637, "grad_norm": 1.3432775735855103, "learning_rate": 1.4979949502450618e-05, "loss": 0.0541, "step": 33710 }, { "epoch": 2.5040843606119116, "grad_norm": 0.8654043674468994, "learning_rate": 1.4975493836328531e-05, "loss": 0.065, "step": 33720 }, { "epoch": 2.504826971632259, "grad_norm": 0.9931495785713196, "learning_rate": 1.4971038170206446e-05, "loss": 0.0447, "step": 33730 }, { "epoch": 2.5055695826526065, "grad_norm": 0.5981758832931519, "learning_rate": 1.4966582504084361e-05, "loss": 0.0511, "step": 33740 }, { "epoch": 2.506312193672954, "grad_norm": 1.3194613456726074, "learning_rate": 1.4962126837962276e-05, "loss": 0.0902, "step": 33750 }, { "epoch": 2.5070548046933014, "grad_norm": 1.7330251932144165, "learning_rate": 1.4957671171840191e-05, "loss": 0.0522, "step": 33760 }, { "epoch": 2.5077974157136493, "grad_norm": 0.47713515162467957, "learning_rate": 1.4953215505718104e-05, "loss": 0.0676, "step": 33770 }, { "epoch": 2.508540026733997, "grad_norm": 2.3502461910247803, "learning_rate": 1.4948759839596021e-05, "loss": 0.0537, "step": 33780 }, { "epoch": 2.5092826377543442, "grad_norm": 0.9015212655067444, "learning_rate": 1.4944304173473934e-05, "loss": 0.0861, "step": 33790 }, { "epoch": 2.5100252487746917, "grad_norm": 0.7821179628372192, "learning_rate": 1.493984850735185e-05, "loss": 0.05, "step": 33800 }, { "epoch": 2.510767859795039, "grad_norm": 0.5906331539154053, "learning_rate": 1.4935392841229764e-05, "loss": 0.0414, "step": 33810 }, { "epoch": 2.511510470815387, "grad_norm": 3.766472101211548, "learning_rate": 1.493093717510768e-05, "loss": 0.087, "step": 33820 }, { "epoch": 2.5122530818357345, "grad_norm": 0.8629242777824402, "learning_rate": 1.4926481508985594e-05, "loss": 0.0452, "step": 33830 }, { "epoch": 2.512995692856082, "grad_norm": 0.4955524504184723, "learning_rate": 1.4922025842863508e-05, "loss": 0.0487, "step": 33840 }, { "epoch": 2.5137383038764294, "grad_norm": 1.7956591844558716, "learning_rate": 1.4917570176741423e-05, "loss": 0.0838, "step": 33850 }, { "epoch": 2.514480914896777, "grad_norm": 1.9371930360794067, "learning_rate": 1.4913114510619338e-05, "loss": 0.0987, "step": 33860 }, { "epoch": 2.5152235259171247, "grad_norm": 0.846228837966919, "learning_rate": 1.4908658844497253e-05, "loss": 0.0762, "step": 33870 }, { "epoch": 2.515966136937472, "grad_norm": 2.3206639289855957, "learning_rate": 1.4904203178375168e-05, "loss": 0.0978, "step": 33880 }, { "epoch": 2.5167087479578196, "grad_norm": 1.545559048652649, "learning_rate": 1.4899747512253083e-05, "loss": 0.0446, "step": 33890 }, { "epoch": 2.517451358978167, "grad_norm": 1.571700930595398, "learning_rate": 1.4895291846130998e-05, "loss": 0.0786, "step": 33900 }, { "epoch": 2.5181939699985145, "grad_norm": 1.5507023334503174, "learning_rate": 1.4890836180008911e-05, "loss": 0.0737, "step": 33910 }, { "epoch": 2.5189365810188624, "grad_norm": 1.5131269693374634, "learning_rate": 1.4886380513886826e-05, "loss": 0.0549, "step": 33920 }, { "epoch": 2.51967919203921, "grad_norm": 0.8395630121231079, "learning_rate": 1.4881924847764743e-05, "loss": 0.0665, "step": 33930 }, { "epoch": 2.5204218030595573, "grad_norm": 2.2483623027801514, "learning_rate": 1.4877469181642656e-05, "loss": 0.0329, "step": 33940 }, { "epoch": 2.521164414079905, "grad_norm": 1.996645212173462, "learning_rate": 1.4873013515520571e-05, "loss": 0.0658, "step": 33950 }, { "epoch": 2.521907025100252, "grad_norm": 1.3301950693130493, "learning_rate": 1.4868557849398484e-05, "loss": 0.0595, "step": 33960 }, { "epoch": 2.5226496361206, "grad_norm": 2.5241854190826416, "learning_rate": 1.4864102183276401e-05, "loss": 0.0837, "step": 33970 }, { "epoch": 2.5233922471409476, "grad_norm": 0.841391384601593, "learning_rate": 1.4859646517154314e-05, "loss": 0.0789, "step": 33980 }, { "epoch": 2.524134858161295, "grad_norm": 1.6445497274398804, "learning_rate": 1.485519085103223e-05, "loss": 0.0841, "step": 33990 }, { "epoch": 2.524877469181643, "grad_norm": 1.169519305229187, "learning_rate": 1.4850735184910146e-05, "loss": 0.0483, "step": 34000 }, { "epoch": 2.5256200802019904, "grad_norm": 1.069122076034546, "learning_rate": 1.484627951878806e-05, "loss": 0.0683, "step": 34010 }, { "epoch": 2.526362691222338, "grad_norm": 2.5198476314544678, "learning_rate": 1.4841823852665974e-05, "loss": 0.0837, "step": 34020 }, { "epoch": 2.5271053022426853, "grad_norm": 1.8398845195770264, "learning_rate": 1.4837368186543888e-05, "loss": 0.0589, "step": 34030 }, { "epoch": 2.5278479132630327, "grad_norm": 1.1466633081436157, "learning_rate": 1.4832912520421804e-05, "loss": 0.0439, "step": 34040 }, { "epoch": 2.5285905242833806, "grad_norm": 1.7048221826553345, "learning_rate": 1.482845685429972e-05, "loss": 0.0828, "step": 34050 }, { "epoch": 2.529333135303728, "grad_norm": 1.1695541143417358, "learning_rate": 1.4824001188177632e-05, "loss": 0.0756, "step": 34060 }, { "epoch": 2.5300757463240755, "grad_norm": 0.7336133718490601, "learning_rate": 1.4819545522055547e-05, "loss": 0.0519, "step": 34070 }, { "epoch": 2.530818357344423, "grad_norm": 3.3735294342041016, "learning_rate": 1.4815089855933462e-05, "loss": 0.0542, "step": 34080 }, { "epoch": 2.5315609683647704, "grad_norm": 4.34521484375, "learning_rate": 1.4810634189811377e-05, "loss": 0.0878, "step": 34090 }, { "epoch": 2.5323035793851183, "grad_norm": 2.667654037475586, "learning_rate": 1.480617852368929e-05, "loss": 0.0794, "step": 34100 }, { "epoch": 2.5330461904054657, "grad_norm": 3.387084722518921, "learning_rate": 1.4801722857567206e-05, "loss": 0.0941, "step": 34110 }, { "epoch": 2.533788801425813, "grad_norm": 2.6281583309173584, "learning_rate": 1.4797267191445122e-05, "loss": 0.0624, "step": 34120 }, { "epoch": 2.5345314124461606, "grad_norm": 1.0451610088348389, "learning_rate": 1.4792811525323036e-05, "loss": 0.0515, "step": 34130 }, { "epoch": 2.535274023466508, "grad_norm": 1.5573267936706543, "learning_rate": 1.478835585920095e-05, "loss": 0.0878, "step": 34140 }, { "epoch": 2.536016634486856, "grad_norm": 1.6244726181030273, "learning_rate": 1.4783900193078866e-05, "loss": 0.0647, "step": 34150 }, { "epoch": 2.5367592455072034, "grad_norm": 2.02620005607605, "learning_rate": 1.477944452695678e-05, "loss": 0.0734, "step": 34160 }, { "epoch": 2.537501856527551, "grad_norm": 1.4137933254241943, "learning_rate": 1.4774988860834696e-05, "loss": 0.0881, "step": 34170 }, { "epoch": 2.5382444675478983, "grad_norm": 1.8512823581695557, "learning_rate": 1.4770533194712609e-05, "loss": 0.0635, "step": 34180 }, { "epoch": 2.538987078568246, "grad_norm": 1.565675973892212, "learning_rate": 1.4766077528590526e-05, "loss": 0.0625, "step": 34190 }, { "epoch": 2.5397296895885937, "grad_norm": 0.9743090271949768, "learning_rate": 1.4761621862468439e-05, "loss": 0.0686, "step": 34200 }, { "epoch": 2.540472300608941, "grad_norm": 1.4764388799667358, "learning_rate": 1.4757166196346354e-05, "loss": 0.0563, "step": 34210 }, { "epoch": 2.5412149116292886, "grad_norm": 1.5421714782714844, "learning_rate": 1.4752710530224269e-05, "loss": 0.0727, "step": 34220 }, { "epoch": 2.541957522649636, "grad_norm": 0.654322624206543, "learning_rate": 1.4748254864102184e-05, "loss": 0.0523, "step": 34230 }, { "epoch": 2.5427001336699835, "grad_norm": 1.733870029449463, "learning_rate": 1.4743799197980099e-05, "loss": 0.0854, "step": 34240 }, { "epoch": 2.5434427446903314, "grad_norm": 2.0312111377716064, "learning_rate": 1.4739343531858012e-05, "loss": 0.0809, "step": 34250 }, { "epoch": 2.544185355710679, "grad_norm": 1.3134266138076782, "learning_rate": 1.4734887865735929e-05, "loss": 0.0722, "step": 34260 }, { "epoch": 2.5449279667310263, "grad_norm": 2.5417449474334717, "learning_rate": 1.4730432199613842e-05, "loss": 0.0819, "step": 34270 }, { "epoch": 2.5456705777513737, "grad_norm": 1.3097118139266968, "learning_rate": 1.4725976533491757e-05, "loss": 0.0818, "step": 34280 }, { "epoch": 2.546413188771721, "grad_norm": 1.2188619375228882, "learning_rate": 1.4721520867369672e-05, "loss": 0.0692, "step": 34290 }, { "epoch": 2.547155799792069, "grad_norm": 2.3822097778320312, "learning_rate": 1.4717065201247587e-05, "loss": 0.0847, "step": 34300 }, { "epoch": 2.5478984108124165, "grad_norm": 1.0267736911773682, "learning_rate": 1.4712609535125502e-05, "loss": 0.0844, "step": 34310 }, { "epoch": 2.548641021832764, "grad_norm": 1.53618323802948, "learning_rate": 1.4708153869003416e-05, "loss": 0.0804, "step": 34320 }, { "epoch": 2.5493836328531114, "grad_norm": 1.5292679071426392, "learning_rate": 1.470369820288133e-05, "loss": 0.0752, "step": 34330 }, { "epoch": 2.550126243873459, "grad_norm": 0.5811061263084412, "learning_rate": 1.4699242536759247e-05, "loss": 0.0684, "step": 34340 }, { "epoch": 2.5508688548938068, "grad_norm": 0.8153356313705444, "learning_rate": 1.469478687063716e-05, "loss": 0.048, "step": 34350 }, { "epoch": 2.551611465914154, "grad_norm": 1.236395001411438, "learning_rate": 1.4690331204515076e-05, "loss": 0.0614, "step": 34360 }, { "epoch": 2.5523540769345017, "grad_norm": 1.525625467300415, "learning_rate": 1.4685875538392989e-05, "loss": 0.0802, "step": 34370 }, { "epoch": 2.553096687954849, "grad_norm": 0.4339189827442169, "learning_rate": 1.4681419872270906e-05, "loss": 0.041, "step": 34380 }, { "epoch": 2.5538392989751966, "grad_norm": 1.1005926132202148, "learning_rate": 1.4676964206148819e-05, "loss": 0.0557, "step": 34390 }, { "epoch": 2.5545819099955445, "grad_norm": 2.0460987091064453, "learning_rate": 1.4672508540026734e-05, "loss": 0.0777, "step": 34400 }, { "epoch": 2.555324521015892, "grad_norm": 1.1774736642837524, "learning_rate": 1.466805287390465e-05, "loss": 0.0499, "step": 34410 }, { "epoch": 2.5560671320362394, "grad_norm": 1.0830439329147339, "learning_rate": 1.4663597207782564e-05, "loss": 0.0378, "step": 34420 }, { "epoch": 2.556809743056587, "grad_norm": 3.122680187225342, "learning_rate": 1.4659141541660479e-05, "loss": 0.0711, "step": 34430 }, { "epoch": 2.5575523540769343, "grad_norm": 1.451540231704712, "learning_rate": 1.4654685875538392e-05, "loss": 0.0909, "step": 34440 }, { "epoch": 2.558294965097282, "grad_norm": 2.591353416442871, "learning_rate": 1.4650230209416309e-05, "loss": 0.0714, "step": 34450 }, { "epoch": 2.5590375761176296, "grad_norm": 1.4591681957244873, "learning_rate": 1.4645774543294224e-05, "loss": 0.0777, "step": 34460 }, { "epoch": 2.559780187137977, "grad_norm": 0.7905107736587524, "learning_rate": 1.4641318877172137e-05, "loss": 0.056, "step": 34470 }, { "epoch": 2.5605227981583245, "grad_norm": 2.1354310512542725, "learning_rate": 1.4636863211050052e-05, "loss": 0.0647, "step": 34480 }, { "epoch": 2.561265409178672, "grad_norm": 0.35180187225341797, "learning_rate": 1.4632407544927967e-05, "loss": 0.056, "step": 34490 }, { "epoch": 2.56200802019902, "grad_norm": 1.2425521612167358, "learning_rate": 1.4627951878805882e-05, "loss": 0.0533, "step": 34500 }, { "epoch": 2.5627506312193673, "grad_norm": 2.0177273750305176, "learning_rate": 1.4623496212683797e-05, "loss": 0.039, "step": 34510 }, { "epoch": 2.5634932422397148, "grad_norm": 0.6943618059158325, "learning_rate": 1.461904054656171e-05, "loss": 0.0622, "step": 34520 }, { "epoch": 2.5642358532600626, "grad_norm": 2.4245269298553467, "learning_rate": 1.4614584880439627e-05, "loss": 0.0611, "step": 34530 }, { "epoch": 2.5649784642804097, "grad_norm": 0.730995237827301, "learning_rate": 1.461012921431754e-05, "loss": 0.0834, "step": 34540 }, { "epoch": 2.5657210753007575, "grad_norm": 0.8295930027961731, "learning_rate": 1.4605673548195455e-05, "loss": 0.0567, "step": 34550 }, { "epoch": 2.566463686321105, "grad_norm": 2.3141775131225586, "learning_rate": 1.460121788207337e-05, "loss": 0.0663, "step": 34560 }, { "epoch": 2.5672062973414524, "grad_norm": 1.6702011823654175, "learning_rate": 1.4596762215951285e-05, "loss": 0.1019, "step": 34570 }, { "epoch": 2.5679489083618003, "grad_norm": 1.9209109544754028, "learning_rate": 1.45923065498292e-05, "loss": 0.0763, "step": 34580 }, { "epoch": 2.568691519382148, "grad_norm": 1.5046935081481934, "learning_rate": 1.4587850883707114e-05, "loss": 0.0805, "step": 34590 }, { "epoch": 2.5694341304024952, "grad_norm": 0.8070915937423706, "learning_rate": 1.458339521758503e-05, "loss": 0.0334, "step": 34600 }, { "epoch": 2.5701767414228427, "grad_norm": 0.2428605705499649, "learning_rate": 1.4578939551462944e-05, "loss": 0.0457, "step": 34610 }, { "epoch": 2.57091935244319, "grad_norm": 0.3563167452812195, "learning_rate": 1.4574483885340859e-05, "loss": 0.0598, "step": 34620 }, { "epoch": 2.571661963463538, "grad_norm": 0.9736761450767517, "learning_rate": 1.4570028219218774e-05, "loss": 0.058, "step": 34630 }, { "epoch": 2.5724045744838855, "grad_norm": 1.3607254028320312, "learning_rate": 1.4565572553096689e-05, "loss": 0.069, "step": 34640 }, { "epoch": 2.573147185504233, "grad_norm": 0.6492049694061279, "learning_rate": 1.4561116886974604e-05, "loss": 0.0406, "step": 34650 }, { "epoch": 2.5738897965245804, "grad_norm": 1.3823007345199585, "learning_rate": 1.4556661220852517e-05, "loss": 0.062, "step": 34660 }, { "epoch": 2.574632407544928, "grad_norm": 3.305699110031128, "learning_rate": 1.4552205554730434e-05, "loss": 0.1082, "step": 34670 }, { "epoch": 2.5753750185652757, "grad_norm": 1.8386292457580566, "learning_rate": 1.4547749888608347e-05, "loss": 0.0667, "step": 34680 }, { "epoch": 2.576117629585623, "grad_norm": 0.8886379599571228, "learning_rate": 1.4543294222486262e-05, "loss": 0.0873, "step": 34690 }, { "epoch": 2.5768602406059706, "grad_norm": 1.2363057136535645, "learning_rate": 1.4538838556364177e-05, "loss": 0.0978, "step": 34700 }, { "epoch": 2.577602851626318, "grad_norm": 1.2348647117614746, "learning_rate": 1.4534382890242092e-05, "loss": 0.0801, "step": 34710 }, { "epoch": 2.5783454626466655, "grad_norm": 1.7312116622924805, "learning_rate": 1.4529927224120007e-05, "loss": 0.0794, "step": 34720 }, { "epoch": 2.5790880736670134, "grad_norm": 1.0014289617538452, "learning_rate": 1.452547155799792e-05, "loss": 0.061, "step": 34730 }, { "epoch": 2.579830684687361, "grad_norm": 0.8576075434684753, "learning_rate": 1.4521015891875835e-05, "loss": 0.0756, "step": 34740 }, { "epoch": 2.5805732957077083, "grad_norm": 2.0893824100494385, "learning_rate": 1.4516560225753752e-05, "loss": 0.0672, "step": 34750 }, { "epoch": 2.5813159067280558, "grad_norm": 0.5497003197669983, "learning_rate": 1.4512104559631665e-05, "loss": 0.037, "step": 34760 }, { "epoch": 2.5820585177484032, "grad_norm": 3.0224320888519287, "learning_rate": 1.450764889350958e-05, "loss": 0.0525, "step": 34770 }, { "epoch": 2.582801128768751, "grad_norm": 0.8441876173019409, "learning_rate": 1.4503193227387493e-05, "loss": 0.0875, "step": 34780 }, { "epoch": 2.5835437397890986, "grad_norm": 2.5795955657958984, "learning_rate": 1.449873756126541e-05, "loss": 0.0817, "step": 34790 }, { "epoch": 2.584286350809446, "grad_norm": 0.8166103363037109, "learning_rate": 1.4494281895143323e-05, "loss": 0.0503, "step": 34800 }, { "epoch": 2.5850289618297935, "grad_norm": 1.8198186159133911, "learning_rate": 1.4489826229021238e-05, "loss": 0.0388, "step": 34810 }, { "epoch": 2.585771572850141, "grad_norm": 4.351510524749756, "learning_rate": 1.4485370562899155e-05, "loss": 0.0762, "step": 34820 }, { "epoch": 2.586514183870489, "grad_norm": 0.9294065833091736, "learning_rate": 1.4480914896777068e-05, "loss": 0.0678, "step": 34830 }, { "epoch": 2.5872567948908363, "grad_norm": 1.0012507438659668, "learning_rate": 1.4476459230654983e-05, "loss": 0.0536, "step": 34840 }, { "epoch": 2.5879994059111837, "grad_norm": 0.6957378387451172, "learning_rate": 1.4472003564532897e-05, "loss": 0.0958, "step": 34850 }, { "epoch": 2.588742016931531, "grad_norm": 0.9014194011688232, "learning_rate": 1.4467547898410813e-05, "loss": 0.0647, "step": 34860 }, { "epoch": 2.5894846279518786, "grad_norm": 1.4799509048461914, "learning_rate": 1.4463092232288728e-05, "loss": 0.0757, "step": 34870 }, { "epoch": 2.5902272389722265, "grad_norm": 1.020585060119629, "learning_rate": 1.4458636566166642e-05, "loss": 0.0537, "step": 34880 }, { "epoch": 2.590969849992574, "grad_norm": 3.012230396270752, "learning_rate": 1.4454180900044557e-05, "loss": 0.0807, "step": 34890 }, { "epoch": 2.5917124610129214, "grad_norm": 1.4325774908065796, "learning_rate": 1.4449725233922472e-05, "loss": 0.0746, "step": 34900 }, { "epoch": 2.592455072033269, "grad_norm": 0.7965050339698792, "learning_rate": 1.4445269567800387e-05, "loss": 0.0488, "step": 34910 }, { "epoch": 2.5931976830536163, "grad_norm": 0.29672083258628845, "learning_rate": 1.4440813901678302e-05, "loss": 0.0621, "step": 34920 }, { "epoch": 2.593940294073964, "grad_norm": 1.9577540159225464, "learning_rate": 1.4436358235556217e-05, "loss": 0.0845, "step": 34930 }, { "epoch": 2.5946829050943117, "grad_norm": 2.028249502182007, "learning_rate": 1.4431902569434132e-05, "loss": 0.0853, "step": 34940 }, { "epoch": 2.595425516114659, "grad_norm": 0.9768528342247009, "learning_rate": 1.4427446903312045e-05, "loss": 0.0762, "step": 34950 }, { "epoch": 2.5961681271350066, "grad_norm": 0.36251920461654663, "learning_rate": 1.442299123718996e-05, "loss": 0.0513, "step": 34960 }, { "epoch": 2.596910738155354, "grad_norm": 1.7192022800445557, "learning_rate": 1.4418535571067875e-05, "loss": 0.0643, "step": 34970 }, { "epoch": 2.597653349175702, "grad_norm": 1.3157782554626465, "learning_rate": 1.441407990494579e-05, "loss": 0.0863, "step": 34980 }, { "epoch": 2.5983959601960493, "grad_norm": 1.921720027923584, "learning_rate": 1.4409624238823705e-05, "loss": 0.0806, "step": 34990 }, { "epoch": 2.599138571216397, "grad_norm": 0.9074265360832214, "learning_rate": 1.4405168572701618e-05, "loss": 0.0916, "step": 35000 }, { "epoch": 2.5998811822367442, "grad_norm": 1.1961455345153809, "learning_rate": 1.4400712906579535e-05, "loss": 0.0818, "step": 35010 }, { "epoch": 2.6006237932570917, "grad_norm": 1.6636606454849243, "learning_rate": 1.4396257240457448e-05, "loss": 0.0947, "step": 35020 }, { "epoch": 2.6013664042774396, "grad_norm": 1.7410112619400024, "learning_rate": 1.4391801574335363e-05, "loss": 0.0614, "step": 35030 }, { "epoch": 2.602109015297787, "grad_norm": 4.592065811157227, "learning_rate": 1.4387345908213278e-05, "loss": 0.0652, "step": 35040 }, { "epoch": 2.6028516263181345, "grad_norm": 2.8058197498321533, "learning_rate": 1.4382890242091193e-05, "loss": 0.0731, "step": 35050 }, { "epoch": 2.603594237338482, "grad_norm": 1.1537928581237793, "learning_rate": 1.4378434575969108e-05, "loss": 0.0789, "step": 35060 }, { "epoch": 2.6043368483588294, "grad_norm": 1.5462356805801392, "learning_rate": 1.4373978909847021e-05, "loss": 0.0735, "step": 35070 }, { "epoch": 2.6050794593791773, "grad_norm": 2.397684097290039, "learning_rate": 1.4369523243724938e-05, "loss": 0.0812, "step": 35080 }, { "epoch": 2.6058220703995247, "grad_norm": 0.8381139039993286, "learning_rate": 1.4365067577602851e-05, "loss": 0.0472, "step": 35090 }, { "epoch": 2.606564681419872, "grad_norm": 0.911646842956543, "learning_rate": 1.4360611911480766e-05, "loss": 0.0668, "step": 35100 }, { "epoch": 2.60730729244022, "grad_norm": 1.0542629957199097, "learning_rate": 1.4356156245358681e-05, "loss": 0.0711, "step": 35110 }, { "epoch": 2.608049903460567, "grad_norm": 0.4654415547847748, "learning_rate": 1.4351700579236596e-05, "loss": 0.0561, "step": 35120 }, { "epoch": 2.608792514480915, "grad_norm": 1.3107812404632568, "learning_rate": 1.4347244913114511e-05, "loss": 0.0691, "step": 35130 }, { "epoch": 2.6095351255012624, "grad_norm": 2.2061398029327393, "learning_rate": 1.4342789246992425e-05, "loss": 0.1093, "step": 35140 }, { "epoch": 2.61027773652161, "grad_norm": 1.198928952217102, "learning_rate": 1.433833358087034e-05, "loss": 0.0445, "step": 35150 }, { "epoch": 2.6110203475419578, "grad_norm": 1.055016279220581, "learning_rate": 1.4333877914748256e-05, "loss": 0.057, "step": 35160 }, { "epoch": 2.6117629585623052, "grad_norm": 1.0568102598190308, "learning_rate": 1.432942224862617e-05, "loss": 0.0598, "step": 35170 }, { "epoch": 2.6125055695826527, "grad_norm": 1.305461049079895, "learning_rate": 1.4324966582504085e-05, "loss": 0.0859, "step": 35180 }, { "epoch": 2.613248180603, "grad_norm": 1.0581294298171997, "learning_rate": 1.4320510916382e-05, "loss": 0.0662, "step": 35190 }, { "epoch": 2.6139907916233476, "grad_norm": 0.9426524639129639, "learning_rate": 1.4316055250259915e-05, "loss": 0.0559, "step": 35200 }, { "epoch": 2.6147334026436955, "grad_norm": 0.5946950316429138, "learning_rate": 1.431159958413783e-05, "loss": 0.0954, "step": 35210 }, { "epoch": 2.615476013664043, "grad_norm": 1.6754854917526245, "learning_rate": 1.4307143918015743e-05, "loss": 0.0866, "step": 35220 }, { "epoch": 2.6162186246843904, "grad_norm": 1.6336374282836914, "learning_rate": 1.430268825189366e-05, "loss": 0.0815, "step": 35230 }, { "epoch": 2.616961235704738, "grad_norm": 2.549908399581909, "learning_rate": 1.4298232585771573e-05, "loss": 0.0529, "step": 35240 }, { "epoch": 2.6177038467250853, "grad_norm": 1.3991200923919678, "learning_rate": 1.4293776919649488e-05, "loss": 0.0483, "step": 35250 }, { "epoch": 2.618446457745433, "grad_norm": 0.7398178577423096, "learning_rate": 1.4289321253527401e-05, "loss": 0.0677, "step": 35260 }, { "epoch": 2.6191890687657806, "grad_norm": 1.8208078145980835, "learning_rate": 1.4284865587405318e-05, "loss": 0.0539, "step": 35270 }, { "epoch": 2.619931679786128, "grad_norm": 1.7018234729766846, "learning_rate": 1.4280409921283233e-05, "loss": 0.0529, "step": 35280 }, { "epoch": 2.6206742908064755, "grad_norm": 2.4244110584259033, "learning_rate": 1.4275954255161146e-05, "loss": 0.0595, "step": 35290 }, { "epoch": 2.621416901826823, "grad_norm": 0.505042314529419, "learning_rate": 1.4271498589039061e-05, "loss": 0.0865, "step": 35300 }, { "epoch": 2.622159512847171, "grad_norm": 1.206248164176941, "learning_rate": 1.4267042922916976e-05, "loss": 0.0662, "step": 35310 }, { "epoch": 2.6229021238675183, "grad_norm": 1.3556511402130127, "learning_rate": 1.4262587256794891e-05, "loss": 0.0575, "step": 35320 }, { "epoch": 2.6236447348878658, "grad_norm": 0.7870049476623535, "learning_rate": 1.4258131590672806e-05, "loss": 0.0614, "step": 35330 }, { "epoch": 2.624387345908213, "grad_norm": 1.794494867324829, "learning_rate": 1.4253675924550721e-05, "loss": 0.097, "step": 35340 }, { "epoch": 2.6251299569285607, "grad_norm": 0.7083643078804016, "learning_rate": 1.4249220258428636e-05, "loss": 0.0648, "step": 35350 }, { "epoch": 2.6258725679489086, "grad_norm": 0.7302588224411011, "learning_rate": 1.424476459230655e-05, "loss": 0.0578, "step": 35360 }, { "epoch": 2.626615178969256, "grad_norm": 3.079280376434326, "learning_rate": 1.4240308926184465e-05, "loss": 0.0897, "step": 35370 }, { "epoch": 2.6273577899896035, "grad_norm": 0.824223518371582, "learning_rate": 1.423585326006238e-05, "loss": 0.08, "step": 35380 }, { "epoch": 2.628100401009951, "grad_norm": 1.3617218732833862, "learning_rate": 1.4231397593940295e-05, "loss": 0.0633, "step": 35390 }, { "epoch": 2.6288430120302984, "grad_norm": 0.5929654240608215, "learning_rate": 1.422694192781821e-05, "loss": 0.0467, "step": 35400 }, { "epoch": 2.6295856230506462, "grad_norm": 2.259077310562134, "learning_rate": 1.4222486261696123e-05, "loss": 0.0735, "step": 35410 }, { "epoch": 2.6303282340709937, "grad_norm": 1.6033036708831787, "learning_rate": 1.421803059557404e-05, "loss": 0.0972, "step": 35420 }, { "epoch": 2.631070845091341, "grad_norm": 1.9454528093338013, "learning_rate": 1.4213574929451953e-05, "loss": 0.0736, "step": 35430 }, { "epoch": 2.6318134561116886, "grad_norm": 0.5253037810325623, "learning_rate": 1.4209119263329868e-05, "loss": 0.0651, "step": 35440 }, { "epoch": 2.632556067132036, "grad_norm": 0.5184679627418518, "learning_rate": 1.4204663597207784e-05, "loss": 0.0403, "step": 35450 }, { "epoch": 2.633298678152384, "grad_norm": 0.7617425918579102, "learning_rate": 1.4200207931085698e-05, "loss": 0.0699, "step": 35460 }, { "epoch": 2.6340412891727314, "grad_norm": 1.4257124662399292, "learning_rate": 1.4195752264963613e-05, "loss": 0.0725, "step": 35470 }, { "epoch": 2.634783900193079, "grad_norm": 2.0177693367004395, "learning_rate": 1.4191296598841526e-05, "loss": 0.0693, "step": 35480 }, { "epoch": 2.6355265112134263, "grad_norm": 1.3134448528289795, "learning_rate": 1.4186840932719443e-05, "loss": 0.0739, "step": 35490 }, { "epoch": 2.6362691222337737, "grad_norm": 2.646014928817749, "learning_rate": 1.4182385266597356e-05, "loss": 0.0605, "step": 35500 }, { "epoch": 2.6370117332541216, "grad_norm": 1.8550797700881958, "learning_rate": 1.4177929600475271e-05, "loss": 0.0577, "step": 35510 }, { "epoch": 2.637754344274469, "grad_norm": 1.6561418771743774, "learning_rate": 1.4173473934353186e-05, "loss": 0.0798, "step": 35520 }, { "epoch": 2.6384969552948165, "grad_norm": 1.4984925985336304, "learning_rate": 1.4169018268231101e-05, "loss": 0.0764, "step": 35530 }, { "epoch": 2.639239566315164, "grad_norm": 2.3113274574279785, "learning_rate": 1.4164562602109016e-05, "loss": 0.0465, "step": 35540 }, { "epoch": 2.6399821773355114, "grad_norm": 2.2579538822174072, "learning_rate": 1.416010693598693e-05, "loss": 0.0767, "step": 35550 }, { "epoch": 2.6407247883558593, "grad_norm": 1.6482024192810059, "learning_rate": 1.4155651269864844e-05, "loss": 0.0514, "step": 35560 }, { "epoch": 2.641467399376207, "grad_norm": 2.0257744789123535, "learning_rate": 1.4151195603742761e-05, "loss": 0.0452, "step": 35570 }, { "epoch": 2.6422100103965542, "grad_norm": 1.083173394203186, "learning_rate": 1.4146739937620674e-05, "loss": 0.059, "step": 35580 }, { "epoch": 2.6429526214169017, "grad_norm": 1.5998643636703491, "learning_rate": 1.414228427149859e-05, "loss": 0.0557, "step": 35590 }, { "epoch": 2.643695232437249, "grad_norm": 0.8439221382141113, "learning_rate": 1.4137828605376504e-05, "loss": 0.0582, "step": 35600 }, { "epoch": 2.644437843457597, "grad_norm": 2.156799793243408, "learning_rate": 1.413337293925442e-05, "loss": 0.0562, "step": 35610 }, { "epoch": 2.6451804544779445, "grad_norm": 0.5263361930847168, "learning_rate": 1.4128917273132334e-05, "loss": 0.0844, "step": 35620 }, { "epoch": 2.645923065498292, "grad_norm": 2.5201919078826904, "learning_rate": 1.4124461607010248e-05, "loss": 0.0867, "step": 35630 }, { "epoch": 2.6466656765186394, "grad_norm": 1.7519117593765259, "learning_rate": 1.4120005940888164e-05, "loss": 0.0578, "step": 35640 }, { "epoch": 2.647408287538987, "grad_norm": 1.5524243116378784, "learning_rate": 1.4115550274766078e-05, "loss": 0.0662, "step": 35650 }, { "epoch": 2.6481508985593347, "grad_norm": 0.8685120344161987, "learning_rate": 1.4111094608643993e-05, "loss": 0.0509, "step": 35660 }, { "epoch": 2.648893509579682, "grad_norm": 2.201120376586914, "learning_rate": 1.4106638942521906e-05, "loss": 0.057, "step": 35670 }, { "epoch": 2.6496361206000296, "grad_norm": 0.4596274197101593, "learning_rate": 1.4102183276399823e-05, "loss": 0.0714, "step": 35680 }, { "epoch": 2.6503787316203775, "grad_norm": 2.369061231613159, "learning_rate": 1.4097727610277738e-05, "loss": 0.05, "step": 35690 }, { "epoch": 2.6511213426407245, "grad_norm": 0.30310168862342834, "learning_rate": 1.4093271944155651e-05, "loss": 0.073, "step": 35700 }, { "epoch": 2.6518639536610724, "grad_norm": 0.8447324633598328, "learning_rate": 1.4088816278033568e-05, "loss": 0.0769, "step": 35710 }, { "epoch": 2.65260656468142, "grad_norm": 1.5331531763076782, "learning_rate": 1.4084360611911481e-05, "loss": 0.0837, "step": 35720 }, { "epoch": 2.6533491757017673, "grad_norm": 0.9283561110496521, "learning_rate": 1.4079904945789396e-05, "loss": 0.0608, "step": 35730 }, { "epoch": 2.654091786722115, "grad_norm": 1.556794285774231, "learning_rate": 1.4075449279667311e-05, "loss": 0.0839, "step": 35740 }, { "epoch": 2.6548343977424627, "grad_norm": 2.0326085090637207, "learning_rate": 1.4070993613545226e-05, "loss": 0.079, "step": 35750 }, { "epoch": 2.65557700876281, "grad_norm": 1.1070688962936401, "learning_rate": 1.406653794742314e-05, "loss": 0.0598, "step": 35760 }, { "epoch": 2.6563196197831576, "grad_norm": 2.687786817550659, "learning_rate": 1.4062082281301054e-05, "loss": 0.0528, "step": 35770 }, { "epoch": 2.657062230803505, "grad_norm": 2.0500941276550293, "learning_rate": 1.4057626615178969e-05, "loss": 0.0586, "step": 35780 }, { "epoch": 2.657804841823853, "grad_norm": 1.7865089178085327, "learning_rate": 1.4053170949056884e-05, "loss": 0.0816, "step": 35790 }, { "epoch": 2.6585474528442004, "grad_norm": 1.018984317779541, "learning_rate": 1.4048715282934799e-05, "loss": 0.0338, "step": 35800 }, { "epoch": 2.659290063864548, "grad_norm": 0.3521486520767212, "learning_rate": 1.4044259616812714e-05, "loss": 0.0549, "step": 35810 }, { "epoch": 2.6600326748848953, "grad_norm": 2.4242541790008545, "learning_rate": 1.4039803950690627e-05, "loss": 0.1002, "step": 35820 }, { "epoch": 2.6607752859052427, "grad_norm": 1.1004574298858643, "learning_rate": 1.4035348284568544e-05, "loss": 0.0457, "step": 35830 }, { "epoch": 2.6615178969255906, "grad_norm": 1.469435214996338, "learning_rate": 1.4030892618446457e-05, "loss": 0.0819, "step": 35840 }, { "epoch": 2.662260507945938, "grad_norm": 0.20133927464485168, "learning_rate": 1.4026436952324372e-05, "loss": 0.0591, "step": 35850 }, { "epoch": 2.6630031189662855, "grad_norm": 1.4048292636871338, "learning_rate": 1.4021981286202289e-05, "loss": 0.0562, "step": 35860 }, { "epoch": 2.663745729986633, "grad_norm": 5.874467849731445, "learning_rate": 1.4017525620080202e-05, "loss": 0.0712, "step": 35870 }, { "epoch": 2.6644883410069804, "grad_norm": 1.0913431644439697, "learning_rate": 1.4013069953958117e-05, "loss": 0.0703, "step": 35880 }, { "epoch": 2.6652309520273283, "grad_norm": 0.3893365263938904, "learning_rate": 1.400861428783603e-05, "loss": 0.0593, "step": 35890 }, { "epoch": 2.6659735630476757, "grad_norm": 1.0970251560211182, "learning_rate": 1.4004158621713947e-05, "loss": 0.0716, "step": 35900 }, { "epoch": 2.666716174068023, "grad_norm": 1.392922282218933, "learning_rate": 1.3999702955591862e-05, "loss": 0.0589, "step": 35910 }, { "epoch": 2.6674587850883706, "grad_norm": 0.6275796890258789, "learning_rate": 1.3995247289469776e-05, "loss": 0.037, "step": 35920 }, { "epoch": 2.668201396108718, "grad_norm": 1.6049987077713013, "learning_rate": 1.399079162334769e-05, "loss": 0.0566, "step": 35930 }, { "epoch": 2.668944007129066, "grad_norm": 1.582099199295044, "learning_rate": 1.3986335957225606e-05, "loss": 0.0689, "step": 35940 }, { "epoch": 2.6696866181494134, "grad_norm": 0.8754908442497253, "learning_rate": 1.398188029110352e-05, "loss": 0.0926, "step": 35950 }, { "epoch": 2.670429229169761, "grad_norm": 2.45027756690979, "learning_rate": 1.3977424624981434e-05, "loss": 0.0468, "step": 35960 }, { "epoch": 2.6711718401901083, "grad_norm": 1.8053902387619019, "learning_rate": 1.3972968958859349e-05, "loss": 0.0489, "step": 35970 }, { "epoch": 2.671914451210456, "grad_norm": 3.0964303016662598, "learning_rate": 1.3968513292737266e-05, "loss": 0.0834, "step": 35980 }, { "epoch": 2.6726570622308037, "grad_norm": 2.1275410652160645, "learning_rate": 1.3964057626615179e-05, "loss": 0.0938, "step": 35990 }, { "epoch": 2.673399673251151, "grad_norm": 0.8171222805976868, "learning_rate": 1.3959601960493094e-05, "loss": 0.0669, "step": 36000 }, { "epoch": 2.6741422842714986, "grad_norm": 3.5696773529052734, "learning_rate": 1.3955146294371009e-05, "loss": 0.0935, "step": 36010 }, { "epoch": 2.674884895291846, "grad_norm": 1.2515684366226196, "learning_rate": 1.3950690628248924e-05, "loss": 0.0902, "step": 36020 }, { "epoch": 2.6756275063121935, "grad_norm": 3.313480854034424, "learning_rate": 1.3946234962126839e-05, "loss": 0.0592, "step": 36030 }, { "epoch": 2.6763701173325414, "grad_norm": 0.7134093046188354, "learning_rate": 1.3941779296004752e-05, "loss": 0.0455, "step": 36040 }, { "epoch": 2.677112728352889, "grad_norm": 0.5225452184677124, "learning_rate": 1.3937323629882669e-05, "loss": 0.0487, "step": 36050 }, { "epoch": 2.6778553393732363, "grad_norm": 0.6673758625984192, "learning_rate": 1.3932867963760582e-05, "loss": 0.079, "step": 36060 }, { "epoch": 2.6785979503935837, "grad_norm": 2.7382137775421143, "learning_rate": 1.3928412297638497e-05, "loss": 0.0757, "step": 36070 }, { "epoch": 2.679340561413931, "grad_norm": 0.9488750696182251, "learning_rate": 1.392395663151641e-05, "loss": 0.0542, "step": 36080 }, { "epoch": 2.680083172434279, "grad_norm": 4.239487648010254, "learning_rate": 1.3919500965394327e-05, "loss": 0.0986, "step": 36090 }, { "epoch": 2.6808257834546265, "grad_norm": 1.1247403621673584, "learning_rate": 1.3915045299272242e-05, "loss": 0.0535, "step": 36100 }, { "epoch": 2.681568394474974, "grad_norm": 1.4115970134735107, "learning_rate": 1.3910589633150155e-05, "loss": 0.0712, "step": 36110 }, { "epoch": 2.6823110054953214, "grad_norm": 1.4354156255722046, "learning_rate": 1.3906133967028072e-05, "loss": 0.0612, "step": 36120 }, { "epoch": 2.683053616515669, "grad_norm": 0.6323860287666321, "learning_rate": 1.3901678300905985e-05, "loss": 0.0839, "step": 36130 }, { "epoch": 2.6837962275360168, "grad_norm": 1.8889610767364502, "learning_rate": 1.38972226347839e-05, "loss": 0.0528, "step": 36140 }, { "epoch": 2.684538838556364, "grad_norm": 1.292384386062622, "learning_rate": 1.3892766968661815e-05, "loss": 0.0699, "step": 36150 }, { "epoch": 2.6852814495767117, "grad_norm": 1.048690676689148, "learning_rate": 1.388831130253973e-05, "loss": 0.0679, "step": 36160 }, { "epoch": 2.686024060597059, "grad_norm": 0.9815926551818848, "learning_rate": 1.3883855636417645e-05, "loss": 0.0747, "step": 36170 }, { "epoch": 2.6867666716174066, "grad_norm": 0.6208893060684204, "learning_rate": 1.3879399970295559e-05, "loss": 0.0517, "step": 36180 }, { "epoch": 2.6875092826377545, "grad_norm": 1.4446412324905396, "learning_rate": 1.3874944304173474e-05, "loss": 0.0549, "step": 36190 }, { "epoch": 2.688251893658102, "grad_norm": 0.39226940274238586, "learning_rate": 1.3870488638051389e-05, "loss": 0.0788, "step": 36200 }, { "epoch": 2.6889945046784494, "grad_norm": 1.446395754814148, "learning_rate": 1.3866032971929304e-05, "loss": 0.1031, "step": 36210 }, { "epoch": 2.689737115698797, "grad_norm": 1.849453330039978, "learning_rate": 1.3861577305807219e-05, "loss": 0.0683, "step": 36220 }, { "epoch": 2.6904797267191443, "grad_norm": 1.7892287969589233, "learning_rate": 1.3857121639685132e-05, "loss": 0.0647, "step": 36230 }, { "epoch": 2.691222337739492, "grad_norm": 0.9724948406219482, "learning_rate": 1.3852665973563049e-05, "loss": 0.0798, "step": 36240 }, { "epoch": 2.6919649487598396, "grad_norm": 1.3628493547439575, "learning_rate": 1.3848210307440962e-05, "loss": 0.0556, "step": 36250 }, { "epoch": 2.692707559780187, "grad_norm": 1.9433894157409668, "learning_rate": 1.3843754641318877e-05, "loss": 0.0985, "step": 36260 }, { "epoch": 2.693450170800535, "grad_norm": 1.5981539487838745, "learning_rate": 1.3839298975196794e-05, "loss": 0.0582, "step": 36270 }, { "epoch": 2.694192781820882, "grad_norm": 0.5563480257987976, "learning_rate": 1.3834843309074707e-05, "loss": 0.0417, "step": 36280 }, { "epoch": 2.69493539284123, "grad_norm": 0.9371748566627502, "learning_rate": 1.3830387642952622e-05, "loss": 0.0775, "step": 36290 }, { "epoch": 2.6956780038615773, "grad_norm": 1.239343523979187, "learning_rate": 1.3825931976830535e-05, "loss": 0.0469, "step": 36300 }, { "epoch": 2.6964206148819247, "grad_norm": 0.48741811513900757, "learning_rate": 1.3821476310708452e-05, "loss": 0.0756, "step": 36310 }, { "epoch": 2.6971632259022726, "grad_norm": 3.158456802368164, "learning_rate": 1.3817020644586367e-05, "loss": 0.0276, "step": 36320 }, { "epoch": 2.69790583692262, "grad_norm": 1.2816053628921509, "learning_rate": 1.381256497846428e-05, "loss": 0.0553, "step": 36330 }, { "epoch": 2.6986484479429675, "grad_norm": 0.4111814796924591, "learning_rate": 1.3808109312342195e-05, "loss": 0.0452, "step": 36340 }, { "epoch": 2.699391058963315, "grad_norm": 1.7918380498886108, "learning_rate": 1.380365364622011e-05, "loss": 0.0432, "step": 36350 }, { "epoch": 2.7001336699836624, "grad_norm": 1.718360424041748, "learning_rate": 1.3799197980098025e-05, "loss": 0.1107, "step": 36360 }, { "epoch": 2.7008762810040103, "grad_norm": 0.892610490322113, "learning_rate": 1.3794742313975939e-05, "loss": 0.0709, "step": 36370 }, { "epoch": 2.701618892024358, "grad_norm": 0.5579351186752319, "learning_rate": 1.3790286647853855e-05, "loss": 0.048, "step": 36380 }, { "epoch": 2.7023615030447052, "grad_norm": 0.5769586563110352, "learning_rate": 1.378583098173177e-05, "loss": 0.0572, "step": 36390 }, { "epoch": 2.7031041140650527, "grad_norm": 2.853304624557495, "learning_rate": 1.3781375315609684e-05, "loss": 0.0766, "step": 36400 }, { "epoch": 2.7038467250854, "grad_norm": 3.345918655395508, "learning_rate": 1.3776919649487599e-05, "loss": 0.0761, "step": 36410 }, { "epoch": 2.704589336105748, "grad_norm": 1.423073649406433, "learning_rate": 1.3772463983365514e-05, "loss": 0.0698, "step": 36420 }, { "epoch": 2.7053319471260955, "grad_norm": 2.362412929534912, "learning_rate": 1.3768008317243429e-05, "loss": 0.08, "step": 36430 }, { "epoch": 2.706074558146443, "grad_norm": 2.3598601818084717, "learning_rate": 1.3763552651121344e-05, "loss": 0.0829, "step": 36440 }, { "epoch": 2.7068171691667904, "grad_norm": 1.5622998476028442, "learning_rate": 1.3759096984999257e-05, "loss": 0.0447, "step": 36450 }, { "epoch": 2.707559780187138, "grad_norm": 0.6126532554626465, "learning_rate": 1.3754641318877173e-05, "loss": 0.0703, "step": 36460 }, { "epoch": 2.7083023912074857, "grad_norm": 1.302445411682129, "learning_rate": 1.3750185652755087e-05, "loss": 0.0636, "step": 36470 }, { "epoch": 2.709045002227833, "grad_norm": 0.8420956134796143, "learning_rate": 1.3745729986633002e-05, "loss": 0.0789, "step": 36480 }, { "epoch": 2.7097876132481806, "grad_norm": 1.0522314310073853, "learning_rate": 1.3741274320510915e-05, "loss": 0.0619, "step": 36490 }, { "epoch": 2.710530224268528, "grad_norm": 1.1721168756484985, "learning_rate": 1.3736818654388832e-05, "loss": 0.047, "step": 36500 }, { "epoch": 2.7112728352888755, "grad_norm": 1.1313562393188477, "learning_rate": 1.3732362988266747e-05, "loss": 0.0793, "step": 36510 }, { "epoch": 2.7120154463092234, "grad_norm": 0.656134307384491, "learning_rate": 1.372790732214466e-05, "loss": 0.0709, "step": 36520 }, { "epoch": 2.712758057329571, "grad_norm": 0.298880934715271, "learning_rate": 1.3723451656022577e-05, "loss": 0.0516, "step": 36530 }, { "epoch": 2.7135006683499183, "grad_norm": 0.7972229719161987, "learning_rate": 1.371899598990049e-05, "loss": 0.0448, "step": 36540 }, { "epoch": 2.7142432793702658, "grad_norm": 1.8355180025100708, "learning_rate": 1.3714540323778405e-05, "loss": 0.0938, "step": 36550 }, { "epoch": 2.714985890390613, "grad_norm": 1.5195986032485962, "learning_rate": 1.371008465765632e-05, "loss": 0.0733, "step": 36560 }, { "epoch": 2.715728501410961, "grad_norm": 0.950968861579895, "learning_rate": 1.3705628991534235e-05, "loss": 0.038, "step": 36570 }, { "epoch": 2.7164711124313086, "grad_norm": 2.1783084869384766, "learning_rate": 1.370117332541215e-05, "loss": 0.0732, "step": 36580 }, { "epoch": 2.717213723451656, "grad_norm": 4.514249801635742, "learning_rate": 1.3696717659290063e-05, "loss": 0.0534, "step": 36590 }, { "epoch": 2.7179563344720035, "grad_norm": 1.7248497009277344, "learning_rate": 1.3692261993167978e-05, "loss": 0.0703, "step": 36600 }, { "epoch": 2.718698945492351, "grad_norm": 2.7249276638031006, "learning_rate": 1.3687806327045895e-05, "loss": 0.0967, "step": 36610 }, { "epoch": 2.719441556512699, "grad_norm": 0.6530225276947021, "learning_rate": 1.3683350660923808e-05, "loss": 0.0476, "step": 36620 }, { "epoch": 2.7201841675330463, "grad_norm": 0.6490178108215332, "learning_rate": 1.3678894994801723e-05, "loss": 0.0502, "step": 36630 }, { "epoch": 2.7209267785533937, "grad_norm": 2.1632354259490967, "learning_rate": 1.3674439328679638e-05, "loss": 0.0634, "step": 36640 }, { "epoch": 2.721669389573741, "grad_norm": 0.618598997592926, "learning_rate": 1.3669983662557553e-05, "loss": 0.0631, "step": 36650 }, { "epoch": 2.7224120005940886, "grad_norm": 0.579268753528595, "learning_rate": 1.3665527996435467e-05, "loss": 0.0632, "step": 36660 }, { "epoch": 2.7231546116144365, "grad_norm": 0.623193085193634, "learning_rate": 1.3661072330313382e-05, "loss": 0.074, "step": 36670 }, { "epoch": 2.723897222634784, "grad_norm": 0.6630807518959045, "learning_rate": 1.3656616664191298e-05, "loss": 0.0574, "step": 36680 }, { "epoch": 2.7246398336551314, "grad_norm": 1.1906079053878784, "learning_rate": 1.3652160998069212e-05, "loss": 0.0833, "step": 36690 }, { "epoch": 2.725382444675479, "grad_norm": 0.7799108624458313, "learning_rate": 1.3647705331947127e-05, "loss": 0.0696, "step": 36700 }, { "epoch": 2.7261250556958263, "grad_norm": 0.28752097487449646, "learning_rate": 1.364324966582504e-05, "loss": 0.0523, "step": 36710 }, { "epoch": 2.726867666716174, "grad_norm": 1.7490395307540894, "learning_rate": 1.3638793999702957e-05, "loss": 0.0729, "step": 36720 }, { "epoch": 2.7276102777365216, "grad_norm": 0.7951035499572754, "learning_rate": 1.3634338333580872e-05, "loss": 0.0777, "step": 36730 }, { "epoch": 2.728352888756869, "grad_norm": 1.3298048973083496, "learning_rate": 1.3629882667458785e-05, "loss": 0.0631, "step": 36740 }, { "epoch": 2.7290954997772165, "grad_norm": 0.4356074929237366, "learning_rate": 1.36254270013367e-05, "loss": 0.0439, "step": 36750 }, { "epoch": 2.729838110797564, "grad_norm": 0.5063789486885071, "learning_rate": 1.3620971335214615e-05, "loss": 0.0666, "step": 36760 }, { "epoch": 2.730580721817912, "grad_norm": 2.056678533554077, "learning_rate": 1.361651566909253e-05, "loss": 0.078, "step": 36770 }, { "epoch": 2.7313233328382593, "grad_norm": 1.4419987201690674, "learning_rate": 1.3612060002970443e-05, "loss": 0.0662, "step": 36780 }, { "epoch": 2.732065943858607, "grad_norm": 1.547361969947815, "learning_rate": 1.360760433684836e-05, "loss": 0.046, "step": 36790 }, { "epoch": 2.7328085548789542, "grad_norm": 2.8562614917755127, "learning_rate": 1.3603148670726275e-05, "loss": 0.0744, "step": 36800 }, { "epoch": 2.7335511658993017, "grad_norm": 3.6716110706329346, "learning_rate": 1.3598693004604188e-05, "loss": 0.0709, "step": 36810 }, { "epoch": 2.7342937769196496, "grad_norm": 1.2131446599960327, "learning_rate": 1.3594237338482103e-05, "loss": 0.0557, "step": 36820 }, { "epoch": 2.735036387939997, "grad_norm": 0.5412776470184326, "learning_rate": 1.3589781672360018e-05, "loss": 0.0738, "step": 36830 }, { "epoch": 2.7357789989603445, "grad_norm": 1.8566441535949707, "learning_rate": 1.3585326006237933e-05, "loss": 0.0771, "step": 36840 }, { "epoch": 2.7365216099806924, "grad_norm": 1.2451858520507812, "learning_rate": 1.3580870340115848e-05, "loss": 0.0773, "step": 36850 }, { "epoch": 2.7372642210010394, "grad_norm": 1.140763759613037, "learning_rate": 1.3576414673993761e-05, "loss": 0.0661, "step": 36860 }, { "epoch": 2.7380068320213873, "grad_norm": 1.0305567979812622, "learning_rate": 1.3571959007871678e-05, "loss": 0.0349, "step": 36870 }, { "epoch": 2.7387494430417347, "grad_norm": 1.3327600955963135, "learning_rate": 1.3567503341749591e-05, "loss": 0.0824, "step": 36880 }, { "epoch": 2.739492054062082, "grad_norm": 1.6335985660552979, "learning_rate": 1.3563047675627506e-05, "loss": 0.0727, "step": 36890 }, { "epoch": 2.74023466508243, "grad_norm": 1.2704558372497559, "learning_rate": 1.3558592009505421e-05, "loss": 0.0589, "step": 36900 }, { "epoch": 2.7409772761027775, "grad_norm": 1.6128848791122437, "learning_rate": 1.3554136343383336e-05, "loss": 0.0584, "step": 36910 }, { "epoch": 2.741719887123125, "grad_norm": 1.1986207962036133, "learning_rate": 1.3549680677261251e-05, "loss": 0.0891, "step": 36920 }, { "epoch": 2.7424624981434724, "grad_norm": 0.5783780813217163, "learning_rate": 1.3545225011139165e-05, "loss": 0.0748, "step": 36930 }, { "epoch": 2.74320510916382, "grad_norm": 2.1520473957061768, "learning_rate": 1.3540769345017081e-05, "loss": 0.0562, "step": 36940 }, { "epoch": 2.7439477201841678, "grad_norm": 1.8193594217300415, "learning_rate": 1.3536313678894995e-05, "loss": 0.0797, "step": 36950 }, { "epoch": 2.744690331204515, "grad_norm": 2.4824233055114746, "learning_rate": 1.353185801277291e-05, "loss": 0.0791, "step": 36960 }, { "epoch": 2.7454329422248627, "grad_norm": 1.889683723449707, "learning_rate": 1.3527402346650825e-05, "loss": 0.0832, "step": 36970 }, { "epoch": 2.74617555324521, "grad_norm": 1.3686025142669678, "learning_rate": 1.352294668052874e-05, "loss": 0.0714, "step": 36980 }, { "epoch": 2.7469181642655576, "grad_norm": 1.2818831205368042, "learning_rate": 1.3518491014406655e-05, "loss": 0.0581, "step": 36990 }, { "epoch": 2.7476607752859055, "grad_norm": 1.8120100498199463, "learning_rate": 1.3514035348284568e-05, "loss": 0.0515, "step": 37000 }, { "epoch": 2.748403386306253, "grad_norm": 2.3514275550842285, "learning_rate": 1.3509579682162483e-05, "loss": 0.0766, "step": 37010 }, { "epoch": 2.7491459973266004, "grad_norm": 0.7103281617164612, "learning_rate": 1.35051240160404e-05, "loss": 0.06, "step": 37020 }, { "epoch": 2.749888608346948, "grad_norm": 0.8013458251953125, "learning_rate": 1.3500668349918313e-05, "loss": 0.042, "step": 37030 }, { "epoch": 2.7506312193672953, "grad_norm": 0.9104951024055481, "learning_rate": 1.3496212683796228e-05, "loss": 0.0657, "step": 37040 }, { "epoch": 2.751373830387643, "grad_norm": 3.061896324157715, "learning_rate": 1.3491757017674143e-05, "loss": 0.0995, "step": 37050 }, { "epoch": 2.7521164414079906, "grad_norm": 2.806757688522339, "learning_rate": 1.3487301351552058e-05, "loss": 0.0623, "step": 37060 }, { "epoch": 2.752859052428338, "grad_norm": 0.8061108589172363, "learning_rate": 1.3482845685429971e-05, "loss": 0.0434, "step": 37070 }, { "epoch": 2.7536016634486855, "grad_norm": 0.7972543835639954, "learning_rate": 1.3478390019307886e-05, "loss": 0.0756, "step": 37080 }, { "epoch": 2.754344274469033, "grad_norm": 1.5233701467514038, "learning_rate": 1.3473934353185803e-05, "loss": 0.0669, "step": 37090 }, { "epoch": 2.755086885489381, "grad_norm": 3.342548370361328, "learning_rate": 1.3469478687063716e-05, "loss": 0.0706, "step": 37100 }, { "epoch": 2.7558294965097283, "grad_norm": 0.6841835975646973, "learning_rate": 1.3465023020941631e-05, "loss": 0.0526, "step": 37110 }, { "epoch": 2.7565721075300758, "grad_norm": 0.362078994512558, "learning_rate": 1.3460567354819544e-05, "loss": 0.0528, "step": 37120 }, { "epoch": 2.757314718550423, "grad_norm": 2.367532968521118, "learning_rate": 1.3456111688697461e-05, "loss": 0.0709, "step": 37130 }, { "epoch": 2.7580573295707707, "grad_norm": 1.0397535562515259, "learning_rate": 1.3451656022575376e-05, "loss": 0.0827, "step": 37140 }, { "epoch": 2.7587999405911185, "grad_norm": 2.048051118850708, "learning_rate": 1.344720035645329e-05, "loss": 0.0707, "step": 37150 }, { "epoch": 2.759542551611466, "grad_norm": 0.5190430283546448, "learning_rate": 1.3442744690331206e-05, "loss": 0.0711, "step": 37160 }, { "epoch": 2.7602851626318134, "grad_norm": 2.5641369819641113, "learning_rate": 1.343828902420912e-05, "loss": 0.0677, "step": 37170 }, { "epoch": 2.761027773652161, "grad_norm": 0.1991005390882492, "learning_rate": 1.3433833358087034e-05, "loss": 0.0298, "step": 37180 }, { "epoch": 2.7617703846725083, "grad_norm": 1.7412118911743164, "learning_rate": 1.3429377691964948e-05, "loss": 0.061, "step": 37190 }, { "epoch": 2.7625129956928562, "grad_norm": 1.8074331283569336, "learning_rate": 1.3424922025842864e-05, "loss": 0.0684, "step": 37200 }, { "epoch": 2.7632556067132037, "grad_norm": 1.0141547918319702, "learning_rate": 1.342046635972078e-05, "loss": 0.0713, "step": 37210 }, { "epoch": 2.763998217733551, "grad_norm": 1.6694709062576294, "learning_rate": 1.3416010693598693e-05, "loss": 0.0599, "step": 37220 }, { "epoch": 2.7647408287538986, "grad_norm": 2.54500675201416, "learning_rate": 1.3411555027476608e-05, "loss": 0.054, "step": 37230 }, { "epoch": 2.765483439774246, "grad_norm": 2.212883710861206, "learning_rate": 1.3407099361354523e-05, "loss": 0.0734, "step": 37240 }, { "epoch": 2.766226050794594, "grad_norm": 1.2556638717651367, "learning_rate": 1.3402643695232438e-05, "loss": 0.0718, "step": 37250 }, { "epoch": 2.7669686618149414, "grad_norm": 2.478182792663574, "learning_rate": 1.3398188029110353e-05, "loss": 0.0535, "step": 37260 }, { "epoch": 2.767711272835289, "grad_norm": 1.532631516456604, "learning_rate": 1.3393732362988266e-05, "loss": 0.0807, "step": 37270 }, { "epoch": 2.7684538838556363, "grad_norm": 1.9082090854644775, "learning_rate": 1.3389276696866183e-05, "loss": 0.0805, "step": 37280 }, { "epoch": 2.7691964948759837, "grad_norm": 1.0164248943328857, "learning_rate": 1.3384821030744096e-05, "loss": 0.0843, "step": 37290 }, { "epoch": 2.7699391058963316, "grad_norm": 0.967978298664093, "learning_rate": 1.3380365364622011e-05, "loss": 0.0693, "step": 37300 }, { "epoch": 2.770681716916679, "grad_norm": 1.1831194162368774, "learning_rate": 1.3375909698499928e-05, "loss": 0.0716, "step": 37310 }, { "epoch": 2.7714243279370265, "grad_norm": 2.0037786960601807, "learning_rate": 1.3371454032377841e-05, "loss": 0.0813, "step": 37320 }, { "epoch": 2.772166938957374, "grad_norm": 1.3486874103546143, "learning_rate": 1.3366998366255756e-05, "loss": 0.0912, "step": 37330 }, { "epoch": 2.7729095499777214, "grad_norm": 1.2542924880981445, "learning_rate": 1.336254270013367e-05, "loss": 0.067, "step": 37340 }, { "epoch": 2.7736521609980693, "grad_norm": 0.595507025718689, "learning_rate": 1.3358087034011586e-05, "loss": 0.0758, "step": 37350 }, { "epoch": 2.7743947720184168, "grad_norm": 1.162650465965271, "learning_rate": 1.33536313678895e-05, "loss": 0.0643, "step": 37360 }, { "epoch": 2.7751373830387642, "grad_norm": 0.5855199098587036, "learning_rate": 1.3349175701767414e-05, "loss": 0.0572, "step": 37370 }, { "epoch": 2.7758799940591117, "grad_norm": 3.25514554977417, "learning_rate": 1.334472003564533e-05, "loss": 0.0628, "step": 37380 }, { "epoch": 2.776622605079459, "grad_norm": 2.44706392288208, "learning_rate": 1.3340264369523244e-05, "loss": 0.0798, "step": 37390 }, { "epoch": 2.777365216099807, "grad_norm": 1.5468707084655762, "learning_rate": 1.333580870340116e-05, "loss": 0.0699, "step": 37400 }, { "epoch": 2.7781078271201545, "grad_norm": 3.0609419345855713, "learning_rate": 1.3331353037279073e-05, "loss": 0.0741, "step": 37410 }, { "epoch": 2.778850438140502, "grad_norm": 2.7245450019836426, "learning_rate": 1.3326897371156988e-05, "loss": 0.0528, "step": 37420 }, { "epoch": 2.77959304916085, "grad_norm": 0.9166297912597656, "learning_rate": 1.3322441705034904e-05, "loss": 0.0744, "step": 37430 }, { "epoch": 2.780335660181197, "grad_norm": 1.0476568937301636, "learning_rate": 1.3317986038912818e-05, "loss": 0.0556, "step": 37440 }, { "epoch": 2.7810782712015447, "grad_norm": 2.8554935455322266, "learning_rate": 1.3313530372790733e-05, "loss": 0.0651, "step": 37450 }, { "epoch": 2.781820882221892, "grad_norm": 1.7968850135803223, "learning_rate": 1.3309074706668648e-05, "loss": 0.0684, "step": 37460 }, { "epoch": 2.7825634932422396, "grad_norm": 3.455589532852173, "learning_rate": 1.3304619040546563e-05, "loss": 0.0739, "step": 37470 }, { "epoch": 2.7833061042625875, "grad_norm": 1.8191416263580322, "learning_rate": 1.3300163374424476e-05, "loss": 0.0527, "step": 37480 }, { "epoch": 2.784048715282935, "grad_norm": 1.094232201576233, "learning_rate": 1.329570770830239e-05, "loss": 0.076, "step": 37490 }, { "epoch": 2.7847913263032824, "grad_norm": 4.273893356323242, "learning_rate": 1.3291252042180307e-05, "loss": 0.086, "step": 37500 }, { "epoch": 2.78553393732363, "grad_norm": 2.4333572387695312, "learning_rate": 1.328679637605822e-05, "loss": 0.102, "step": 37510 }, { "epoch": 2.7862765483439773, "grad_norm": 1.4521609544754028, "learning_rate": 1.3282340709936136e-05, "loss": 0.0733, "step": 37520 }, { "epoch": 2.787019159364325, "grad_norm": 1.7854300737380981, "learning_rate": 1.3277885043814049e-05, "loss": 0.0426, "step": 37530 }, { "epoch": 2.7877617703846727, "grad_norm": 1.4833481311798096, "learning_rate": 1.3273429377691966e-05, "loss": 0.0501, "step": 37540 }, { "epoch": 2.78850438140502, "grad_norm": 3.3596692085266113, "learning_rate": 1.326897371156988e-05, "loss": 0.0536, "step": 37550 }, { "epoch": 2.7892469924253676, "grad_norm": 3.833606481552124, "learning_rate": 1.3264518045447794e-05, "loss": 0.0684, "step": 37560 }, { "epoch": 2.789989603445715, "grad_norm": 0.862786054611206, "learning_rate": 1.326006237932571e-05, "loss": 0.0419, "step": 37570 }, { "epoch": 2.790732214466063, "grad_norm": 0.7479864954948425, "learning_rate": 1.3255606713203624e-05, "loss": 0.062, "step": 37580 }, { "epoch": 2.7914748254864103, "grad_norm": 1.0688323974609375, "learning_rate": 1.3251151047081539e-05, "loss": 0.0577, "step": 37590 }, { "epoch": 2.792217436506758, "grad_norm": 1.409751057624817, "learning_rate": 1.3246695380959454e-05, "loss": 0.0587, "step": 37600 }, { "epoch": 2.7929600475271052, "grad_norm": 0.3575490415096283, "learning_rate": 1.3242239714837369e-05, "loss": 0.0798, "step": 37610 }, { "epoch": 2.7937026585474527, "grad_norm": 1.4559156894683838, "learning_rate": 1.3237784048715284e-05, "loss": 0.0729, "step": 37620 }, { "epoch": 2.7944452695678006, "grad_norm": 1.0611257553100586, "learning_rate": 1.3233328382593197e-05, "loss": 0.0695, "step": 37630 }, { "epoch": 2.795187880588148, "grad_norm": 1.5635493993759155, "learning_rate": 1.3228872716471112e-05, "loss": 0.0826, "step": 37640 }, { "epoch": 2.7959304916084955, "grad_norm": 0.6104263663291931, "learning_rate": 1.3224417050349027e-05, "loss": 0.0548, "step": 37650 }, { "epoch": 2.796673102628843, "grad_norm": 1.3987880945205688, "learning_rate": 1.3219961384226942e-05, "loss": 0.0678, "step": 37660 }, { "epoch": 2.7974157136491904, "grad_norm": 0.8820175528526306, "learning_rate": 1.3215505718104857e-05, "loss": 0.0589, "step": 37670 }, { "epoch": 2.7981583246695383, "grad_norm": 1.4698015451431274, "learning_rate": 1.321105005198277e-05, "loss": 0.0607, "step": 37680 }, { "epoch": 2.7989009356898857, "grad_norm": 2.0641324520111084, "learning_rate": 1.3206594385860687e-05, "loss": 0.0633, "step": 37690 }, { "epoch": 2.799643546710233, "grad_norm": 3.1822593212127686, "learning_rate": 1.32021387197386e-05, "loss": 0.0465, "step": 37700 }, { "epoch": 2.8003861577305806, "grad_norm": 1.4389050006866455, "learning_rate": 1.3197683053616516e-05, "loss": 0.0879, "step": 37710 }, { "epoch": 2.801128768750928, "grad_norm": 1.4606937170028687, "learning_rate": 1.3193227387494432e-05, "loss": 0.0548, "step": 37720 }, { "epoch": 2.801871379771276, "grad_norm": 2.7403457164764404, "learning_rate": 1.3188771721372346e-05, "loss": 0.0507, "step": 37730 }, { "epoch": 2.8026139907916234, "grad_norm": 2.382749080657959, "learning_rate": 1.318431605525026e-05, "loss": 0.0725, "step": 37740 }, { "epoch": 2.803356601811971, "grad_norm": 1.1223398447036743, "learning_rate": 1.3179860389128174e-05, "loss": 0.0901, "step": 37750 }, { "epoch": 2.8040992128323183, "grad_norm": 1.5770460367202759, "learning_rate": 1.317540472300609e-05, "loss": 0.0897, "step": 37760 }, { "epoch": 2.804841823852666, "grad_norm": 1.267210602760315, "learning_rate": 1.3170949056884004e-05, "loss": 0.0532, "step": 37770 }, { "epoch": 2.8055844348730137, "grad_norm": 0.7207576036453247, "learning_rate": 1.3166493390761919e-05, "loss": 0.0356, "step": 37780 }, { "epoch": 2.806327045893361, "grad_norm": 0.6520107984542847, "learning_rate": 1.3162037724639834e-05, "loss": 0.0663, "step": 37790 }, { "epoch": 2.8070696569137086, "grad_norm": 1.4991291761398315, "learning_rate": 1.3157582058517749e-05, "loss": 0.0732, "step": 37800 }, { "epoch": 2.807812267934056, "grad_norm": 2.074842691421509, "learning_rate": 1.3153126392395664e-05, "loss": 0.0878, "step": 37810 }, { "epoch": 2.8085548789544035, "grad_norm": 0.5263580083847046, "learning_rate": 1.3148670726273577e-05, "loss": 0.098, "step": 37820 }, { "epoch": 2.8092974899747514, "grad_norm": 2.034339427947998, "learning_rate": 1.3144215060151494e-05, "loss": 0.0592, "step": 37830 }, { "epoch": 2.810040100995099, "grad_norm": 1.7744560241699219, "learning_rate": 1.3139759394029409e-05, "loss": 0.0785, "step": 37840 }, { "epoch": 2.8107827120154463, "grad_norm": 1.718765377998352, "learning_rate": 1.3135303727907322e-05, "loss": 0.0602, "step": 37850 }, { "epoch": 2.8115253230357937, "grad_norm": 0.45636001229286194, "learning_rate": 1.3130848061785237e-05, "loss": 0.0625, "step": 37860 }, { "epoch": 2.812267934056141, "grad_norm": 0.8540383577346802, "learning_rate": 1.3126392395663152e-05, "loss": 0.0749, "step": 37870 }, { "epoch": 2.813010545076489, "grad_norm": 0.7994318604469299, "learning_rate": 1.3121936729541067e-05, "loss": 0.0738, "step": 37880 }, { "epoch": 2.8137531560968365, "grad_norm": 2.370769739151001, "learning_rate": 1.311748106341898e-05, "loss": 0.0594, "step": 37890 }, { "epoch": 2.814495767117184, "grad_norm": 3.0861258506774902, "learning_rate": 1.3113025397296895e-05, "loss": 0.0758, "step": 37900 }, { "epoch": 2.8152383781375314, "grad_norm": 0.46465158462524414, "learning_rate": 1.3108569731174812e-05, "loss": 0.0581, "step": 37910 }, { "epoch": 2.815980989157879, "grad_norm": 2.443127393722534, "learning_rate": 1.3104114065052725e-05, "loss": 0.0948, "step": 37920 }, { "epoch": 2.8167236001782268, "grad_norm": 2.299797534942627, "learning_rate": 1.309965839893064e-05, "loss": 0.0565, "step": 37930 }, { "epoch": 2.817466211198574, "grad_norm": 3.1325736045837402, "learning_rate": 1.3095202732808554e-05, "loss": 0.0834, "step": 37940 }, { "epoch": 2.8182088222189217, "grad_norm": 1.0582780838012695, "learning_rate": 1.309074706668647e-05, "loss": 0.0739, "step": 37950 }, { "epoch": 2.818951433239269, "grad_norm": 2.284137725830078, "learning_rate": 1.3086291400564385e-05, "loss": 0.0861, "step": 37960 }, { "epoch": 2.8196940442596166, "grad_norm": 0.8023969531059265, "learning_rate": 1.3081835734442299e-05, "loss": 0.0615, "step": 37970 }, { "epoch": 2.8204366552799645, "grad_norm": 1.1526970863342285, "learning_rate": 1.3077380068320215e-05, "loss": 0.0686, "step": 37980 }, { "epoch": 2.821179266300312, "grad_norm": 2.1727919578552246, "learning_rate": 1.3072924402198129e-05, "loss": 0.0874, "step": 37990 }, { "epoch": 2.8219218773206594, "grad_norm": 0.8734510540962219, "learning_rate": 1.3068468736076044e-05, "loss": 0.0577, "step": 38000 }, { "epoch": 2.8226644883410072, "grad_norm": 0.24533693492412567, "learning_rate": 1.3064013069953959e-05, "loss": 0.0887, "step": 38010 }, { "epoch": 2.8234070993613543, "grad_norm": 2.325021982192993, "learning_rate": 1.3059557403831874e-05, "loss": 0.0681, "step": 38020 }, { "epoch": 2.824149710381702, "grad_norm": 1.5730549097061157, "learning_rate": 1.3055101737709789e-05, "loss": 0.0522, "step": 38030 }, { "epoch": 2.8248923214020496, "grad_norm": 1.1065586805343628, "learning_rate": 1.3050646071587702e-05, "loss": 0.0581, "step": 38040 }, { "epoch": 2.825634932422397, "grad_norm": 0.48450005054473877, "learning_rate": 1.3046190405465617e-05, "loss": 0.0715, "step": 38050 }, { "epoch": 2.826377543442745, "grad_norm": 1.2192469835281372, "learning_rate": 1.3041734739343532e-05, "loss": 0.0557, "step": 38060 }, { "epoch": 2.8271201544630924, "grad_norm": 1.8304122686386108, "learning_rate": 1.3037279073221447e-05, "loss": 0.0832, "step": 38070 }, { "epoch": 2.82786276548344, "grad_norm": 1.7451564073562622, "learning_rate": 1.3032823407099362e-05, "loss": 0.0839, "step": 38080 }, { "epoch": 2.8286053765037873, "grad_norm": 1.190588355064392, "learning_rate": 1.3028367740977277e-05, "loss": 0.0659, "step": 38090 }, { "epoch": 2.8293479875241347, "grad_norm": 0.46794483065605164, "learning_rate": 1.3023912074855192e-05, "loss": 0.0554, "step": 38100 }, { "epoch": 2.8300905985444826, "grad_norm": 1.307004451751709, "learning_rate": 1.3019456408733105e-05, "loss": 0.0637, "step": 38110 }, { "epoch": 2.83083320956483, "grad_norm": 1.7343428134918213, "learning_rate": 1.301500074261102e-05, "loss": 0.0469, "step": 38120 }, { "epoch": 2.8315758205851775, "grad_norm": 1.2779510021209717, "learning_rate": 1.3010545076488937e-05, "loss": 0.0376, "step": 38130 }, { "epoch": 2.832318431605525, "grad_norm": 0.3328961431980133, "learning_rate": 1.300608941036685e-05, "loss": 0.0531, "step": 38140 }, { "epoch": 2.8330610426258724, "grad_norm": 1.039929986000061, "learning_rate": 1.3001633744244765e-05, "loss": 0.0729, "step": 38150 }, { "epoch": 2.8338036536462203, "grad_norm": 1.5998934507369995, "learning_rate": 1.2997178078122678e-05, "loss": 0.0736, "step": 38160 }, { "epoch": 2.834546264666568, "grad_norm": 0.7443707585334778, "learning_rate": 1.2992722412000595e-05, "loss": 0.0618, "step": 38170 }, { "epoch": 2.8352888756869152, "grad_norm": 1.4436475038528442, "learning_rate": 1.2988266745878508e-05, "loss": 0.0762, "step": 38180 }, { "epoch": 2.8360314867072627, "grad_norm": 1.3913630247116089, "learning_rate": 1.2983811079756423e-05, "loss": 0.0825, "step": 38190 }, { "epoch": 2.83677409772761, "grad_norm": 1.0317375659942627, "learning_rate": 1.2979355413634338e-05, "loss": 0.0693, "step": 38200 }, { "epoch": 2.837516708747958, "grad_norm": 1.5049179792404175, "learning_rate": 1.2974899747512253e-05, "loss": 0.0597, "step": 38210 }, { "epoch": 2.8382593197683055, "grad_norm": 1.5254199504852295, "learning_rate": 1.2970444081390168e-05, "loss": 0.0452, "step": 38220 }, { "epoch": 2.839001930788653, "grad_norm": 2.0400617122650146, "learning_rate": 1.2965988415268082e-05, "loss": 0.0726, "step": 38230 }, { "epoch": 2.8397445418090004, "grad_norm": 1.2715054750442505, "learning_rate": 1.2961532749145998e-05, "loss": 0.0657, "step": 38240 }, { "epoch": 2.840487152829348, "grad_norm": 1.5253748893737793, "learning_rate": 1.2957077083023913e-05, "loss": 0.093, "step": 38250 }, { "epoch": 2.8412297638496957, "grad_norm": 1.2937556505203247, "learning_rate": 1.2952621416901827e-05, "loss": 0.0773, "step": 38260 }, { "epoch": 2.841972374870043, "grad_norm": 0.9976204633712769, "learning_rate": 1.2948165750779742e-05, "loss": 0.0774, "step": 38270 }, { "epoch": 2.8427149858903906, "grad_norm": 0.886090874671936, "learning_rate": 1.2943710084657657e-05, "loss": 0.051, "step": 38280 }, { "epoch": 2.843457596910738, "grad_norm": 1.4611785411834717, "learning_rate": 1.2939254418535572e-05, "loss": 0.0612, "step": 38290 }, { "epoch": 2.8442002079310855, "grad_norm": 0.9807224869728088, "learning_rate": 1.2934798752413487e-05, "loss": 0.0654, "step": 38300 }, { "epoch": 2.8449428189514334, "grad_norm": 1.1847294569015503, "learning_rate": 1.29303430862914e-05, "loss": 0.0605, "step": 38310 }, { "epoch": 2.845685429971781, "grad_norm": 0.535963237285614, "learning_rate": 1.2925887420169317e-05, "loss": 0.0763, "step": 38320 }, { "epoch": 2.8464280409921283, "grad_norm": 2.856031894683838, "learning_rate": 1.292143175404723e-05, "loss": 0.064, "step": 38330 }, { "epoch": 2.8471706520124758, "grad_norm": 0.5636598467826843, "learning_rate": 1.2916976087925145e-05, "loss": 0.0675, "step": 38340 }, { "epoch": 2.847913263032823, "grad_norm": 1.67021644115448, "learning_rate": 1.291252042180306e-05, "loss": 0.0784, "step": 38350 }, { "epoch": 2.848655874053171, "grad_norm": 1.2641798257827759, "learning_rate": 1.2908064755680975e-05, "loss": 0.063, "step": 38360 }, { "epoch": 2.8493984850735186, "grad_norm": 0.549030601978302, "learning_rate": 1.290360908955889e-05, "loss": 0.0584, "step": 38370 }, { "epoch": 2.850141096093866, "grad_norm": 2.4092066287994385, "learning_rate": 1.2899153423436803e-05, "loss": 0.0935, "step": 38380 }, { "epoch": 2.8508837071142135, "grad_norm": 0.9181311726570129, "learning_rate": 1.289469775731472e-05, "loss": 0.0586, "step": 38390 }, { "epoch": 2.851626318134561, "grad_norm": 1.6261708736419678, "learning_rate": 1.2890242091192633e-05, "loss": 0.0871, "step": 38400 }, { "epoch": 2.852368929154909, "grad_norm": 1.0663944482803345, "learning_rate": 1.2885786425070548e-05, "loss": 0.0464, "step": 38410 }, { "epoch": 2.8531115401752563, "grad_norm": 1.0980523824691772, "learning_rate": 1.2881330758948463e-05, "loss": 0.0805, "step": 38420 }, { "epoch": 2.8538541511956037, "grad_norm": 1.2846684455871582, "learning_rate": 1.2876875092826378e-05, "loss": 0.1012, "step": 38430 }, { "epoch": 2.854596762215951, "grad_norm": 2.4955661296844482, "learning_rate": 1.2872419426704293e-05, "loss": 0.0585, "step": 38440 }, { "epoch": 2.8553393732362986, "grad_norm": 1.1014326810836792, "learning_rate": 1.2867963760582207e-05, "loss": 0.0656, "step": 38450 }, { "epoch": 2.8560819842566465, "grad_norm": 0.6860207915306091, "learning_rate": 1.2863508094460122e-05, "loss": 0.0522, "step": 38460 }, { "epoch": 2.856824595276994, "grad_norm": 1.087835669517517, "learning_rate": 1.2859052428338037e-05, "loss": 0.0673, "step": 38470 }, { "epoch": 2.8575672062973414, "grad_norm": 2.119745969772339, "learning_rate": 1.2854596762215952e-05, "loss": 0.0697, "step": 38480 }, { "epoch": 2.858309817317689, "grad_norm": 1.239503026008606, "learning_rate": 1.2850141096093867e-05, "loss": 0.0857, "step": 38490 }, { "epoch": 2.8590524283380363, "grad_norm": 4.281582832336426, "learning_rate": 1.2845685429971781e-05, "loss": 0.0598, "step": 38500 }, { "epoch": 2.859795039358384, "grad_norm": 2.14847993850708, "learning_rate": 1.2841229763849696e-05, "loss": 0.068, "step": 38510 }, { "epoch": 2.8605376503787316, "grad_norm": 4.647385120391846, "learning_rate": 1.283677409772761e-05, "loss": 0.0738, "step": 38520 }, { "epoch": 2.861280261399079, "grad_norm": 0.7696908116340637, "learning_rate": 1.2832318431605525e-05, "loss": 0.0458, "step": 38530 }, { "epoch": 2.8620228724194265, "grad_norm": 0.9217883348464966, "learning_rate": 1.2827862765483441e-05, "loss": 0.0427, "step": 38540 }, { "epoch": 2.862765483439774, "grad_norm": 2.1366024017333984, "learning_rate": 1.2823407099361355e-05, "loss": 0.0735, "step": 38550 }, { "epoch": 2.863508094460122, "grad_norm": 1.3464000225067139, "learning_rate": 1.281895143323927e-05, "loss": 0.0712, "step": 38560 }, { "epoch": 2.8642507054804693, "grad_norm": 0.7203729152679443, "learning_rate": 1.2814495767117183e-05, "loss": 0.0611, "step": 38570 }, { "epoch": 2.864993316500817, "grad_norm": 1.2216914892196655, "learning_rate": 1.28100401009951e-05, "loss": 0.0697, "step": 38580 }, { "epoch": 2.8657359275211647, "grad_norm": 1.4661186933517456, "learning_rate": 1.2805584434873013e-05, "loss": 0.0771, "step": 38590 }, { "epoch": 2.8664785385415117, "grad_norm": 0.9744288325309753, "learning_rate": 1.2801128768750928e-05, "loss": 0.0693, "step": 38600 }, { "epoch": 2.8672211495618596, "grad_norm": 0.961794912815094, "learning_rate": 1.2796673102628845e-05, "loss": 0.0956, "step": 38610 }, { "epoch": 2.867963760582207, "grad_norm": 2.7422969341278076, "learning_rate": 1.2792217436506758e-05, "loss": 0.0994, "step": 38620 }, { "epoch": 2.8687063716025545, "grad_norm": 1.6554310321807861, "learning_rate": 1.2787761770384673e-05, "loss": 0.0521, "step": 38630 }, { "epoch": 2.8694489826229024, "grad_norm": 2.223524808883667, "learning_rate": 1.2783306104262586e-05, "loss": 0.0773, "step": 38640 }, { "epoch": 2.87019159364325, "grad_norm": 0.8952299356460571, "learning_rate": 1.2778850438140503e-05, "loss": 0.0592, "step": 38650 }, { "epoch": 2.8709342046635973, "grad_norm": 0.7390848398208618, "learning_rate": 1.2774394772018418e-05, "loss": 0.0638, "step": 38660 }, { "epoch": 2.8716768156839447, "grad_norm": 1.8188756704330444, "learning_rate": 1.2769939105896331e-05, "loss": 0.0533, "step": 38670 }, { "epoch": 2.872419426704292, "grad_norm": 1.062387466430664, "learning_rate": 1.2765483439774246e-05, "loss": 0.0603, "step": 38680 }, { "epoch": 2.87316203772464, "grad_norm": 2.831735134124756, "learning_rate": 1.2761027773652161e-05, "loss": 0.0383, "step": 38690 }, { "epoch": 2.8739046487449875, "grad_norm": 0.8888131976127625, "learning_rate": 1.2756572107530076e-05, "loss": 0.0578, "step": 38700 }, { "epoch": 2.874647259765335, "grad_norm": 2.9310948848724365, "learning_rate": 1.2752116441407991e-05, "loss": 0.068, "step": 38710 }, { "epoch": 2.8753898707856824, "grad_norm": 0.4100227952003479, "learning_rate": 1.2747660775285905e-05, "loss": 0.0827, "step": 38720 }, { "epoch": 2.87613248180603, "grad_norm": 1.1365009546279907, "learning_rate": 1.2743205109163821e-05, "loss": 0.0718, "step": 38730 }, { "epoch": 2.8768750928263778, "grad_norm": 1.4055360555648804, "learning_rate": 1.2738749443041735e-05, "loss": 0.0788, "step": 38740 }, { "epoch": 2.877617703846725, "grad_norm": 1.0839377641677856, "learning_rate": 1.273429377691965e-05, "loss": 0.0649, "step": 38750 }, { "epoch": 2.8783603148670727, "grad_norm": 1.1226552724838257, "learning_rate": 1.2729838110797565e-05, "loss": 0.0717, "step": 38760 }, { "epoch": 2.87910292588742, "grad_norm": 0.4335779845714569, "learning_rate": 1.272538244467548e-05, "loss": 0.0535, "step": 38770 }, { "epoch": 2.8798455369077676, "grad_norm": 0.9374495148658752, "learning_rate": 1.2720926778553395e-05, "loss": 0.069, "step": 38780 }, { "epoch": 2.8805881479281155, "grad_norm": 1.4652955532073975, "learning_rate": 1.2716471112431308e-05, "loss": 0.0836, "step": 38790 }, { "epoch": 2.881330758948463, "grad_norm": 1.3489465713500977, "learning_rate": 1.2712015446309225e-05, "loss": 0.0757, "step": 38800 }, { "epoch": 2.8820733699688104, "grad_norm": 3.2947144508361816, "learning_rate": 1.2707559780187138e-05, "loss": 0.0454, "step": 38810 }, { "epoch": 2.882815980989158, "grad_norm": 1.3606735467910767, "learning_rate": 1.2703104114065053e-05, "loss": 0.0732, "step": 38820 }, { "epoch": 2.8835585920095053, "grad_norm": 0.7241372466087341, "learning_rate": 1.2698648447942968e-05, "loss": 0.0727, "step": 38830 }, { "epoch": 2.884301203029853, "grad_norm": 1.2209150791168213, "learning_rate": 1.2694192781820883e-05, "loss": 0.0625, "step": 38840 }, { "epoch": 2.8850438140502006, "grad_norm": 3.2763359546661377, "learning_rate": 1.2689737115698798e-05, "loss": 0.0619, "step": 38850 }, { "epoch": 2.885786425070548, "grad_norm": 0.7242000699043274, "learning_rate": 1.2685281449576711e-05, "loss": 0.0428, "step": 38860 }, { "epoch": 2.8865290360908955, "grad_norm": 1.276310682296753, "learning_rate": 1.2680825783454628e-05, "loss": 0.0873, "step": 38870 }, { "epoch": 2.887271647111243, "grad_norm": 0.70942223072052, "learning_rate": 1.2676370117332541e-05, "loss": 0.0563, "step": 38880 }, { "epoch": 2.888014258131591, "grad_norm": 1.4560551643371582, "learning_rate": 1.2671914451210456e-05, "loss": 0.0754, "step": 38890 }, { "epoch": 2.8887568691519383, "grad_norm": 3.525283098220825, "learning_rate": 1.2667458785088371e-05, "loss": 0.1017, "step": 38900 }, { "epoch": 2.8894994801722858, "grad_norm": 2.5498082637786865, "learning_rate": 1.2663003118966286e-05, "loss": 0.0777, "step": 38910 }, { "epoch": 2.890242091192633, "grad_norm": 1.0680961608886719, "learning_rate": 1.2658547452844201e-05, "loss": 0.058, "step": 38920 }, { "epoch": 2.8909847022129807, "grad_norm": 0.860130786895752, "learning_rate": 1.2654091786722114e-05, "loss": 0.0519, "step": 38930 }, { "epoch": 2.8917273132333285, "grad_norm": 0.4104064106941223, "learning_rate": 1.264963612060003e-05, "loss": 0.0721, "step": 38940 }, { "epoch": 2.892469924253676, "grad_norm": 1.9882882833480835, "learning_rate": 1.2645180454477946e-05, "loss": 0.0761, "step": 38950 }, { "epoch": 2.8932125352740234, "grad_norm": 0.7212990522384644, "learning_rate": 1.264072478835586e-05, "loss": 0.0602, "step": 38960 }, { "epoch": 2.893955146294371, "grad_norm": 0.5965471863746643, "learning_rate": 1.2636269122233774e-05, "loss": 0.0764, "step": 38970 }, { "epoch": 2.8946977573147183, "grad_norm": 2.7292585372924805, "learning_rate": 1.2631813456111688e-05, "loss": 0.0523, "step": 38980 }, { "epoch": 2.8954403683350662, "grad_norm": 0.5721240043640137, "learning_rate": 1.2627357789989604e-05, "loss": 0.0278, "step": 38990 }, { "epoch": 2.8961829793554137, "grad_norm": 2.3758206367492676, "learning_rate": 1.262290212386752e-05, "loss": 0.0495, "step": 39000 }, { "epoch": 2.896925590375761, "grad_norm": 0.5098469257354736, "learning_rate": 1.2618446457745433e-05, "loss": 0.0734, "step": 39010 }, { "epoch": 2.8976682013961086, "grad_norm": 1.3292839527130127, "learning_rate": 1.261399079162335e-05, "loss": 0.0594, "step": 39020 }, { "epoch": 2.898410812416456, "grad_norm": 0.4254518747329712, "learning_rate": 1.2609535125501263e-05, "loss": 0.0818, "step": 39030 }, { "epoch": 2.899153423436804, "grad_norm": 1.5062333345413208, "learning_rate": 1.2605079459379178e-05, "loss": 0.058, "step": 39040 }, { "epoch": 2.8998960344571514, "grad_norm": 1.215062141418457, "learning_rate": 1.2600623793257091e-05, "loss": 0.0422, "step": 39050 }, { "epoch": 2.900638645477499, "grad_norm": 1.1051884889602661, "learning_rate": 1.2596168127135008e-05, "loss": 0.0726, "step": 39060 }, { "epoch": 2.9013812564978463, "grad_norm": 0.9767510294914246, "learning_rate": 1.2591712461012923e-05, "loss": 0.0557, "step": 39070 }, { "epoch": 2.9021238675181937, "grad_norm": 3.3917346000671387, "learning_rate": 1.2587256794890836e-05, "loss": 0.0595, "step": 39080 }, { "epoch": 2.9028664785385416, "grad_norm": 2.1821186542510986, "learning_rate": 1.2582801128768751e-05, "loss": 0.0616, "step": 39090 }, { "epoch": 2.903609089558889, "grad_norm": 3.1181464195251465, "learning_rate": 1.2578345462646666e-05, "loss": 0.0614, "step": 39100 }, { "epoch": 2.9043517005792365, "grad_norm": 1.3670252561569214, "learning_rate": 1.2573889796524581e-05, "loss": 0.0571, "step": 39110 }, { "epoch": 2.905094311599584, "grad_norm": 2.3952903747558594, "learning_rate": 1.2569434130402496e-05, "loss": 0.0576, "step": 39120 }, { "epoch": 2.9058369226199314, "grad_norm": 3.1629419326782227, "learning_rate": 1.256497846428041e-05, "loss": 0.0895, "step": 39130 }, { "epoch": 2.9065795336402793, "grad_norm": 2.0828757286071777, "learning_rate": 1.2560522798158326e-05, "loss": 0.0773, "step": 39140 }, { "epoch": 2.9073221446606268, "grad_norm": 0.463571161031723, "learning_rate": 1.255606713203624e-05, "loss": 0.053, "step": 39150 }, { "epoch": 2.9080647556809742, "grad_norm": 1.1848664283752441, "learning_rate": 1.2551611465914154e-05, "loss": 0.072, "step": 39160 }, { "epoch": 2.908807366701322, "grad_norm": 0.9706199765205383, "learning_rate": 1.254715579979207e-05, "loss": 0.0496, "step": 39170 }, { "epoch": 2.909549977721669, "grad_norm": 2.834559440612793, "learning_rate": 1.2542700133669984e-05, "loss": 0.0794, "step": 39180 }, { "epoch": 2.910292588742017, "grad_norm": 2.5755867958068848, "learning_rate": 1.25382444675479e-05, "loss": 0.0941, "step": 39190 }, { "epoch": 2.9110351997623645, "grad_norm": 0.6742496490478516, "learning_rate": 1.2533788801425812e-05, "loss": 0.0551, "step": 39200 }, { "epoch": 2.911777810782712, "grad_norm": 1.3099998235702515, "learning_rate": 1.2529333135303729e-05, "loss": 0.0438, "step": 39210 }, { "epoch": 2.91252042180306, "grad_norm": 1.9794929027557373, "learning_rate": 1.2524877469181642e-05, "loss": 0.0746, "step": 39220 }, { "epoch": 2.9132630328234073, "grad_norm": 1.8509886264801025, "learning_rate": 1.2520421803059557e-05, "loss": 0.0832, "step": 39230 }, { "epoch": 2.9140056438437547, "grad_norm": 1.6332608461380005, "learning_rate": 1.2515966136937472e-05, "loss": 0.0653, "step": 39240 }, { "epoch": 2.914748254864102, "grad_norm": 1.8351725339889526, "learning_rate": 1.2511510470815387e-05, "loss": 0.075, "step": 39250 }, { "epoch": 2.9154908658844496, "grad_norm": 2.058716058731079, "learning_rate": 1.2507054804693302e-05, "loss": 0.0559, "step": 39260 }, { "epoch": 2.9162334769047975, "grad_norm": 0.9268501996994019, "learning_rate": 1.2502599138571216e-05, "loss": 0.0599, "step": 39270 }, { "epoch": 2.916976087925145, "grad_norm": 1.5155894756317139, "learning_rate": 1.2498143472449132e-05, "loss": 0.0994, "step": 39280 }, { "epoch": 2.9177186989454924, "grad_norm": 0.5851942896842957, "learning_rate": 1.2493687806327046e-05, "loss": 0.0485, "step": 39290 }, { "epoch": 2.91846130996584, "grad_norm": 3.38140606880188, "learning_rate": 1.248923214020496e-05, "loss": 0.0576, "step": 39300 }, { "epoch": 2.9192039209861873, "grad_norm": 1.3043699264526367, "learning_rate": 1.2484776474082876e-05, "loss": 0.0433, "step": 39310 }, { "epoch": 2.919946532006535, "grad_norm": 2.2233948707580566, "learning_rate": 1.248032080796079e-05, "loss": 0.0585, "step": 39320 }, { "epoch": 2.9206891430268827, "grad_norm": 2.5557191371917725, "learning_rate": 1.2475865141838706e-05, "loss": 0.0506, "step": 39330 }, { "epoch": 2.92143175404723, "grad_norm": 1.088620662689209, "learning_rate": 1.2471409475716619e-05, "loss": 0.0709, "step": 39340 }, { "epoch": 2.9221743650675776, "grad_norm": 1.4036482572555542, "learning_rate": 1.2466953809594534e-05, "loss": 0.0976, "step": 39350 }, { "epoch": 2.922916976087925, "grad_norm": 2.0633914470672607, "learning_rate": 1.246249814347245e-05, "loss": 0.0545, "step": 39360 }, { "epoch": 2.923659587108273, "grad_norm": 1.7512578964233398, "learning_rate": 1.2458042477350364e-05, "loss": 0.0749, "step": 39370 }, { "epoch": 2.9244021981286203, "grad_norm": 0.6414874792098999, "learning_rate": 1.2453586811228279e-05, "loss": 0.0569, "step": 39380 }, { "epoch": 2.925144809148968, "grad_norm": 1.033894419670105, "learning_rate": 1.2449131145106192e-05, "loss": 0.0706, "step": 39390 }, { "epoch": 2.9258874201693152, "grad_norm": 1.335205316543579, "learning_rate": 1.2444675478984109e-05, "loss": 0.085, "step": 39400 }, { "epoch": 2.9266300311896627, "grad_norm": 1.1902940273284912, "learning_rate": 1.2440219812862024e-05, "loss": 0.0359, "step": 39410 }, { "epoch": 2.9273726422100106, "grad_norm": 2.9166228771209717, "learning_rate": 1.2435764146739937e-05, "loss": 0.0584, "step": 39420 }, { "epoch": 2.928115253230358, "grad_norm": 0.8290153741836548, "learning_rate": 1.2431308480617854e-05, "loss": 0.0669, "step": 39430 }, { "epoch": 2.9288578642507055, "grad_norm": 0.8835294842720032, "learning_rate": 1.2426852814495767e-05, "loss": 0.0545, "step": 39440 }, { "epoch": 2.929600475271053, "grad_norm": 0.4865367114543915, "learning_rate": 1.2422397148373682e-05, "loss": 0.0727, "step": 39450 }, { "epoch": 2.9303430862914004, "grad_norm": 0.7041136622428894, "learning_rate": 1.2417941482251596e-05, "loss": 0.0369, "step": 39460 }, { "epoch": 2.9310856973117483, "grad_norm": 1.066720724105835, "learning_rate": 1.2413485816129512e-05, "loss": 0.0801, "step": 39470 }, { "epoch": 2.9318283083320957, "grad_norm": 0.8632923364639282, "learning_rate": 1.2409030150007427e-05, "loss": 0.0296, "step": 39480 }, { "epoch": 2.932570919352443, "grad_norm": 0.2960319221019745, "learning_rate": 1.240457448388534e-05, "loss": 0.0488, "step": 39490 }, { "epoch": 2.9333135303727906, "grad_norm": 2.5065932273864746, "learning_rate": 1.2400118817763256e-05, "loss": 0.068, "step": 39500 }, { "epoch": 2.934056141393138, "grad_norm": 0.5187915563583374, "learning_rate": 1.239566315164117e-05, "loss": 0.0766, "step": 39510 }, { "epoch": 2.934798752413486, "grad_norm": 1.234108805656433, "learning_rate": 1.2391207485519085e-05, "loss": 0.0883, "step": 39520 }, { "epoch": 2.9355413634338334, "grad_norm": 1.0708197355270386, "learning_rate": 1.2386751819397e-05, "loss": 0.0661, "step": 39530 }, { "epoch": 2.936283974454181, "grad_norm": 0.3668792247772217, "learning_rate": 1.2382296153274915e-05, "loss": 0.0474, "step": 39540 }, { "epoch": 2.9370265854745283, "grad_norm": 0.8638660311698914, "learning_rate": 1.237784048715283e-05, "loss": 0.0541, "step": 39550 }, { "epoch": 2.937769196494876, "grad_norm": 2.86773681640625, "learning_rate": 1.2373384821030744e-05, "loss": 0.0799, "step": 39560 }, { "epoch": 2.9385118075152237, "grad_norm": 3.684232473373413, "learning_rate": 1.2368929154908659e-05, "loss": 0.0852, "step": 39570 }, { "epoch": 2.939254418535571, "grad_norm": 1.3619695901870728, "learning_rate": 1.2364473488786574e-05, "loss": 0.0468, "step": 39580 }, { "epoch": 2.9399970295559186, "grad_norm": 1.2432461977005005, "learning_rate": 1.2360017822664489e-05, "loss": 0.0836, "step": 39590 }, { "epoch": 2.940739640576266, "grad_norm": 0.7977986335754395, "learning_rate": 1.2355562156542404e-05, "loss": 0.076, "step": 39600 }, { "epoch": 2.9414822515966135, "grad_norm": 1.1581476926803589, "learning_rate": 1.2351106490420317e-05, "loss": 0.0622, "step": 39610 }, { "epoch": 2.9422248626169614, "grad_norm": 3.3563334941864014, "learning_rate": 1.2346650824298234e-05, "loss": 0.0713, "step": 39620 }, { "epoch": 2.942967473637309, "grad_norm": 1.305243968963623, "learning_rate": 1.2342195158176147e-05, "loss": 0.0886, "step": 39630 }, { "epoch": 2.9437100846576563, "grad_norm": 0.5178385376930237, "learning_rate": 1.2337739492054062e-05, "loss": 0.0861, "step": 39640 }, { "epoch": 2.9444526956780037, "grad_norm": 0.6791629195213318, "learning_rate": 1.2333283825931977e-05, "loss": 0.0655, "step": 39650 }, { "epoch": 2.945195306698351, "grad_norm": 1.124177098274231, "learning_rate": 1.2328828159809892e-05, "loss": 0.08, "step": 39660 }, { "epoch": 2.945937917718699, "grad_norm": 1.458204984664917, "learning_rate": 1.2324372493687807e-05, "loss": 0.0777, "step": 39670 }, { "epoch": 2.9466805287390465, "grad_norm": 2.084416389465332, "learning_rate": 1.231991682756572e-05, "loss": 0.0814, "step": 39680 }, { "epoch": 2.947423139759394, "grad_norm": 1.0431878566741943, "learning_rate": 1.2315461161443637e-05, "loss": 0.0686, "step": 39690 }, { "epoch": 2.9481657507797414, "grad_norm": 0.5064348578453064, "learning_rate": 1.2311005495321552e-05, "loss": 0.0676, "step": 39700 }, { "epoch": 2.948908361800089, "grad_norm": 2.466647148132324, "learning_rate": 1.2306549829199465e-05, "loss": 0.072, "step": 39710 }, { "epoch": 2.9496509728204368, "grad_norm": 0.9796644449234009, "learning_rate": 1.230209416307738e-05, "loss": 0.0493, "step": 39720 }, { "epoch": 2.950393583840784, "grad_norm": 1.4864760637283325, "learning_rate": 1.2297638496955295e-05, "loss": 0.068, "step": 39730 }, { "epoch": 2.9511361948611317, "grad_norm": 1.9162673950195312, "learning_rate": 1.229318283083321e-05, "loss": 0.0799, "step": 39740 }, { "epoch": 2.9518788058814796, "grad_norm": 1.276904582977295, "learning_rate": 1.2288727164711124e-05, "loss": 0.0578, "step": 39750 }, { "epoch": 2.9526214169018266, "grad_norm": 1.9684635400772095, "learning_rate": 1.2284271498589039e-05, "loss": 0.0608, "step": 39760 }, { "epoch": 2.9533640279221745, "grad_norm": 1.3042725324630737, "learning_rate": 1.2279815832466955e-05, "loss": 0.0509, "step": 39770 }, { "epoch": 2.954106638942522, "grad_norm": 1.1622141599655151, "learning_rate": 1.2275360166344869e-05, "loss": 0.0634, "step": 39780 }, { "epoch": 2.9548492499628694, "grad_norm": 3.6292014122009277, "learning_rate": 1.2270904500222784e-05, "loss": 0.0781, "step": 39790 }, { "epoch": 2.9555918609832172, "grad_norm": 0.8196433186531067, "learning_rate": 1.2266448834100699e-05, "loss": 0.0627, "step": 39800 }, { "epoch": 2.9563344720035647, "grad_norm": 2.011394739151001, "learning_rate": 1.2261993167978614e-05, "loss": 0.0516, "step": 39810 }, { "epoch": 2.957077083023912, "grad_norm": 0.2600031793117523, "learning_rate": 1.2257537501856529e-05, "loss": 0.0698, "step": 39820 }, { "epoch": 2.9578196940442596, "grad_norm": 1.4970747232437134, "learning_rate": 1.2253081835734442e-05, "loss": 0.0725, "step": 39830 }, { "epoch": 2.958562305064607, "grad_norm": 0.6336455941200256, "learning_rate": 1.2248626169612359e-05, "loss": 0.0471, "step": 39840 }, { "epoch": 2.959304916084955, "grad_norm": 1.8660321235656738, "learning_rate": 1.2244170503490272e-05, "loss": 0.0905, "step": 39850 }, { "epoch": 2.9600475271053024, "grad_norm": 2.4397964477539062, "learning_rate": 1.2239714837368187e-05, "loss": 0.0487, "step": 39860 }, { "epoch": 2.96079013812565, "grad_norm": 2.365588426589966, "learning_rate": 1.22352591712461e-05, "loss": 0.0774, "step": 39870 }, { "epoch": 2.9615327491459973, "grad_norm": 2.236955404281616, "learning_rate": 1.2230803505124017e-05, "loss": 0.0712, "step": 39880 }, { "epoch": 2.9622753601663447, "grad_norm": 0.8604252338409424, "learning_rate": 1.2226347839001932e-05, "loss": 0.0643, "step": 39890 }, { "epoch": 2.9630179711866926, "grad_norm": 2.3100244998931885, "learning_rate": 1.2221892172879845e-05, "loss": 0.062, "step": 39900 }, { "epoch": 2.96376058220704, "grad_norm": 1.03587007522583, "learning_rate": 1.221743650675776e-05, "loss": 0.0636, "step": 39910 }, { "epoch": 2.9645031932273875, "grad_norm": 0.37145113945007324, "learning_rate": 1.2212980840635675e-05, "loss": 0.0633, "step": 39920 }, { "epoch": 2.965245804247735, "grad_norm": 2.6632423400878906, "learning_rate": 1.220852517451359e-05, "loss": 0.0567, "step": 39930 }, { "epoch": 2.9659884152680824, "grad_norm": 1.9122623205184937, "learning_rate": 1.2204069508391505e-05, "loss": 0.0449, "step": 39940 }, { "epoch": 2.9667310262884303, "grad_norm": 0.8057365417480469, "learning_rate": 1.219961384226942e-05, "loss": 0.0468, "step": 39950 }, { "epoch": 2.967473637308778, "grad_norm": 0.6970472931861877, "learning_rate": 1.2195158176147335e-05, "loss": 0.0478, "step": 39960 }, { "epoch": 2.9682162483291252, "grad_norm": 1.0414628982543945, "learning_rate": 1.2190702510025248e-05, "loss": 0.0621, "step": 39970 }, { "epoch": 2.9689588593494727, "grad_norm": 1.28219735622406, "learning_rate": 1.2186246843903163e-05, "loss": 0.0494, "step": 39980 }, { "epoch": 2.96970147036982, "grad_norm": 2.46976900100708, "learning_rate": 1.2181791177781078e-05, "loss": 0.0879, "step": 39990 }, { "epoch": 2.970444081390168, "grad_norm": 1.4787884950637817, "learning_rate": 1.2177335511658993e-05, "loss": 0.0699, "step": 40000 }, { "epoch": 2.9711866924105155, "grad_norm": 1.21670401096344, "learning_rate": 1.2172879845536908e-05, "loss": 0.073, "step": 40010 }, { "epoch": 2.971929303430863, "grad_norm": 0.8801470398902893, "learning_rate": 1.2168424179414822e-05, "loss": 0.0288, "step": 40020 }, { "epoch": 2.9726719144512104, "grad_norm": 1.1613928079605103, "learning_rate": 1.2163968513292738e-05, "loss": 0.0737, "step": 40030 }, { "epoch": 2.973414525471558, "grad_norm": 1.54849112033844, "learning_rate": 1.2159512847170652e-05, "loss": 0.1064, "step": 40040 }, { "epoch": 2.9741571364919057, "grad_norm": 0.6201350688934326, "learning_rate": 1.2155057181048567e-05, "loss": 0.0259, "step": 40050 }, { "epoch": 2.974899747512253, "grad_norm": 1.4951953887939453, "learning_rate": 1.2150601514926483e-05, "loss": 0.0576, "step": 40060 }, { "epoch": 2.9756423585326006, "grad_norm": 2.557687520980835, "learning_rate": 1.2146145848804397e-05, "loss": 0.0804, "step": 40070 }, { "epoch": 2.976384969552948, "grad_norm": 1.9214116334915161, "learning_rate": 1.2141690182682312e-05, "loss": 0.0941, "step": 40080 }, { "epoch": 2.9771275805732955, "grad_norm": 1.7784258127212524, "learning_rate": 1.2137234516560225e-05, "loss": 0.0808, "step": 40090 }, { "epoch": 2.9778701915936434, "grad_norm": 2.0200514793395996, "learning_rate": 1.2132778850438142e-05, "loss": 0.0691, "step": 40100 }, { "epoch": 2.978612802613991, "grad_norm": 2.5777747631073, "learning_rate": 1.2128323184316057e-05, "loss": 0.079, "step": 40110 }, { "epoch": 2.9793554136343383, "grad_norm": 0.5303300619125366, "learning_rate": 1.212386751819397e-05, "loss": 0.0507, "step": 40120 }, { "epoch": 2.9800980246546858, "grad_norm": 1.3025041818618774, "learning_rate": 1.2119411852071885e-05, "loss": 0.0579, "step": 40130 }, { "epoch": 2.980840635675033, "grad_norm": 1.2491486072540283, "learning_rate": 1.21149561859498e-05, "loss": 0.1129, "step": 40140 }, { "epoch": 2.981583246695381, "grad_norm": 1.2781689167022705, "learning_rate": 1.2110500519827715e-05, "loss": 0.0695, "step": 40150 }, { "epoch": 2.9823258577157286, "grad_norm": 0.6310214400291443, "learning_rate": 1.2106044853705628e-05, "loss": 0.0625, "step": 40160 }, { "epoch": 2.983068468736076, "grad_norm": 0.8008638024330139, "learning_rate": 1.2101589187583543e-05, "loss": 0.0615, "step": 40170 }, { "epoch": 2.9838110797564235, "grad_norm": 1.496964454650879, "learning_rate": 1.209713352146146e-05, "loss": 0.0569, "step": 40180 }, { "epoch": 2.984553690776771, "grad_norm": 1.446394681930542, "learning_rate": 1.2092677855339373e-05, "loss": 0.0606, "step": 40190 }, { "epoch": 2.985296301797119, "grad_norm": 2.3738341331481934, "learning_rate": 1.2088222189217288e-05, "loss": 0.0918, "step": 40200 }, { "epoch": 2.9860389128174663, "grad_norm": 2.0323574542999268, "learning_rate": 1.2083766523095203e-05, "loss": 0.0548, "step": 40210 }, { "epoch": 2.9867815238378137, "grad_norm": 1.84878408908844, "learning_rate": 1.2079310856973118e-05, "loss": 0.0715, "step": 40220 }, { "epoch": 2.987524134858161, "grad_norm": 3.0860447883605957, "learning_rate": 1.2074855190851033e-05, "loss": 0.0663, "step": 40230 }, { "epoch": 2.9882667458785086, "grad_norm": 2.3522326946258545, "learning_rate": 1.2070399524728946e-05, "loss": 0.0642, "step": 40240 }, { "epoch": 2.9890093568988565, "grad_norm": 3.3070790767669678, "learning_rate": 1.2065943858606863e-05, "loss": 0.0727, "step": 40250 }, { "epoch": 2.989751967919204, "grad_norm": 1.6434651613235474, "learning_rate": 1.2061488192484776e-05, "loss": 0.0662, "step": 40260 }, { "epoch": 2.9904945789395514, "grad_norm": 1.9186336994171143, "learning_rate": 1.2057032526362691e-05, "loss": 0.0882, "step": 40270 }, { "epoch": 2.991237189959899, "grad_norm": 2.3194291591644287, "learning_rate": 1.2052576860240605e-05, "loss": 0.0749, "step": 40280 }, { "epoch": 2.9919798009802463, "grad_norm": 1.5101096630096436, "learning_rate": 1.2048121194118521e-05, "loss": 0.0563, "step": 40290 }, { "epoch": 2.992722412000594, "grad_norm": 0.4890212118625641, "learning_rate": 1.2043665527996436e-05, "loss": 0.0693, "step": 40300 }, { "epoch": 2.9934650230209416, "grad_norm": 2.9807426929473877, "learning_rate": 1.203920986187435e-05, "loss": 0.0661, "step": 40310 }, { "epoch": 2.994207634041289, "grad_norm": 2.466383934020996, "learning_rate": 1.2034754195752266e-05, "loss": 0.0578, "step": 40320 }, { "epoch": 2.994950245061637, "grad_norm": 2.2628114223480225, "learning_rate": 1.203029852963018e-05, "loss": 0.0521, "step": 40330 }, { "epoch": 2.995692856081984, "grad_norm": 0.5551472306251526, "learning_rate": 1.2025842863508095e-05, "loss": 0.0203, "step": 40340 }, { "epoch": 2.996435467102332, "grad_norm": 0.6742831468582153, "learning_rate": 1.202138719738601e-05, "loss": 0.0729, "step": 40350 }, { "epoch": 2.9971780781226793, "grad_norm": 2.0665626525878906, "learning_rate": 1.2016931531263925e-05, "loss": 0.0778, "step": 40360 }, { "epoch": 2.997920689143027, "grad_norm": 1.7712310552597046, "learning_rate": 1.201247586514184e-05, "loss": 0.0635, "step": 40370 }, { "epoch": 2.9986633001633747, "grad_norm": 0.27435049414634705, "learning_rate": 1.2008020199019753e-05, "loss": 0.051, "step": 40380 }, { "epoch": 2.999405911183722, "grad_norm": 2.0611684322357178, "learning_rate": 1.2003564532897668e-05, "loss": 0.0496, "step": 40390 }, { "epoch": 3.0, "eval_f1": 0.0, "eval_loss": 0.055565182119607925, "eval_runtime": 795.9474, "eval_samples_per_second": 47.766, "eval_steps_per_second": 2.986, "step": 40398 }, { "epoch": 3.0001485222040696, "grad_norm": 2.1062331199645996, "learning_rate": 1.1999108866775585e-05, "loss": 0.0856, "step": 40400 }, { "epoch": 3.000891133224417, "grad_norm": 0.7246180772781372, "learning_rate": 1.1994653200653498e-05, "loss": 0.0521, "step": 40410 }, { "epoch": 3.0016337442447645, "grad_norm": 0.7559748888015747, "learning_rate": 1.1990197534531413e-05, "loss": 0.0641, "step": 40420 }, { "epoch": 3.002376355265112, "grad_norm": 1.5347598791122437, "learning_rate": 1.1985741868409326e-05, "loss": 0.0732, "step": 40430 }, { "epoch": 3.00311896628546, "grad_norm": 0.6457234025001526, "learning_rate": 1.1981286202287243e-05, "loss": 0.058, "step": 40440 }, { "epoch": 3.0038615773058073, "grad_norm": 0.716690719127655, "learning_rate": 1.1976830536165156e-05, "loss": 0.0528, "step": 40450 }, { "epoch": 3.0046041883261547, "grad_norm": 3.154327630996704, "learning_rate": 1.1972374870043071e-05, "loss": 0.09, "step": 40460 }, { "epoch": 3.005346799346502, "grad_norm": 0.7682334780693054, "learning_rate": 1.1967919203920988e-05, "loss": 0.0764, "step": 40470 }, { "epoch": 3.0060894103668496, "grad_norm": 0.4691618084907532, "learning_rate": 1.1963463537798901e-05, "loss": 0.0437, "step": 40480 }, { "epoch": 3.0068320213871975, "grad_norm": 1.4054096937179565, "learning_rate": 1.1959007871676816e-05, "loss": 0.065, "step": 40490 }, { "epoch": 3.007574632407545, "grad_norm": 2.800178050994873, "learning_rate": 1.195455220555473e-05, "loss": 0.1019, "step": 40500 }, { "epoch": 3.0083172434278924, "grad_norm": 1.4787908792495728, "learning_rate": 1.1950096539432646e-05, "loss": 0.0959, "step": 40510 }, { "epoch": 3.00905985444824, "grad_norm": 0.8632726073265076, "learning_rate": 1.1945640873310561e-05, "loss": 0.062, "step": 40520 }, { "epoch": 3.0098024654685878, "grad_norm": 0.41822558641433716, "learning_rate": 1.1941185207188475e-05, "loss": 0.0676, "step": 40530 }, { "epoch": 3.010545076488935, "grad_norm": 1.6045604944229126, "learning_rate": 1.193672954106639e-05, "loss": 0.0548, "step": 40540 }, { "epoch": 3.0112876875092827, "grad_norm": 4.851860523223877, "learning_rate": 1.1932273874944304e-05, "loss": 0.0509, "step": 40550 }, { "epoch": 3.01203029852963, "grad_norm": 1.4133330583572388, "learning_rate": 1.192781820882222e-05, "loss": 0.0475, "step": 40560 }, { "epoch": 3.0127729095499776, "grad_norm": 1.6727176904678345, "learning_rate": 1.1923362542700133e-05, "loss": 0.0687, "step": 40570 }, { "epoch": 3.0135155205703255, "grad_norm": 1.9212983846664429, "learning_rate": 1.1918906876578048e-05, "loss": 0.0982, "step": 40580 }, { "epoch": 3.014258131590673, "grad_norm": 2.5236656665802, "learning_rate": 1.1914451210455964e-05, "loss": 0.0831, "step": 40590 }, { "epoch": 3.0150007426110204, "grad_norm": 0.8461244106292725, "learning_rate": 1.1909995544333878e-05, "loss": 0.0635, "step": 40600 }, { "epoch": 3.015743353631368, "grad_norm": 1.5794192552566528, "learning_rate": 1.1905539878211793e-05, "loss": 0.0718, "step": 40610 }, { "epoch": 3.0164859646517153, "grad_norm": 0.7722788453102112, "learning_rate": 1.1901084212089708e-05, "loss": 0.0726, "step": 40620 }, { "epoch": 3.017228575672063, "grad_norm": 1.9075877666473389, "learning_rate": 1.1896628545967623e-05, "loss": 0.0537, "step": 40630 }, { "epoch": 3.0179711866924106, "grad_norm": 1.057822585105896, "learning_rate": 1.1892172879845538e-05, "loss": 0.0839, "step": 40640 }, { "epoch": 3.018713797712758, "grad_norm": 2.4780874252319336, "learning_rate": 1.1887717213723451e-05, "loss": 0.0492, "step": 40650 }, { "epoch": 3.0194564087331055, "grad_norm": 1.614044189453125, "learning_rate": 1.1883261547601368e-05, "loss": 0.0488, "step": 40660 }, { "epoch": 3.020199019753453, "grad_norm": 0.8229920268058777, "learning_rate": 1.1878805881479281e-05, "loss": 0.0803, "step": 40670 }, { "epoch": 3.020941630773801, "grad_norm": 0.7119345664978027, "learning_rate": 1.1874350215357196e-05, "loss": 0.048, "step": 40680 }, { "epoch": 3.0216842417941483, "grad_norm": 1.8598228693008423, "learning_rate": 1.1869894549235111e-05, "loss": 0.0658, "step": 40690 }, { "epoch": 3.0224268528144957, "grad_norm": 0.4016118347644806, "learning_rate": 1.1865438883113026e-05, "loss": 0.0351, "step": 40700 }, { "epoch": 3.023169463834843, "grad_norm": 1.3350958824157715, "learning_rate": 1.1860983216990941e-05, "loss": 0.0553, "step": 40710 }, { "epoch": 3.0239120748551906, "grad_norm": 0.8724972605705261, "learning_rate": 1.1856527550868854e-05, "loss": 0.0677, "step": 40720 }, { "epoch": 3.0246546858755385, "grad_norm": 0.5378849506378174, "learning_rate": 1.1852071884746771e-05, "loss": 0.0623, "step": 40730 }, { "epoch": 3.025397296895886, "grad_norm": 0.3792591094970703, "learning_rate": 1.1847616218624684e-05, "loss": 0.0654, "step": 40740 }, { "epoch": 3.0261399079162334, "grad_norm": 0.3281194567680359, "learning_rate": 1.18431605525026e-05, "loss": 0.0862, "step": 40750 }, { "epoch": 3.026882518936581, "grad_norm": 1.0597580671310425, "learning_rate": 1.1838704886380514e-05, "loss": 0.0651, "step": 40760 }, { "epoch": 3.0276251299569283, "grad_norm": 0.7202780842781067, "learning_rate": 1.183424922025843e-05, "loss": 0.0742, "step": 40770 }, { "epoch": 3.0283677409772762, "grad_norm": 0.5692980885505676, "learning_rate": 1.1829793554136344e-05, "loss": 0.0685, "step": 40780 }, { "epoch": 3.0291103519976237, "grad_norm": 0.774348258972168, "learning_rate": 1.1825337888014258e-05, "loss": 0.0549, "step": 40790 }, { "epoch": 3.029852963017971, "grad_norm": 1.022932529449463, "learning_rate": 1.1820882221892173e-05, "loss": 0.0687, "step": 40800 }, { "epoch": 3.0305955740383186, "grad_norm": 0.4947283864021301, "learning_rate": 1.181642655577009e-05, "loss": 0.056, "step": 40810 }, { "epoch": 3.0313381850586665, "grad_norm": 1.4515385627746582, "learning_rate": 1.1811970889648003e-05, "loss": 0.0671, "step": 40820 }, { "epoch": 3.032080796079014, "grad_norm": 2.044039487838745, "learning_rate": 1.1807515223525918e-05, "loss": 0.0713, "step": 40830 }, { "epoch": 3.0328234070993614, "grad_norm": 0.6720436215400696, "learning_rate": 1.1803059557403831e-05, "loss": 0.0685, "step": 40840 }, { "epoch": 3.033566018119709, "grad_norm": 1.2701702117919922, "learning_rate": 1.1798603891281748e-05, "loss": 0.0871, "step": 40850 }, { "epoch": 3.0343086291400563, "grad_norm": 0.6899190545082092, "learning_rate": 1.179414822515966e-05, "loss": 0.0761, "step": 40860 }, { "epoch": 3.035051240160404, "grad_norm": 2.306637763977051, "learning_rate": 1.1789692559037576e-05, "loss": 0.0697, "step": 40870 }, { "epoch": 3.0357938511807516, "grad_norm": 0.6029354333877563, "learning_rate": 1.1785236892915493e-05, "loss": 0.0677, "step": 40880 }, { "epoch": 3.036536462201099, "grad_norm": 2.0085299015045166, "learning_rate": 1.1780781226793406e-05, "loss": 0.0726, "step": 40890 }, { "epoch": 3.0372790732214465, "grad_norm": 1.0071104764938354, "learning_rate": 1.177632556067132e-05, "loss": 0.0556, "step": 40900 }, { "epoch": 3.038021684241794, "grad_norm": 1.3485918045043945, "learning_rate": 1.1771869894549234e-05, "loss": 0.0403, "step": 40910 }, { "epoch": 3.038764295262142, "grad_norm": 0.6336653232574463, "learning_rate": 1.176741422842715e-05, "loss": 0.0556, "step": 40920 }, { "epoch": 3.0395069062824893, "grad_norm": 0.9142085909843445, "learning_rate": 1.1762958562305066e-05, "loss": 0.0506, "step": 40930 }, { "epoch": 3.0402495173028368, "grad_norm": 2.9091007709503174, "learning_rate": 1.1758502896182979e-05, "loss": 0.075, "step": 40940 }, { "epoch": 3.040992128323184, "grad_norm": 1.0477584600448608, "learning_rate": 1.1754047230060894e-05, "loss": 0.0697, "step": 40950 }, { "epoch": 3.0417347393435317, "grad_norm": 0.5181246995925903, "learning_rate": 1.1749591563938809e-05, "loss": 0.0808, "step": 40960 }, { "epoch": 3.0424773503638796, "grad_norm": 2.0747780799865723, "learning_rate": 1.1745135897816724e-05, "loss": 0.0793, "step": 40970 }, { "epoch": 3.043219961384227, "grad_norm": 1.4035779237747192, "learning_rate": 1.1740680231694637e-05, "loss": 0.0503, "step": 40980 }, { "epoch": 3.0439625724045745, "grad_norm": 1.336506962776184, "learning_rate": 1.1736224565572554e-05, "loss": 0.027, "step": 40990 }, { "epoch": 3.044705183424922, "grad_norm": 1.5096478462219238, "learning_rate": 1.1731768899450469e-05, "loss": 0.0526, "step": 41000 }, { "epoch": 3.0454477944452694, "grad_norm": 0.9661771655082703, "learning_rate": 1.1727313233328382e-05, "loss": 0.0593, "step": 41010 }, { "epoch": 3.0461904054656173, "grad_norm": 2.019800901412964, "learning_rate": 1.1722857567206297e-05, "loss": 0.0584, "step": 41020 }, { "epoch": 3.0469330164859647, "grad_norm": 1.9931749105453491, "learning_rate": 1.1718401901084212e-05, "loss": 0.0661, "step": 41030 }, { "epoch": 3.047675627506312, "grad_norm": 1.6220228672027588, "learning_rate": 1.1713946234962127e-05, "loss": 0.0753, "step": 41040 }, { "epoch": 3.0484182385266596, "grad_norm": 0.8533129692077637, "learning_rate": 1.1709490568840042e-05, "loss": 0.0538, "step": 41050 }, { "epoch": 3.0491608495470075, "grad_norm": 2.1048035621643066, "learning_rate": 1.1705034902717956e-05, "loss": 0.0699, "step": 41060 }, { "epoch": 3.049903460567355, "grad_norm": 1.1397531032562256, "learning_rate": 1.1700579236595872e-05, "loss": 0.0385, "step": 41070 }, { "epoch": 3.0506460715877024, "grad_norm": 1.218272089958191, "learning_rate": 1.1696123570473786e-05, "loss": 0.0511, "step": 41080 }, { "epoch": 3.05138868260805, "grad_norm": 1.6609820127487183, "learning_rate": 1.16916679043517e-05, "loss": 0.0693, "step": 41090 }, { "epoch": 3.0521312936283973, "grad_norm": 0.7362266778945923, "learning_rate": 1.1687212238229616e-05, "loss": 0.0834, "step": 41100 }, { "epoch": 3.052873904648745, "grad_norm": 1.8617407083511353, "learning_rate": 1.168275657210753e-05, "loss": 0.0672, "step": 41110 }, { "epoch": 3.0536165156690926, "grad_norm": 1.8097994327545166, "learning_rate": 1.1678300905985446e-05, "loss": 0.0693, "step": 41120 }, { "epoch": 3.05435912668944, "grad_norm": 0.9621978402137756, "learning_rate": 1.1673845239863359e-05, "loss": 0.0697, "step": 41130 }, { "epoch": 3.0551017377097875, "grad_norm": 0.6268504858016968, "learning_rate": 1.1669389573741276e-05, "loss": 0.0801, "step": 41140 }, { "epoch": 3.055844348730135, "grad_norm": 0.5146762132644653, "learning_rate": 1.1664933907619189e-05, "loss": 0.0516, "step": 41150 }, { "epoch": 3.056586959750483, "grad_norm": 1.8789063692092896, "learning_rate": 1.1660478241497104e-05, "loss": 0.0604, "step": 41160 }, { "epoch": 3.0573295707708303, "grad_norm": 1.16319739818573, "learning_rate": 1.1656022575375019e-05, "loss": 0.0819, "step": 41170 }, { "epoch": 3.058072181791178, "grad_norm": 1.3086446523666382, "learning_rate": 1.1651566909252934e-05, "loss": 0.0805, "step": 41180 }, { "epoch": 3.0588147928115252, "grad_norm": 1.2489312887191772, "learning_rate": 1.1647111243130849e-05, "loss": 0.07, "step": 41190 }, { "epoch": 3.0595574038318727, "grad_norm": 1.8217955827713013, "learning_rate": 1.1642655577008762e-05, "loss": 0.0798, "step": 41200 }, { "epoch": 3.0603000148522206, "grad_norm": 1.6634607315063477, "learning_rate": 1.1638199910886677e-05, "loss": 0.0426, "step": 41210 }, { "epoch": 3.061042625872568, "grad_norm": 2.512523889541626, "learning_rate": 1.1633744244764594e-05, "loss": 0.0639, "step": 41220 }, { "epoch": 3.0617852368929155, "grad_norm": 1.9372293949127197, "learning_rate": 1.1629288578642507e-05, "loss": 0.1061, "step": 41230 }, { "epoch": 3.062527847913263, "grad_norm": 1.2639974355697632, "learning_rate": 1.1624832912520422e-05, "loss": 0.0663, "step": 41240 }, { "epoch": 3.0632704589336104, "grad_norm": 1.19036066532135, "learning_rate": 1.1620377246398337e-05, "loss": 0.0901, "step": 41250 }, { "epoch": 3.0640130699539583, "grad_norm": 2.261476993560791, "learning_rate": 1.1615921580276252e-05, "loss": 0.1083, "step": 41260 }, { "epoch": 3.0647556809743057, "grad_norm": 1.9970070123672485, "learning_rate": 1.1611465914154165e-05, "loss": 0.0546, "step": 41270 }, { "epoch": 3.065498291994653, "grad_norm": 1.31303870677948, "learning_rate": 1.160701024803208e-05, "loss": 0.0756, "step": 41280 }, { "epoch": 3.0662409030150006, "grad_norm": 1.9843600988388062, "learning_rate": 1.1602554581909997e-05, "loss": 0.0427, "step": 41290 }, { "epoch": 3.066983514035348, "grad_norm": 2.369338035583496, "learning_rate": 1.159809891578791e-05, "loss": 0.0688, "step": 41300 }, { "epoch": 3.067726125055696, "grad_norm": 1.3789310455322266, "learning_rate": 1.1593643249665825e-05, "loss": 0.0755, "step": 41310 }, { "epoch": 3.0684687360760434, "grad_norm": 2.209085702896118, "learning_rate": 1.1589187583543739e-05, "loss": 0.08, "step": 41320 }, { "epoch": 3.069211347096391, "grad_norm": 1.1442301273345947, "learning_rate": 1.1584731917421655e-05, "loss": 0.0652, "step": 41330 }, { "epoch": 3.0699539581167383, "grad_norm": 1.0715082883834839, "learning_rate": 1.158027625129957e-05, "loss": 0.0595, "step": 41340 }, { "epoch": 3.0706965691370858, "grad_norm": 2.274426221847534, "learning_rate": 1.1575820585177484e-05, "loss": 0.0624, "step": 41350 }, { "epoch": 3.0714391801574337, "grad_norm": 2.2110979557037354, "learning_rate": 1.1571364919055399e-05, "loss": 0.0696, "step": 41360 }, { "epoch": 3.072181791177781, "grad_norm": 0.7689408659934998, "learning_rate": 1.1566909252933314e-05, "loss": 0.0498, "step": 41370 }, { "epoch": 3.0729244021981286, "grad_norm": 1.7183451652526855, "learning_rate": 1.1562453586811229e-05, "loss": 0.0641, "step": 41380 }, { "epoch": 3.073667013218476, "grad_norm": 0.8684793710708618, "learning_rate": 1.1557997920689144e-05, "loss": 0.0747, "step": 41390 }, { "epoch": 3.074409624238824, "grad_norm": 1.1358157396316528, "learning_rate": 1.1553542254567059e-05, "loss": 0.0625, "step": 41400 }, { "epoch": 3.0751522352591714, "grad_norm": 1.4486446380615234, "learning_rate": 1.1549086588444974e-05, "loss": 0.0625, "step": 41410 }, { "epoch": 3.075894846279519, "grad_norm": 0.8137945532798767, "learning_rate": 1.1544630922322887e-05, "loss": 0.0589, "step": 41420 }, { "epoch": 3.0766374572998663, "grad_norm": 2.614501476287842, "learning_rate": 1.1540175256200802e-05, "loss": 0.0617, "step": 41430 }, { "epoch": 3.0773800683202137, "grad_norm": 1.8231887817382812, "learning_rate": 1.1535719590078717e-05, "loss": 0.0626, "step": 41440 }, { "epoch": 3.0781226793405616, "grad_norm": 1.6901496648788452, "learning_rate": 1.1531263923956632e-05, "loss": 0.0533, "step": 41450 }, { "epoch": 3.078865290360909, "grad_norm": 2.780428171157837, "learning_rate": 1.1526808257834547e-05, "loss": 0.0599, "step": 41460 }, { "epoch": 3.0796079013812565, "grad_norm": 0.7014702558517456, "learning_rate": 1.152235259171246e-05, "loss": 0.0325, "step": 41470 }, { "epoch": 3.080350512401604, "grad_norm": 1.5613539218902588, "learning_rate": 1.1517896925590377e-05, "loss": 0.0723, "step": 41480 }, { "epoch": 3.0810931234219514, "grad_norm": 0.42727401852607727, "learning_rate": 1.151344125946829e-05, "loss": 0.0387, "step": 41490 }, { "epoch": 3.0818357344422993, "grad_norm": 0.7476786971092224, "learning_rate": 1.1508985593346205e-05, "loss": 0.0964, "step": 41500 }, { "epoch": 3.0825783454626468, "grad_norm": 0.780316948890686, "learning_rate": 1.1504529927224122e-05, "loss": 0.0734, "step": 41510 }, { "epoch": 3.083320956482994, "grad_norm": 1.4033887386322021, "learning_rate": 1.1500074261102035e-05, "loss": 0.068, "step": 41520 }, { "epoch": 3.0840635675033417, "grad_norm": 2.3569159507751465, "learning_rate": 1.149561859497995e-05, "loss": 0.0773, "step": 41530 }, { "epoch": 3.084806178523689, "grad_norm": 2.889099359512329, "learning_rate": 1.1491162928857864e-05, "loss": 0.0708, "step": 41540 }, { "epoch": 3.085548789544037, "grad_norm": 0.9511964321136475, "learning_rate": 1.148670726273578e-05, "loss": 0.0688, "step": 41550 }, { "epoch": 3.0862914005643844, "grad_norm": 0.7817783951759338, "learning_rate": 1.1482251596613693e-05, "loss": 0.0624, "step": 41560 }, { "epoch": 3.087034011584732, "grad_norm": 1.0421441793441772, "learning_rate": 1.1477795930491608e-05, "loss": 0.0506, "step": 41570 }, { "epoch": 3.0877766226050793, "grad_norm": 1.8338831663131714, "learning_rate": 1.1473340264369523e-05, "loss": 0.0341, "step": 41580 }, { "epoch": 3.088519233625427, "grad_norm": 0.8225168585777283, "learning_rate": 1.1468884598247438e-05, "loss": 0.0541, "step": 41590 }, { "epoch": 3.0892618446457747, "grad_norm": 0.7224980592727661, "learning_rate": 1.1464428932125353e-05, "loss": 0.0588, "step": 41600 }, { "epoch": 3.090004455666122, "grad_norm": 1.8559728860855103, "learning_rate": 1.1459973266003267e-05, "loss": 0.0867, "step": 41610 }, { "epoch": 3.0907470666864696, "grad_norm": 2.965693712234497, "learning_rate": 1.1455517599881182e-05, "loss": 0.0735, "step": 41620 }, { "epoch": 3.091489677706817, "grad_norm": 1.150742530822754, "learning_rate": 1.1451061933759098e-05, "loss": 0.0847, "step": 41630 }, { "epoch": 3.092232288727165, "grad_norm": 2.492800712585449, "learning_rate": 1.1446606267637012e-05, "loss": 0.0813, "step": 41640 }, { "epoch": 3.0929748997475124, "grad_norm": 1.1938602924346924, "learning_rate": 1.1442150601514927e-05, "loss": 0.0843, "step": 41650 }, { "epoch": 3.09371751076786, "grad_norm": 1.2840536832809448, "learning_rate": 1.1437694935392842e-05, "loss": 0.0537, "step": 41660 }, { "epoch": 3.0944601217882073, "grad_norm": 0.45311644673347473, "learning_rate": 1.1433239269270757e-05, "loss": 0.0419, "step": 41670 }, { "epoch": 3.0952027328085547, "grad_norm": 1.7243297100067139, "learning_rate": 1.142878360314867e-05, "loss": 0.0568, "step": 41680 }, { "epoch": 3.0959453438289026, "grad_norm": 1.793365240097046, "learning_rate": 1.1424327937026585e-05, "loss": 0.0749, "step": 41690 }, { "epoch": 3.09668795484925, "grad_norm": 3.0934557914733887, "learning_rate": 1.1419872270904502e-05, "loss": 0.0775, "step": 41700 }, { "epoch": 3.0974305658695975, "grad_norm": 0.9718724489212036, "learning_rate": 1.1415416604782415e-05, "loss": 0.0614, "step": 41710 }, { "epoch": 3.098173176889945, "grad_norm": 1.1722973585128784, "learning_rate": 1.141096093866033e-05, "loss": 0.0926, "step": 41720 }, { "epoch": 3.0989157879102924, "grad_norm": 2.2873425483703613, "learning_rate": 1.1406505272538243e-05, "loss": 0.0894, "step": 41730 }, { "epoch": 3.0996583989306403, "grad_norm": 0.9070118069648743, "learning_rate": 1.140204960641616e-05, "loss": 0.0558, "step": 41740 }, { "epoch": 3.1004010099509878, "grad_norm": 2.142220973968506, "learning_rate": 1.1397593940294075e-05, "loss": 0.0754, "step": 41750 }, { "epoch": 3.1011436209713352, "grad_norm": 0.7095875144004822, "learning_rate": 1.1393138274171988e-05, "loss": 0.0474, "step": 41760 }, { "epoch": 3.1018862319916827, "grad_norm": 0.9688817262649536, "learning_rate": 1.1388682608049905e-05, "loss": 0.0508, "step": 41770 }, { "epoch": 3.10262884301203, "grad_norm": 1.3128563165664673, "learning_rate": 1.1384226941927818e-05, "loss": 0.0785, "step": 41780 }, { "epoch": 3.103371454032378, "grad_norm": 1.9782556295394897, "learning_rate": 1.1379771275805733e-05, "loss": 0.0642, "step": 41790 }, { "epoch": 3.1041140650527255, "grad_norm": 2.7178022861480713, "learning_rate": 1.1375315609683648e-05, "loss": 0.0747, "step": 41800 }, { "epoch": 3.104856676073073, "grad_norm": 0.8021518588066101, "learning_rate": 1.1370859943561563e-05, "loss": 0.0784, "step": 41810 }, { "epoch": 3.1055992870934204, "grad_norm": 1.1395660638809204, "learning_rate": 1.1366404277439478e-05, "loss": 0.0693, "step": 41820 }, { "epoch": 3.106341898113768, "grad_norm": 0.3719038963317871, "learning_rate": 1.1361948611317392e-05, "loss": 0.0576, "step": 41830 }, { "epoch": 3.1070845091341157, "grad_norm": 3.742137908935547, "learning_rate": 1.1357492945195307e-05, "loss": 0.0891, "step": 41840 }, { "epoch": 3.107827120154463, "grad_norm": 1.4700413942337036, "learning_rate": 1.1353037279073222e-05, "loss": 0.0648, "step": 41850 }, { "epoch": 3.1085697311748106, "grad_norm": 1.799091100692749, "learning_rate": 1.1348581612951137e-05, "loss": 0.0811, "step": 41860 }, { "epoch": 3.109312342195158, "grad_norm": 0.4236965477466583, "learning_rate": 1.1344125946829052e-05, "loss": 0.0569, "step": 41870 }, { "epoch": 3.1100549532155055, "grad_norm": 0.9053602814674377, "learning_rate": 1.1339670280706965e-05, "loss": 0.0476, "step": 41880 }, { "epoch": 3.1107975642358534, "grad_norm": 1.6792991161346436, "learning_rate": 1.1335214614584882e-05, "loss": 0.0675, "step": 41890 }, { "epoch": 3.111540175256201, "grad_norm": 0.48913243412971497, "learning_rate": 1.1330758948462795e-05, "loss": 0.0632, "step": 41900 }, { "epoch": 3.1122827862765483, "grad_norm": 2.1380555629730225, "learning_rate": 1.132630328234071e-05, "loss": 0.0699, "step": 41910 }, { "epoch": 3.1130253972968958, "grad_norm": 3.3821558952331543, "learning_rate": 1.1321847616218627e-05, "loss": 0.0529, "step": 41920 }, { "epoch": 3.113768008317243, "grad_norm": 1.7938247919082642, "learning_rate": 1.131739195009654e-05, "loss": 0.0707, "step": 41930 }, { "epoch": 3.114510619337591, "grad_norm": 1.2361664772033691, "learning_rate": 1.1312936283974455e-05, "loss": 0.0633, "step": 41940 }, { "epoch": 3.1152532303579386, "grad_norm": 1.2739269733428955, "learning_rate": 1.1308480617852368e-05, "loss": 0.0466, "step": 41950 }, { "epoch": 3.115995841378286, "grad_norm": 2.4915049076080322, "learning_rate": 1.1304024951730285e-05, "loss": 0.0761, "step": 41960 }, { "epoch": 3.1167384523986335, "grad_norm": 1.7226744890213013, "learning_rate": 1.1299569285608198e-05, "loss": 0.0604, "step": 41970 }, { "epoch": 3.1174810634189813, "grad_norm": 1.8600285053253174, "learning_rate": 1.1295113619486113e-05, "loss": 0.0596, "step": 41980 }, { "epoch": 3.118223674439329, "grad_norm": 2.710045099258423, "learning_rate": 1.1290657953364028e-05, "loss": 0.0905, "step": 41990 }, { "epoch": 3.1189662854596762, "grad_norm": 0.9297268390655518, "learning_rate": 1.1286202287241943e-05, "loss": 0.0513, "step": 42000 }, { "epoch": 3.1197088964800237, "grad_norm": 2.148306131362915, "learning_rate": 1.1281746621119858e-05, "loss": 0.0729, "step": 42010 }, { "epoch": 3.120451507500371, "grad_norm": 1.510986328125, "learning_rate": 1.1277290954997771e-05, "loss": 0.082, "step": 42020 }, { "epoch": 3.121194118520719, "grad_norm": 2.6012394428253174, "learning_rate": 1.1272835288875686e-05, "loss": 0.0769, "step": 42030 }, { "epoch": 3.1219367295410665, "grad_norm": 2.248591184616089, "learning_rate": 1.1268379622753603e-05, "loss": 0.0889, "step": 42040 }, { "epoch": 3.122679340561414, "grad_norm": 1.4806914329528809, "learning_rate": 1.1263923956631516e-05, "loss": 0.0694, "step": 42050 }, { "epoch": 3.1234219515817614, "grad_norm": 2.7519147396087646, "learning_rate": 1.1259468290509431e-05, "loss": 0.0503, "step": 42060 }, { "epoch": 3.124164562602109, "grad_norm": 2.996624708175659, "learning_rate": 1.1255012624387346e-05, "loss": 0.0769, "step": 42070 }, { "epoch": 3.1249071736224567, "grad_norm": 0.765396237373352, "learning_rate": 1.1250556958265261e-05, "loss": 0.0529, "step": 42080 }, { "epoch": 3.125649784642804, "grad_norm": 0.4686828553676605, "learning_rate": 1.1246101292143176e-05, "loss": 0.0445, "step": 42090 }, { "epoch": 3.1263923956631516, "grad_norm": 0.8828471899032593, "learning_rate": 1.124164562602109e-05, "loss": 0.0721, "step": 42100 }, { "epoch": 3.127135006683499, "grad_norm": 3.2565178871154785, "learning_rate": 1.1237189959899006e-05, "loss": 0.0749, "step": 42110 }, { "epoch": 3.1278776177038465, "grad_norm": 1.6550853252410889, "learning_rate": 1.123273429377692e-05, "loss": 0.0714, "step": 42120 }, { "epoch": 3.1286202287241944, "grad_norm": 0.7988404631614685, "learning_rate": 1.1228278627654835e-05, "loss": 0.0889, "step": 42130 }, { "epoch": 3.129362839744542, "grad_norm": 0.8032083511352539, "learning_rate": 1.1223822961532748e-05, "loss": 0.0761, "step": 42140 }, { "epoch": 3.1301054507648893, "grad_norm": 1.479958176612854, "learning_rate": 1.1219367295410665e-05, "loss": 0.0505, "step": 42150 }, { "epoch": 3.130848061785237, "grad_norm": 1.5908888578414917, "learning_rate": 1.121491162928858e-05, "loss": 0.0723, "step": 42160 }, { "epoch": 3.1315906728055842, "grad_norm": 0.9829261302947998, "learning_rate": 1.1210455963166493e-05, "loss": 0.0565, "step": 42170 }, { "epoch": 3.132333283825932, "grad_norm": 0.9932742118835449, "learning_rate": 1.120600029704441e-05, "loss": 0.0773, "step": 42180 }, { "epoch": 3.1330758948462796, "grad_norm": 1.5291361808776855, "learning_rate": 1.1201544630922323e-05, "loss": 0.0945, "step": 42190 }, { "epoch": 3.133818505866627, "grad_norm": 0.7759218215942383, "learning_rate": 1.1197088964800238e-05, "loss": 0.0567, "step": 42200 }, { "epoch": 3.1345611168869745, "grad_norm": 1.5172669887542725, "learning_rate": 1.1192633298678153e-05, "loss": 0.041, "step": 42210 }, { "epoch": 3.1353037279073224, "grad_norm": 2.5315630435943604, "learning_rate": 1.1188177632556068e-05, "loss": 0.0596, "step": 42220 }, { "epoch": 3.13604633892767, "grad_norm": 1.723301649093628, "learning_rate": 1.1183721966433983e-05, "loss": 0.0474, "step": 42230 }, { "epoch": 3.1367889499480173, "grad_norm": 1.322437047958374, "learning_rate": 1.1179266300311896e-05, "loss": 0.0513, "step": 42240 }, { "epoch": 3.1375315609683647, "grad_norm": 1.4333685636520386, "learning_rate": 1.1174810634189811e-05, "loss": 0.0602, "step": 42250 }, { "epoch": 3.138274171988712, "grad_norm": 1.3001521825790405, "learning_rate": 1.1170354968067726e-05, "loss": 0.0504, "step": 42260 }, { "epoch": 3.13901678300906, "grad_norm": 0.24273203313350677, "learning_rate": 1.1165899301945641e-05, "loss": 0.033, "step": 42270 }, { "epoch": 3.1397593940294075, "grad_norm": 2.059208393096924, "learning_rate": 1.1161443635823556e-05, "loss": 0.0636, "step": 42280 }, { "epoch": 3.140502005049755, "grad_norm": 1.3567062616348267, "learning_rate": 1.115698796970147e-05, "loss": 0.0693, "step": 42290 }, { "epoch": 3.1412446160701024, "grad_norm": 0.29561829566955566, "learning_rate": 1.1152532303579386e-05, "loss": 0.0617, "step": 42300 }, { "epoch": 3.14198722709045, "grad_norm": 1.4350517988204956, "learning_rate": 1.11480766374573e-05, "loss": 0.0671, "step": 42310 }, { "epoch": 3.1427298381107978, "grad_norm": 1.5640891790390015, "learning_rate": 1.1143620971335214e-05, "loss": 0.069, "step": 42320 }, { "epoch": 3.143472449131145, "grad_norm": 1.763852596282959, "learning_rate": 1.1139165305213131e-05, "loss": 0.0703, "step": 42330 }, { "epoch": 3.1442150601514927, "grad_norm": 2.7091476917266846, "learning_rate": 1.1134709639091044e-05, "loss": 0.0564, "step": 42340 }, { "epoch": 3.14495767117184, "grad_norm": 1.8148163557052612, "learning_rate": 1.113025397296896e-05, "loss": 0.066, "step": 42350 }, { "epoch": 3.1457002821921876, "grad_norm": 1.7284859418869019, "learning_rate": 1.1125798306846873e-05, "loss": 0.0552, "step": 42360 }, { "epoch": 3.1464428932125355, "grad_norm": 0.8898233771324158, "learning_rate": 1.112134264072479e-05, "loss": 0.0448, "step": 42370 }, { "epoch": 3.147185504232883, "grad_norm": 0.5898759365081787, "learning_rate": 1.1116886974602703e-05, "loss": 0.0547, "step": 42380 }, { "epoch": 3.1479281152532304, "grad_norm": 0.21336278319358826, "learning_rate": 1.1112431308480618e-05, "loss": 0.0413, "step": 42390 }, { "epoch": 3.148670726273578, "grad_norm": 1.7114509344100952, "learning_rate": 1.1107975642358533e-05, "loss": 0.099, "step": 42400 }, { "epoch": 3.1494133372939253, "grad_norm": 0.577693521976471, "learning_rate": 1.1103519976236448e-05, "loss": 0.0631, "step": 42410 }, { "epoch": 3.150155948314273, "grad_norm": 0.7681446075439453, "learning_rate": 1.1099064310114363e-05, "loss": 0.0605, "step": 42420 }, { "epoch": 3.1508985593346206, "grad_norm": 0.7933735251426697, "learning_rate": 1.1094608643992276e-05, "loss": 0.0827, "step": 42430 }, { "epoch": 3.151641170354968, "grad_norm": 0.5828210115432739, "learning_rate": 1.1090152977870193e-05, "loss": 0.0485, "step": 42440 }, { "epoch": 3.1523837813753155, "grad_norm": 0.2343250811100006, "learning_rate": 1.1085697311748108e-05, "loss": 0.0362, "step": 42450 }, { "epoch": 3.153126392395663, "grad_norm": 2.2434585094451904, "learning_rate": 1.1081241645626021e-05, "loss": 0.0569, "step": 42460 }, { "epoch": 3.153869003416011, "grad_norm": 1.9830187559127808, "learning_rate": 1.1076785979503936e-05, "loss": 0.0823, "step": 42470 }, { "epoch": 3.1546116144363583, "grad_norm": 1.5530307292938232, "learning_rate": 1.1072330313381851e-05, "loss": 0.0859, "step": 42480 }, { "epoch": 3.1553542254567057, "grad_norm": 1.3739688396453857, "learning_rate": 1.1067874647259766e-05, "loss": 0.0796, "step": 42490 }, { "epoch": 3.156096836477053, "grad_norm": 0.9068493247032166, "learning_rate": 1.1063418981137681e-05, "loss": 0.0461, "step": 42500 }, { "epoch": 3.1568394474974006, "grad_norm": 1.6886134147644043, "learning_rate": 1.1058963315015594e-05, "loss": 0.093, "step": 42510 }, { "epoch": 3.1575820585177485, "grad_norm": 2.079350233078003, "learning_rate": 1.1054507648893511e-05, "loss": 0.0642, "step": 42520 }, { "epoch": 3.158324669538096, "grad_norm": 1.6049011945724487, "learning_rate": 1.1050051982771424e-05, "loss": 0.0613, "step": 42530 }, { "epoch": 3.1590672805584434, "grad_norm": 0.36164718866348267, "learning_rate": 1.104559631664934e-05, "loss": 0.0669, "step": 42540 }, { "epoch": 3.159809891578791, "grad_norm": 1.008207082748413, "learning_rate": 1.1041140650527253e-05, "loss": 0.0988, "step": 42550 }, { "epoch": 3.1605525025991383, "grad_norm": 2.440133571624756, "learning_rate": 1.103668498440517e-05, "loss": 0.0465, "step": 42560 }, { "epoch": 3.1612951136194862, "grad_norm": 0.5878588557243347, "learning_rate": 1.1032229318283084e-05, "loss": 0.04, "step": 42570 }, { "epoch": 3.1620377246398337, "grad_norm": 1.898363709449768, "learning_rate": 1.1027773652160997e-05, "loss": 0.0494, "step": 42580 }, { "epoch": 3.162780335660181, "grad_norm": 1.029447078704834, "learning_rate": 1.1023317986038914e-05, "loss": 0.0658, "step": 42590 }, { "epoch": 3.1635229466805286, "grad_norm": 1.3765453100204468, "learning_rate": 1.1018862319916827e-05, "loss": 0.0479, "step": 42600 }, { "epoch": 3.1642655577008765, "grad_norm": 0.8187096118927002, "learning_rate": 1.1014406653794742e-05, "loss": 0.0677, "step": 42610 }, { "epoch": 3.165008168721224, "grad_norm": 3.0219662189483643, "learning_rate": 1.1009950987672657e-05, "loss": 0.0544, "step": 42620 }, { "epoch": 3.1657507797415714, "grad_norm": 0.4135105609893799, "learning_rate": 1.1005495321550572e-05, "loss": 0.0379, "step": 42630 }, { "epoch": 3.166493390761919, "grad_norm": 1.9586702585220337, "learning_rate": 1.1001039655428487e-05, "loss": 0.0644, "step": 42640 }, { "epoch": 3.1672360017822663, "grad_norm": 0.3655850887298584, "learning_rate": 1.09965839893064e-05, "loss": 0.0825, "step": 42650 }, { "epoch": 3.167978612802614, "grad_norm": 0.6566202640533447, "learning_rate": 1.0992128323184316e-05, "loss": 0.0597, "step": 42660 }, { "epoch": 3.1687212238229616, "grad_norm": 1.0529264211654663, "learning_rate": 1.098767265706223e-05, "loss": 0.0569, "step": 42670 }, { "epoch": 3.169463834843309, "grad_norm": 1.0065066814422607, "learning_rate": 1.0983216990940146e-05, "loss": 0.0674, "step": 42680 }, { "epoch": 3.1702064458636565, "grad_norm": 3.295680046081543, "learning_rate": 1.097876132481806e-05, "loss": 0.0754, "step": 42690 }, { "epoch": 3.170949056884004, "grad_norm": 1.2959693670272827, "learning_rate": 1.0974305658695976e-05, "loss": 0.0668, "step": 42700 }, { "epoch": 3.171691667904352, "grad_norm": 1.669705867767334, "learning_rate": 1.096984999257389e-05, "loss": 0.0788, "step": 42710 }, { "epoch": 3.1724342789246993, "grad_norm": 1.6456496715545654, "learning_rate": 1.0965394326451804e-05, "loss": 0.0564, "step": 42720 }, { "epoch": 3.1731768899450468, "grad_norm": 1.2554987668991089, "learning_rate": 1.0960938660329719e-05, "loss": 0.0777, "step": 42730 }, { "epoch": 3.173919500965394, "grad_norm": 2.1584696769714355, "learning_rate": 1.0956482994207636e-05, "loss": 0.0614, "step": 42740 }, { "epoch": 3.1746621119857417, "grad_norm": 1.2832754850387573, "learning_rate": 1.0952027328085549e-05, "loss": 0.0692, "step": 42750 }, { "epoch": 3.1754047230060896, "grad_norm": 0.7738613486289978, "learning_rate": 1.0947571661963464e-05, "loss": 0.0628, "step": 42760 }, { "epoch": 3.176147334026437, "grad_norm": 1.1968348026275635, "learning_rate": 1.0943115995841377e-05, "loss": 0.0578, "step": 42770 }, { "epoch": 3.1768899450467845, "grad_norm": 0.8434922099113464, "learning_rate": 1.0938660329719294e-05, "loss": 0.0698, "step": 42780 }, { "epoch": 3.177632556067132, "grad_norm": 0.869853675365448, "learning_rate": 1.0934204663597209e-05, "loss": 0.0595, "step": 42790 }, { "epoch": 3.17837516708748, "grad_norm": 1.2317126989364624, "learning_rate": 1.0929748997475122e-05, "loss": 0.0731, "step": 42800 }, { "epoch": 3.1791177781078273, "grad_norm": 2.5130200386047363, "learning_rate": 1.0925293331353037e-05, "loss": 0.0744, "step": 42810 }, { "epoch": 3.1798603891281747, "grad_norm": 1.538057565689087, "learning_rate": 1.0920837665230952e-05, "loss": 0.0544, "step": 42820 }, { "epoch": 3.180603000148522, "grad_norm": 2.4751596450805664, "learning_rate": 1.0916381999108867e-05, "loss": 0.0568, "step": 42830 }, { "epoch": 3.1813456111688696, "grad_norm": 0.298840194940567, "learning_rate": 1.091192633298678e-05, "loss": 0.0391, "step": 42840 }, { "epoch": 3.1820882221892175, "grad_norm": 1.3424323797225952, "learning_rate": 1.0907470666864697e-05, "loss": 0.0854, "step": 42850 }, { "epoch": 3.182830833209565, "grad_norm": 1.4192628860473633, "learning_rate": 1.0903015000742612e-05, "loss": 0.0538, "step": 42860 }, { "epoch": 3.1835734442299124, "grad_norm": 1.6128898859024048, "learning_rate": 1.0898559334620526e-05, "loss": 0.0526, "step": 42870 }, { "epoch": 3.18431605525026, "grad_norm": 1.4316538572311401, "learning_rate": 1.089410366849844e-05, "loss": 0.0555, "step": 42880 }, { "epoch": 3.1850586662706073, "grad_norm": 0.824476420879364, "learning_rate": 1.0889648002376356e-05, "loss": 0.0546, "step": 42890 }, { "epoch": 3.185801277290955, "grad_norm": 1.2334526777267456, "learning_rate": 1.088519233625427e-05, "loss": 0.0352, "step": 42900 }, { "epoch": 3.1865438883113026, "grad_norm": 1.7368484735488892, "learning_rate": 1.0880736670132186e-05, "loss": 0.0406, "step": 42910 }, { "epoch": 3.18728649933165, "grad_norm": 1.9491029977798462, "learning_rate": 1.0876281004010099e-05, "loss": 0.0542, "step": 42920 }, { "epoch": 3.1880291103519975, "grad_norm": 2.460498809814453, "learning_rate": 1.0871825337888016e-05, "loss": 0.0532, "step": 42930 }, { "epoch": 3.188771721372345, "grad_norm": 1.1754149198532104, "learning_rate": 1.0867369671765929e-05, "loss": 0.1044, "step": 42940 }, { "epoch": 3.189514332392693, "grad_norm": 1.4648820161819458, "learning_rate": 1.0862914005643844e-05, "loss": 0.0671, "step": 42950 }, { "epoch": 3.1902569434130403, "grad_norm": 1.5888352394104004, "learning_rate": 1.0858458339521759e-05, "loss": 0.054, "step": 42960 }, { "epoch": 3.190999554433388, "grad_norm": 0.9326488971710205, "learning_rate": 1.0854002673399674e-05, "loss": 0.067, "step": 42970 }, { "epoch": 3.1917421654537352, "grad_norm": 0.8771650195121765, "learning_rate": 1.0849547007277589e-05, "loss": 0.0449, "step": 42980 }, { "epoch": 3.1924847764740827, "grad_norm": 1.608896017074585, "learning_rate": 1.0845091341155502e-05, "loss": 0.0408, "step": 42990 }, { "epoch": 3.1932273874944306, "grad_norm": 1.1456100940704346, "learning_rate": 1.0840635675033419e-05, "loss": 0.0858, "step": 43000 }, { "epoch": 3.193969998514778, "grad_norm": 0.9099137187004089, "learning_rate": 1.0836180008911332e-05, "loss": 0.075, "step": 43010 }, { "epoch": 3.1947126095351255, "grad_norm": 3.812081813812256, "learning_rate": 1.0831724342789247e-05, "loss": 0.0499, "step": 43020 }, { "epoch": 3.195455220555473, "grad_norm": 3.0689592361450195, "learning_rate": 1.0827268676667162e-05, "loss": 0.0763, "step": 43030 }, { "epoch": 3.1961978315758204, "grad_norm": 1.6192634105682373, "learning_rate": 1.0822813010545077e-05, "loss": 0.0643, "step": 43040 }, { "epoch": 3.1969404425961683, "grad_norm": 2.0305936336517334, "learning_rate": 1.0818357344422992e-05, "loss": 0.0602, "step": 43050 }, { "epoch": 3.1976830536165157, "grad_norm": 3.326087713241577, "learning_rate": 1.0813901678300905e-05, "loss": 0.0737, "step": 43060 }, { "epoch": 3.198425664636863, "grad_norm": 1.6276055574417114, "learning_rate": 1.080944601217882e-05, "loss": 0.0827, "step": 43070 }, { "epoch": 3.1991682756572106, "grad_norm": 0.9700911641120911, "learning_rate": 1.0804990346056735e-05, "loss": 0.0455, "step": 43080 }, { "epoch": 3.199910886677558, "grad_norm": 0.9960930347442627, "learning_rate": 1.080053467993465e-05, "loss": 0.061, "step": 43090 }, { "epoch": 3.200653497697906, "grad_norm": 1.0367248058319092, "learning_rate": 1.0796079013812565e-05, "loss": 0.0802, "step": 43100 }, { "epoch": 3.2013961087182534, "grad_norm": 2.8516721725463867, "learning_rate": 1.079162334769048e-05, "loss": 0.0529, "step": 43110 }, { "epoch": 3.202138719738601, "grad_norm": 0.391720712184906, "learning_rate": 1.0787167681568395e-05, "loss": 0.0704, "step": 43120 }, { "epoch": 3.2028813307589483, "grad_norm": 2.677454710006714, "learning_rate": 1.0782712015446309e-05, "loss": 0.0784, "step": 43130 }, { "epoch": 3.2036239417792958, "grad_norm": 2.6573688983917236, "learning_rate": 1.0778256349324224e-05, "loss": 0.0293, "step": 43140 }, { "epoch": 3.2043665527996437, "grad_norm": 2.1728572845458984, "learning_rate": 1.077380068320214e-05, "loss": 0.0788, "step": 43150 }, { "epoch": 3.205109163819991, "grad_norm": 1.871918797492981, "learning_rate": 1.0769345017080054e-05, "loss": 0.0722, "step": 43160 }, { "epoch": 3.2058517748403386, "grad_norm": 0.8562690615653992, "learning_rate": 1.0764889350957969e-05, "loss": 0.0808, "step": 43170 }, { "epoch": 3.206594385860686, "grad_norm": 0.7276735305786133, "learning_rate": 1.0760433684835882e-05, "loss": 0.0452, "step": 43180 }, { "epoch": 3.207336996881034, "grad_norm": 1.5723731517791748, "learning_rate": 1.0755978018713799e-05, "loss": 0.0914, "step": 43190 }, { "epoch": 3.2080796079013814, "grad_norm": 2.1610429286956787, "learning_rate": 1.0751522352591714e-05, "loss": 0.069, "step": 43200 }, { "epoch": 3.208822218921729, "grad_norm": 2.077439069747925, "learning_rate": 1.0747066686469627e-05, "loss": 0.0626, "step": 43210 }, { "epoch": 3.2095648299420763, "grad_norm": 1.1080368757247925, "learning_rate": 1.0742611020347544e-05, "loss": 0.0603, "step": 43220 }, { "epoch": 3.2103074409624237, "grad_norm": 1.089012622833252, "learning_rate": 1.0738155354225457e-05, "loss": 0.046, "step": 43230 }, { "epoch": 3.2110500519827716, "grad_norm": 1.0629624128341675, "learning_rate": 1.0733699688103372e-05, "loss": 0.065, "step": 43240 }, { "epoch": 3.211792663003119, "grad_norm": 2.318930149078369, "learning_rate": 1.0729244021981285e-05, "loss": 0.0856, "step": 43250 }, { "epoch": 3.2125352740234665, "grad_norm": 0.6202425360679626, "learning_rate": 1.0724788355859202e-05, "loss": 0.0659, "step": 43260 }, { "epoch": 3.213277885043814, "grad_norm": 1.3956732749938965, "learning_rate": 1.0720332689737117e-05, "loss": 0.0679, "step": 43270 }, { "epoch": 3.2140204960641614, "grad_norm": 1.315119743347168, "learning_rate": 1.071587702361503e-05, "loss": 0.0776, "step": 43280 }, { "epoch": 3.2147631070845093, "grad_norm": 1.4344345331192017, "learning_rate": 1.0711421357492945e-05, "loss": 0.0826, "step": 43290 }, { "epoch": 3.2155057181048567, "grad_norm": 1.0249013900756836, "learning_rate": 1.070696569137086e-05, "loss": 0.052, "step": 43300 }, { "epoch": 3.216248329125204, "grad_norm": 0.4725293815135956, "learning_rate": 1.0702510025248775e-05, "loss": 0.0762, "step": 43310 }, { "epoch": 3.2169909401455516, "grad_norm": 2.4270472526550293, "learning_rate": 1.069805435912669e-05, "loss": 0.0802, "step": 43320 }, { "epoch": 3.217733551165899, "grad_norm": 1.2253289222717285, "learning_rate": 1.0693598693004603e-05, "loss": 0.0569, "step": 43330 }, { "epoch": 3.218476162186247, "grad_norm": 0.7013288140296936, "learning_rate": 1.068914302688252e-05, "loss": 0.0446, "step": 43340 }, { "epoch": 3.2192187732065944, "grad_norm": 1.2379825115203857, "learning_rate": 1.0684687360760433e-05, "loss": 0.0667, "step": 43350 }, { "epoch": 3.219961384226942, "grad_norm": 1.6265310049057007, "learning_rate": 1.0680231694638348e-05, "loss": 0.0971, "step": 43360 }, { "epoch": 3.2207039952472893, "grad_norm": 1.87282133102417, "learning_rate": 1.0675776028516263e-05, "loss": 0.0718, "step": 43370 }, { "epoch": 3.2214466062676372, "grad_norm": 1.285352349281311, "learning_rate": 1.0671320362394178e-05, "loss": 0.0982, "step": 43380 }, { "epoch": 3.2221892172879847, "grad_norm": 1.9336485862731934, "learning_rate": 1.0666864696272093e-05, "loss": 0.0469, "step": 43390 }, { "epoch": 3.222931828308332, "grad_norm": 2.0391123294830322, "learning_rate": 1.0662409030150007e-05, "loss": 0.0731, "step": 43400 }, { "epoch": 3.2236744393286796, "grad_norm": 0.8943817019462585, "learning_rate": 1.0657953364027923e-05, "loss": 0.0598, "step": 43410 }, { "epoch": 3.224417050349027, "grad_norm": 0.8489885330200195, "learning_rate": 1.0653497697905837e-05, "loss": 0.0584, "step": 43420 }, { "epoch": 3.225159661369375, "grad_norm": 0.9959657192230225, "learning_rate": 1.0649042031783752e-05, "loss": 0.064, "step": 43430 }, { "epoch": 3.2259022723897224, "grad_norm": 1.330881953239441, "learning_rate": 1.0644586365661667e-05, "loss": 0.069, "step": 43440 }, { "epoch": 3.22664488341007, "grad_norm": 2.326178550720215, "learning_rate": 1.0640130699539582e-05, "loss": 0.0519, "step": 43450 }, { "epoch": 3.2273874944304173, "grad_norm": 0.8001874089241028, "learning_rate": 1.0635675033417497e-05, "loss": 0.059, "step": 43460 }, { "epoch": 3.2281301054507647, "grad_norm": 1.8835375308990479, "learning_rate": 1.063121936729541e-05, "loss": 0.1137, "step": 43470 }, { "epoch": 3.2288727164711126, "grad_norm": 1.0874513387680054, "learning_rate": 1.0626763701173325e-05, "loss": 0.0764, "step": 43480 }, { "epoch": 3.22961532749146, "grad_norm": 0.6599858403205872, "learning_rate": 1.0622308035051242e-05, "loss": 0.0516, "step": 43490 }, { "epoch": 3.2303579385118075, "grad_norm": 0.5097527503967285, "learning_rate": 1.0617852368929155e-05, "loss": 0.0429, "step": 43500 }, { "epoch": 3.231100549532155, "grad_norm": 0.7686970829963684, "learning_rate": 1.061339670280707e-05, "loss": 0.0492, "step": 43510 }, { "epoch": 3.2318431605525024, "grad_norm": 1.8468888998031616, "learning_rate": 1.0608941036684985e-05, "loss": 0.0558, "step": 43520 }, { "epoch": 3.2325857715728503, "grad_norm": 0.26282399892807007, "learning_rate": 1.06044853705629e-05, "loss": 0.069, "step": 43530 }, { "epoch": 3.2333283825931978, "grad_norm": 2.4941744804382324, "learning_rate": 1.0600029704440813e-05, "loss": 0.053, "step": 43540 }, { "epoch": 3.234070993613545, "grad_norm": 2.5920748710632324, "learning_rate": 1.0595574038318728e-05, "loss": 0.0638, "step": 43550 }, { "epoch": 3.2348136046338927, "grad_norm": 0.590412437915802, "learning_rate": 1.0591118372196645e-05, "loss": 0.0606, "step": 43560 }, { "epoch": 3.23555621565424, "grad_norm": 1.5013396739959717, "learning_rate": 1.0586662706074558e-05, "loss": 0.0688, "step": 43570 }, { "epoch": 3.236298826674588, "grad_norm": 1.6271650791168213, "learning_rate": 1.0582207039952473e-05, "loss": 0.0675, "step": 43580 }, { "epoch": 3.2370414376949355, "grad_norm": 0.9938003420829773, "learning_rate": 1.0577751373830386e-05, "loss": 0.081, "step": 43590 }, { "epoch": 3.237784048715283, "grad_norm": 1.7118418216705322, "learning_rate": 1.0573295707708303e-05, "loss": 0.0595, "step": 43600 }, { "epoch": 3.2385266597356304, "grad_norm": 0.5836747884750366, "learning_rate": 1.0568840041586218e-05, "loss": 0.0638, "step": 43610 }, { "epoch": 3.239269270755978, "grad_norm": 0.8831083178520203, "learning_rate": 1.0564384375464131e-05, "loss": 0.0438, "step": 43620 }, { "epoch": 3.2400118817763257, "grad_norm": 1.3930721282958984, "learning_rate": 1.0559928709342048e-05, "loss": 0.0442, "step": 43630 }, { "epoch": 3.240754492796673, "grad_norm": 5.538564682006836, "learning_rate": 1.0555473043219961e-05, "loss": 0.0557, "step": 43640 }, { "epoch": 3.2414971038170206, "grad_norm": 1.8036948442459106, "learning_rate": 1.0551017377097876e-05, "loss": 0.0664, "step": 43650 }, { "epoch": 3.242239714837368, "grad_norm": 0.252446711063385, "learning_rate": 1.054656171097579e-05, "loss": 0.0578, "step": 43660 }, { "epoch": 3.2429823258577155, "grad_norm": 1.6370078325271606, "learning_rate": 1.0542106044853706e-05, "loss": 0.0668, "step": 43670 }, { "epoch": 3.2437249368780634, "grad_norm": 0.6032128930091858, "learning_rate": 1.0537650378731621e-05, "loss": 0.0697, "step": 43680 }, { "epoch": 3.244467547898411, "grad_norm": 1.6264759302139282, "learning_rate": 1.0533194712609535e-05, "loss": 0.0407, "step": 43690 }, { "epoch": 3.2452101589187583, "grad_norm": 1.4826754331588745, "learning_rate": 1.052873904648745e-05, "loss": 0.0342, "step": 43700 }, { "epoch": 3.2459527699391058, "grad_norm": 0.2272782027721405, "learning_rate": 1.0524283380365365e-05, "loss": 0.0744, "step": 43710 }, { "epoch": 3.246695380959453, "grad_norm": 2.538353204727173, "learning_rate": 1.051982771424328e-05, "loss": 0.063, "step": 43720 }, { "epoch": 3.247437991979801, "grad_norm": 1.0268744230270386, "learning_rate": 1.0515372048121195e-05, "loss": 0.0528, "step": 43730 }, { "epoch": 3.2481806030001485, "grad_norm": 0.812559962272644, "learning_rate": 1.0510916381999108e-05, "loss": 0.0557, "step": 43740 }, { "epoch": 3.248923214020496, "grad_norm": 0.4809117317199707, "learning_rate": 1.0506460715877025e-05, "loss": 0.0452, "step": 43750 }, { "epoch": 3.2496658250408434, "grad_norm": 2.713081121444702, "learning_rate": 1.0502005049754938e-05, "loss": 0.0444, "step": 43760 }, { "epoch": 3.2504084360611913, "grad_norm": 1.6045153141021729, "learning_rate": 1.0497549383632853e-05, "loss": 0.0835, "step": 43770 }, { "epoch": 3.251151047081539, "grad_norm": 1.182823657989502, "learning_rate": 1.049309371751077e-05, "loss": 0.0358, "step": 43780 }, { "epoch": 3.2518936581018862, "grad_norm": 1.261793613433838, "learning_rate": 1.0488638051388683e-05, "loss": 0.059, "step": 43790 }, { "epoch": 3.2526362691222337, "grad_norm": 1.5234590768814087, "learning_rate": 1.0484182385266598e-05, "loss": 0.0675, "step": 43800 }, { "epoch": 3.253378880142581, "grad_norm": 3.1365156173706055, "learning_rate": 1.0479726719144511e-05, "loss": 0.0543, "step": 43810 }, { "epoch": 3.254121491162929, "grad_norm": 2.5033278465270996, "learning_rate": 1.0475271053022428e-05, "loss": 0.0883, "step": 43820 }, { "epoch": 3.2548641021832765, "grad_norm": 1.3353326320648193, "learning_rate": 1.0470815386900341e-05, "loss": 0.0692, "step": 43830 }, { "epoch": 3.255606713203624, "grad_norm": 1.0387260913848877, "learning_rate": 1.0466359720778256e-05, "loss": 0.0863, "step": 43840 }, { "epoch": 3.2563493242239714, "grad_norm": 1.1819405555725098, "learning_rate": 1.0461904054656171e-05, "loss": 0.0635, "step": 43850 }, { "epoch": 3.257091935244319, "grad_norm": 0.8391949534416199, "learning_rate": 1.0457448388534086e-05, "loss": 0.0419, "step": 43860 }, { "epoch": 3.2578345462646667, "grad_norm": 3.538817882537842, "learning_rate": 1.0452992722412001e-05, "loss": 0.0757, "step": 43870 }, { "epoch": 3.258577157285014, "grad_norm": 1.986291766166687, "learning_rate": 1.0448537056289915e-05, "loss": 0.0663, "step": 43880 }, { "epoch": 3.2593197683053616, "grad_norm": 2.091001510620117, "learning_rate": 1.0444081390167831e-05, "loss": 0.0641, "step": 43890 }, { "epoch": 3.260062379325709, "grad_norm": 1.032757043838501, "learning_rate": 1.0439625724045746e-05, "loss": 0.0629, "step": 43900 }, { "epoch": 3.260804990346057, "grad_norm": 0.7859309911727905, "learning_rate": 1.043517005792366e-05, "loss": 0.0354, "step": 43910 }, { "epoch": 3.2615476013664044, "grad_norm": 0.43140801787376404, "learning_rate": 1.0430714391801575e-05, "loss": 0.0709, "step": 43920 }, { "epoch": 3.262290212386752, "grad_norm": 1.7433068752288818, "learning_rate": 1.042625872567949e-05, "loss": 0.088, "step": 43930 }, { "epoch": 3.2630328234070993, "grad_norm": 2.8857994079589844, "learning_rate": 1.0421803059557405e-05, "loss": 0.0876, "step": 43940 }, { "epoch": 3.2637754344274468, "grad_norm": 0.6673837304115295, "learning_rate": 1.0417347393435318e-05, "loss": 0.0525, "step": 43950 }, { "epoch": 3.2645180454477947, "grad_norm": 0.9741489887237549, "learning_rate": 1.0412891727313233e-05, "loss": 0.0626, "step": 43960 }, { "epoch": 3.265260656468142, "grad_norm": 0.6886597275733948, "learning_rate": 1.040843606119115e-05, "loss": 0.0607, "step": 43970 }, { "epoch": 3.2660032674884896, "grad_norm": 0.6309210062026978, "learning_rate": 1.0403980395069063e-05, "loss": 0.0695, "step": 43980 }, { "epoch": 3.266745878508837, "grad_norm": 3.206247329711914, "learning_rate": 1.0399524728946978e-05, "loss": 0.0573, "step": 43990 }, { "epoch": 3.2674884895291845, "grad_norm": 0.590715765953064, "learning_rate": 1.0395069062824891e-05, "loss": 0.0443, "step": 44000 }, { "epoch": 3.2682311005495324, "grad_norm": 3.0547935962677, "learning_rate": 1.0390613396702808e-05, "loss": 0.062, "step": 44010 }, { "epoch": 3.26897371156988, "grad_norm": 5.862617015838623, "learning_rate": 1.0386157730580723e-05, "loss": 0.0543, "step": 44020 }, { "epoch": 3.2697163225902273, "grad_norm": 0.2635265588760376, "learning_rate": 1.0381702064458636e-05, "loss": 0.0307, "step": 44030 }, { "epoch": 3.2704589336105747, "grad_norm": 1.2568199634552002, "learning_rate": 1.0377246398336553e-05, "loss": 0.0388, "step": 44040 }, { "epoch": 3.271201544630922, "grad_norm": 0.4090416133403778, "learning_rate": 1.0372790732214466e-05, "loss": 0.0327, "step": 44050 }, { "epoch": 3.27194415565127, "grad_norm": 1.8070887327194214, "learning_rate": 1.0368335066092381e-05, "loss": 0.0626, "step": 44060 }, { "epoch": 3.2726867666716175, "grad_norm": 1.7493088245391846, "learning_rate": 1.0363879399970296e-05, "loss": 0.0763, "step": 44070 }, { "epoch": 3.273429377691965, "grad_norm": 1.4830260276794434, "learning_rate": 1.0359423733848211e-05, "loss": 0.0501, "step": 44080 }, { "epoch": 3.2741719887123124, "grad_norm": 0.5901992917060852, "learning_rate": 1.0354968067726126e-05, "loss": 0.0614, "step": 44090 }, { "epoch": 3.27491459973266, "grad_norm": 2.098797082901001, "learning_rate": 1.035051240160404e-05, "loss": 0.058, "step": 44100 }, { "epoch": 3.2756572107530078, "grad_norm": 1.220317006111145, "learning_rate": 1.0346056735481954e-05, "loss": 0.0958, "step": 44110 }, { "epoch": 3.276399821773355, "grad_norm": 0.6847271919250488, "learning_rate": 1.034160106935987e-05, "loss": 0.0539, "step": 44120 }, { "epoch": 3.2771424327937027, "grad_norm": 1.1545261144638062, "learning_rate": 1.0337145403237784e-05, "loss": 0.0644, "step": 44130 }, { "epoch": 3.27788504381405, "grad_norm": 2.0466902256011963, "learning_rate": 1.03326897371157e-05, "loss": 0.0595, "step": 44140 }, { "epoch": 3.2786276548343976, "grad_norm": 2.453418493270874, "learning_rate": 1.0328234070993614e-05, "loss": 0.0978, "step": 44150 }, { "epoch": 3.2793702658547454, "grad_norm": 3.9171948432922363, "learning_rate": 1.032377840487153e-05, "loss": 0.0583, "step": 44160 }, { "epoch": 3.280112876875093, "grad_norm": 2.847308874130249, "learning_rate": 1.0319322738749443e-05, "loss": 0.0792, "step": 44170 }, { "epoch": 3.2808554878954403, "grad_norm": 1.2531743049621582, "learning_rate": 1.0314867072627358e-05, "loss": 0.0833, "step": 44180 }, { "epoch": 3.281598098915788, "grad_norm": 3.184342384338379, "learning_rate": 1.0310411406505274e-05, "loss": 0.0575, "step": 44190 }, { "epoch": 3.2823407099361352, "grad_norm": 1.0278104543685913, "learning_rate": 1.0305955740383188e-05, "loss": 0.0633, "step": 44200 }, { "epoch": 3.283083320956483, "grad_norm": 1.105556607246399, "learning_rate": 1.0301500074261103e-05, "loss": 0.0774, "step": 44210 }, { "epoch": 3.2838259319768306, "grad_norm": 2.209592342376709, "learning_rate": 1.0297044408139016e-05, "loss": 0.0427, "step": 44220 }, { "epoch": 3.284568542997178, "grad_norm": 1.6247801780700684, "learning_rate": 1.0292588742016933e-05, "loss": 0.0552, "step": 44230 }, { "epoch": 3.2853111540175255, "grad_norm": 1.5469924211502075, "learning_rate": 1.0288133075894846e-05, "loss": 0.0543, "step": 44240 }, { "epoch": 3.286053765037873, "grad_norm": 1.353393316268921, "learning_rate": 1.0283677409772761e-05, "loss": 0.0592, "step": 44250 }, { "epoch": 3.286796376058221, "grad_norm": 1.046172022819519, "learning_rate": 1.0279221743650676e-05, "loss": 0.0504, "step": 44260 }, { "epoch": 3.2875389870785683, "grad_norm": 0.5917429327964783, "learning_rate": 1.0274766077528591e-05, "loss": 0.0429, "step": 44270 }, { "epoch": 3.2882815980989157, "grad_norm": 1.0659432411193848, "learning_rate": 1.0270310411406506e-05, "loss": 0.0625, "step": 44280 }, { "epoch": 3.289024209119263, "grad_norm": 1.2359225749969482, "learning_rate": 1.0265854745284419e-05, "loss": 0.0623, "step": 44290 }, { "epoch": 3.2897668201396106, "grad_norm": 1.3712466955184937, "learning_rate": 1.0261399079162336e-05, "loss": 0.0367, "step": 44300 }, { "epoch": 3.2905094311599585, "grad_norm": 0.6402594447135925, "learning_rate": 1.025694341304025e-05, "loss": 0.06, "step": 44310 }, { "epoch": 3.291252042180306, "grad_norm": 3.1848597526550293, "learning_rate": 1.0252487746918164e-05, "loss": 0.0566, "step": 44320 }, { "epoch": 3.2919946532006534, "grad_norm": 0.769212543964386, "learning_rate": 1.0248032080796079e-05, "loss": 0.0579, "step": 44330 }, { "epoch": 3.292737264221001, "grad_norm": 2.3581175804138184, "learning_rate": 1.0243576414673994e-05, "loss": 0.0666, "step": 44340 }, { "epoch": 3.2934798752413488, "grad_norm": 0.7175013422966003, "learning_rate": 1.0239120748551909e-05, "loss": 0.0463, "step": 44350 }, { "epoch": 3.2942224862616962, "grad_norm": 1.026248574256897, "learning_rate": 1.0234665082429822e-05, "loss": 0.06, "step": 44360 }, { "epoch": 3.2949650972820437, "grad_norm": 1.5957810878753662, "learning_rate": 1.0230209416307737e-05, "loss": 0.0462, "step": 44370 }, { "epoch": 3.295707708302391, "grad_norm": 0.7169394493103027, "learning_rate": 1.0225753750185654e-05, "loss": 0.0872, "step": 44380 }, { "epoch": 3.2964503193227386, "grad_norm": 1.4847623109817505, "learning_rate": 1.0221298084063567e-05, "loss": 0.0513, "step": 44390 }, { "epoch": 3.2971929303430865, "grad_norm": 1.8581526279449463, "learning_rate": 1.0216842417941482e-05, "loss": 0.0663, "step": 44400 }, { "epoch": 3.297935541363434, "grad_norm": 0.3895215690135956, "learning_rate": 1.0212386751819397e-05, "loss": 0.0684, "step": 44410 }, { "epoch": 3.2986781523837814, "grad_norm": 0.4079320728778839, "learning_rate": 1.0207931085697312e-05, "loss": 0.0593, "step": 44420 }, { "epoch": 3.299420763404129, "grad_norm": 1.5425752401351929, "learning_rate": 1.0203475419575227e-05, "loss": 0.0927, "step": 44430 }, { "epoch": 3.3001633744244763, "grad_norm": 1.9693676233291626, "learning_rate": 1.019901975345314e-05, "loss": 0.091, "step": 44440 }, { "epoch": 3.300905985444824, "grad_norm": 1.5233045816421509, "learning_rate": 1.0194564087331057e-05, "loss": 0.081, "step": 44450 }, { "epoch": 3.3016485964651716, "grad_norm": 1.4039238691329956, "learning_rate": 1.019010842120897e-05, "loss": 0.0609, "step": 44460 }, { "epoch": 3.302391207485519, "grad_norm": 1.4866162538528442, "learning_rate": 1.0185652755086886e-05, "loss": 0.0607, "step": 44470 }, { "epoch": 3.3031338185058665, "grad_norm": 2.2782821655273438, "learning_rate": 1.01811970889648e-05, "loss": 0.0506, "step": 44480 }, { "epoch": 3.3038764295262144, "grad_norm": 1.1079455614089966, "learning_rate": 1.0176741422842716e-05, "loss": 0.0785, "step": 44490 }, { "epoch": 3.304619040546562, "grad_norm": 2.9163553714752197, "learning_rate": 1.017228575672063e-05, "loss": 0.0635, "step": 44500 }, { "epoch": 3.3053616515669093, "grad_norm": 2.6138124465942383, "learning_rate": 1.0167830090598544e-05, "loss": 0.0539, "step": 44510 }, { "epoch": 3.3061042625872568, "grad_norm": 1.3911471366882324, "learning_rate": 1.0163374424476459e-05, "loss": 0.0717, "step": 44520 }, { "epoch": 3.306846873607604, "grad_norm": 1.2493705749511719, "learning_rate": 1.0158918758354374e-05, "loss": 0.0833, "step": 44530 }, { "epoch": 3.307589484627952, "grad_norm": 0.6470643877983093, "learning_rate": 1.0154463092232289e-05, "loss": 0.0834, "step": 44540 }, { "epoch": 3.3083320956482996, "grad_norm": 1.346933126449585, "learning_rate": 1.0150007426110204e-05, "loss": 0.0417, "step": 44550 }, { "epoch": 3.309074706668647, "grad_norm": 1.7907460927963257, "learning_rate": 1.0145551759988119e-05, "loss": 0.0613, "step": 44560 }, { "epoch": 3.3098173176889945, "grad_norm": 3.1035265922546387, "learning_rate": 1.0141096093866034e-05, "loss": 0.0773, "step": 44570 }, { "epoch": 3.310559928709342, "grad_norm": 1.8176069259643555, "learning_rate": 1.0136640427743947e-05, "loss": 0.0679, "step": 44580 }, { "epoch": 3.31130253972969, "grad_norm": 1.3476163148880005, "learning_rate": 1.0132184761621862e-05, "loss": 0.066, "step": 44590 }, { "epoch": 3.3120451507500372, "grad_norm": 1.3628586530685425, "learning_rate": 1.0127729095499779e-05, "loss": 0.0671, "step": 44600 }, { "epoch": 3.3127877617703847, "grad_norm": 0.9032784104347229, "learning_rate": 1.0123273429377692e-05, "loss": 0.0828, "step": 44610 }, { "epoch": 3.313530372790732, "grad_norm": 1.4668526649475098, "learning_rate": 1.0118817763255607e-05, "loss": 0.0669, "step": 44620 }, { "epoch": 3.3142729838110796, "grad_norm": 0.5808708071708679, "learning_rate": 1.011436209713352e-05, "loss": 0.0926, "step": 44630 }, { "epoch": 3.3150155948314275, "grad_norm": 0.9334034323692322, "learning_rate": 1.0109906431011437e-05, "loss": 0.0695, "step": 44640 }, { "epoch": 3.315758205851775, "grad_norm": 2.833603858947754, "learning_rate": 1.010545076488935e-05, "loss": 0.0559, "step": 44650 }, { "epoch": 3.3165008168721224, "grad_norm": 0.9554916620254517, "learning_rate": 1.0100995098767265e-05, "loss": 0.0612, "step": 44660 }, { "epoch": 3.31724342789247, "grad_norm": 2.4863193035125732, "learning_rate": 1.0096539432645182e-05, "loss": 0.0585, "step": 44670 }, { "epoch": 3.3179860389128173, "grad_norm": 1.2843139171600342, "learning_rate": 1.0092083766523095e-05, "loss": 0.057, "step": 44680 }, { "epoch": 3.318728649933165, "grad_norm": 3.115098237991333, "learning_rate": 1.008762810040101e-05, "loss": 0.062, "step": 44690 }, { "epoch": 3.3194712609535126, "grad_norm": 1.55138099193573, "learning_rate": 1.0083172434278924e-05, "loss": 0.0757, "step": 44700 }, { "epoch": 3.32021387197386, "grad_norm": 2.5275778770446777, "learning_rate": 1.007871676815684e-05, "loss": 0.0794, "step": 44710 }, { "epoch": 3.3209564829942075, "grad_norm": 2.697035551071167, "learning_rate": 1.0074261102034755e-05, "loss": 0.0467, "step": 44720 }, { "epoch": 3.321699094014555, "grad_norm": 0.9199452996253967, "learning_rate": 1.0069805435912669e-05, "loss": 0.0822, "step": 44730 }, { "epoch": 3.322441705034903, "grad_norm": 2.4956889152526855, "learning_rate": 1.0065349769790584e-05, "loss": 0.0608, "step": 44740 }, { "epoch": 3.3231843160552503, "grad_norm": 2.050920248031616, "learning_rate": 1.0060894103668499e-05, "loss": 0.0895, "step": 44750 }, { "epoch": 3.323926927075598, "grad_norm": 1.369531273841858, "learning_rate": 1.0056438437546414e-05, "loss": 0.0515, "step": 44760 }, { "epoch": 3.3246695380959452, "grad_norm": 3.23797345161438, "learning_rate": 1.0051982771424329e-05, "loss": 0.0678, "step": 44770 }, { "epoch": 3.3254121491162927, "grad_norm": 0.7047389149665833, "learning_rate": 1.0047527105302242e-05, "loss": 0.0508, "step": 44780 }, { "epoch": 3.3261547601366406, "grad_norm": 0.7370045781135559, "learning_rate": 1.0043071439180159e-05, "loss": 0.0548, "step": 44790 }, { "epoch": 3.326897371156988, "grad_norm": 1.6927924156188965, "learning_rate": 1.0038615773058072e-05, "loss": 0.0859, "step": 44800 }, { "epoch": 3.3276399821773355, "grad_norm": 0.9936562180519104, "learning_rate": 1.0034160106935987e-05, "loss": 0.0641, "step": 44810 }, { "epoch": 3.328382593197683, "grad_norm": 1.1975995302200317, "learning_rate": 1.0029704440813902e-05, "loss": 0.0758, "step": 44820 }, { "epoch": 3.3291252042180304, "grad_norm": 0.6930918097496033, "learning_rate": 1.0025248774691817e-05, "loss": 0.0674, "step": 44830 }, { "epoch": 3.3298678152383783, "grad_norm": 1.631554365158081, "learning_rate": 1.0020793108569732e-05, "loss": 0.0528, "step": 44840 }, { "epoch": 3.3306104262587257, "grad_norm": 0.4644923508167267, "learning_rate": 1.0016337442447645e-05, "loss": 0.0543, "step": 44850 }, { "epoch": 3.331353037279073, "grad_norm": 0.8077749013900757, "learning_rate": 1.0011881776325562e-05, "loss": 0.0411, "step": 44860 }, { "epoch": 3.3320956482994206, "grad_norm": 0.5882359743118286, "learning_rate": 1.0007426110203475e-05, "loss": 0.038, "step": 44870 }, { "epoch": 3.332838259319768, "grad_norm": 2.1609537601470947, "learning_rate": 1.000297044408139e-05, "loss": 0.0709, "step": 44880 }, { "epoch": 3.333580870340116, "grad_norm": 2.9716951847076416, "learning_rate": 9.998514777959305e-06, "loss": 0.0759, "step": 44890 }, { "epoch": 3.3343234813604634, "grad_norm": 1.4051735401153564, "learning_rate": 9.99405911183722e-06, "loss": 0.0842, "step": 44900 }, { "epoch": 3.335066092380811, "grad_norm": 1.0370116233825684, "learning_rate": 9.989603445715135e-06, "loss": 0.0574, "step": 44910 }, { "epoch": 3.3358087034011583, "grad_norm": 2.2001404762268066, "learning_rate": 9.985147779593049e-06, "loss": 0.0711, "step": 44920 }, { "epoch": 3.336551314421506, "grad_norm": 1.836188554763794, "learning_rate": 9.980692113470964e-06, "loss": 0.0672, "step": 44930 }, { "epoch": 3.3372939254418537, "grad_norm": 3.0078184604644775, "learning_rate": 9.976236447348879e-06, "loss": 0.073, "step": 44940 }, { "epoch": 3.338036536462201, "grad_norm": 0.9165183305740356, "learning_rate": 9.971780781226794e-06, "loss": 0.0768, "step": 44950 }, { "epoch": 3.3387791474825486, "grad_norm": 1.1523520946502686, "learning_rate": 9.967325115104709e-06, "loss": 0.0325, "step": 44960 }, { "epoch": 3.339521758502896, "grad_norm": 0.701426088809967, "learning_rate": 9.962869448982624e-06, "loss": 0.0557, "step": 44970 }, { "epoch": 3.340264369523244, "grad_norm": 0.9896045327186584, "learning_rate": 9.958413782860539e-06, "loss": 0.0438, "step": 44980 }, { "epoch": 3.3410069805435914, "grad_norm": 1.784203290939331, "learning_rate": 9.953958116738452e-06, "loss": 0.0509, "step": 44990 }, { "epoch": 3.341749591563939, "grad_norm": 1.9945133924484253, "learning_rate": 9.949502450616367e-06, "loss": 0.0682, "step": 45000 }, { "epoch": 3.3424922025842863, "grad_norm": 2.4926345348358154, "learning_rate": 9.945046784494283e-06, "loss": 0.0784, "step": 45010 }, { "epoch": 3.3432348136046337, "grad_norm": 1.1978400945663452, "learning_rate": 9.940591118372197e-06, "loss": 0.0644, "step": 45020 }, { "epoch": 3.3439774246249816, "grad_norm": 0.44274619221687317, "learning_rate": 9.936135452250112e-06, "loss": 0.0558, "step": 45030 }, { "epoch": 3.344720035645329, "grad_norm": 0.9496433138847351, "learning_rate": 9.931679786128025e-06, "loss": 0.0406, "step": 45040 }, { "epoch": 3.3454626466656765, "grad_norm": 0.4227916896343231, "learning_rate": 9.927224120005942e-06, "loss": 0.0446, "step": 45050 }, { "epoch": 3.346205257686024, "grad_norm": 1.6861997842788696, "learning_rate": 9.922768453883855e-06, "loss": 0.0576, "step": 45060 }, { "epoch": 3.346947868706372, "grad_norm": 2.8230645656585693, "learning_rate": 9.91831278776177e-06, "loss": 0.055, "step": 45070 }, { "epoch": 3.3476904797267193, "grad_norm": 1.0160224437713623, "learning_rate": 9.913857121639687e-06, "loss": 0.09, "step": 45080 }, { "epoch": 3.3484330907470667, "grad_norm": 2.2444396018981934, "learning_rate": 9.9094014555176e-06, "loss": 0.0685, "step": 45090 }, { "epoch": 3.349175701767414, "grad_norm": 1.4811400175094604, "learning_rate": 9.904945789395515e-06, "loss": 0.0576, "step": 45100 }, { "epoch": 3.3499183127877616, "grad_norm": 2.1645710468292236, "learning_rate": 9.900490123273428e-06, "loss": 0.0926, "step": 45110 }, { "epoch": 3.3506609238081095, "grad_norm": 2.3082311153411865, "learning_rate": 9.896034457151345e-06, "loss": 0.0536, "step": 45120 }, { "epoch": 3.351403534828457, "grad_norm": 1.518615961074829, "learning_rate": 9.89157879102926e-06, "loss": 0.0877, "step": 45130 }, { "epoch": 3.3521461458488044, "grad_norm": 0.9290609955787659, "learning_rate": 9.887123124907173e-06, "loss": 0.0549, "step": 45140 }, { "epoch": 3.352888756869152, "grad_norm": 2.0847578048706055, "learning_rate": 9.882667458785088e-06, "loss": 0.0656, "step": 45150 }, { "epoch": 3.3536313678894993, "grad_norm": 2.6729955673217773, "learning_rate": 9.878211792663003e-06, "loss": 0.0772, "step": 45160 }, { "epoch": 3.3543739789098472, "grad_norm": 2.263134241104126, "learning_rate": 9.873756126540918e-06, "loss": 0.0689, "step": 45170 }, { "epoch": 3.3551165899301947, "grad_norm": 0.8726534247398376, "learning_rate": 9.869300460418833e-06, "loss": 0.0652, "step": 45180 }, { "epoch": 3.355859200950542, "grad_norm": 1.9389985799789429, "learning_rate": 9.864844794296747e-06, "loss": 0.0787, "step": 45190 }, { "epoch": 3.3566018119708896, "grad_norm": 2.6896302700042725, "learning_rate": 9.860389128174663e-06, "loss": 0.0622, "step": 45200 }, { "epoch": 3.357344422991237, "grad_norm": 1.6283527612686157, "learning_rate": 9.855933462052577e-06, "loss": 0.052, "step": 45210 }, { "epoch": 3.358087034011585, "grad_norm": 1.3546130657196045, "learning_rate": 9.851477795930492e-06, "loss": 0.0797, "step": 45220 }, { "epoch": 3.3588296450319324, "grad_norm": 1.457862377166748, "learning_rate": 9.847022129808407e-06, "loss": 0.0727, "step": 45230 }, { "epoch": 3.35957225605228, "grad_norm": 0.8017680644989014, "learning_rate": 9.842566463686322e-06, "loss": 0.0362, "step": 45240 }, { "epoch": 3.3603148670726273, "grad_norm": 1.1099777221679688, "learning_rate": 9.838110797564237e-06, "loss": 0.0549, "step": 45250 }, { "epoch": 3.3610574780929747, "grad_norm": 2.5298869609832764, "learning_rate": 9.83365513144215e-06, "loss": 0.0445, "step": 45260 }, { "epoch": 3.3618000891133226, "grad_norm": 0.5401008725166321, "learning_rate": 9.829199465320067e-06, "loss": 0.0526, "step": 45270 }, { "epoch": 3.36254270013367, "grad_norm": 1.3315315246582031, "learning_rate": 9.82474379919798e-06, "loss": 0.0675, "step": 45280 }, { "epoch": 3.3632853111540175, "grad_norm": 0.9110653400421143, "learning_rate": 9.820288133075895e-06, "loss": 0.0365, "step": 45290 }, { "epoch": 3.364027922174365, "grad_norm": 1.0673272609710693, "learning_rate": 9.81583246695381e-06, "loss": 0.0603, "step": 45300 }, { "epoch": 3.3647705331947124, "grad_norm": 1.7336030006408691, "learning_rate": 9.811376800831725e-06, "loss": 0.0634, "step": 45310 }, { "epoch": 3.3655131442150603, "grad_norm": 0.638027548789978, "learning_rate": 9.80692113470964e-06, "loss": 0.041, "step": 45320 }, { "epoch": 3.3662557552354078, "grad_norm": 0.6306934356689453, "learning_rate": 9.802465468587553e-06, "loss": 0.092, "step": 45330 }, { "epoch": 3.366998366255755, "grad_norm": 1.2354300022125244, "learning_rate": 9.79800980246547e-06, "loss": 0.0729, "step": 45340 }, { "epoch": 3.3677409772761027, "grad_norm": 0.46772605180740356, "learning_rate": 9.793554136343383e-06, "loss": 0.039, "step": 45350 }, { "epoch": 3.36848358829645, "grad_norm": 0.28647175431251526, "learning_rate": 9.789098470221298e-06, "loss": 0.0544, "step": 45360 }, { "epoch": 3.369226199316798, "grad_norm": 0.0804813876748085, "learning_rate": 9.784642804099213e-06, "loss": 0.0483, "step": 45370 }, { "epoch": 3.3699688103371455, "grad_norm": 2.941643476486206, "learning_rate": 9.780187137977128e-06, "loss": 0.0728, "step": 45380 }, { "epoch": 3.370711421357493, "grad_norm": 1.7482622861862183, "learning_rate": 9.775731471855043e-06, "loss": 0.0857, "step": 45390 }, { "epoch": 3.3714540323778404, "grad_norm": 3.30426025390625, "learning_rate": 9.771275805732956e-06, "loss": 0.0671, "step": 45400 }, { "epoch": 3.372196643398188, "grad_norm": 1.9652279615402222, "learning_rate": 9.766820139610871e-06, "loss": 0.0693, "step": 45410 }, { "epoch": 3.3729392544185357, "grad_norm": 0.7707439064979553, "learning_rate": 9.762364473488788e-06, "loss": 0.0572, "step": 45420 }, { "epoch": 3.373681865438883, "grad_norm": 0.6071941256523132, "learning_rate": 9.757908807366701e-06, "loss": 0.0451, "step": 45430 }, { "epoch": 3.3744244764592306, "grad_norm": 1.8582054376602173, "learning_rate": 9.753453141244616e-06, "loss": 0.0391, "step": 45440 }, { "epoch": 3.375167087479578, "grad_norm": 1.5152584314346313, "learning_rate": 9.74899747512253e-06, "loss": 0.0941, "step": 45450 }, { "epoch": 3.3759096984999255, "grad_norm": 0.9906954169273376, "learning_rate": 9.744541809000446e-06, "loss": 0.0514, "step": 45460 }, { "epoch": 3.3766523095202734, "grad_norm": 1.4278010129928589, "learning_rate": 9.740086142878361e-06, "loss": 0.0492, "step": 45470 }, { "epoch": 3.377394920540621, "grad_norm": 2.468226671218872, "learning_rate": 9.735630476756275e-06, "loss": 0.0692, "step": 45480 }, { "epoch": 3.3781375315609683, "grad_norm": 0.5829160809516907, "learning_rate": 9.731174810634191e-06, "loss": 0.0661, "step": 45490 }, { "epoch": 3.3788801425813157, "grad_norm": 1.5764065980911255, "learning_rate": 9.726719144512105e-06, "loss": 0.0695, "step": 45500 }, { "epoch": 3.3796227536016636, "grad_norm": 0.9860460162162781, "learning_rate": 9.72226347839002e-06, "loss": 0.0474, "step": 45510 }, { "epoch": 3.380365364622011, "grad_norm": 1.8442999124526978, "learning_rate": 9.717807812267933e-06, "loss": 0.1209, "step": 45520 }, { "epoch": 3.3811079756423585, "grad_norm": 3.0390632152557373, "learning_rate": 9.71335214614585e-06, "loss": 0.0604, "step": 45530 }, { "epoch": 3.381850586662706, "grad_norm": 1.5547683238983154, "learning_rate": 9.708896480023765e-06, "loss": 0.093, "step": 45540 }, { "epoch": 3.3825931976830534, "grad_norm": 2.504794120788574, "learning_rate": 9.704440813901678e-06, "loss": 0.0385, "step": 45550 }, { "epoch": 3.3833358087034013, "grad_norm": 2.069444417953491, "learning_rate": 9.699985147779593e-06, "loss": 0.0766, "step": 45560 }, { "epoch": 3.384078419723749, "grad_norm": 1.7017885446548462, "learning_rate": 9.695529481657508e-06, "loss": 0.066, "step": 45570 }, { "epoch": 3.3848210307440962, "grad_norm": 1.3779213428497314, "learning_rate": 9.691073815535423e-06, "loss": 0.0581, "step": 45580 }, { "epoch": 3.3855636417644437, "grad_norm": 2.676387071609497, "learning_rate": 9.686618149413338e-06, "loss": 0.0898, "step": 45590 }, { "epoch": 3.386306252784791, "grad_norm": 1.7272534370422363, "learning_rate": 9.682162483291253e-06, "loss": 0.0493, "step": 45600 }, { "epoch": 3.387048863805139, "grad_norm": 0.43985655903816223, "learning_rate": 9.677706817169168e-06, "loss": 0.0536, "step": 45610 }, { "epoch": 3.3877914748254865, "grad_norm": 2.7680649757385254, "learning_rate": 9.673251151047081e-06, "loss": 0.0442, "step": 45620 }, { "epoch": 3.388534085845834, "grad_norm": 2.375917911529541, "learning_rate": 9.668795484924996e-06, "loss": 0.0681, "step": 45630 }, { "epoch": 3.3892766968661814, "grad_norm": 1.5520646572113037, "learning_rate": 9.664339818802911e-06, "loss": 0.0533, "step": 45640 }, { "epoch": 3.3900193078865293, "grad_norm": 0.7106296420097351, "learning_rate": 9.659884152680826e-06, "loss": 0.0527, "step": 45650 }, { "epoch": 3.3907619189068767, "grad_norm": 1.5558034181594849, "learning_rate": 9.655428486558741e-06, "loss": 0.0664, "step": 45660 }, { "epoch": 3.391504529927224, "grad_norm": 2.0336802005767822, "learning_rate": 9.650972820436654e-06, "loss": 0.0744, "step": 45670 }, { "epoch": 3.3922471409475716, "grad_norm": 2.6379876136779785, "learning_rate": 9.646517154314571e-06, "loss": 0.0752, "step": 45680 }, { "epoch": 3.392989751967919, "grad_norm": 1.7118732929229736, "learning_rate": 9.642061488192484e-06, "loss": 0.0775, "step": 45690 }, { "epoch": 3.393732362988267, "grad_norm": 0.5552663803100586, "learning_rate": 9.6376058220704e-06, "loss": 0.0601, "step": 45700 }, { "epoch": 3.3944749740086144, "grad_norm": 1.9017750024795532, "learning_rate": 9.633150155948314e-06, "loss": 0.0687, "step": 45710 }, { "epoch": 3.395217585028962, "grad_norm": 0.6380416750907898, "learning_rate": 9.62869448982623e-06, "loss": 0.0657, "step": 45720 }, { "epoch": 3.3959601960493093, "grad_norm": 0.7644681930541992, "learning_rate": 9.624238823704144e-06, "loss": 0.0563, "step": 45730 }, { "epoch": 3.3967028070696568, "grad_norm": 3.4050753116607666, "learning_rate": 9.619783157582058e-06, "loss": 0.0557, "step": 45740 }, { "epoch": 3.3974454180900047, "grad_norm": 2.2620880603790283, "learning_rate": 9.615327491459974e-06, "loss": 0.0667, "step": 45750 }, { "epoch": 3.398188029110352, "grad_norm": 1.442107915878296, "learning_rate": 9.610871825337888e-06, "loss": 0.0644, "step": 45760 }, { "epoch": 3.3989306401306996, "grad_norm": 0.9981054067611694, "learning_rate": 9.606416159215803e-06, "loss": 0.0412, "step": 45770 }, { "epoch": 3.399673251151047, "grad_norm": 0.9032704830169678, "learning_rate": 9.601960493093718e-06, "loss": 0.0779, "step": 45780 }, { "epoch": 3.4004158621713945, "grad_norm": 1.9194782972335815, "learning_rate": 9.597504826971633e-06, "loss": 0.0679, "step": 45790 }, { "epoch": 3.4011584731917424, "grad_norm": 2.2495553493499756, "learning_rate": 9.593049160849548e-06, "loss": 0.0495, "step": 45800 }, { "epoch": 3.40190108421209, "grad_norm": 0.7401419878005981, "learning_rate": 9.588593494727461e-06, "loss": 0.0372, "step": 45810 }, { "epoch": 3.4026436952324373, "grad_norm": 2.4702985286712646, "learning_rate": 9.584137828605376e-06, "loss": 0.0629, "step": 45820 }, { "epoch": 3.4033863062527847, "grad_norm": 1.2022415399551392, "learning_rate": 9.579682162483293e-06, "loss": 0.0573, "step": 45830 }, { "epoch": 3.404128917273132, "grad_norm": 1.8239872455596924, "learning_rate": 9.575226496361206e-06, "loss": 0.0513, "step": 45840 }, { "epoch": 3.40487152829348, "grad_norm": 2.117016553878784, "learning_rate": 9.570770830239121e-06, "loss": 0.0671, "step": 45850 }, { "epoch": 3.4056141393138275, "grad_norm": 1.195753574371338, "learning_rate": 9.566315164117036e-06, "loss": 0.0544, "step": 45860 }, { "epoch": 3.406356750334175, "grad_norm": 0.719028890132904, "learning_rate": 9.561859497994951e-06, "loss": 0.058, "step": 45870 }, { "epoch": 3.4070993613545224, "grad_norm": 0.7662678360939026, "learning_rate": 9.557403831872866e-06, "loss": 0.06, "step": 45880 }, { "epoch": 3.40784197237487, "grad_norm": 1.3117008209228516, "learning_rate": 9.55294816575078e-06, "loss": 0.0367, "step": 45890 }, { "epoch": 3.4085845833952177, "grad_norm": 0.3628579080104828, "learning_rate": 9.548492499628696e-06, "loss": 0.0582, "step": 45900 }, { "epoch": 3.409327194415565, "grad_norm": 3.9150137901306152, "learning_rate": 9.54403683350661e-06, "loss": 0.0684, "step": 45910 }, { "epoch": 3.4100698054359126, "grad_norm": 1.5000407695770264, "learning_rate": 9.539581167384524e-06, "loss": 0.0871, "step": 45920 }, { "epoch": 3.41081241645626, "grad_norm": 1.804474115371704, "learning_rate": 9.535125501262438e-06, "loss": 0.0469, "step": 45930 }, { "epoch": 3.4115550274766075, "grad_norm": 1.7255808115005493, "learning_rate": 9.530669835140354e-06, "loss": 0.0358, "step": 45940 }, { "epoch": 3.4122976384969554, "grad_norm": 1.890074372291565, "learning_rate": 9.52621416901827e-06, "loss": 0.0541, "step": 45950 }, { "epoch": 3.413040249517303, "grad_norm": 1.5225468873977661, "learning_rate": 9.521758502896183e-06, "loss": 0.074, "step": 45960 }, { "epoch": 3.4137828605376503, "grad_norm": 1.2024898529052734, "learning_rate": 9.517302836774098e-06, "loss": 0.0931, "step": 45970 }, { "epoch": 3.414525471557998, "grad_norm": 5.313319683074951, "learning_rate": 9.512847170652013e-06, "loss": 0.0663, "step": 45980 }, { "epoch": 3.4152680825783452, "grad_norm": 1.3511769771575928, "learning_rate": 9.508391504529928e-06, "loss": 0.0887, "step": 45990 }, { "epoch": 3.416010693598693, "grad_norm": 2.674060344696045, "learning_rate": 9.503935838407843e-06, "loss": 0.0518, "step": 46000 }, { "epoch": 3.4167533046190406, "grad_norm": 0.3742149770259857, "learning_rate": 9.499480172285757e-06, "loss": 0.0483, "step": 46010 }, { "epoch": 3.417495915639388, "grad_norm": 0.4566430151462555, "learning_rate": 9.495024506163672e-06, "loss": 0.0641, "step": 46020 }, { "epoch": 3.4182385266597355, "grad_norm": 1.1455416679382324, "learning_rate": 9.490568840041586e-06, "loss": 0.0723, "step": 46030 }, { "epoch": 3.418981137680083, "grad_norm": 0.7190825343132019, "learning_rate": 9.4861131739195e-06, "loss": 0.0622, "step": 46040 }, { "epoch": 3.419723748700431, "grad_norm": 0.6445209383964539, "learning_rate": 9.481657507797416e-06, "loss": 0.0357, "step": 46050 }, { "epoch": 3.4204663597207783, "grad_norm": 2.212388038635254, "learning_rate": 9.47720184167533e-06, "loss": 0.0446, "step": 46060 }, { "epoch": 3.4212089707411257, "grad_norm": 1.0653600692749023, "learning_rate": 9.472746175553246e-06, "loss": 0.099, "step": 46070 }, { "epoch": 3.421951581761473, "grad_norm": 2.42584228515625, "learning_rate": 9.468290509431159e-06, "loss": 0.0587, "step": 46080 }, { "epoch": 3.422694192781821, "grad_norm": 2.0575003623962402, "learning_rate": 9.463834843309076e-06, "loss": 0.0777, "step": 46090 }, { "epoch": 3.4234368038021685, "grad_norm": 2.0597803592681885, "learning_rate": 9.459379177186989e-06, "loss": 0.0569, "step": 46100 }, { "epoch": 3.424179414822516, "grad_norm": 1.2137857675552368, "learning_rate": 9.454923511064904e-06, "loss": 0.0696, "step": 46110 }, { "epoch": 3.4249220258428634, "grad_norm": 1.2219140529632568, "learning_rate": 9.45046784494282e-06, "loss": 0.0644, "step": 46120 }, { "epoch": 3.425664636863211, "grad_norm": 0.3102966845035553, "learning_rate": 9.446012178820734e-06, "loss": 0.0485, "step": 46130 }, { "epoch": 3.4264072478835588, "grad_norm": 3.548948287963867, "learning_rate": 9.441556512698649e-06, "loss": 0.0802, "step": 46140 }, { "epoch": 3.427149858903906, "grad_norm": 1.5830320119857788, "learning_rate": 9.437100846576562e-06, "loss": 0.0685, "step": 46150 }, { "epoch": 3.4278924699242537, "grad_norm": 0.3730054795742035, "learning_rate": 9.432645180454479e-06, "loss": 0.0555, "step": 46160 }, { "epoch": 3.428635080944601, "grad_norm": 2.1286816596984863, "learning_rate": 9.428189514332394e-06, "loss": 0.0546, "step": 46170 }, { "epoch": 3.4293776919649486, "grad_norm": 0.7147573828697205, "learning_rate": 9.423733848210307e-06, "loss": 0.0516, "step": 46180 }, { "epoch": 3.4301203029852965, "grad_norm": 2.009298324584961, "learning_rate": 9.419278182088222e-06, "loss": 0.0667, "step": 46190 }, { "epoch": 3.430862914005644, "grad_norm": 1.3036754131317139, "learning_rate": 9.414822515966137e-06, "loss": 0.0738, "step": 46200 }, { "epoch": 3.4316055250259914, "grad_norm": 1.2708832025527954, "learning_rate": 9.410366849844052e-06, "loss": 0.0378, "step": 46210 }, { "epoch": 3.432348136046339, "grad_norm": 1.8291300535202026, "learning_rate": 9.405911183721966e-06, "loss": 0.0484, "step": 46220 }, { "epoch": 3.4330907470666867, "grad_norm": 3.277575731277466, "learning_rate": 9.40145551759988e-06, "loss": 0.0766, "step": 46230 }, { "epoch": 3.433833358087034, "grad_norm": 2.973456859588623, "learning_rate": 9.396999851477797e-06, "loss": 0.0464, "step": 46240 }, { "epoch": 3.4345759691073816, "grad_norm": 1.2118042707443237, "learning_rate": 9.39254418535571e-06, "loss": 0.0573, "step": 46250 }, { "epoch": 3.435318580127729, "grad_norm": 1.3770191669464111, "learning_rate": 9.388088519233626e-06, "loss": 0.0598, "step": 46260 }, { "epoch": 3.4360611911480765, "grad_norm": 2.709092855453491, "learning_rate": 9.38363285311154e-06, "loss": 0.0743, "step": 46270 }, { "epoch": 3.4368038021684244, "grad_norm": 0.41751283407211304, "learning_rate": 9.379177186989456e-06, "loss": 0.0687, "step": 46280 }, { "epoch": 3.437546413188772, "grad_norm": 0.8437953591346741, "learning_rate": 9.37472152086737e-06, "loss": 0.047, "step": 46290 }, { "epoch": 3.4382890242091193, "grad_norm": 2.391899824142456, "learning_rate": 9.370265854745284e-06, "loss": 0.0499, "step": 46300 }, { "epoch": 3.4390316352294668, "grad_norm": 1.9807353019714355, "learning_rate": 9.3658101886232e-06, "loss": 0.0596, "step": 46310 }, { "epoch": 3.439774246249814, "grad_norm": 0.2767632007598877, "learning_rate": 9.361354522501114e-06, "loss": 0.0419, "step": 46320 }, { "epoch": 3.440516857270162, "grad_norm": 2.447248697280884, "learning_rate": 9.356898856379029e-06, "loss": 0.0679, "step": 46330 }, { "epoch": 3.4412594682905095, "grad_norm": 1.484784722328186, "learning_rate": 9.352443190256942e-06, "loss": 0.039, "step": 46340 }, { "epoch": 3.442002079310857, "grad_norm": 1.6514251232147217, "learning_rate": 9.347987524134859e-06, "loss": 0.0346, "step": 46350 }, { "epoch": 3.4427446903312044, "grad_norm": 1.2701706886291504, "learning_rate": 9.343531858012774e-06, "loss": 0.0298, "step": 46360 }, { "epoch": 3.443487301351552, "grad_norm": 6.292190074920654, "learning_rate": 9.339076191890687e-06, "loss": 0.0471, "step": 46370 }, { "epoch": 3.4442299123719, "grad_norm": 1.1822702884674072, "learning_rate": 9.334620525768604e-06, "loss": 0.088, "step": 46380 }, { "epoch": 3.4449725233922472, "grad_norm": 2.342872142791748, "learning_rate": 9.330164859646517e-06, "loss": 0.0844, "step": 46390 }, { "epoch": 3.4457151344125947, "grad_norm": 0.5191331505775452, "learning_rate": 9.325709193524432e-06, "loss": 0.0737, "step": 46400 }, { "epoch": 3.446457745432942, "grad_norm": 2.7693073749542236, "learning_rate": 9.321253527402347e-06, "loss": 0.0634, "step": 46410 }, { "epoch": 3.4472003564532896, "grad_norm": 1.8164703845977783, "learning_rate": 9.316797861280262e-06, "loss": 0.0518, "step": 46420 }, { "epoch": 3.4479429674736375, "grad_norm": 0.6304923295974731, "learning_rate": 9.312342195158177e-06, "loss": 0.0483, "step": 46430 }, { "epoch": 3.448685578493985, "grad_norm": 0.35260432958602905, "learning_rate": 9.30788652903609e-06, "loss": 0.0711, "step": 46440 }, { "epoch": 3.4494281895143324, "grad_norm": 1.1337438821792603, "learning_rate": 9.303430862914005e-06, "loss": 0.0628, "step": 46450 }, { "epoch": 3.45017080053468, "grad_norm": 1.7715853452682495, "learning_rate": 9.29897519679192e-06, "loss": 0.0582, "step": 46460 }, { "epoch": 3.4509134115550273, "grad_norm": 2.768024206161499, "learning_rate": 9.294519530669835e-06, "loss": 0.0729, "step": 46470 }, { "epoch": 3.451656022575375, "grad_norm": 0.7756059169769287, "learning_rate": 9.29006386454775e-06, "loss": 0.0466, "step": 46480 }, { "epoch": 3.4523986335957226, "grad_norm": 1.0126781463623047, "learning_rate": 9.285608198425664e-06, "loss": 0.0615, "step": 46490 }, { "epoch": 3.45314124461607, "grad_norm": 1.432900071144104, "learning_rate": 9.28115253230358e-06, "loss": 0.0481, "step": 46500 }, { "epoch": 3.4538838556364175, "grad_norm": 1.1251558065414429, "learning_rate": 9.276696866181494e-06, "loss": 0.0573, "step": 46510 }, { "epoch": 3.454626466656765, "grad_norm": 1.9688692092895508, "learning_rate": 9.272241200059409e-06, "loss": 0.0425, "step": 46520 }, { "epoch": 3.455369077677113, "grad_norm": 0.8410460352897644, "learning_rate": 9.267785533937325e-06, "loss": 0.0555, "step": 46530 }, { "epoch": 3.4561116886974603, "grad_norm": 1.3958379030227661, "learning_rate": 9.263329867815239e-06, "loss": 0.0733, "step": 46540 }, { "epoch": 3.4568542997178078, "grad_norm": 0.4909604489803314, "learning_rate": 9.258874201693154e-06, "loss": 0.0681, "step": 46550 }, { "epoch": 3.4575969107381552, "grad_norm": 0.4906344413757324, "learning_rate": 9.254418535571067e-06, "loss": 0.0965, "step": 46560 }, { "epoch": 3.4583395217585027, "grad_norm": 0.9761192202568054, "learning_rate": 9.249962869448984e-06, "loss": 0.049, "step": 46570 }, { "epoch": 3.4590821327788506, "grad_norm": 0.2913404107093811, "learning_rate": 9.245507203326899e-06, "loss": 0.0506, "step": 46580 }, { "epoch": 3.459824743799198, "grad_norm": 1.4044737815856934, "learning_rate": 9.241051537204812e-06, "loss": 0.0496, "step": 46590 }, { "epoch": 3.4605673548195455, "grad_norm": 0.5915066599845886, "learning_rate": 9.236595871082727e-06, "loss": 0.0476, "step": 46600 }, { "epoch": 3.461309965839893, "grad_norm": 0.6072288751602173, "learning_rate": 9.232140204960642e-06, "loss": 0.0433, "step": 46610 }, { "epoch": 3.4620525768602404, "grad_norm": 1.144883632659912, "learning_rate": 9.227684538838557e-06, "loss": 0.0593, "step": 46620 }, { "epoch": 3.4627951878805883, "grad_norm": 1.200415015220642, "learning_rate": 9.22322887271647e-06, "loss": 0.0611, "step": 46630 }, { "epoch": 3.4635377989009357, "grad_norm": 1.8944742679595947, "learning_rate": 9.218773206594385e-06, "loss": 0.0679, "step": 46640 }, { "epoch": 3.464280409921283, "grad_norm": 1.1931655406951904, "learning_rate": 9.214317540472302e-06, "loss": 0.0392, "step": 46650 }, { "epoch": 3.4650230209416306, "grad_norm": 0.9544970989227295, "learning_rate": 9.209861874350215e-06, "loss": 0.0808, "step": 46660 }, { "epoch": 3.465765631961978, "grad_norm": 0.6240988373756409, "learning_rate": 9.20540620822813e-06, "loss": 0.0622, "step": 46670 }, { "epoch": 3.466508242982326, "grad_norm": 0.9956406950950623, "learning_rate": 9.200950542106045e-06, "loss": 0.0536, "step": 46680 }, { "epoch": 3.4672508540026734, "grad_norm": 1.983014702796936, "learning_rate": 9.19649487598396e-06, "loss": 0.0661, "step": 46690 }, { "epoch": 3.467993465023021, "grad_norm": 1.8986002206802368, "learning_rate": 9.192039209861875e-06, "loss": 0.0885, "step": 46700 }, { "epoch": 3.4687360760433683, "grad_norm": 1.382034182548523, "learning_rate": 9.187583543739788e-06, "loss": 0.0768, "step": 46710 }, { "epoch": 3.469478687063716, "grad_norm": 2.0056140422821045, "learning_rate": 9.183127877617705e-06, "loss": 0.0564, "step": 46720 }, { "epoch": 3.4702212980840637, "grad_norm": 0.6202702522277832, "learning_rate": 9.178672211495618e-06, "loss": 0.0412, "step": 46730 }, { "epoch": 3.470963909104411, "grad_norm": 2.408010959625244, "learning_rate": 9.174216545373533e-06, "loss": 0.0654, "step": 46740 }, { "epoch": 3.4717065201247586, "grad_norm": 1.9785070419311523, "learning_rate": 9.169760879251447e-06, "loss": 0.0879, "step": 46750 }, { "epoch": 3.472449131145106, "grad_norm": 1.066245675086975, "learning_rate": 9.165305213129363e-06, "loss": 0.0434, "step": 46760 }, { "epoch": 3.473191742165454, "grad_norm": 0.1916639357805252, "learning_rate": 9.160849547007278e-06, "loss": 0.0524, "step": 46770 }, { "epoch": 3.4739343531858013, "grad_norm": 2.890707492828369, "learning_rate": 9.156393880885192e-06, "loss": 0.081, "step": 46780 }, { "epoch": 3.474676964206149, "grad_norm": 1.435386061668396, "learning_rate": 9.151938214763108e-06, "loss": 0.0458, "step": 46790 }, { "epoch": 3.4754195752264962, "grad_norm": 1.8877719640731812, "learning_rate": 9.147482548641022e-06, "loss": 0.0704, "step": 46800 }, { "epoch": 3.476162186246844, "grad_norm": 2.4223110675811768, "learning_rate": 9.143026882518937e-06, "loss": 0.0615, "step": 46810 }, { "epoch": 3.4769047972671916, "grad_norm": 0.5216322541236877, "learning_rate": 9.138571216396852e-06, "loss": 0.0584, "step": 46820 }, { "epoch": 3.477647408287539, "grad_norm": 2.4628725051879883, "learning_rate": 9.134115550274767e-06, "loss": 0.0849, "step": 46830 }, { "epoch": 3.4783900193078865, "grad_norm": 1.662915825843811, "learning_rate": 9.129659884152682e-06, "loss": 0.0587, "step": 46840 }, { "epoch": 3.479132630328234, "grad_norm": 2.0761513710021973, "learning_rate": 9.125204218030595e-06, "loss": 0.0419, "step": 46850 }, { "epoch": 3.479875241348582, "grad_norm": 0.843273401260376, "learning_rate": 9.12074855190851e-06, "loss": 0.0633, "step": 46860 }, { "epoch": 3.4806178523689293, "grad_norm": 2.208324432373047, "learning_rate": 9.116292885786427e-06, "loss": 0.0575, "step": 46870 }, { "epoch": 3.4813604633892767, "grad_norm": 2.6127047538757324, "learning_rate": 9.11183721966434e-06, "loss": 0.0774, "step": 46880 }, { "epoch": 3.482103074409624, "grad_norm": 0.6786608099937439, "learning_rate": 9.107381553542255e-06, "loss": 0.0691, "step": 46890 }, { "epoch": 3.4828456854299716, "grad_norm": 3.497749090194702, "learning_rate": 9.102925887420168e-06, "loss": 0.0899, "step": 46900 }, { "epoch": 3.4835882964503195, "grad_norm": 0.41955363750457764, "learning_rate": 9.098470221298085e-06, "loss": 0.0617, "step": 46910 }, { "epoch": 3.484330907470667, "grad_norm": 0.7046768069267273, "learning_rate": 9.094014555175998e-06, "loss": 0.0789, "step": 46920 }, { "epoch": 3.4850735184910144, "grad_norm": 1.7392123937606812, "learning_rate": 9.089558889053913e-06, "loss": 0.0574, "step": 46930 }, { "epoch": 3.485816129511362, "grad_norm": 0.9724016189575195, "learning_rate": 9.08510322293183e-06, "loss": 0.0793, "step": 46940 }, { "epoch": 3.4865587405317093, "grad_norm": 1.3222392797470093, "learning_rate": 9.080647556809743e-06, "loss": 0.0675, "step": 46950 }, { "epoch": 3.4873013515520572, "grad_norm": 1.0925577878952026, "learning_rate": 9.076191890687658e-06, "loss": 0.0428, "step": 46960 }, { "epoch": 3.4880439625724047, "grad_norm": 0.7402594685554504, "learning_rate": 9.071736224565572e-06, "loss": 0.0559, "step": 46970 }, { "epoch": 3.488786573592752, "grad_norm": 2.0264675617218018, "learning_rate": 9.067280558443488e-06, "loss": 0.0425, "step": 46980 }, { "epoch": 3.4895291846130996, "grad_norm": 1.4667295217514038, "learning_rate": 9.062824892321403e-06, "loss": 0.0659, "step": 46990 }, { "epoch": 3.490271795633447, "grad_norm": 2.150097131729126, "learning_rate": 9.058369226199317e-06, "loss": 0.0656, "step": 47000 }, { "epoch": 3.491014406653795, "grad_norm": 0.9886521100997925, "learning_rate": 9.053913560077232e-06, "loss": 0.0598, "step": 47010 }, { "epoch": 3.4917570176741424, "grad_norm": 1.4608570337295532, "learning_rate": 9.049457893955147e-06, "loss": 0.1056, "step": 47020 }, { "epoch": 3.49249962869449, "grad_norm": 0.7766015529632568, "learning_rate": 9.045002227833061e-06, "loss": 0.0371, "step": 47030 }, { "epoch": 3.4932422397148373, "grad_norm": 0.7282046675682068, "learning_rate": 9.040546561710975e-06, "loss": 0.0582, "step": 47040 }, { "epoch": 3.4939848507351847, "grad_norm": 2.043895721435547, "learning_rate": 9.036090895588891e-06, "loss": 0.0516, "step": 47050 }, { "epoch": 3.4947274617555326, "grad_norm": 0.4283212721347809, "learning_rate": 9.031635229466806e-06, "loss": 0.0603, "step": 47060 }, { "epoch": 3.49547007277588, "grad_norm": 1.2481293678283691, "learning_rate": 9.02717956334472e-06, "loss": 0.0381, "step": 47070 }, { "epoch": 3.4962126837962275, "grad_norm": 1.1199532747268677, "learning_rate": 9.022723897222635e-06, "loss": 0.0837, "step": 47080 }, { "epoch": 3.496955294816575, "grad_norm": 1.4884730577468872, "learning_rate": 9.01826823110055e-06, "loss": 0.0265, "step": 47090 }, { "epoch": 3.4976979058369224, "grad_norm": 0.7303683757781982, "learning_rate": 9.013812564978465e-06, "loss": 0.0711, "step": 47100 }, { "epoch": 3.4984405168572703, "grad_norm": 1.0589395761489868, "learning_rate": 9.00935689885638e-06, "loss": 0.0711, "step": 47110 }, { "epoch": 3.4991831278776178, "grad_norm": 0.7774037718772888, "learning_rate": 9.004901232734293e-06, "loss": 0.0543, "step": 47120 }, { "epoch": 3.499925738897965, "grad_norm": 1.6601343154907227, "learning_rate": 9.00044556661221e-06, "loss": 0.0608, "step": 47130 }, { "epoch": 3.5006683499183127, "grad_norm": 0.5749710202217102, "learning_rate": 8.995989900490123e-06, "loss": 0.0565, "step": 47140 }, { "epoch": 3.50141096093866, "grad_norm": 1.1628215312957764, "learning_rate": 8.991534234368038e-06, "loss": 0.0382, "step": 47150 }, { "epoch": 3.502153571959008, "grad_norm": 1.716511607170105, "learning_rate": 8.987078568245953e-06, "loss": 0.0662, "step": 47160 }, { "epoch": 3.5028961829793555, "grad_norm": 1.7500982284545898, "learning_rate": 8.982622902123868e-06, "loss": 0.0447, "step": 47170 }, { "epoch": 3.503638793999703, "grad_norm": 1.2955238819122314, "learning_rate": 8.978167236001783e-06, "loss": 0.0674, "step": 47180 }, { "epoch": 3.5043814050200504, "grad_norm": 0.32319340109825134, "learning_rate": 8.973711569879696e-06, "loss": 0.0294, "step": 47190 }, { "epoch": 3.505124016040398, "grad_norm": 1.2012195587158203, "learning_rate": 8.969255903757613e-06, "loss": 0.085, "step": 47200 }, { "epoch": 3.5058666270607457, "grad_norm": 1.1960065364837646, "learning_rate": 8.964800237635526e-06, "loss": 0.0697, "step": 47210 }, { "epoch": 3.506609238081093, "grad_norm": 0.929478108882904, "learning_rate": 8.960344571513441e-06, "loss": 0.0367, "step": 47220 }, { "epoch": 3.5073518491014406, "grad_norm": 0.9691451787948608, "learning_rate": 8.955888905391356e-06, "loss": 0.0622, "step": 47230 }, { "epoch": 3.508094460121788, "grad_norm": 1.1364026069641113, "learning_rate": 8.951433239269271e-06, "loss": 0.0581, "step": 47240 }, { "epoch": 3.5088370711421355, "grad_norm": 1.0724977254867554, "learning_rate": 8.946977573147186e-06, "loss": 0.06, "step": 47250 }, { "epoch": 3.5095796821624834, "grad_norm": 2.1212868690490723, "learning_rate": 8.9425219070251e-06, "loss": 0.0663, "step": 47260 }, { "epoch": 3.510322293182831, "grad_norm": 0.43843552470207214, "learning_rate": 8.938066240903015e-06, "loss": 0.0727, "step": 47270 }, { "epoch": 3.5110649042031783, "grad_norm": 1.7042205333709717, "learning_rate": 8.933610574780931e-06, "loss": 0.0521, "step": 47280 }, { "epoch": 3.511807515223526, "grad_norm": 1.1995595693588257, "learning_rate": 8.929154908658845e-06, "loss": 0.0867, "step": 47290 }, { "epoch": 3.512550126243873, "grad_norm": 0.9248149394989014, "learning_rate": 8.92469924253676e-06, "loss": 0.0627, "step": 47300 }, { "epoch": 3.513292737264221, "grad_norm": 0.9813995957374573, "learning_rate": 8.920243576414675e-06, "loss": 0.0924, "step": 47310 }, { "epoch": 3.5140353482845685, "grad_norm": 1.9305483102798462, "learning_rate": 8.91578791029259e-06, "loss": 0.054, "step": 47320 }, { "epoch": 3.514777959304916, "grad_norm": 0.959563672542572, "learning_rate": 8.911332244170503e-06, "loss": 0.0512, "step": 47330 }, { "epoch": 3.515520570325264, "grad_norm": 0.4607963263988495, "learning_rate": 8.906876578048418e-06, "loss": 0.0822, "step": 47340 }, { "epoch": 3.5162631813456113, "grad_norm": 0.6391094923019409, "learning_rate": 8.902420911926335e-06, "loss": 0.0722, "step": 47350 }, { "epoch": 3.517005792365959, "grad_norm": 1.5800341367721558, "learning_rate": 8.897965245804248e-06, "loss": 0.0268, "step": 47360 }, { "epoch": 3.5177484033863062, "grad_norm": 0.609835147857666, "learning_rate": 8.893509579682163e-06, "loss": 0.0625, "step": 47370 }, { "epoch": 3.5184910144066537, "grad_norm": 2.046144723892212, "learning_rate": 8.889053913560076e-06, "loss": 0.0691, "step": 47380 }, { "epoch": 3.5192336254270016, "grad_norm": 1.5540525913238525, "learning_rate": 8.884598247437993e-06, "loss": 0.0637, "step": 47390 }, { "epoch": 3.519976236447349, "grad_norm": 0.6543061137199402, "learning_rate": 8.880142581315908e-06, "loss": 0.0655, "step": 47400 }, { "epoch": 3.5207188474676965, "grad_norm": 0.7677350640296936, "learning_rate": 8.875686915193821e-06, "loss": 0.0481, "step": 47410 }, { "epoch": 3.521461458488044, "grad_norm": 0.8277533054351807, "learning_rate": 8.871231249071736e-06, "loss": 0.0858, "step": 47420 }, { "epoch": 3.5222040695083914, "grad_norm": 0.4589194059371948, "learning_rate": 8.866775582949651e-06, "loss": 0.0333, "step": 47430 }, { "epoch": 3.5229466805287393, "grad_norm": 1.8496214151382446, "learning_rate": 8.862319916827566e-06, "loss": 0.0732, "step": 47440 }, { "epoch": 3.5236892915490867, "grad_norm": 2.326258659362793, "learning_rate": 8.85786425070548e-06, "loss": 0.0875, "step": 47450 }, { "epoch": 3.524431902569434, "grad_norm": 3.8180079460144043, "learning_rate": 8.853408584583396e-06, "loss": 0.0932, "step": 47460 }, { "epoch": 3.5251745135897816, "grad_norm": 2.513268232345581, "learning_rate": 8.848952918461311e-06, "loss": 0.065, "step": 47470 }, { "epoch": 3.525917124610129, "grad_norm": 0.91473788022995, "learning_rate": 8.844497252339224e-06, "loss": 0.068, "step": 47480 }, { "epoch": 3.526659735630477, "grad_norm": 1.168892502784729, "learning_rate": 8.84004158621714e-06, "loss": 0.0351, "step": 47490 }, { "epoch": 3.5274023466508244, "grad_norm": 0.857007622718811, "learning_rate": 8.835585920095054e-06, "loss": 0.08, "step": 47500 }, { "epoch": 3.528144957671172, "grad_norm": 1.0127304792404175, "learning_rate": 8.83113025397297e-06, "loss": 0.0736, "step": 47510 }, { "epoch": 3.5288875686915193, "grad_norm": 1.726643681526184, "learning_rate": 8.826674587850884e-06, "loss": 0.0578, "step": 47520 }, { "epoch": 3.5296301797118668, "grad_norm": 1.7665687799453735, "learning_rate": 8.822218921728798e-06, "loss": 0.061, "step": 47530 }, { "epoch": 3.5303727907322147, "grad_norm": 0.5315186977386475, "learning_rate": 8.817763255606714e-06, "loss": 0.0641, "step": 47540 }, { "epoch": 3.531115401752562, "grad_norm": 1.4870011806488037, "learning_rate": 8.813307589484628e-06, "loss": 0.0726, "step": 47550 }, { "epoch": 3.5318580127729096, "grad_norm": 1.0774098634719849, "learning_rate": 8.808851923362543e-06, "loss": 0.0726, "step": 47560 }, { "epoch": 3.532600623793257, "grad_norm": 2.8988194465637207, "learning_rate": 8.80439625724046e-06, "loss": 0.0568, "step": 47570 }, { "epoch": 3.5333432348136045, "grad_norm": 0.4730290472507477, "learning_rate": 8.799940591118373e-06, "loss": 0.053, "step": 47580 }, { "epoch": 3.5340858458339524, "grad_norm": 1.4914735555648804, "learning_rate": 8.795484924996288e-06, "loss": 0.0601, "step": 47590 }, { "epoch": 3.5348284568543, "grad_norm": 1.6329556703567505, "learning_rate": 8.791029258874201e-06, "loss": 0.0553, "step": 47600 }, { "epoch": 3.5355710678746473, "grad_norm": 2.3614673614501953, "learning_rate": 8.786573592752118e-06, "loss": 0.0744, "step": 47610 }, { "epoch": 3.5363136788949947, "grad_norm": 2.2033894062042236, "learning_rate": 8.782117926630031e-06, "loss": 0.0615, "step": 47620 }, { "epoch": 3.537056289915342, "grad_norm": 0.42361772060394287, "learning_rate": 8.777662260507946e-06, "loss": 0.0494, "step": 47630 }, { "epoch": 3.53779890093569, "grad_norm": 1.1795815229415894, "learning_rate": 8.773206594385861e-06, "loss": 0.0633, "step": 47640 }, { "epoch": 3.5385415119560375, "grad_norm": 1.4586540460586548, "learning_rate": 8.768750928263776e-06, "loss": 0.0525, "step": 47650 }, { "epoch": 3.539284122976385, "grad_norm": 0.6656326055526733, "learning_rate": 8.764295262141691e-06, "loss": 0.0831, "step": 47660 }, { "epoch": 3.5400267339967324, "grad_norm": 0.5945910215377808, "learning_rate": 8.759839596019604e-06, "loss": 0.0444, "step": 47670 }, { "epoch": 3.54076934501708, "grad_norm": 1.4914181232452393, "learning_rate": 8.75538392989752e-06, "loss": 0.0344, "step": 47680 }, { "epoch": 3.5415119560374277, "grad_norm": 1.5932813882827759, "learning_rate": 8.750928263775436e-06, "loss": 0.0624, "step": 47690 }, { "epoch": 3.542254567057775, "grad_norm": 1.4807531833648682, "learning_rate": 8.74647259765335e-06, "loss": 0.0671, "step": 47700 }, { "epoch": 3.5429971780781226, "grad_norm": 3.4547231197357178, "learning_rate": 8.742016931531264e-06, "loss": 0.0489, "step": 47710 }, { "epoch": 3.54373978909847, "grad_norm": 1.1375788450241089, "learning_rate": 8.73756126540918e-06, "loss": 0.0641, "step": 47720 }, { "epoch": 3.5444824001188175, "grad_norm": 1.481046199798584, "learning_rate": 8.733105599287094e-06, "loss": 0.0676, "step": 47730 }, { "epoch": 3.5452250111391654, "grad_norm": 0.4229665994644165, "learning_rate": 8.728649933165007e-06, "loss": 0.0647, "step": 47740 }, { "epoch": 3.545967622159513, "grad_norm": 1.467894196510315, "learning_rate": 8.724194267042922e-06, "loss": 0.0597, "step": 47750 }, { "epoch": 3.5467102331798603, "grad_norm": 0.5830600261688232, "learning_rate": 8.719738600920839e-06, "loss": 0.0649, "step": 47760 }, { "epoch": 3.547452844200208, "grad_norm": 1.7029845714569092, "learning_rate": 8.715282934798752e-06, "loss": 0.104, "step": 47770 }, { "epoch": 3.5481954552205552, "grad_norm": 0.429775208234787, "learning_rate": 8.710827268676667e-06, "loss": 0.0556, "step": 47780 }, { "epoch": 3.548938066240903, "grad_norm": 2.0149717330932617, "learning_rate": 8.70637160255458e-06, "loss": 0.0864, "step": 47790 }, { "epoch": 3.5496806772612506, "grad_norm": 3.289201498031616, "learning_rate": 8.701915936432497e-06, "loss": 0.077, "step": 47800 }, { "epoch": 3.550423288281598, "grad_norm": 0.7623452544212341, "learning_rate": 8.697460270310412e-06, "loss": 0.063, "step": 47810 }, { "epoch": 3.5511658993019455, "grad_norm": 1.59382164478302, "learning_rate": 8.693004604188326e-06, "loss": 0.0783, "step": 47820 }, { "epoch": 3.551908510322293, "grad_norm": 1.199479579925537, "learning_rate": 8.688548938066242e-06, "loss": 0.0654, "step": 47830 }, { "epoch": 3.552651121342641, "grad_norm": 0.7450114488601685, "learning_rate": 8.684093271944156e-06, "loss": 0.0612, "step": 47840 }, { "epoch": 3.5533937323629883, "grad_norm": 2.706019163131714, "learning_rate": 8.67963760582207e-06, "loss": 0.0717, "step": 47850 }, { "epoch": 3.5541363433833357, "grad_norm": 1.1404179334640503, "learning_rate": 8.675181939699986e-06, "loss": 0.0612, "step": 47860 }, { "epoch": 3.5548789544036836, "grad_norm": 4.893725872039795, "learning_rate": 8.6707262735779e-06, "loss": 0.0612, "step": 47870 }, { "epoch": 3.5556215654240306, "grad_norm": 2.2265067100524902, "learning_rate": 8.666270607455816e-06, "loss": 0.0517, "step": 47880 }, { "epoch": 3.5563641764443785, "grad_norm": 0.6842568516731262, "learning_rate": 8.661814941333729e-06, "loss": 0.0562, "step": 47890 }, { "epoch": 3.557106787464726, "grad_norm": 0.7433666586875916, "learning_rate": 8.657359275211644e-06, "loss": 0.0586, "step": 47900 }, { "epoch": 3.5578493984850734, "grad_norm": 1.5881272554397583, "learning_rate": 8.652903609089559e-06, "loss": 0.0667, "step": 47910 }, { "epoch": 3.5585920095054213, "grad_norm": 1.4796943664550781, "learning_rate": 8.648447942967474e-06, "loss": 0.0797, "step": 47920 }, { "epoch": 3.5593346205257688, "grad_norm": 1.468156337738037, "learning_rate": 8.643992276845389e-06, "loss": 0.0655, "step": 47930 }, { "epoch": 3.560077231546116, "grad_norm": 1.8605856895446777, "learning_rate": 8.639536610723302e-06, "loss": 0.0929, "step": 47940 }, { "epoch": 3.5608198425664637, "grad_norm": 0.9194366335868835, "learning_rate": 8.635080944601219e-06, "loss": 0.0498, "step": 47950 }, { "epoch": 3.561562453586811, "grad_norm": 1.5396569967269897, "learning_rate": 8.630625278479132e-06, "loss": 0.0706, "step": 47960 }, { "epoch": 3.562305064607159, "grad_norm": 2.437840700149536, "learning_rate": 8.626169612357047e-06, "loss": 0.0877, "step": 47970 }, { "epoch": 3.5630476756275065, "grad_norm": 0.9188141822814941, "learning_rate": 8.621713946234964e-06, "loss": 0.0793, "step": 47980 }, { "epoch": 3.563790286647854, "grad_norm": 1.7149858474731445, "learning_rate": 8.617258280112877e-06, "loss": 0.0615, "step": 47990 }, { "epoch": 3.5645328976682014, "grad_norm": 0.9212315082550049, "learning_rate": 8.612802613990792e-06, "loss": 0.0857, "step": 48000 }, { "epoch": 3.565275508688549, "grad_norm": 1.6391431093215942, "learning_rate": 8.608346947868706e-06, "loss": 0.0434, "step": 48010 }, { "epoch": 3.5660181197088967, "grad_norm": 0.9591582417488098, "learning_rate": 8.603891281746622e-06, "loss": 0.0665, "step": 48020 }, { "epoch": 3.566760730729244, "grad_norm": 0.856239914894104, "learning_rate": 8.599435615624536e-06, "loss": 0.0416, "step": 48030 }, { "epoch": 3.5675033417495916, "grad_norm": 0.6518556475639343, "learning_rate": 8.59497994950245e-06, "loss": 0.0264, "step": 48040 }, { "epoch": 3.568245952769939, "grad_norm": 1.1841713190078735, "learning_rate": 8.590524283380365e-06, "loss": 0.0806, "step": 48050 }, { "epoch": 3.5689885637902865, "grad_norm": 0.7646443247795105, "learning_rate": 8.58606861725828e-06, "loss": 0.0474, "step": 48060 }, { "epoch": 3.5697311748106344, "grad_norm": 2.25919246673584, "learning_rate": 8.581612951136195e-06, "loss": 0.0511, "step": 48070 }, { "epoch": 3.570473785830982, "grad_norm": 2.300975799560547, "learning_rate": 8.577157285014109e-06, "loss": 0.0626, "step": 48080 }, { "epoch": 3.5712163968513293, "grad_norm": 0.9677648544311523, "learning_rate": 8.572701618892024e-06, "loss": 0.0506, "step": 48090 }, { "epoch": 3.5719590078716768, "grad_norm": 2.2813456058502197, "learning_rate": 8.56824595276994e-06, "loss": 0.0981, "step": 48100 }, { "epoch": 3.572701618892024, "grad_norm": 1.1389786005020142, "learning_rate": 8.563790286647854e-06, "loss": 0.045, "step": 48110 }, { "epoch": 3.573444229912372, "grad_norm": 0.6446773409843445, "learning_rate": 8.559334620525769e-06, "loss": 0.0623, "step": 48120 }, { "epoch": 3.5741868409327195, "grad_norm": 0.8095260858535767, "learning_rate": 8.554878954403684e-06, "loss": 0.039, "step": 48130 }, { "epoch": 3.574929451953067, "grad_norm": 1.6221411228179932, "learning_rate": 8.550423288281599e-06, "loss": 0.0335, "step": 48140 }, { "epoch": 3.5756720629734144, "grad_norm": 2.590031385421753, "learning_rate": 8.545967622159512e-06, "loss": 0.0626, "step": 48150 }, { "epoch": 3.576414673993762, "grad_norm": 1.4469174146652222, "learning_rate": 8.541511956037427e-06, "loss": 0.0582, "step": 48160 }, { "epoch": 3.57715728501411, "grad_norm": 1.5746777057647705, "learning_rate": 8.537056289915344e-06, "loss": 0.0842, "step": 48170 }, { "epoch": 3.5778998960344572, "grad_norm": 1.6506327390670776, "learning_rate": 8.532600623793257e-06, "loss": 0.0394, "step": 48180 }, { "epoch": 3.5786425070548047, "grad_norm": 1.0401891469955444, "learning_rate": 8.528144957671172e-06, "loss": 0.0653, "step": 48190 }, { "epoch": 3.579385118075152, "grad_norm": 1.626905918121338, "learning_rate": 8.523689291549085e-06, "loss": 0.0546, "step": 48200 }, { "epoch": 3.5801277290954996, "grad_norm": 1.3999053239822388, "learning_rate": 8.519233625427002e-06, "loss": 0.0506, "step": 48210 }, { "epoch": 3.5808703401158475, "grad_norm": 1.21640944480896, "learning_rate": 8.514777959304917e-06, "loss": 0.0772, "step": 48220 }, { "epoch": 3.581612951136195, "grad_norm": 0.853153645992279, "learning_rate": 8.51032229318283e-06, "loss": 0.0399, "step": 48230 }, { "epoch": 3.5823555621565424, "grad_norm": 1.5424240827560425, "learning_rate": 8.505866627060747e-06, "loss": 0.0593, "step": 48240 }, { "epoch": 3.58309817317689, "grad_norm": 0.3190561830997467, "learning_rate": 8.50141096093866e-06, "loss": 0.0463, "step": 48250 }, { "epoch": 3.5838407841972373, "grad_norm": 1.145538568496704, "learning_rate": 8.496955294816575e-06, "loss": 0.044, "step": 48260 }, { "epoch": 3.584583395217585, "grad_norm": 1.7267332077026367, "learning_rate": 8.49249962869449e-06, "loss": 0.065, "step": 48270 }, { "epoch": 3.5853260062379326, "grad_norm": 0.45990875363349915, "learning_rate": 8.488043962572405e-06, "loss": 0.0485, "step": 48280 }, { "epoch": 3.58606861725828, "grad_norm": 0.5736406445503235, "learning_rate": 8.48358829645032e-06, "loss": 0.0597, "step": 48290 }, { "epoch": 3.5868112282786275, "grad_norm": 0.34725332260131836, "learning_rate": 8.479132630328234e-06, "loss": 0.0634, "step": 48300 }, { "epoch": 3.587553839298975, "grad_norm": 2.8280141353607178, "learning_rate": 8.474676964206149e-06, "loss": 0.0553, "step": 48310 }, { "epoch": 3.588296450319323, "grad_norm": 0.43756672739982605, "learning_rate": 8.470221298084064e-06, "loss": 0.0565, "step": 48320 }, { "epoch": 3.5890390613396703, "grad_norm": 4.297688961029053, "learning_rate": 8.465765631961979e-06, "loss": 0.0723, "step": 48330 }, { "epoch": 3.5897816723600178, "grad_norm": 1.2074358463287354, "learning_rate": 8.461309965839894e-06, "loss": 0.0601, "step": 48340 }, { "epoch": 3.5905242833803652, "grad_norm": 1.4937044382095337, "learning_rate": 8.456854299717807e-06, "loss": 0.0968, "step": 48350 }, { "epoch": 3.5912668944007127, "grad_norm": 1.5273339748382568, "learning_rate": 8.452398633595724e-06, "loss": 0.0736, "step": 48360 }, { "epoch": 3.5920095054210606, "grad_norm": 2.0641915798187256, "learning_rate": 8.447942967473637e-06, "loss": 0.0649, "step": 48370 }, { "epoch": 3.592752116441408, "grad_norm": 1.619729995727539, "learning_rate": 8.443487301351552e-06, "loss": 0.0689, "step": 48380 }, { "epoch": 3.5934947274617555, "grad_norm": 2.8663530349731445, "learning_rate": 8.439031635229469e-06, "loss": 0.0823, "step": 48390 }, { "epoch": 3.594237338482103, "grad_norm": 0.41677528619766235, "learning_rate": 8.434575969107382e-06, "loss": 0.0444, "step": 48400 }, { "epoch": 3.5949799495024504, "grad_norm": 1.2534587383270264, "learning_rate": 8.430120302985297e-06, "loss": 0.046, "step": 48410 }, { "epoch": 3.5957225605227983, "grad_norm": 1.1408332586288452, "learning_rate": 8.42566463686321e-06, "loss": 0.0431, "step": 48420 }, { "epoch": 3.5964651715431457, "grad_norm": 1.304916262626648, "learning_rate": 8.421208970741127e-06, "loss": 0.0525, "step": 48430 }, { "epoch": 3.597207782563493, "grad_norm": 1.2031378746032715, "learning_rate": 8.41675330461904e-06, "loss": 0.0647, "step": 48440 }, { "epoch": 3.597950393583841, "grad_norm": 1.5209531784057617, "learning_rate": 8.412297638496955e-06, "loss": 0.0614, "step": 48450 }, { "epoch": 3.598693004604188, "grad_norm": 1.3445301055908203, "learning_rate": 8.40784197237487e-06, "loss": 0.0793, "step": 48460 }, { "epoch": 3.599435615624536, "grad_norm": 0.48478415608406067, "learning_rate": 8.403386306252785e-06, "loss": 0.0781, "step": 48470 }, { "epoch": 3.6001782266448834, "grad_norm": 0.9039621353149414, "learning_rate": 8.3989306401307e-06, "loss": 0.0734, "step": 48480 }, { "epoch": 3.600920837665231, "grad_norm": 1.992552638053894, "learning_rate": 8.394474974008613e-06, "loss": 0.0582, "step": 48490 }, { "epoch": 3.6016634486855788, "grad_norm": 1.8207603693008423, "learning_rate": 8.39001930788653e-06, "loss": 0.0443, "step": 48500 }, { "epoch": 3.602406059705926, "grad_norm": 1.5029350519180298, "learning_rate": 8.385563641764445e-06, "loss": 0.0553, "step": 48510 }, { "epoch": 3.6031486707262737, "grad_norm": 1.8282309770584106, "learning_rate": 8.381107975642358e-06, "loss": 0.0408, "step": 48520 }, { "epoch": 3.603891281746621, "grad_norm": 1.2360031604766846, "learning_rate": 8.376652309520273e-06, "loss": 0.0606, "step": 48530 }, { "epoch": 3.6046338927669686, "grad_norm": 2.7400107383728027, "learning_rate": 8.372196643398188e-06, "loss": 0.0532, "step": 48540 }, { "epoch": 3.6053765037873164, "grad_norm": 1.5960508584976196, "learning_rate": 8.367740977276103e-06, "loss": 0.0641, "step": 48550 }, { "epoch": 3.606119114807664, "grad_norm": 1.4296334981918335, "learning_rate": 8.363285311154018e-06, "loss": 0.0749, "step": 48560 }, { "epoch": 3.6068617258280113, "grad_norm": 2.3336386680603027, "learning_rate": 8.358829645031932e-06, "loss": 0.0739, "step": 48570 }, { "epoch": 3.607604336848359, "grad_norm": 1.2750415802001953, "learning_rate": 8.354373978909848e-06, "loss": 0.0661, "step": 48580 }, { "epoch": 3.6083469478687062, "grad_norm": 1.1788580417633057, "learning_rate": 8.349918312787762e-06, "loss": 0.0629, "step": 48590 }, { "epoch": 3.609089558889054, "grad_norm": 1.36868155002594, "learning_rate": 8.345462646665677e-06, "loss": 0.0625, "step": 48600 }, { "epoch": 3.6098321699094016, "grad_norm": 1.373689889907837, "learning_rate": 8.34100698054359e-06, "loss": 0.0562, "step": 48610 }, { "epoch": 3.610574780929749, "grad_norm": 2.2986576557159424, "learning_rate": 8.336551314421507e-06, "loss": 0.0644, "step": 48620 }, { "epoch": 3.6113173919500965, "grad_norm": 0.9667069911956787, "learning_rate": 8.332095648299422e-06, "loss": 0.0964, "step": 48630 }, { "epoch": 3.612060002970444, "grad_norm": 1.5480561256408691, "learning_rate": 8.327639982177335e-06, "loss": 0.0456, "step": 48640 }, { "epoch": 3.612802613990792, "grad_norm": 2.361801862716675, "learning_rate": 8.323184316055252e-06, "loss": 0.0599, "step": 48650 }, { "epoch": 3.6135452250111393, "grad_norm": 0.5613052845001221, "learning_rate": 8.318728649933165e-06, "loss": 0.0428, "step": 48660 }, { "epoch": 3.6142878360314867, "grad_norm": 2.242196559906006, "learning_rate": 8.31427298381108e-06, "loss": 0.0627, "step": 48670 }, { "epoch": 3.615030447051834, "grad_norm": 0.8085373640060425, "learning_rate": 8.309817317688995e-06, "loss": 0.0531, "step": 48680 }, { "epoch": 3.6157730580721816, "grad_norm": 1.4560467004776, "learning_rate": 8.30536165156691e-06, "loss": 0.054, "step": 48690 }, { "epoch": 3.6165156690925295, "grad_norm": 1.3971903324127197, "learning_rate": 8.300905985444825e-06, "loss": 0.039, "step": 48700 }, { "epoch": 3.617258280112877, "grad_norm": 1.9902868270874023, "learning_rate": 8.296450319322738e-06, "loss": 0.0625, "step": 48710 }, { "epoch": 3.6180008911332244, "grad_norm": 2.5948617458343506, "learning_rate": 8.291994653200653e-06, "loss": 0.0437, "step": 48720 }, { "epoch": 3.618743502153572, "grad_norm": 1.1913443803787231, "learning_rate": 8.287538987078568e-06, "loss": 0.0699, "step": 48730 }, { "epoch": 3.6194861131739193, "grad_norm": 1.5906063318252563, "learning_rate": 8.283083320956483e-06, "loss": 0.0839, "step": 48740 }, { "epoch": 3.6202287241942672, "grad_norm": 1.980405569076538, "learning_rate": 8.278627654834398e-06, "loss": 0.0694, "step": 48750 }, { "epoch": 3.6209713352146147, "grad_norm": 0.5400698781013489, "learning_rate": 8.274171988712313e-06, "loss": 0.065, "step": 48760 }, { "epoch": 3.621713946234962, "grad_norm": 2.1848058700561523, "learning_rate": 8.269716322590228e-06, "loss": 0.0694, "step": 48770 }, { "epoch": 3.6224565572553096, "grad_norm": 1.3808951377868652, "learning_rate": 8.265260656468141e-06, "loss": 0.0496, "step": 48780 }, { "epoch": 3.623199168275657, "grad_norm": 1.0675551891326904, "learning_rate": 8.260804990346056e-06, "loss": 0.0807, "step": 48790 }, { "epoch": 3.623941779296005, "grad_norm": 0.58694988489151, "learning_rate": 8.256349324223973e-06, "loss": 0.0794, "step": 48800 }, { "epoch": 3.6246843903163524, "grad_norm": 1.7244460582733154, "learning_rate": 8.251893658101886e-06, "loss": 0.0631, "step": 48810 }, { "epoch": 3.6254270013367, "grad_norm": 0.8975712060928345, "learning_rate": 8.247437991979801e-06, "loss": 0.0395, "step": 48820 }, { "epoch": 3.6261696123570473, "grad_norm": 0.6970779299736023, "learning_rate": 8.242982325857715e-06, "loss": 0.0548, "step": 48830 }, { "epoch": 3.6269122233773947, "grad_norm": 3.2898783683776855, "learning_rate": 8.238526659735631e-06, "loss": 0.0594, "step": 48840 }, { "epoch": 3.6276548343977426, "grad_norm": 1.9671893119812012, "learning_rate": 8.234070993613545e-06, "loss": 0.0702, "step": 48850 }, { "epoch": 3.62839744541809, "grad_norm": 0.6699275970458984, "learning_rate": 8.22961532749146e-06, "loss": 0.0674, "step": 48860 }, { "epoch": 3.6291400564384375, "grad_norm": 1.7382103204727173, "learning_rate": 8.225159661369375e-06, "loss": 0.0578, "step": 48870 }, { "epoch": 3.629882667458785, "grad_norm": 3.2885706424713135, "learning_rate": 8.22070399524729e-06, "loss": 0.0565, "step": 48880 }, { "epoch": 3.6306252784791324, "grad_norm": 1.1703078746795654, "learning_rate": 8.216248329125205e-06, "loss": 0.0696, "step": 48890 }, { "epoch": 3.6313678894994803, "grad_norm": 0.3696410059928894, "learning_rate": 8.211792663003118e-06, "loss": 0.0693, "step": 48900 }, { "epoch": 3.6321105005198278, "grad_norm": 2.525710105895996, "learning_rate": 8.207336996881035e-06, "loss": 0.0648, "step": 48910 }, { "epoch": 3.632853111540175, "grad_norm": 1.2038122415542603, "learning_rate": 8.20288133075895e-06, "loss": 0.0612, "step": 48920 }, { "epoch": 3.6335957225605227, "grad_norm": 2.1405258178710938, "learning_rate": 8.198425664636863e-06, "loss": 0.0607, "step": 48930 }, { "epoch": 3.63433833358087, "grad_norm": 1.3878464698791504, "learning_rate": 8.193969998514778e-06, "loss": 0.0556, "step": 48940 }, { "epoch": 3.635080944601218, "grad_norm": 0.8806987404823303, "learning_rate": 8.189514332392693e-06, "loss": 0.0496, "step": 48950 }, { "epoch": 3.6358235556215655, "grad_norm": 0.7001392841339111, "learning_rate": 8.185058666270608e-06, "loss": 0.0623, "step": 48960 }, { "epoch": 3.636566166641913, "grad_norm": 1.2697521448135376, "learning_rate": 8.180603000148523e-06, "loss": 0.0804, "step": 48970 }, { "epoch": 3.6373087776622604, "grad_norm": 0.7916688323020935, "learning_rate": 8.176147334026436e-06, "loss": 0.0562, "step": 48980 }, { "epoch": 3.638051388682608, "grad_norm": 1.4177652597427368, "learning_rate": 8.171691667904353e-06, "loss": 0.0454, "step": 48990 }, { "epoch": 3.6387939997029557, "grad_norm": 2.105250120162964, "learning_rate": 8.167236001782266e-06, "loss": 0.0614, "step": 49000 }, { "epoch": 3.639536610723303, "grad_norm": 1.3366777896881104, "learning_rate": 8.162780335660181e-06, "loss": 0.0743, "step": 49010 }, { "epoch": 3.6402792217436506, "grad_norm": 0.6253573298454285, "learning_rate": 8.158324669538096e-06, "loss": 0.0649, "step": 49020 }, { "epoch": 3.6410218327639985, "grad_norm": 0.4928571879863739, "learning_rate": 8.153869003416011e-06, "loss": 0.069, "step": 49030 }, { "epoch": 3.6417644437843455, "grad_norm": 0.9008927345275879, "learning_rate": 8.149413337293926e-06, "loss": 0.0508, "step": 49040 }, { "epoch": 3.6425070548046934, "grad_norm": 0.6431163549423218, "learning_rate": 8.14495767117184e-06, "loss": 0.0758, "step": 49050 }, { "epoch": 3.643249665825041, "grad_norm": 3.3478305339813232, "learning_rate": 8.140502005049756e-06, "loss": 0.0692, "step": 49060 }, { "epoch": 3.6439922768453883, "grad_norm": 1.4182404279708862, "learning_rate": 8.13604633892767e-06, "loss": 0.0813, "step": 49070 }, { "epoch": 3.644734887865736, "grad_norm": 0.8223309516906738, "learning_rate": 8.131590672805584e-06, "loss": 0.0589, "step": 49080 }, { "epoch": 3.6454774988860836, "grad_norm": 3.398197889328003, "learning_rate": 8.1271350066835e-06, "loss": 0.049, "step": 49090 }, { "epoch": 3.646220109906431, "grad_norm": 0.7745069861412048, "learning_rate": 8.122679340561414e-06, "loss": 0.0341, "step": 49100 }, { "epoch": 3.6469627209267785, "grad_norm": 1.4048975706100464, "learning_rate": 8.11822367443933e-06, "loss": 0.0626, "step": 49110 }, { "epoch": 3.647705331947126, "grad_norm": 1.0643982887268066, "learning_rate": 8.113768008317243e-06, "loss": 0.0543, "step": 49120 }, { "epoch": 3.648447942967474, "grad_norm": 2.2764272689819336, "learning_rate": 8.109312342195158e-06, "loss": 0.0827, "step": 49130 }, { "epoch": 3.6491905539878213, "grad_norm": 1.6373050212860107, "learning_rate": 8.104856676073073e-06, "loss": 0.0593, "step": 49140 }, { "epoch": 3.649933165008169, "grad_norm": 0.9760125875473022, "learning_rate": 8.100401009950988e-06, "loss": 0.0482, "step": 49150 }, { "epoch": 3.6506757760285162, "grad_norm": 0.3532949388027191, "learning_rate": 8.095945343828903e-06, "loss": 0.0667, "step": 49160 }, { "epoch": 3.6514183870488637, "grad_norm": 1.1069635152816772, "learning_rate": 8.091489677706818e-06, "loss": 0.054, "step": 49170 }, { "epoch": 3.6521609980692116, "grad_norm": 2.3868649005889893, "learning_rate": 8.087034011584733e-06, "loss": 0.0543, "step": 49180 }, { "epoch": 3.652903609089559, "grad_norm": 2.724820375442505, "learning_rate": 8.082578345462646e-06, "loss": 0.0669, "step": 49190 }, { "epoch": 3.6536462201099065, "grad_norm": 1.579540491104126, "learning_rate": 8.078122679340561e-06, "loss": 0.0574, "step": 49200 }, { "epoch": 3.654388831130254, "grad_norm": 1.0479410886764526, "learning_rate": 8.073667013218478e-06, "loss": 0.0563, "step": 49210 }, { "epoch": 3.6551314421506014, "grad_norm": 0.6656275391578674, "learning_rate": 8.069211347096391e-06, "loss": 0.0449, "step": 49220 }, { "epoch": 3.6558740531709493, "grad_norm": 0.6518800258636475, "learning_rate": 8.064755680974306e-06, "loss": 0.0884, "step": 49230 }, { "epoch": 3.6566166641912967, "grad_norm": 0.2461584061384201, "learning_rate": 8.06030001485222e-06, "loss": 0.0426, "step": 49240 }, { "epoch": 3.657359275211644, "grad_norm": 1.2033755779266357, "learning_rate": 8.055844348730136e-06, "loss": 0.052, "step": 49250 }, { "epoch": 3.6581018862319916, "grad_norm": 2.62395977973938, "learning_rate": 8.051388682608051e-06, "loss": 0.1079, "step": 49260 }, { "epoch": 3.658844497252339, "grad_norm": 0.4178759455680847, "learning_rate": 8.046933016485964e-06, "loss": 0.0778, "step": 49270 }, { "epoch": 3.659587108272687, "grad_norm": 1.5172581672668457, "learning_rate": 8.042477350363881e-06, "loss": 0.1025, "step": 49280 }, { "epoch": 3.6603297192930344, "grad_norm": 2.6945109367370605, "learning_rate": 8.038021684241794e-06, "loss": 0.0959, "step": 49290 }, { "epoch": 3.661072330313382, "grad_norm": 1.2761311531066895, "learning_rate": 8.03356601811971e-06, "loss": 0.0477, "step": 49300 }, { "epoch": 3.6618149413337293, "grad_norm": 0.8883626461029053, "learning_rate": 8.029110351997623e-06, "loss": 0.0615, "step": 49310 }, { "epoch": 3.6625575523540768, "grad_norm": 0.7210109829902649, "learning_rate": 8.02465468587554e-06, "loss": 0.0618, "step": 49320 }, { "epoch": 3.6633001633744247, "grad_norm": 1.4874513149261475, "learning_rate": 8.020199019753454e-06, "loss": 0.0522, "step": 49330 }, { "epoch": 3.664042774394772, "grad_norm": 0.9390859603881836, "learning_rate": 8.015743353631368e-06, "loss": 0.0567, "step": 49340 }, { "epoch": 3.6647853854151196, "grad_norm": 0.896518349647522, "learning_rate": 8.011287687509283e-06, "loss": 0.0589, "step": 49350 }, { "epoch": 3.665527996435467, "grad_norm": 2.0531513690948486, "learning_rate": 8.006832021387198e-06, "loss": 0.0871, "step": 49360 }, { "epoch": 3.6662706074558145, "grad_norm": 1.1506352424621582, "learning_rate": 8.002376355265113e-06, "loss": 0.068, "step": 49370 }, { "epoch": 3.6670132184761624, "grad_norm": 0.7232264280319214, "learning_rate": 7.997920689143028e-06, "loss": 0.0859, "step": 49380 }, { "epoch": 3.66775582949651, "grad_norm": 2.220487594604492, "learning_rate": 7.99346502302094e-06, "loss": 0.0333, "step": 49390 }, { "epoch": 3.6684984405168573, "grad_norm": 0.4155381917953491, "learning_rate": 7.989009356898858e-06, "loss": 0.0488, "step": 49400 }, { "epoch": 3.6692410515372047, "grad_norm": 0.726426362991333, "learning_rate": 7.98455369077677e-06, "loss": 0.0712, "step": 49410 }, { "epoch": 3.669983662557552, "grad_norm": 1.152137041091919, "learning_rate": 7.980098024654686e-06, "loss": 0.0479, "step": 49420 }, { "epoch": 3.6707262735779, "grad_norm": 0.6122065782546997, "learning_rate": 7.9756423585326e-06, "loss": 0.0538, "step": 49430 }, { "epoch": 3.6714688845982475, "grad_norm": 1.2292253971099854, "learning_rate": 7.971186692410516e-06, "loss": 0.046, "step": 49440 }, { "epoch": 3.672211495618595, "grad_norm": 0.447689950466156, "learning_rate": 7.96673102628843e-06, "loss": 0.0483, "step": 49450 }, { "epoch": 3.6729541066389424, "grad_norm": 0.4693281352519989, "learning_rate": 7.962275360166344e-06, "loss": 0.0252, "step": 49460 }, { "epoch": 3.67369671765929, "grad_norm": 1.050255298614502, "learning_rate": 7.95781969404426e-06, "loss": 0.0934, "step": 49470 }, { "epoch": 3.6744393286796377, "grad_norm": 1.7105246782302856, "learning_rate": 7.953364027922174e-06, "loss": 0.0491, "step": 49480 }, { "epoch": 3.675181939699985, "grad_norm": 1.1339107751846313, "learning_rate": 7.948908361800089e-06, "loss": 0.0531, "step": 49490 }, { "epoch": 3.6759245507203326, "grad_norm": 2.340886116027832, "learning_rate": 7.944452695678004e-06, "loss": 0.0547, "step": 49500 }, { "epoch": 3.67666716174068, "grad_norm": 0.6771245002746582, "learning_rate": 7.939997029555919e-06, "loss": 0.0726, "step": 49510 }, { "epoch": 3.6774097727610275, "grad_norm": 1.6597684621810913, "learning_rate": 7.935541363433834e-06, "loss": 0.0894, "step": 49520 }, { "epoch": 3.6781523837813754, "grad_norm": 2.375394821166992, "learning_rate": 7.931085697311747e-06, "loss": 0.0645, "step": 49530 }, { "epoch": 3.678894994801723, "grad_norm": 1.4455935955047607, "learning_rate": 7.926630031189662e-06, "loss": 0.06, "step": 49540 }, { "epoch": 3.6796376058220703, "grad_norm": 0.4095980226993561, "learning_rate": 7.922174365067577e-06, "loss": 0.0667, "step": 49550 }, { "epoch": 3.680380216842418, "grad_norm": 0.5983967781066895, "learning_rate": 7.917718698945492e-06, "loss": 0.0564, "step": 49560 }, { "epoch": 3.6811228278627652, "grad_norm": 0.8008362054824829, "learning_rate": 7.913263032823407e-06, "loss": 0.0642, "step": 49570 }, { "epoch": 3.681865438883113, "grad_norm": 4.587621688842773, "learning_rate": 7.908807366701322e-06, "loss": 0.0828, "step": 49580 }, { "epoch": 3.6826080499034606, "grad_norm": 2.4655842781066895, "learning_rate": 7.904351700579237e-06, "loss": 0.0508, "step": 49590 }, { "epoch": 3.683350660923808, "grad_norm": 3.4618539810180664, "learning_rate": 7.89989603445715e-06, "loss": 0.0668, "step": 49600 }, { "epoch": 3.684093271944156, "grad_norm": 0.5393896698951721, "learning_rate": 7.895440368335066e-06, "loss": 0.0231, "step": 49610 }, { "epoch": 3.684835882964503, "grad_norm": 2.3107566833496094, "learning_rate": 7.890984702212982e-06, "loss": 0.0643, "step": 49620 }, { "epoch": 3.685578493984851, "grad_norm": 0.5587324500083923, "learning_rate": 7.886529036090896e-06, "loss": 0.0612, "step": 49630 }, { "epoch": 3.6863211050051983, "grad_norm": 3.5032103061676025, "learning_rate": 7.88207336996881e-06, "loss": 0.0796, "step": 49640 }, { "epoch": 3.6870637160255457, "grad_norm": 1.4119980335235596, "learning_rate": 7.877617703846724e-06, "loss": 0.0507, "step": 49650 }, { "epoch": 3.6878063270458936, "grad_norm": 2.8842852115631104, "learning_rate": 7.87316203772464e-06, "loss": 0.0847, "step": 49660 }, { "epoch": 3.688548938066241, "grad_norm": 2.1859562397003174, "learning_rate": 7.868706371602556e-06, "loss": 0.0462, "step": 49670 }, { "epoch": 3.6892915490865885, "grad_norm": 1.2712724208831787, "learning_rate": 7.864250705480469e-06, "loss": 0.0603, "step": 49680 }, { "epoch": 3.690034160106936, "grad_norm": 0.29489457607269287, "learning_rate": 7.859795039358386e-06, "loss": 0.0504, "step": 49690 }, { "epoch": 3.6907767711272834, "grad_norm": 1.2111194133758545, "learning_rate": 7.855339373236299e-06, "loss": 0.0466, "step": 49700 }, { "epoch": 3.6915193821476313, "grad_norm": 0.5714041590690613, "learning_rate": 7.850883707114214e-06, "loss": 0.0915, "step": 49710 }, { "epoch": 3.6922619931679788, "grad_norm": 1.720264196395874, "learning_rate": 7.846428040992127e-06, "loss": 0.0974, "step": 49720 }, { "epoch": 3.693004604188326, "grad_norm": 1.2143886089324951, "learning_rate": 7.841972374870044e-06, "loss": 0.047, "step": 49730 }, { "epoch": 3.6937472152086737, "grad_norm": 1.8155580759048462, "learning_rate": 7.837516708747959e-06, "loss": 0.0645, "step": 49740 }, { "epoch": 3.694489826229021, "grad_norm": 1.720137596130371, "learning_rate": 7.833061042625872e-06, "loss": 0.0603, "step": 49750 }, { "epoch": 3.695232437249369, "grad_norm": 1.3623203039169312, "learning_rate": 7.828605376503787e-06, "loss": 0.0717, "step": 49760 }, { "epoch": 3.6959750482697165, "grad_norm": 0.7780874967575073, "learning_rate": 7.824149710381702e-06, "loss": 0.0413, "step": 49770 }, { "epoch": 3.696717659290064, "grad_norm": 1.2925565242767334, "learning_rate": 7.819694044259617e-06, "loss": 0.0699, "step": 49780 }, { "epoch": 3.6974602703104114, "grad_norm": 1.491782546043396, "learning_rate": 7.815238378137532e-06, "loss": 0.0693, "step": 49790 }, { "epoch": 3.698202881330759, "grad_norm": 1.258483648300171, "learning_rate": 7.810782712015445e-06, "loss": 0.0523, "step": 49800 }, { "epoch": 3.6989454923511067, "grad_norm": 0.5992489457130432, "learning_rate": 7.806327045893362e-06, "loss": 0.0645, "step": 49810 }, { "epoch": 3.699688103371454, "grad_norm": 2.4607632160186768, "learning_rate": 7.801871379771275e-06, "loss": 0.0682, "step": 49820 }, { "epoch": 3.7004307143918016, "grad_norm": 1.1010299921035767, "learning_rate": 7.79741571364919e-06, "loss": 0.0794, "step": 49830 }, { "epoch": 3.701173325412149, "grad_norm": 3.313181161880493, "learning_rate": 7.792960047527105e-06, "loss": 0.0543, "step": 49840 }, { "epoch": 3.7019159364324965, "grad_norm": 2.4023070335388184, "learning_rate": 7.78850438140502e-06, "loss": 0.0673, "step": 49850 }, { "epoch": 3.7026585474528444, "grad_norm": 1.3617714643478394, "learning_rate": 7.784048715282935e-06, "loss": 0.0477, "step": 49860 }, { "epoch": 3.703401158473192, "grad_norm": 1.4105480909347534, "learning_rate": 7.779593049160849e-06, "loss": 0.0659, "step": 49870 }, { "epoch": 3.7041437694935393, "grad_norm": 2.565361261367798, "learning_rate": 7.775137383038765e-06, "loss": 0.0741, "step": 49880 }, { "epoch": 3.7048863805138867, "grad_norm": 1.3081820011138916, "learning_rate": 7.770681716916679e-06, "loss": 0.0527, "step": 49890 }, { "epoch": 3.705628991534234, "grad_norm": 0.5387427806854248, "learning_rate": 7.766226050794594e-06, "loss": 0.0848, "step": 49900 }, { "epoch": 3.706371602554582, "grad_norm": 1.5753438472747803, "learning_rate": 7.761770384672509e-06, "loss": 0.0789, "step": 49910 }, { "epoch": 3.7071142135749295, "grad_norm": 1.9173258543014526, "learning_rate": 7.757314718550424e-06, "loss": 0.0863, "step": 49920 }, { "epoch": 3.707856824595277, "grad_norm": 0.6267051696777344, "learning_rate": 7.752859052428339e-06, "loss": 0.0511, "step": 49930 }, { "epoch": 3.7085994356156244, "grad_norm": 0.7196197509765625, "learning_rate": 7.748403386306252e-06, "loss": 0.07, "step": 49940 }, { "epoch": 3.709342046635972, "grad_norm": 1.6515315771102905, "learning_rate": 7.743947720184169e-06, "loss": 0.0634, "step": 49950 }, { "epoch": 3.71008465765632, "grad_norm": 1.1136648654937744, "learning_rate": 7.739492054062084e-06, "loss": 0.0372, "step": 49960 }, { "epoch": 3.7108272686766672, "grad_norm": 1.6340572834014893, "learning_rate": 7.735036387939997e-06, "loss": 0.0599, "step": 49970 }, { "epoch": 3.7115698796970147, "grad_norm": 1.4177989959716797, "learning_rate": 7.730580721817912e-06, "loss": 0.077, "step": 49980 }, { "epoch": 3.712312490717362, "grad_norm": 1.4736772775650024, "learning_rate": 7.726125055695827e-06, "loss": 0.0872, "step": 49990 }, { "epoch": 3.7130551017377096, "grad_norm": 1.0160552263259888, "learning_rate": 7.721669389573742e-06, "loss": 0.0432, "step": 50000 }, { "epoch": 3.7137977127580575, "grad_norm": 2.048893451690674, "learning_rate": 7.717213723451655e-06, "loss": 0.0606, "step": 50010 }, { "epoch": 3.714540323778405, "grad_norm": 0.3452848494052887, "learning_rate": 7.71275805732957e-06, "loss": 0.0257, "step": 50020 }, { "epoch": 3.7152829347987524, "grad_norm": 2.9031543731689453, "learning_rate": 7.708302391207487e-06, "loss": 0.0525, "step": 50030 }, { "epoch": 3.7160255458191, "grad_norm": 1.9653599262237549, "learning_rate": 7.7038467250854e-06, "loss": 0.0811, "step": 50040 }, { "epoch": 3.7167681568394473, "grad_norm": 0.6306089758872986, "learning_rate": 7.699391058963315e-06, "loss": 0.0619, "step": 50050 }, { "epoch": 3.717510767859795, "grad_norm": 2.5835585594177246, "learning_rate": 7.694935392841229e-06, "loss": 0.0772, "step": 50060 }, { "epoch": 3.7182533788801426, "grad_norm": 1.6114825010299683, "learning_rate": 7.690479726719145e-06, "loss": 0.0758, "step": 50070 }, { "epoch": 3.71899598990049, "grad_norm": 2.363929271697998, "learning_rate": 7.68602406059706e-06, "loss": 0.0746, "step": 50080 }, { "epoch": 3.7197386009208375, "grad_norm": 0.5508888959884644, "learning_rate": 7.681568394474973e-06, "loss": 0.0874, "step": 50090 }, { "epoch": 3.720481211941185, "grad_norm": 0.6778455972671509, "learning_rate": 7.67711272835289e-06, "loss": 0.0444, "step": 50100 }, { "epoch": 3.721223822961533, "grad_norm": 0.6257083415985107, "learning_rate": 7.672657062230803e-06, "loss": 0.0501, "step": 50110 }, { "epoch": 3.7219664339818803, "grad_norm": 0.7296163439750671, "learning_rate": 7.668201396108718e-06, "loss": 0.0719, "step": 50120 }, { "epoch": 3.7227090450022278, "grad_norm": 1.6046028137207031, "learning_rate": 7.663745729986632e-06, "loss": 0.0848, "step": 50130 }, { "epoch": 3.723451656022575, "grad_norm": 2.0736193656921387, "learning_rate": 7.659290063864548e-06, "loss": 0.0585, "step": 50140 }, { "epoch": 3.7241942670429227, "grad_norm": 3.5360021591186523, "learning_rate": 7.654834397742463e-06, "loss": 0.0601, "step": 50150 }, { "epoch": 3.7249368780632706, "grad_norm": 1.6731319427490234, "learning_rate": 7.650378731620377e-06, "loss": 0.0783, "step": 50160 }, { "epoch": 3.725679489083618, "grad_norm": 1.434248685836792, "learning_rate": 7.645923065498292e-06, "loss": 0.0688, "step": 50170 }, { "epoch": 3.7264221001039655, "grad_norm": 2.1045992374420166, "learning_rate": 7.641467399376207e-06, "loss": 0.0679, "step": 50180 }, { "epoch": 3.7271647111243134, "grad_norm": 2.572453022003174, "learning_rate": 7.637011733254122e-06, "loss": 0.0378, "step": 50190 }, { "epoch": 3.7279073221446604, "grad_norm": 2.0661511421203613, "learning_rate": 7.632556067132037e-06, "loss": 0.0657, "step": 50200 }, { "epoch": 3.7286499331650083, "grad_norm": 4.1037278175354, "learning_rate": 7.628100401009952e-06, "loss": 0.0909, "step": 50210 }, { "epoch": 3.7293925441853557, "grad_norm": 2.651695728302002, "learning_rate": 7.623644734887866e-06, "loss": 0.07, "step": 50220 }, { "epoch": 3.730135155205703, "grad_norm": 1.7881807088851929, "learning_rate": 7.619189068765781e-06, "loss": 0.0622, "step": 50230 }, { "epoch": 3.730877766226051, "grad_norm": 0.9953715801239014, "learning_rate": 7.614733402643695e-06, "loss": 0.0471, "step": 50240 }, { "epoch": 3.7316203772463985, "grad_norm": 1.1658798456192017, "learning_rate": 7.610277736521611e-06, "loss": 0.0636, "step": 50250 }, { "epoch": 3.732362988266746, "grad_norm": 1.1256842613220215, "learning_rate": 7.605822070399525e-06, "loss": 0.0665, "step": 50260 }, { "epoch": 3.7331055992870934, "grad_norm": 0.6367254853248596, "learning_rate": 7.601366404277439e-06, "loss": 0.052, "step": 50270 }, { "epoch": 3.733848210307441, "grad_norm": 2.0500311851501465, "learning_rate": 7.596910738155354e-06, "loss": 0.0545, "step": 50280 }, { "epoch": 3.7345908213277887, "grad_norm": 0.8507987856864929, "learning_rate": 7.59245507203327e-06, "loss": 0.0459, "step": 50290 }, { "epoch": 3.735333432348136, "grad_norm": 1.231722354888916, "learning_rate": 7.587999405911184e-06, "loss": 0.0689, "step": 50300 }, { "epoch": 3.7360760433684836, "grad_norm": 1.071260929107666, "learning_rate": 7.583543739789098e-06, "loss": 0.0589, "step": 50310 }, { "epoch": 3.736818654388831, "grad_norm": 0.9924083948135376, "learning_rate": 7.5790880736670124e-06, "loss": 0.0662, "step": 50320 }, { "epoch": 3.7375612654091785, "grad_norm": 3.8950726985931396, "learning_rate": 7.574632407544928e-06, "loss": 0.0801, "step": 50330 }, { "epoch": 3.7383038764295264, "grad_norm": 1.2569303512573242, "learning_rate": 7.570176741422843e-06, "loss": 0.0549, "step": 50340 }, { "epoch": 3.739046487449874, "grad_norm": 2.5183095932006836, "learning_rate": 7.565721075300757e-06, "loss": 0.071, "step": 50350 }, { "epoch": 3.7397890984702213, "grad_norm": 0.9033598303794861, "learning_rate": 7.561265409178673e-06, "loss": 0.0671, "step": 50360 }, { "epoch": 3.740531709490569, "grad_norm": 3.6152873039245605, "learning_rate": 7.556809743056587e-06, "loss": 0.0498, "step": 50370 }, { "epoch": 3.7412743205109162, "grad_norm": 0.6265145540237427, "learning_rate": 7.5523540769345015e-06, "loss": 0.0791, "step": 50380 }, { "epoch": 3.742016931531264, "grad_norm": 0.47644293308258057, "learning_rate": 7.547898410812416e-06, "loss": 0.0719, "step": 50390 }, { "epoch": 3.7427595425516116, "grad_norm": 1.28927743434906, "learning_rate": 7.543442744690332e-06, "loss": 0.0471, "step": 50400 }, { "epoch": 3.743502153571959, "grad_norm": 0.5300150513648987, "learning_rate": 7.5389870785682465e-06, "loss": 0.0565, "step": 50410 }, { "epoch": 3.7442447645923065, "grad_norm": 0.5767967104911804, "learning_rate": 7.534531412446161e-06, "loss": 0.0767, "step": 50420 }, { "epoch": 3.744987375612654, "grad_norm": 1.1901088953018188, "learning_rate": 7.530075746324075e-06, "loss": 0.0639, "step": 50430 }, { "epoch": 3.745729986633002, "grad_norm": 0.5107831954956055, "learning_rate": 7.525620080201991e-06, "loss": 0.0423, "step": 50440 }, { "epoch": 3.7464725976533493, "grad_norm": 0.25454196333885193, "learning_rate": 7.521164414079905e-06, "loss": 0.0639, "step": 50450 }, { "epoch": 3.7472152086736967, "grad_norm": 0.2913890480995178, "learning_rate": 7.51670874795782e-06, "loss": 0.0553, "step": 50460 }, { "epoch": 3.747957819694044, "grad_norm": 1.8346703052520752, "learning_rate": 7.512253081835736e-06, "loss": 0.0751, "step": 50470 }, { "epoch": 3.7487004307143916, "grad_norm": 1.856366753578186, "learning_rate": 7.50779741571365e-06, "loss": 0.0852, "step": 50480 }, { "epoch": 3.7494430417347395, "grad_norm": 1.0152894258499146, "learning_rate": 7.503341749591564e-06, "loss": 0.0439, "step": 50490 }, { "epoch": 3.750185652755087, "grad_norm": 1.121036171913147, "learning_rate": 7.498886083469479e-06, "loss": 0.0781, "step": 50500 }, { "epoch": 3.7509282637754344, "grad_norm": 3.682908535003662, "learning_rate": 7.494430417347393e-06, "loss": 0.0624, "step": 50510 }, { "epoch": 3.751670874795782, "grad_norm": 1.5728288888931274, "learning_rate": 7.489974751225309e-06, "loss": 0.0812, "step": 50520 }, { "epoch": 3.7524134858161293, "grad_norm": 1.3011523485183716, "learning_rate": 7.485519085103223e-06, "loss": 0.1094, "step": 50530 }, { "epoch": 3.753156096836477, "grad_norm": 1.683307409286499, "learning_rate": 7.481063418981138e-06, "loss": 0.0715, "step": 50540 }, { "epoch": 3.7538987078568247, "grad_norm": 0.9677258729934692, "learning_rate": 7.476607752859052e-06, "loss": 0.0619, "step": 50550 }, { "epoch": 3.754641318877172, "grad_norm": 1.786702036857605, "learning_rate": 7.472152086736967e-06, "loss": 0.0721, "step": 50560 }, { "epoch": 3.7553839298975196, "grad_norm": 4.346724987030029, "learning_rate": 7.467696420614882e-06, "loss": 0.0477, "step": 50570 }, { "epoch": 3.756126540917867, "grad_norm": 0.49064943194389343, "learning_rate": 7.463240754492797e-06, "loss": 0.073, "step": 50580 }, { "epoch": 3.756869151938215, "grad_norm": 0.6625070571899414, "learning_rate": 7.458785088370711e-06, "loss": 0.0626, "step": 50590 }, { "epoch": 3.7576117629585624, "grad_norm": 3.1530327796936035, "learning_rate": 7.454329422248626e-06, "loss": 0.0642, "step": 50600 }, { "epoch": 3.75835437397891, "grad_norm": 0.38084813952445984, "learning_rate": 7.449873756126541e-06, "loss": 0.0687, "step": 50610 }, { "epoch": 3.7590969849992573, "grad_norm": 0.8041538000106812, "learning_rate": 7.4454180900044555e-06, "loss": 0.0677, "step": 50620 }, { "epoch": 3.7598395960196047, "grad_norm": 1.4357954263687134, "learning_rate": 7.440962423882371e-06, "loss": 0.0461, "step": 50630 }, { "epoch": 3.7605822070399526, "grad_norm": 0.9471798539161682, "learning_rate": 7.4365067577602855e-06, "loss": 0.0871, "step": 50640 }, { "epoch": 3.7613248180603, "grad_norm": 1.356444001197815, "learning_rate": 7.4320510916382005e-06, "loss": 0.0514, "step": 50650 }, { "epoch": 3.7620674290806475, "grad_norm": 0.9252959489822388, "learning_rate": 7.427595425516115e-06, "loss": 0.0675, "step": 50660 }, { "epoch": 3.762810040100995, "grad_norm": 1.9478005170822144, "learning_rate": 7.42313975939403e-06, "loss": 0.0473, "step": 50670 }, { "epoch": 3.7635526511213424, "grad_norm": 0.6486461162567139, "learning_rate": 7.418684093271944e-06, "loss": 0.0632, "step": 50680 }, { "epoch": 3.7642952621416903, "grad_norm": 1.0158225297927856, "learning_rate": 7.41422842714986e-06, "loss": 0.0945, "step": 50690 }, { "epoch": 3.7650378731620378, "grad_norm": 1.4446204900741577, "learning_rate": 7.409772761027774e-06, "loss": 0.0441, "step": 50700 }, { "epoch": 3.765780484182385, "grad_norm": 1.2535960674285889, "learning_rate": 7.405317094905689e-06, "loss": 0.0414, "step": 50710 }, { "epoch": 3.7665230952027327, "grad_norm": 0.49216005206108093, "learning_rate": 7.400861428783603e-06, "loss": 0.0505, "step": 50720 }, { "epoch": 3.76726570622308, "grad_norm": 1.1775668859481812, "learning_rate": 7.396405762661518e-06, "loss": 0.0713, "step": 50730 }, { "epoch": 3.768008317243428, "grad_norm": 1.1207523345947266, "learning_rate": 7.391950096539433e-06, "loss": 0.0645, "step": 50740 }, { "epoch": 3.7687509282637754, "grad_norm": 0.7334256172180176, "learning_rate": 7.387494430417348e-06, "loss": 0.0407, "step": 50750 }, { "epoch": 3.769493539284123, "grad_norm": 0.34325599670410156, "learning_rate": 7.383038764295263e-06, "loss": 0.0618, "step": 50760 }, { "epoch": 3.770236150304471, "grad_norm": 1.9858282804489136, "learning_rate": 7.378583098173177e-06, "loss": 0.0502, "step": 50770 }, { "epoch": 3.770978761324818, "grad_norm": 0.8460551500320435, "learning_rate": 7.374127432051092e-06, "loss": 0.0662, "step": 50780 }, { "epoch": 3.7717213723451657, "grad_norm": 1.374432921409607, "learning_rate": 7.369671765929006e-06, "loss": 0.0294, "step": 50790 }, { "epoch": 3.772463983365513, "grad_norm": 0.547275722026825, "learning_rate": 7.365216099806921e-06, "loss": 0.0512, "step": 50800 }, { "epoch": 3.7732065943858606, "grad_norm": 2.171943426132202, "learning_rate": 7.360760433684836e-06, "loss": 0.074, "step": 50810 }, { "epoch": 3.7739492054062085, "grad_norm": 5.715026378631592, "learning_rate": 7.356304767562751e-06, "loss": 0.0664, "step": 50820 }, { "epoch": 3.774691816426556, "grad_norm": 0.8222048282623291, "learning_rate": 7.351849101440665e-06, "loss": 0.0573, "step": 50830 }, { "epoch": 3.7754344274469034, "grad_norm": 0.7894913554191589, "learning_rate": 7.34739343531858e-06, "loss": 0.0447, "step": 50840 }, { "epoch": 3.776177038467251, "grad_norm": 0.9093634486198425, "learning_rate": 7.342937769196494e-06, "loss": 0.0685, "step": 50850 }, { "epoch": 3.7769196494875983, "grad_norm": 0.49111273884773254, "learning_rate": 7.338482103074409e-06, "loss": 0.0421, "step": 50860 }, { "epoch": 3.777662260507946, "grad_norm": 2.105043411254883, "learning_rate": 7.334026436952325e-06, "loss": 0.0791, "step": 50870 }, { "epoch": 3.7784048715282936, "grad_norm": 0.9441326856613159, "learning_rate": 7.329570770830239e-06, "loss": 0.0506, "step": 50880 }, { "epoch": 3.779147482548641, "grad_norm": 0.6536591053009033, "learning_rate": 7.325115104708154e-06, "loss": 0.0574, "step": 50890 }, { "epoch": 3.7798900935689885, "grad_norm": 0.7419950366020203, "learning_rate": 7.3206594385860685e-06, "loss": 0.0576, "step": 50900 }, { "epoch": 3.780632704589336, "grad_norm": 0.9035283923149109, "learning_rate": 7.3162037724639835e-06, "loss": 0.0522, "step": 50910 }, { "epoch": 3.781375315609684, "grad_norm": 1.0308187007904053, "learning_rate": 7.3117481063418985e-06, "loss": 0.0575, "step": 50920 }, { "epoch": 3.7821179266300313, "grad_norm": 1.2720097303390503, "learning_rate": 7.3072924402198135e-06, "loss": 0.0663, "step": 50930 }, { "epoch": 3.7828605376503788, "grad_norm": 2.122695207595825, "learning_rate": 7.302836774097728e-06, "loss": 0.0483, "step": 50940 }, { "epoch": 3.7836031486707262, "grad_norm": 1.060192346572876, "learning_rate": 7.298381107975643e-06, "loss": 0.0663, "step": 50950 }, { "epoch": 3.7843457596910737, "grad_norm": 2.148590087890625, "learning_rate": 7.293925441853557e-06, "loss": 0.068, "step": 50960 }, { "epoch": 3.7850883707114216, "grad_norm": 3.7318813800811768, "learning_rate": 7.289469775731472e-06, "loss": 0.0743, "step": 50970 }, { "epoch": 3.785830981731769, "grad_norm": 0.6228942275047302, "learning_rate": 7.285014109609387e-06, "loss": 0.0545, "step": 50980 }, { "epoch": 3.7865735927521165, "grad_norm": 0.46231064200401306, "learning_rate": 7.280558443487302e-06, "loss": 0.0374, "step": 50990 }, { "epoch": 3.787316203772464, "grad_norm": 1.6403611898422241, "learning_rate": 7.276102777365217e-06, "loss": 0.0674, "step": 51000 }, { "epoch": 3.7880588147928114, "grad_norm": 0.5807299613952637, "learning_rate": 7.271647111243131e-06, "loss": 0.0653, "step": 51010 }, { "epoch": 3.7888014258131593, "grad_norm": 2.2317705154418945, "learning_rate": 7.267191445121046e-06, "loss": 0.0464, "step": 51020 }, { "epoch": 3.7895440368335067, "grad_norm": 1.0599946975708008, "learning_rate": 7.26273577899896e-06, "loss": 0.0787, "step": 51030 }, { "epoch": 3.790286647853854, "grad_norm": 0.7271379232406616, "learning_rate": 7.258280112876876e-06, "loss": 0.0682, "step": 51040 }, { "epoch": 3.7910292588742016, "grad_norm": 1.6912349462509155, "learning_rate": 7.25382444675479e-06, "loss": 0.0537, "step": 51050 }, { "epoch": 3.791771869894549, "grad_norm": 1.233393907546997, "learning_rate": 7.249368780632705e-06, "loss": 0.0698, "step": 51060 }, { "epoch": 3.792514480914897, "grad_norm": 1.0007754564285278, "learning_rate": 7.244913114510619e-06, "loss": 0.0515, "step": 51070 }, { "epoch": 3.7932570919352444, "grad_norm": 1.4426878690719604, "learning_rate": 7.240457448388534e-06, "loss": 0.0731, "step": 51080 }, { "epoch": 3.793999702955592, "grad_norm": 0.7018789649009705, "learning_rate": 7.236001782266448e-06, "loss": 0.0422, "step": 51090 }, { "epoch": 3.7947423139759393, "grad_norm": 0.8804644346237183, "learning_rate": 7.231546116144364e-06, "loss": 0.0557, "step": 51100 }, { "epoch": 3.7954849249962868, "grad_norm": 0.588465690612793, "learning_rate": 7.227090450022278e-06, "loss": 0.0795, "step": 51110 }, { "epoch": 3.7962275360166347, "grad_norm": 1.5997480154037476, "learning_rate": 7.222634783900193e-06, "loss": 0.0358, "step": 51120 }, { "epoch": 3.796970147036982, "grad_norm": 0.4901600182056427, "learning_rate": 7.218179117778108e-06, "loss": 0.0743, "step": 51130 }, { "epoch": 3.7977127580573296, "grad_norm": 1.299644947052002, "learning_rate": 7.2137234516560225e-06, "loss": 0.0445, "step": 51140 }, { "epoch": 3.798455369077677, "grad_norm": 2.474388599395752, "learning_rate": 7.2092677855339375e-06, "loss": 0.0676, "step": 51150 }, { "epoch": 3.7991979800980245, "grad_norm": 0.26905447244644165, "learning_rate": 7.2048121194118525e-06, "loss": 0.0655, "step": 51160 }, { "epoch": 3.7999405911183723, "grad_norm": 1.1446030139923096, "learning_rate": 7.2003564532897675e-06, "loss": 0.056, "step": 51170 }, { "epoch": 3.80068320213872, "grad_norm": 0.9644284844398499, "learning_rate": 7.195900787167682e-06, "loss": 0.0443, "step": 51180 }, { "epoch": 3.8014258131590672, "grad_norm": 1.8860034942626953, "learning_rate": 7.191445121045597e-06, "loss": 0.0591, "step": 51190 }, { "epoch": 3.8021684241794147, "grad_norm": 1.3141664266586304, "learning_rate": 7.186989454923511e-06, "loss": 0.0319, "step": 51200 }, { "epoch": 3.802911035199762, "grad_norm": 1.2933363914489746, "learning_rate": 7.182533788801426e-06, "loss": 0.0596, "step": 51210 }, { "epoch": 3.80365364622011, "grad_norm": 1.3324415683746338, "learning_rate": 7.178078122679341e-06, "loss": 0.0632, "step": 51220 }, { "epoch": 3.8043962572404575, "grad_norm": 1.7264912128448486, "learning_rate": 7.173622456557256e-06, "loss": 0.0858, "step": 51230 }, { "epoch": 3.805138868260805, "grad_norm": 1.394806146621704, "learning_rate": 7.16916679043517e-06, "loss": 0.0673, "step": 51240 }, { "epoch": 3.8058814792811524, "grad_norm": 0.5998439788818359, "learning_rate": 7.164711124313085e-06, "loss": 0.0434, "step": 51250 }, { "epoch": 3.8066240903015, "grad_norm": 1.282382607460022, "learning_rate": 7.160255458191e-06, "loss": 0.0696, "step": 51260 }, { "epoch": 3.8073667013218477, "grad_norm": 1.0203378200531006, "learning_rate": 7.155799792068915e-06, "loss": 0.0709, "step": 51270 }, { "epoch": 3.808109312342195, "grad_norm": 1.8482102155685425, "learning_rate": 7.15134412594683e-06, "loss": 0.0499, "step": 51280 }, { "epoch": 3.8088519233625426, "grad_norm": 1.2708896398544312, "learning_rate": 7.146888459824744e-06, "loss": 0.0467, "step": 51290 }, { "epoch": 3.80959453438289, "grad_norm": 2.240234375, "learning_rate": 7.142432793702659e-06, "loss": 0.0742, "step": 51300 }, { "epoch": 3.8103371454032375, "grad_norm": 1.0520896911621094, "learning_rate": 7.137977127580573e-06, "loss": 0.0631, "step": 51310 }, { "epoch": 3.8110797564235854, "grad_norm": 0.6658138036727905, "learning_rate": 7.133521461458488e-06, "loss": 0.0522, "step": 51320 }, { "epoch": 3.811822367443933, "grad_norm": 1.1442534923553467, "learning_rate": 7.129065795336403e-06, "loss": 0.0711, "step": 51330 }, { "epoch": 3.8125649784642803, "grad_norm": 2.0070974826812744, "learning_rate": 7.124610129214318e-06, "loss": 0.0479, "step": 51340 }, { "epoch": 3.8133075894846282, "grad_norm": 1.5337635278701782, "learning_rate": 7.120154463092232e-06, "loss": 0.0552, "step": 51350 }, { "epoch": 3.8140502005049752, "grad_norm": 2.758072853088379, "learning_rate": 7.115698796970147e-06, "loss": 0.0632, "step": 51360 }, { "epoch": 3.814792811525323, "grad_norm": 1.1585407257080078, "learning_rate": 7.111243130848061e-06, "loss": 0.0559, "step": 51370 }, { "epoch": 3.8155354225456706, "grad_norm": 0.7389517426490784, "learning_rate": 7.106787464725976e-06, "loss": 0.0523, "step": 51380 }, { "epoch": 3.816278033566018, "grad_norm": 1.0852411985397339, "learning_rate": 7.102331798603892e-06, "loss": 0.0459, "step": 51390 }, { "epoch": 3.817020644586366, "grad_norm": 1.4274524450302124, "learning_rate": 7.097876132481806e-06, "loss": 0.0707, "step": 51400 }, { "epoch": 3.8177632556067134, "grad_norm": 2.2135133743286133, "learning_rate": 7.093420466359721e-06, "loss": 0.0488, "step": 51410 }, { "epoch": 3.818505866627061, "grad_norm": 2.0411906242370605, "learning_rate": 7.0889648002376355e-06, "loss": 0.0572, "step": 51420 }, { "epoch": 3.8192484776474083, "grad_norm": 1.3621501922607422, "learning_rate": 7.0845091341155505e-06, "loss": 0.0438, "step": 51430 }, { "epoch": 3.8199910886677557, "grad_norm": 2.786686658859253, "learning_rate": 7.080053467993465e-06, "loss": 0.0702, "step": 51440 }, { "epoch": 3.8207336996881036, "grad_norm": 0.6535211801528931, "learning_rate": 7.0755978018713805e-06, "loss": 0.0564, "step": 51450 }, { "epoch": 3.821476310708451, "grad_norm": 0.9478582143783569, "learning_rate": 7.071142135749295e-06, "loss": 0.0522, "step": 51460 }, { "epoch": 3.8222189217287985, "grad_norm": 1.7308098077774048, "learning_rate": 7.06668646962721e-06, "loss": 0.0388, "step": 51470 }, { "epoch": 3.822961532749146, "grad_norm": 1.9273875951766968, "learning_rate": 7.062230803505124e-06, "loss": 0.0684, "step": 51480 }, { "epoch": 3.8237041437694934, "grad_norm": 0.6402938961982727, "learning_rate": 7.057775137383039e-06, "loss": 0.0933, "step": 51490 }, { "epoch": 3.8244467547898413, "grad_norm": 1.3616961240768433, "learning_rate": 7.053319471260953e-06, "loss": 0.0663, "step": 51500 }, { "epoch": 3.8251893658101888, "grad_norm": 1.957161784172058, "learning_rate": 7.048863805138869e-06, "loss": 0.0756, "step": 51510 }, { "epoch": 3.825931976830536, "grad_norm": 2.335097074508667, "learning_rate": 7.044408139016784e-06, "loss": 0.0619, "step": 51520 }, { "epoch": 3.8266745878508837, "grad_norm": 0.9050447940826416, "learning_rate": 7.039952472894698e-06, "loss": 0.0537, "step": 51530 }, { "epoch": 3.827417198871231, "grad_norm": 0.2950853705406189, "learning_rate": 7.035496806772613e-06, "loss": 0.0598, "step": 51540 }, { "epoch": 3.828159809891579, "grad_norm": 2.684269428253174, "learning_rate": 7.031041140650527e-06, "loss": 0.0896, "step": 51550 }, { "epoch": 3.8289024209119265, "grad_norm": 2.0332887172698975, "learning_rate": 7.026585474528442e-06, "loss": 0.0801, "step": 51560 }, { "epoch": 3.829645031932274, "grad_norm": 2.3198342323303223, "learning_rate": 7.022129808406357e-06, "loss": 0.0645, "step": 51570 }, { "epoch": 3.8303876429526214, "grad_norm": 1.091840386390686, "learning_rate": 7.017674142284272e-06, "loss": 0.0553, "step": 51580 }, { "epoch": 3.831130253972969, "grad_norm": 1.6161314249038696, "learning_rate": 7.013218476162186e-06, "loss": 0.0612, "step": 51590 }, { "epoch": 3.8318728649933167, "grad_norm": 0.7637621164321899, "learning_rate": 7.008762810040101e-06, "loss": 0.0349, "step": 51600 }, { "epoch": 3.832615476013664, "grad_norm": 0.9852764010429382, "learning_rate": 7.004307143918015e-06, "loss": 0.0476, "step": 51610 }, { "epoch": 3.8333580870340116, "grad_norm": 2.236307144165039, "learning_rate": 6.999851477795931e-06, "loss": 0.0687, "step": 51620 }, { "epoch": 3.834100698054359, "grad_norm": 0.9051099419593811, "learning_rate": 6.995395811673845e-06, "loss": 0.0861, "step": 51630 }, { "epoch": 3.8348433090747065, "grad_norm": 0.784583568572998, "learning_rate": 6.99094014555176e-06, "loss": 0.0594, "step": 51640 }, { "epoch": 3.8355859200950544, "grad_norm": 1.5118728876113892, "learning_rate": 6.9864844794296745e-06, "loss": 0.0769, "step": 51650 }, { "epoch": 3.836328531115402, "grad_norm": 1.1480764150619507, "learning_rate": 6.9820288133075895e-06, "loss": 0.0678, "step": 51660 }, { "epoch": 3.8370711421357493, "grad_norm": 1.8852975368499756, "learning_rate": 6.9775731471855045e-06, "loss": 0.0714, "step": 51670 }, { "epoch": 3.8378137531560967, "grad_norm": 0.9265196323394775, "learning_rate": 6.9731174810634195e-06, "loss": 0.0424, "step": 51680 }, { "epoch": 3.838556364176444, "grad_norm": 0.8514029383659363, "learning_rate": 6.9686618149413345e-06, "loss": 0.0565, "step": 51690 }, { "epoch": 3.839298975196792, "grad_norm": 3.038891077041626, "learning_rate": 6.964206148819249e-06, "loss": 0.0627, "step": 51700 }, { "epoch": 3.8400415862171395, "grad_norm": 2.795001268386841, "learning_rate": 6.959750482697164e-06, "loss": 0.0808, "step": 51710 }, { "epoch": 3.840784197237487, "grad_norm": 0.8033546805381775, "learning_rate": 6.955294816575078e-06, "loss": 0.0667, "step": 51720 }, { "epoch": 3.8415268082578344, "grad_norm": 3.8095524311065674, "learning_rate": 6.950839150452993e-06, "loss": 0.0525, "step": 51730 }, { "epoch": 3.842269419278182, "grad_norm": 1.2857472896575928, "learning_rate": 6.946383484330908e-06, "loss": 0.0707, "step": 51740 }, { "epoch": 3.84301203029853, "grad_norm": 0.5481483340263367, "learning_rate": 6.941927818208823e-06, "loss": 0.0624, "step": 51750 }, { "epoch": 3.8437546413188772, "grad_norm": 2.4594411849975586, "learning_rate": 6.937472152086737e-06, "loss": 0.0699, "step": 51760 }, { "epoch": 3.8444972523392247, "grad_norm": 2.338852882385254, "learning_rate": 6.933016485964652e-06, "loss": 0.0485, "step": 51770 }, { "epoch": 3.845239863359572, "grad_norm": 0.8134142160415649, "learning_rate": 6.928560819842566e-06, "loss": 0.0581, "step": 51780 }, { "epoch": 3.8459824743799196, "grad_norm": 2.4749584197998047, "learning_rate": 6.924105153720481e-06, "loss": 0.077, "step": 51790 }, { "epoch": 3.8467250854002675, "grad_norm": 0.5151814222335815, "learning_rate": 6.919649487598397e-06, "loss": 0.0554, "step": 51800 }, { "epoch": 3.847467696420615, "grad_norm": 2.5601933002471924, "learning_rate": 6.915193821476311e-06, "loss": 0.0677, "step": 51810 }, { "epoch": 3.8482103074409624, "grad_norm": 1.637519121170044, "learning_rate": 6.910738155354226e-06, "loss": 0.1058, "step": 51820 }, { "epoch": 3.84895291846131, "grad_norm": 0.6686950325965881, "learning_rate": 6.90628248923214e-06, "loss": 0.0781, "step": 51830 }, { "epoch": 3.8496955294816573, "grad_norm": 1.9050847291946411, "learning_rate": 6.901826823110055e-06, "loss": 0.0675, "step": 51840 }, { "epoch": 3.850438140502005, "grad_norm": 1.7424448728561401, "learning_rate": 6.897371156987969e-06, "loss": 0.0585, "step": 51850 }, { "epoch": 3.8511807515223526, "grad_norm": 0.6480499505996704, "learning_rate": 6.892915490865885e-06, "loss": 0.0579, "step": 51860 }, { "epoch": 3.8519233625427, "grad_norm": 0.8286868929862976, "learning_rate": 6.888459824743799e-06, "loss": 0.0416, "step": 51870 }, { "epoch": 3.8526659735630475, "grad_norm": 0.9048423171043396, "learning_rate": 6.884004158621714e-06, "loss": 0.0521, "step": 51880 }, { "epoch": 3.853408584583395, "grad_norm": 0.8243032097816467, "learning_rate": 6.879548492499628e-06, "loss": 0.0405, "step": 51890 }, { "epoch": 3.854151195603743, "grad_norm": 0.618739128112793, "learning_rate": 6.875092826377543e-06, "loss": 0.0427, "step": 51900 }, { "epoch": 3.8548938066240903, "grad_norm": 0.4855763912200928, "learning_rate": 6.8706371602554575e-06, "loss": 0.054, "step": 51910 }, { "epoch": 3.8556364176444378, "grad_norm": 0.3211023211479187, "learning_rate": 6.866181494133373e-06, "loss": 0.0877, "step": 51920 }, { "epoch": 3.8563790286647857, "grad_norm": 0.4299314320087433, "learning_rate": 6.861725828011288e-06, "loss": 0.0633, "step": 51930 }, { "epoch": 3.8571216396851327, "grad_norm": 1.2965507507324219, "learning_rate": 6.8572701618892025e-06, "loss": 0.0749, "step": 51940 }, { "epoch": 3.8578642507054806, "grad_norm": 0.9099006056785583, "learning_rate": 6.8528144957671175e-06, "loss": 0.0495, "step": 51950 }, { "epoch": 3.858606861725828, "grad_norm": 2.0867326259613037, "learning_rate": 6.848358829645032e-06, "loss": 0.0509, "step": 51960 }, { "epoch": 3.8593494727461755, "grad_norm": 3.871561050415039, "learning_rate": 6.8439031635229475e-06, "loss": 0.0922, "step": 51970 }, { "epoch": 3.8600920837665234, "grad_norm": 0.6323529481887817, "learning_rate": 6.839447497400862e-06, "loss": 0.0609, "step": 51980 }, { "epoch": 3.860834694786871, "grad_norm": 2.84472393989563, "learning_rate": 6.834991831278777e-06, "loss": 0.0602, "step": 51990 }, { "epoch": 3.8615773058072183, "grad_norm": 0.9858630895614624, "learning_rate": 6.830536165156691e-06, "loss": 0.035, "step": 52000 }, { "epoch": 3.8623199168275657, "grad_norm": 1.1265889406204224, "learning_rate": 6.826080499034606e-06, "loss": 0.0581, "step": 52010 }, { "epoch": 3.863062527847913, "grad_norm": 1.8120416402816772, "learning_rate": 6.82162483291252e-06, "loss": 0.0919, "step": 52020 }, { "epoch": 3.863805138868261, "grad_norm": 0.340191513299942, "learning_rate": 6.817169166790436e-06, "loss": 0.0482, "step": 52030 }, { "epoch": 3.8645477498886085, "grad_norm": 2.8395814895629883, "learning_rate": 6.81271350066835e-06, "loss": 0.0593, "step": 52040 }, { "epoch": 3.865290360908956, "grad_norm": 1.4814605712890625, "learning_rate": 6.808257834546265e-06, "loss": 0.0911, "step": 52050 }, { "epoch": 3.8660329719293034, "grad_norm": 1.4447940587997437, "learning_rate": 6.80380216842418e-06, "loss": 0.0592, "step": 52060 }, { "epoch": 3.866775582949651, "grad_norm": 1.7888092994689941, "learning_rate": 6.799346502302094e-06, "loss": 0.0895, "step": 52070 }, { "epoch": 3.8675181939699987, "grad_norm": 1.2364249229431152, "learning_rate": 6.794890836180009e-06, "loss": 0.0805, "step": 52080 }, { "epoch": 3.868260804990346, "grad_norm": 1.0468952655792236, "learning_rate": 6.790435170057924e-06, "loss": 0.0674, "step": 52090 }, { "epoch": 3.8690034160106936, "grad_norm": 1.610219120979309, "learning_rate": 6.785979503935839e-06, "loss": 0.0458, "step": 52100 }, { "epoch": 3.869746027031041, "grad_norm": 0.4736784100532532, "learning_rate": 6.781523837813753e-06, "loss": 0.0512, "step": 52110 }, { "epoch": 3.8704886380513885, "grad_norm": 1.5783486366271973, "learning_rate": 6.777068171691668e-06, "loss": 0.089, "step": 52120 }, { "epoch": 3.8712312490717364, "grad_norm": 1.8247394561767578, "learning_rate": 6.772612505569582e-06, "loss": 0.0809, "step": 52130 }, { "epoch": 3.871973860092084, "grad_norm": 1.473613977432251, "learning_rate": 6.768156839447497e-06, "loss": 0.0699, "step": 52140 }, { "epoch": 3.8727164711124313, "grad_norm": 0.9467633962631226, "learning_rate": 6.763701173325412e-06, "loss": 0.0398, "step": 52150 }, { "epoch": 3.873459082132779, "grad_norm": 0.7337872982025146, "learning_rate": 6.759245507203327e-06, "loss": 0.0428, "step": 52160 }, { "epoch": 3.8742016931531262, "grad_norm": 1.1722044944763184, "learning_rate": 6.7547898410812415e-06, "loss": 0.0658, "step": 52170 }, { "epoch": 3.874944304173474, "grad_norm": 1.2315232753753662, "learning_rate": 6.7503341749591565e-06, "loss": 0.0714, "step": 52180 }, { "epoch": 3.8756869151938216, "grad_norm": 1.0262686014175415, "learning_rate": 6.7458785088370715e-06, "loss": 0.0493, "step": 52190 }, { "epoch": 3.876429526214169, "grad_norm": 2.3308000564575195, "learning_rate": 6.741422842714986e-06, "loss": 0.0469, "step": 52200 }, { "epoch": 3.8771721372345165, "grad_norm": 0.9160858392715454, "learning_rate": 6.7369671765929014e-06, "loss": 0.0738, "step": 52210 }, { "epoch": 3.877914748254864, "grad_norm": 0.8617852926254272, "learning_rate": 6.732511510470816e-06, "loss": 0.0674, "step": 52220 }, { "epoch": 3.878657359275212, "grad_norm": 0.7218712568283081, "learning_rate": 6.728055844348731e-06, "loss": 0.0494, "step": 52230 }, { "epoch": 3.8793999702955593, "grad_norm": 2.578873872756958, "learning_rate": 6.723600178226645e-06, "loss": 0.0685, "step": 52240 }, { "epoch": 3.8801425813159067, "grad_norm": 0.5948718786239624, "learning_rate": 6.71914451210456e-06, "loss": 0.0504, "step": 52250 }, { "epoch": 3.880885192336254, "grad_norm": 3.3981211185455322, "learning_rate": 6.714688845982474e-06, "loss": 0.0623, "step": 52260 }, { "epoch": 3.8816278033566016, "grad_norm": 1.991584300994873, "learning_rate": 6.71023317986039e-06, "loss": 0.0566, "step": 52270 }, { "epoch": 3.8823704143769495, "grad_norm": 0.5625025629997253, "learning_rate": 6.705777513738304e-06, "loss": 0.043, "step": 52280 }, { "epoch": 3.883113025397297, "grad_norm": 0.8824495673179626, "learning_rate": 6.701321847616219e-06, "loss": 0.0552, "step": 52290 }, { "epoch": 3.8838556364176444, "grad_norm": 0.5333496928215027, "learning_rate": 6.696866181494133e-06, "loss": 0.061, "step": 52300 }, { "epoch": 3.884598247437992, "grad_norm": 0.932508647441864, "learning_rate": 6.692410515372048e-06, "loss": 0.044, "step": 52310 }, { "epoch": 3.8853408584583393, "grad_norm": 0.28375789523124695, "learning_rate": 6.687954849249964e-06, "loss": 0.0445, "step": 52320 }, { "epoch": 3.886083469478687, "grad_norm": 0.8071836829185486, "learning_rate": 6.683499183127878e-06, "loss": 0.1011, "step": 52330 }, { "epoch": 3.8868260804990347, "grad_norm": 1.178012490272522, "learning_rate": 6.679043517005793e-06, "loss": 0.0465, "step": 52340 }, { "epoch": 3.887568691519382, "grad_norm": 0.8823719620704651, "learning_rate": 6.674587850883707e-06, "loss": 0.045, "step": 52350 }, { "epoch": 3.8883113025397296, "grad_norm": 1.4806567430496216, "learning_rate": 6.670132184761622e-06, "loss": 0.0486, "step": 52360 }, { "epoch": 3.889053913560077, "grad_norm": 1.835952877998352, "learning_rate": 6.665676518639536e-06, "loss": 0.0594, "step": 52370 }, { "epoch": 3.889796524580425, "grad_norm": 1.964036226272583, "learning_rate": 6.661220852517452e-06, "loss": 0.0783, "step": 52380 }, { "epoch": 3.8905391356007724, "grad_norm": 0.7686440944671631, "learning_rate": 6.656765186395366e-06, "loss": 0.0448, "step": 52390 }, { "epoch": 3.89128174662112, "grad_norm": 0.5452464818954468, "learning_rate": 6.652309520273281e-06, "loss": 0.0475, "step": 52400 }, { "epoch": 3.8920243576414673, "grad_norm": 1.948075532913208, "learning_rate": 6.647853854151195e-06, "loss": 0.0631, "step": 52410 }, { "epoch": 3.8927669686618147, "grad_norm": 1.347221851348877, "learning_rate": 6.64339818802911e-06, "loss": 0.0502, "step": 52420 }, { "epoch": 3.8935095796821626, "grad_norm": 0.37357455492019653, "learning_rate": 6.6389425219070245e-06, "loss": 0.0583, "step": 52430 }, { "epoch": 3.89425219070251, "grad_norm": 1.4739619493484497, "learning_rate": 6.63448685578494e-06, "loss": 0.0433, "step": 52440 }, { "epoch": 3.8949948017228575, "grad_norm": 0.9696072936058044, "learning_rate": 6.630031189662855e-06, "loss": 0.0642, "step": 52450 }, { "epoch": 3.895737412743205, "grad_norm": 2.1969690322875977, "learning_rate": 6.6255755235407695e-06, "loss": 0.042, "step": 52460 }, { "epoch": 3.8964800237635524, "grad_norm": 1.8756181001663208, "learning_rate": 6.6211198574186845e-06, "loss": 0.0537, "step": 52470 }, { "epoch": 3.8972226347839003, "grad_norm": 0.8761207461357117, "learning_rate": 6.616664191296599e-06, "loss": 0.0534, "step": 52480 }, { "epoch": 3.8979652458042477, "grad_norm": 1.1076685190200806, "learning_rate": 6.612208525174514e-06, "loss": 0.0536, "step": 52490 }, { "epoch": 3.898707856824595, "grad_norm": 0.5765098333358765, "learning_rate": 6.607752859052429e-06, "loss": 0.0318, "step": 52500 }, { "epoch": 3.899450467844943, "grad_norm": 1.015160322189331, "learning_rate": 6.603297192930344e-06, "loss": 0.0931, "step": 52510 }, { "epoch": 3.90019307886529, "grad_norm": 1.242473840713501, "learning_rate": 6.598841526808258e-06, "loss": 0.0466, "step": 52520 }, { "epoch": 3.900935689885638, "grad_norm": 0.7415289878845215, "learning_rate": 6.594385860686173e-06, "loss": 0.0643, "step": 52530 }, { "epoch": 3.9016783009059854, "grad_norm": 1.0019993782043457, "learning_rate": 6.589930194564087e-06, "loss": 0.0633, "step": 52540 }, { "epoch": 3.902420911926333, "grad_norm": 3.3935482501983643, "learning_rate": 6.585474528442002e-06, "loss": 0.071, "step": 52550 }, { "epoch": 3.903163522946681, "grad_norm": 2.050471067428589, "learning_rate": 6.581018862319917e-06, "loss": 0.0605, "step": 52560 }, { "epoch": 3.9039061339670282, "grad_norm": 1.7919012308120728, "learning_rate": 6.576563196197832e-06, "loss": 0.0388, "step": 52570 }, { "epoch": 3.9046487449873757, "grad_norm": 2.5567381381988525, "learning_rate": 6.572107530075747e-06, "loss": 0.072, "step": 52580 }, { "epoch": 3.905391356007723, "grad_norm": 1.0397082567214966, "learning_rate": 6.567651863953661e-06, "loss": 0.0877, "step": 52590 }, { "epoch": 3.9061339670280706, "grad_norm": 0.6068091988563538, "learning_rate": 6.563196197831576e-06, "loss": 0.0714, "step": 52600 }, { "epoch": 3.9068765780484185, "grad_norm": 1.3227723836898804, "learning_rate": 6.55874053170949e-06, "loss": 0.0669, "step": 52610 }, { "epoch": 3.907619189068766, "grad_norm": 1.6749495267868042, "learning_rate": 6.554284865587406e-06, "loss": 0.0689, "step": 52620 }, { "epoch": 3.9083618000891134, "grad_norm": 1.0209654569625854, "learning_rate": 6.54982919946532e-06, "loss": 0.0847, "step": 52630 }, { "epoch": 3.909104411109461, "grad_norm": 1.9074591398239136, "learning_rate": 6.545373533343235e-06, "loss": 0.0379, "step": 52640 }, { "epoch": 3.9098470221298083, "grad_norm": 0.3264058828353882, "learning_rate": 6.540917867221149e-06, "loss": 0.0683, "step": 52650 }, { "epoch": 3.910589633150156, "grad_norm": 1.0117005109786987, "learning_rate": 6.536462201099064e-06, "loss": 0.0578, "step": 52660 }, { "epoch": 3.9113322441705036, "grad_norm": 2.336249589920044, "learning_rate": 6.532006534976979e-06, "loss": 0.0476, "step": 52670 }, { "epoch": 3.912074855190851, "grad_norm": 0.9599561095237732, "learning_rate": 6.527550868854894e-06, "loss": 0.0638, "step": 52680 }, { "epoch": 3.9128174662111985, "grad_norm": 1.9813097715377808, "learning_rate": 6.5230952027328085e-06, "loss": 0.0477, "step": 52690 }, { "epoch": 3.913560077231546, "grad_norm": 1.9416104555130005, "learning_rate": 6.5186395366107235e-06, "loss": 0.1075, "step": 52700 }, { "epoch": 3.914302688251894, "grad_norm": 2.747821569442749, "learning_rate": 6.5141838704886384e-06, "loss": 0.0997, "step": 52710 }, { "epoch": 3.9150452992722413, "grad_norm": 0.8139704465866089, "learning_rate": 6.509728204366553e-06, "loss": 0.0475, "step": 52720 }, { "epoch": 3.9157879102925888, "grad_norm": 3.535797357559204, "learning_rate": 6.5052725382444684e-06, "loss": 0.074, "step": 52730 }, { "epoch": 3.916530521312936, "grad_norm": 1.303336501121521, "learning_rate": 6.500816872122383e-06, "loss": 0.0383, "step": 52740 }, { "epoch": 3.9172731323332837, "grad_norm": 1.1751950979232788, "learning_rate": 6.496361206000298e-06, "loss": 0.0735, "step": 52750 }, { "epoch": 3.9180157433536316, "grad_norm": 0.4327644407749176, "learning_rate": 6.491905539878212e-06, "loss": 0.0379, "step": 52760 }, { "epoch": 3.918758354373979, "grad_norm": 1.1452745199203491, "learning_rate": 6.487449873756127e-06, "loss": 0.0403, "step": 52770 }, { "epoch": 3.9195009653943265, "grad_norm": 0.25140857696533203, "learning_rate": 6.482994207634041e-06, "loss": 0.0589, "step": 52780 }, { "epoch": 3.920243576414674, "grad_norm": 2.8998043537139893, "learning_rate": 6.478538541511957e-06, "loss": 0.0906, "step": 52790 }, { "epoch": 3.9209861874350214, "grad_norm": 0.8305198550224304, "learning_rate": 6.474082875389871e-06, "loss": 0.0389, "step": 52800 }, { "epoch": 3.9217287984553693, "grad_norm": 1.9127947092056274, "learning_rate": 6.469627209267786e-06, "loss": 0.0678, "step": 52810 }, { "epoch": 3.9224714094757167, "grad_norm": 3.7456071376800537, "learning_rate": 6.4651715431457e-06, "loss": 0.0809, "step": 52820 }, { "epoch": 3.923214020496064, "grad_norm": 2.8850257396698, "learning_rate": 6.460715877023615e-06, "loss": 0.0541, "step": 52830 }, { "epoch": 3.9239566315164116, "grad_norm": 0.6639874577522278, "learning_rate": 6.45626021090153e-06, "loss": 0.0512, "step": 52840 }, { "epoch": 3.924699242536759, "grad_norm": 1.1795817613601685, "learning_rate": 6.451804544779445e-06, "loss": 0.0508, "step": 52850 }, { "epoch": 3.925441853557107, "grad_norm": 0.710468590259552, "learning_rate": 6.44734887865736e-06, "loss": 0.0463, "step": 52860 }, { "epoch": 3.9261844645774544, "grad_norm": 1.7391959428787231, "learning_rate": 6.442893212535274e-06, "loss": 0.0873, "step": 52870 }, { "epoch": 3.926927075597802, "grad_norm": 0.5861713290214539, "learning_rate": 6.438437546413189e-06, "loss": 0.0452, "step": 52880 }, { "epoch": 3.9276696866181493, "grad_norm": 0.8153517842292786, "learning_rate": 6.433981880291103e-06, "loss": 0.0403, "step": 52890 }, { "epoch": 3.9284122976384968, "grad_norm": 3.0007896423339844, "learning_rate": 6.429526214169018e-06, "loss": 0.0681, "step": 52900 }, { "epoch": 3.9291549086588446, "grad_norm": 2.487295627593994, "learning_rate": 6.425070548046933e-06, "loss": 0.0734, "step": 52910 }, { "epoch": 3.929897519679192, "grad_norm": 1.7912089824676514, "learning_rate": 6.420614881924848e-06, "loss": 0.0914, "step": 52920 }, { "epoch": 3.9306401306995395, "grad_norm": 3.3400092124938965, "learning_rate": 6.416159215802762e-06, "loss": 0.0571, "step": 52930 }, { "epoch": 3.931382741719887, "grad_norm": 2.8643431663513184, "learning_rate": 6.411703549680677e-06, "loss": 0.0645, "step": 52940 }, { "epoch": 3.9321253527402344, "grad_norm": 0.3890113830566406, "learning_rate": 6.4072478835585915e-06, "loss": 0.0606, "step": 52950 }, { "epoch": 3.9328679637605823, "grad_norm": 0.5684772729873657, "learning_rate": 6.4027922174365065e-06, "loss": 0.0604, "step": 52960 }, { "epoch": 3.93361057478093, "grad_norm": 2.360518217086792, "learning_rate": 6.398336551314422e-06, "loss": 0.063, "step": 52970 }, { "epoch": 3.9343531858012772, "grad_norm": 1.0846983194351196, "learning_rate": 6.3938808851923365e-06, "loss": 0.0421, "step": 52980 }, { "epoch": 3.9350957968216247, "grad_norm": 0.8970616459846497, "learning_rate": 6.3894252190702515e-06, "loss": 0.0558, "step": 52990 }, { "epoch": 3.935838407841972, "grad_norm": 1.57839834690094, "learning_rate": 6.384969552948166e-06, "loss": 0.0796, "step": 53000 }, { "epoch": 3.93658101886232, "grad_norm": 1.5346392393112183, "learning_rate": 6.380513886826081e-06, "loss": 0.0477, "step": 53010 }, { "epoch": 3.9373236298826675, "grad_norm": 1.0439153909683228, "learning_rate": 6.376058220703996e-06, "loss": 0.0641, "step": 53020 }, { "epoch": 3.938066240903015, "grad_norm": 0.8135676383972168, "learning_rate": 6.371602554581911e-06, "loss": 0.0671, "step": 53030 }, { "epoch": 3.9388088519233624, "grad_norm": 2.3363993167877197, "learning_rate": 6.367146888459825e-06, "loss": 0.0387, "step": 53040 }, { "epoch": 3.93955146294371, "grad_norm": 1.2489509582519531, "learning_rate": 6.36269122233774e-06, "loss": 0.0375, "step": 53050 }, { "epoch": 3.9402940739640577, "grad_norm": 1.8481662273406982, "learning_rate": 6.358235556215654e-06, "loss": 0.0492, "step": 53060 }, { "epoch": 3.941036684984405, "grad_norm": 2.6093268394470215, "learning_rate": 6.353779890093569e-06, "loss": 0.0814, "step": 53070 }, { "epoch": 3.9417792960047526, "grad_norm": 1.122709035873413, "learning_rate": 6.349324223971484e-06, "loss": 0.0497, "step": 53080 }, { "epoch": 3.9425219070251005, "grad_norm": 0.5797234177589417, "learning_rate": 6.344868557849399e-06, "loss": 0.0727, "step": 53090 }, { "epoch": 3.9432645180454475, "grad_norm": 0.6840558648109436, "learning_rate": 6.340412891727314e-06, "loss": 0.0617, "step": 53100 }, { "epoch": 3.9440071290657954, "grad_norm": 0.6512880325317383, "learning_rate": 6.335957225605228e-06, "loss": 0.0461, "step": 53110 }, { "epoch": 3.944749740086143, "grad_norm": 0.3409847319126129, "learning_rate": 6.331501559483143e-06, "loss": 0.0355, "step": 53120 }, { "epoch": 3.9454923511064903, "grad_norm": 1.1444979906082153, "learning_rate": 6.327045893361057e-06, "loss": 0.0465, "step": 53130 }, { "epoch": 3.946234962126838, "grad_norm": 2.209327459335327, "learning_rate": 6.322590227238973e-06, "loss": 0.0566, "step": 53140 }, { "epoch": 3.9469775731471857, "grad_norm": 0.7463454604148865, "learning_rate": 6.318134561116887e-06, "loss": 0.0632, "step": 53150 }, { "epoch": 3.947720184167533, "grad_norm": 0.4572658836841583, "learning_rate": 6.313678894994802e-06, "loss": 0.0592, "step": 53160 }, { "epoch": 3.9484627951878806, "grad_norm": 0.3454363942146301, "learning_rate": 6.309223228872716e-06, "loss": 0.0801, "step": 53170 }, { "epoch": 3.949205406208228, "grad_norm": 1.0574936866760254, "learning_rate": 6.304767562750631e-06, "loss": 0.0735, "step": 53180 }, { "epoch": 3.949948017228576, "grad_norm": 1.7936334609985352, "learning_rate": 6.3003118966285455e-06, "loss": 0.0525, "step": 53190 }, { "epoch": 3.9506906282489234, "grad_norm": 1.0691609382629395, "learning_rate": 6.295856230506461e-06, "loss": 0.0766, "step": 53200 }, { "epoch": 3.951433239269271, "grad_norm": 2.9121530055999756, "learning_rate": 6.2914005643843755e-06, "loss": 0.0436, "step": 53210 }, { "epoch": 3.9521758502896183, "grad_norm": 1.2134883403778076, "learning_rate": 6.2869448982622904e-06, "loss": 0.0619, "step": 53220 }, { "epoch": 3.9529184613099657, "grad_norm": 3.7583858966827393, "learning_rate": 6.282489232140205e-06, "loss": 0.0716, "step": 53230 }, { "epoch": 3.9536610723303136, "grad_norm": 0.6128131151199341, "learning_rate": 6.27803356601812e-06, "loss": 0.0474, "step": 53240 }, { "epoch": 3.954403683350661, "grad_norm": 2.04298734664917, "learning_rate": 6.273577899896035e-06, "loss": 0.0499, "step": 53250 }, { "epoch": 3.9551462943710085, "grad_norm": 2.507197618484497, "learning_rate": 6.26912223377395e-06, "loss": 0.0556, "step": 53260 }, { "epoch": 3.955888905391356, "grad_norm": 2.27921986579895, "learning_rate": 6.2646665676518646e-06, "loss": 0.0311, "step": 53270 }, { "epoch": 3.9566315164117034, "grad_norm": 1.2935408353805542, "learning_rate": 6.260210901529779e-06, "loss": 0.0436, "step": 53280 }, { "epoch": 3.9573741274320513, "grad_norm": 1.6734613180160522, "learning_rate": 6.255755235407694e-06, "loss": 0.0638, "step": 53290 }, { "epoch": 3.9581167384523988, "grad_norm": 2.51885724067688, "learning_rate": 6.251299569285608e-06, "loss": 0.063, "step": 53300 }, { "epoch": 3.958859349472746, "grad_norm": 1.2505862712860107, "learning_rate": 6.246843903163523e-06, "loss": 0.0401, "step": 53310 }, { "epoch": 3.9596019604930937, "grad_norm": 0.8314433097839355, "learning_rate": 6.242388237041438e-06, "loss": 0.0514, "step": 53320 }, { "epoch": 3.960344571513441, "grad_norm": 1.3497314453125, "learning_rate": 6.237932570919353e-06, "loss": 0.0438, "step": 53330 }, { "epoch": 3.961087182533789, "grad_norm": 0.9097846746444702, "learning_rate": 6.233476904797267e-06, "loss": 0.0561, "step": 53340 }, { "epoch": 3.9618297935541364, "grad_norm": 2.9179575443267822, "learning_rate": 6.229021238675182e-06, "loss": 0.0492, "step": 53350 }, { "epoch": 3.962572404574484, "grad_norm": 1.4877798557281494, "learning_rate": 6.224565572553096e-06, "loss": 0.0712, "step": 53360 }, { "epoch": 3.9633150155948313, "grad_norm": 0.9369436502456665, "learning_rate": 6.220109906431012e-06, "loss": 0.075, "step": 53370 }, { "epoch": 3.964057626615179, "grad_norm": 1.126839280128479, "learning_rate": 6.215654240308927e-06, "loss": 0.0503, "step": 53380 }, { "epoch": 3.9648002376355267, "grad_norm": 1.7786030769348145, "learning_rate": 6.211198574186841e-06, "loss": 0.0628, "step": 53390 }, { "epoch": 3.965542848655874, "grad_norm": 1.3987665176391602, "learning_rate": 6.206742908064756e-06, "loss": 0.0942, "step": 53400 }, { "epoch": 3.9662854596762216, "grad_norm": 0.4520890712738037, "learning_rate": 6.20228724194267e-06, "loss": 0.0451, "step": 53410 }, { "epoch": 3.967028070696569, "grad_norm": 1.6644339561462402, "learning_rate": 6.197831575820585e-06, "loss": 0.0483, "step": 53420 }, { "epoch": 3.9677706817169165, "grad_norm": 1.37549889087677, "learning_rate": 6.1933759096985e-06, "loss": 0.0521, "step": 53430 }, { "epoch": 3.9685132927372644, "grad_norm": 2.7035961151123047, "learning_rate": 6.188920243576415e-06, "loss": 0.0649, "step": 53440 }, { "epoch": 3.969255903757612, "grad_norm": 1.5665085315704346, "learning_rate": 6.184464577454329e-06, "loss": 0.0313, "step": 53450 }, { "epoch": 3.9699985147779593, "grad_norm": 1.6840803623199463, "learning_rate": 6.180008911332244e-06, "loss": 0.0966, "step": 53460 }, { "epoch": 3.9707411257983067, "grad_norm": 2.5111865997314453, "learning_rate": 6.1755532452101585e-06, "loss": 0.0306, "step": 53470 }, { "epoch": 3.971483736818654, "grad_norm": 1.9524160623550415, "learning_rate": 6.1710975790880735e-06, "loss": 0.0649, "step": 53480 }, { "epoch": 3.972226347839002, "grad_norm": 1.0792032480239868, "learning_rate": 6.1666419129659885e-06, "loss": 0.07, "step": 53490 }, { "epoch": 3.9729689588593495, "grad_norm": 1.4523978233337402, "learning_rate": 6.1621862468439035e-06, "loss": 0.0678, "step": 53500 }, { "epoch": 3.973711569879697, "grad_norm": 2.432481527328491, "learning_rate": 6.1577305807218185e-06, "loss": 0.0852, "step": 53510 }, { "epoch": 3.9744541809000444, "grad_norm": 1.813653588294983, "learning_rate": 6.153274914599733e-06, "loss": 0.0837, "step": 53520 }, { "epoch": 3.975196791920392, "grad_norm": 0.7046365141868591, "learning_rate": 6.148819248477648e-06, "loss": 0.056, "step": 53530 }, { "epoch": 3.9759394029407398, "grad_norm": 2.6574409008026123, "learning_rate": 6.144363582355562e-06, "loss": 0.0592, "step": 53540 }, { "epoch": 3.9766820139610872, "grad_norm": 2.38148832321167, "learning_rate": 6.139907916233478e-06, "loss": 0.0468, "step": 53550 }, { "epoch": 3.9774246249814347, "grad_norm": 1.9013030529022217, "learning_rate": 6.135452250111392e-06, "loss": 0.0815, "step": 53560 }, { "epoch": 3.978167236001782, "grad_norm": 0.9605785608291626, "learning_rate": 6.130996583989307e-06, "loss": 0.0733, "step": 53570 }, { "epoch": 3.9789098470221296, "grad_norm": 0.8357670903205872, "learning_rate": 6.126540917867221e-06, "loss": 0.0657, "step": 53580 }, { "epoch": 3.9796524580424775, "grad_norm": 2.5137290954589844, "learning_rate": 6.122085251745136e-06, "loss": 0.0539, "step": 53590 }, { "epoch": 3.980395069062825, "grad_norm": 0.9419310688972473, "learning_rate": 6.11762958562305e-06, "loss": 0.0589, "step": 53600 }, { "epoch": 3.9811376800831724, "grad_norm": 1.140599012374878, "learning_rate": 6.113173919500966e-06, "loss": 0.0427, "step": 53610 }, { "epoch": 3.98188029110352, "grad_norm": 1.1350241899490356, "learning_rate": 6.10871825337888e-06, "loss": 0.0699, "step": 53620 }, { "epoch": 3.9826229021238673, "grad_norm": 1.3582016229629517, "learning_rate": 6.104262587256795e-06, "loss": 0.0442, "step": 53630 }, { "epoch": 3.983365513144215, "grad_norm": 1.5812486410140991, "learning_rate": 6.09980692113471e-06, "loss": 0.0749, "step": 53640 }, { "epoch": 3.9841081241645626, "grad_norm": 0.9100233912467957, "learning_rate": 6.095351255012624e-06, "loss": 0.0737, "step": 53650 }, { "epoch": 3.98485073518491, "grad_norm": 0.8678179383277893, "learning_rate": 6.090895588890539e-06, "loss": 0.0367, "step": 53660 }, { "epoch": 3.985593346205258, "grad_norm": 2.191160202026367, "learning_rate": 6.086439922768454e-06, "loss": 0.0931, "step": 53670 }, { "epoch": 3.986335957225605, "grad_norm": 1.9324291944503784, "learning_rate": 6.081984256646369e-06, "loss": 0.0731, "step": 53680 }, { "epoch": 3.987078568245953, "grad_norm": 0.9478018879890442, "learning_rate": 6.077528590524283e-06, "loss": 0.0615, "step": 53690 }, { "epoch": 3.9878211792663003, "grad_norm": 1.2666985988616943, "learning_rate": 6.073072924402198e-06, "loss": 0.0543, "step": 53700 }, { "epoch": 3.9885637902866478, "grad_norm": 1.0880041122436523, "learning_rate": 6.0686172582801125e-06, "loss": 0.0707, "step": 53710 }, { "epoch": 3.9893064013069957, "grad_norm": 0.5105621814727783, "learning_rate": 6.064161592158028e-06, "loss": 0.0686, "step": 53720 }, { "epoch": 3.990049012327343, "grad_norm": 1.3909556865692139, "learning_rate": 6.0597059260359424e-06, "loss": 0.0491, "step": 53730 }, { "epoch": 3.9907916233476906, "grad_norm": 3.0516223907470703, "learning_rate": 6.0552502599138574e-06, "loss": 0.0681, "step": 53740 }, { "epoch": 3.991534234368038, "grad_norm": 0.9059837460517883, "learning_rate": 6.050794593791772e-06, "loss": 0.034, "step": 53750 }, { "epoch": 3.9922768453883855, "grad_norm": 2.0982439517974854, "learning_rate": 6.046338927669687e-06, "loss": 0.0533, "step": 53760 }, { "epoch": 3.9930194564087333, "grad_norm": 1.3619189262390137, "learning_rate": 6.041883261547602e-06, "loss": 0.053, "step": 53770 }, { "epoch": 3.993762067429081, "grad_norm": 0.7172215580940247, "learning_rate": 6.0374275954255166e-06, "loss": 0.0728, "step": 53780 }, { "epoch": 3.9945046784494282, "grad_norm": 1.9413235187530518, "learning_rate": 6.0329719293034316e-06, "loss": 0.0779, "step": 53790 }, { "epoch": 3.9952472894697757, "grad_norm": 2.7654523849487305, "learning_rate": 6.028516263181346e-06, "loss": 0.0537, "step": 53800 }, { "epoch": 3.995989900490123, "grad_norm": 1.1179438829421997, "learning_rate": 6.024060597059261e-06, "loss": 0.0586, "step": 53810 }, { "epoch": 3.996732511510471, "grad_norm": 2.428178071975708, "learning_rate": 6.019604930937175e-06, "loss": 0.0714, "step": 53820 }, { "epoch": 3.9974751225308185, "grad_norm": 1.2658203840255737, "learning_rate": 6.01514926481509e-06, "loss": 0.0462, "step": 53830 }, { "epoch": 3.998217733551166, "grad_norm": 0.42739203572273254, "learning_rate": 6.010693598693005e-06, "loss": 0.0538, "step": 53840 }, { "epoch": 3.9989603445715134, "grad_norm": 0.2762404978275299, "learning_rate": 6.00623793257092e-06, "loss": 0.0637, "step": 53850 }, { "epoch": 3.999702955591861, "grad_norm": 0.41970357298851013, "learning_rate": 6.001782266448834e-06, "loss": 0.0443, "step": 53860 }, { "epoch": 4.0, "eval_f1": 0.0, "eval_loss": 0.05353143438696861, "eval_runtime": 798.3212, "eval_samples_per_second": 47.624, "eval_steps_per_second": 2.977, "step": 53864 }, { "epoch": 4.000445566612209, "grad_norm": 1.3657422065734863, "learning_rate": 5.997326600326749e-06, "loss": 0.0483, "step": 53870 }, { "epoch": 4.001188177632556, "grad_norm": 1.7870548963546753, "learning_rate": 5.992870934204663e-06, "loss": 0.0755, "step": 53880 }, { "epoch": 4.001930788652904, "grad_norm": 1.3454580307006836, "learning_rate": 5.988415268082578e-06, "loss": 0.0706, "step": 53890 }, { "epoch": 4.0026733996732515, "grad_norm": 1.4166697263717651, "learning_rate": 5.983959601960494e-06, "loss": 0.0645, "step": 53900 }, { "epoch": 4.0034160106935985, "grad_norm": 1.9602590799331665, "learning_rate": 5.979503935838408e-06, "loss": 0.0707, "step": 53910 }, { "epoch": 4.004158621713946, "grad_norm": 3.5693228244781494, "learning_rate": 5.975048269716323e-06, "loss": 0.0585, "step": 53920 }, { "epoch": 4.004901232734293, "grad_norm": 3.0163803100585938, "learning_rate": 5.970592603594237e-06, "loss": 0.0689, "step": 53930 }, { "epoch": 4.005643843754641, "grad_norm": 0.6391358375549316, "learning_rate": 5.966136937472152e-06, "loss": 0.0442, "step": 53940 }, { "epoch": 4.006386454774989, "grad_norm": 0.6875801086425781, "learning_rate": 5.961681271350066e-06, "loss": 0.0441, "step": 53950 }, { "epoch": 4.007129065795336, "grad_norm": 2.397939443588257, "learning_rate": 5.957225605227982e-06, "loss": 0.067, "step": 53960 }, { "epoch": 4.007871676815684, "grad_norm": 1.6307581663131714, "learning_rate": 5.952769939105896e-06, "loss": 0.0316, "step": 53970 }, { "epoch": 4.008614287836031, "grad_norm": 1.195626974105835, "learning_rate": 5.948314272983811e-06, "loss": 0.0583, "step": 53980 }, { "epoch": 4.009356898856379, "grad_norm": 2.5990028381347656, "learning_rate": 5.9438586068617255e-06, "loss": 0.079, "step": 53990 }, { "epoch": 4.010099509876727, "grad_norm": 1.327807068824768, "learning_rate": 5.9394029407396405e-06, "loss": 0.0702, "step": 54000 }, { "epoch": 4.010842120897074, "grad_norm": 2.4777891635894775, "learning_rate": 5.9349472746175555e-06, "loss": 0.049, "step": 54010 }, { "epoch": 4.011584731917422, "grad_norm": 1.1965991258621216, "learning_rate": 5.9304916084954705e-06, "loss": 0.0718, "step": 54020 }, { "epoch": 4.012327342937769, "grad_norm": 0.5554696321487427, "learning_rate": 5.9260359423733855e-06, "loss": 0.0433, "step": 54030 }, { "epoch": 4.013069953958117, "grad_norm": 1.6719565391540527, "learning_rate": 5.9215802762513e-06, "loss": 0.074, "step": 54040 }, { "epoch": 4.013812564978465, "grad_norm": 0.40224915742874146, "learning_rate": 5.917124610129215e-06, "loss": 0.0552, "step": 54050 }, { "epoch": 4.014555175998812, "grad_norm": 0.9646987915039062, "learning_rate": 5.912668944007129e-06, "loss": 0.0513, "step": 54060 }, { "epoch": 4.0152977870191595, "grad_norm": 2.081406593322754, "learning_rate": 5.908213277885045e-06, "loss": 0.0644, "step": 54070 }, { "epoch": 4.0160403980395065, "grad_norm": 1.1674538850784302, "learning_rate": 5.903757611762959e-06, "loss": 0.048, "step": 54080 }, { "epoch": 4.016783009059854, "grad_norm": 0.48358213901519775, "learning_rate": 5.899301945640874e-06, "loss": 0.0558, "step": 54090 }, { "epoch": 4.017525620080202, "grad_norm": 0.6902279257774353, "learning_rate": 5.894846279518788e-06, "loss": 0.0671, "step": 54100 }, { "epoch": 4.018268231100549, "grad_norm": 1.8209834098815918, "learning_rate": 5.890390613396703e-06, "loss": 0.0673, "step": 54110 }, { "epoch": 4.019010842120897, "grad_norm": 0.6729385256767273, "learning_rate": 5.885934947274617e-06, "loss": 0.0378, "step": 54120 }, { "epoch": 4.019753453141244, "grad_norm": 0.9727711081504822, "learning_rate": 5.881479281152533e-06, "loss": 0.0405, "step": 54130 }, { "epoch": 4.020496064161592, "grad_norm": 0.18303215503692627, "learning_rate": 5.877023615030447e-06, "loss": 0.0852, "step": 54140 }, { "epoch": 4.02123867518194, "grad_norm": 1.9007530212402344, "learning_rate": 5.872567948908362e-06, "loss": 0.0734, "step": 54150 }, { "epoch": 4.021981286202287, "grad_norm": 1.277612566947937, "learning_rate": 5.868112282786277e-06, "loss": 0.055, "step": 54160 }, { "epoch": 4.022723897222635, "grad_norm": 0.40179601311683655, "learning_rate": 5.863656616664191e-06, "loss": 0.0591, "step": 54170 }, { "epoch": 4.023466508242982, "grad_norm": 0.6786103248596191, "learning_rate": 5.859200950542106e-06, "loss": 0.0633, "step": 54180 }, { "epoch": 4.02420911926333, "grad_norm": 2.203526496887207, "learning_rate": 5.854745284420021e-06, "loss": 0.0759, "step": 54190 }, { "epoch": 4.024951730283678, "grad_norm": 1.4047437906265259, "learning_rate": 5.850289618297936e-06, "loss": 0.0761, "step": 54200 }, { "epoch": 4.025694341304025, "grad_norm": 1.809567928314209, "learning_rate": 5.84583395217585e-06, "loss": 0.0656, "step": 54210 }, { "epoch": 4.026436952324373, "grad_norm": 1.946758508682251, "learning_rate": 5.841378286053765e-06, "loss": 0.0509, "step": 54220 }, { "epoch": 4.02717956334472, "grad_norm": 1.9087995290756226, "learning_rate": 5.8369226199316795e-06, "loss": 0.0697, "step": 54230 }, { "epoch": 4.0279221743650675, "grad_norm": 0.6901232600212097, "learning_rate": 5.8324669538095944e-06, "loss": 0.0574, "step": 54240 }, { "epoch": 4.028664785385415, "grad_norm": 1.0720127820968628, "learning_rate": 5.8280112876875094e-06, "loss": 0.0663, "step": 54250 }, { "epoch": 4.029407396405762, "grad_norm": 1.3058216571807861, "learning_rate": 5.8235556215654244e-06, "loss": 0.046, "step": 54260 }, { "epoch": 4.03015000742611, "grad_norm": 0.79048752784729, "learning_rate": 5.819099955443339e-06, "loss": 0.0757, "step": 54270 }, { "epoch": 4.030892618446458, "grad_norm": 2.9405195713043213, "learning_rate": 5.814644289321254e-06, "loss": 0.0786, "step": 54280 }, { "epoch": 4.031635229466805, "grad_norm": 2.264370918273926, "learning_rate": 5.8101886231991686e-06, "loss": 0.0659, "step": 54290 }, { "epoch": 4.032377840487153, "grad_norm": 0.7332669496536255, "learning_rate": 5.805732957077083e-06, "loss": 0.0595, "step": 54300 }, { "epoch": 4.0331204515075, "grad_norm": 2.585928440093994, "learning_rate": 5.8012772909549986e-06, "loss": 0.0937, "step": 54310 }, { "epoch": 4.033863062527848, "grad_norm": 0.5826427340507507, "learning_rate": 5.796821624832913e-06, "loss": 0.0509, "step": 54320 }, { "epoch": 4.034605673548196, "grad_norm": 1.9038110971450806, "learning_rate": 5.792365958710828e-06, "loss": 0.0811, "step": 54330 }, { "epoch": 4.035348284568543, "grad_norm": 0.9601470232009888, "learning_rate": 5.787910292588742e-06, "loss": 0.0506, "step": 54340 }, { "epoch": 4.036090895588891, "grad_norm": 0.9888731241226196, "learning_rate": 5.783454626466657e-06, "loss": 0.0703, "step": 54350 }, { "epoch": 4.036833506609238, "grad_norm": 0.9209118485450745, "learning_rate": 5.778998960344572e-06, "loss": 0.0736, "step": 54360 }, { "epoch": 4.037576117629586, "grad_norm": 2.1321849822998047, "learning_rate": 5.774543294222487e-06, "loss": 0.058, "step": 54370 }, { "epoch": 4.038318728649934, "grad_norm": 0.4780378043651581, "learning_rate": 5.770087628100401e-06, "loss": 0.0451, "step": 54380 }, { "epoch": 4.039061339670281, "grad_norm": 0.6175082921981812, "learning_rate": 5.765631961978316e-06, "loss": 0.0457, "step": 54390 }, { "epoch": 4.0398039506906285, "grad_norm": 0.8569179773330688, "learning_rate": 5.76117629585623e-06, "loss": 0.063, "step": 54400 }, { "epoch": 4.0405465617109755, "grad_norm": 0.5725436210632324, "learning_rate": 5.756720629734145e-06, "loss": 0.0555, "step": 54410 }, { "epoch": 4.041289172731323, "grad_norm": 1.1730387210845947, "learning_rate": 5.752264963612061e-06, "loss": 0.0586, "step": 54420 }, { "epoch": 4.042031783751671, "grad_norm": 1.527571678161621, "learning_rate": 5.747809297489975e-06, "loss": 0.0381, "step": 54430 }, { "epoch": 4.042774394772018, "grad_norm": 1.1510217189788818, "learning_rate": 5.74335363136789e-06, "loss": 0.0494, "step": 54440 }, { "epoch": 4.043517005792366, "grad_norm": 2.92785382270813, "learning_rate": 5.738897965245804e-06, "loss": 0.0484, "step": 54450 }, { "epoch": 4.044259616812713, "grad_norm": 1.290939211845398, "learning_rate": 5.734442299123719e-06, "loss": 0.0585, "step": 54460 }, { "epoch": 4.045002227833061, "grad_norm": 1.4122437238693237, "learning_rate": 5.729986633001633e-06, "loss": 0.0517, "step": 54470 }, { "epoch": 4.045744838853409, "grad_norm": 1.2548693418502808, "learning_rate": 5.725530966879549e-06, "loss": 0.0642, "step": 54480 }, { "epoch": 4.046487449873756, "grad_norm": 2.637096405029297, "learning_rate": 5.721075300757463e-06, "loss": 0.0761, "step": 54490 }, { "epoch": 4.047230060894104, "grad_norm": 1.5033966302871704, "learning_rate": 5.716619634635378e-06, "loss": 0.0705, "step": 54500 }, { "epoch": 4.047972671914451, "grad_norm": 0.40369749069213867, "learning_rate": 5.7121639685132925e-06, "loss": 0.0678, "step": 54510 }, { "epoch": 4.048715282934799, "grad_norm": 2.240159511566162, "learning_rate": 5.7077083023912075e-06, "loss": 0.0721, "step": 54520 }, { "epoch": 4.049457893955147, "grad_norm": 2.353022336959839, "learning_rate": 5.703252636269122e-06, "loss": 0.0597, "step": 54530 }, { "epoch": 4.050200504975494, "grad_norm": 0.9646291732788086, "learning_rate": 5.6987969701470375e-06, "loss": 0.049, "step": 54540 }, { "epoch": 4.050943115995842, "grad_norm": 2.1946423053741455, "learning_rate": 5.6943413040249525e-06, "loss": 0.062, "step": 54550 }, { "epoch": 4.051685727016189, "grad_norm": 2.869677782058716, "learning_rate": 5.689885637902867e-06, "loss": 0.0553, "step": 54560 }, { "epoch": 4.0524283380365365, "grad_norm": 3.2481913566589355, "learning_rate": 5.685429971780782e-06, "loss": 0.0373, "step": 54570 }, { "epoch": 4.053170949056884, "grad_norm": 1.6126598119735718, "learning_rate": 5.680974305658696e-06, "loss": 0.0596, "step": 54580 }, { "epoch": 4.053913560077231, "grad_norm": 1.128563404083252, "learning_rate": 5.676518639536611e-06, "loss": 0.0334, "step": 54590 }, { "epoch": 4.054656171097579, "grad_norm": 2.0686044692993164, "learning_rate": 5.672062973414526e-06, "loss": 0.0672, "step": 54600 }, { "epoch": 4.055398782117926, "grad_norm": 0.4722823202610016, "learning_rate": 5.667607307292441e-06, "loss": 0.0368, "step": 54610 }, { "epoch": 4.056141393138274, "grad_norm": 1.7301435470581055, "learning_rate": 5.663151641170355e-06, "loss": 0.0736, "step": 54620 }, { "epoch": 4.056884004158622, "grad_norm": 0.7229349613189697, "learning_rate": 5.65869597504827e-06, "loss": 0.0901, "step": 54630 }, { "epoch": 4.057626615178969, "grad_norm": 2.0059196949005127, "learning_rate": 5.654240308926184e-06, "loss": 0.0608, "step": 54640 }, { "epoch": 4.058369226199317, "grad_norm": 0.5810075402259827, "learning_rate": 5.649784642804099e-06, "loss": 0.0438, "step": 54650 }, { "epoch": 4.059111837219664, "grad_norm": 1.114890456199646, "learning_rate": 5.645328976682014e-06, "loss": 0.0457, "step": 54660 }, { "epoch": 4.059854448240012, "grad_norm": 1.2894550561904907, "learning_rate": 5.640873310559929e-06, "loss": 0.0744, "step": 54670 }, { "epoch": 4.06059705926036, "grad_norm": 0.7137770652770996, "learning_rate": 5.636417644437843e-06, "loss": 0.0785, "step": 54680 }, { "epoch": 4.061339670280707, "grad_norm": 2.466371774673462, "learning_rate": 5.631961978315758e-06, "loss": 0.0536, "step": 54690 }, { "epoch": 4.062082281301055, "grad_norm": 0.39749160408973694, "learning_rate": 5.627506312193673e-06, "loss": 0.0582, "step": 54700 }, { "epoch": 4.062824892321402, "grad_norm": 1.1517406702041626, "learning_rate": 5.623050646071588e-06, "loss": 0.0545, "step": 54710 }, { "epoch": 4.0635675033417495, "grad_norm": 0.7695569396018982, "learning_rate": 5.618594979949503e-06, "loss": 0.0449, "step": 54720 }, { "epoch": 4.064310114362097, "grad_norm": 0.4918759763240814, "learning_rate": 5.614139313827417e-06, "loss": 0.0566, "step": 54730 }, { "epoch": 4.065052725382444, "grad_norm": 1.6468467712402344, "learning_rate": 5.609683647705332e-06, "loss": 0.052, "step": 54740 }, { "epoch": 4.065795336402792, "grad_norm": 1.0345146656036377, "learning_rate": 5.6052279815832464e-06, "loss": 0.0394, "step": 54750 }, { "epoch": 4.066537947423139, "grad_norm": 3.2710659503936768, "learning_rate": 5.6007723154611614e-06, "loss": 0.071, "step": 54760 }, { "epoch": 4.067280558443487, "grad_norm": 2.8637590408325195, "learning_rate": 5.5963166493390764e-06, "loss": 0.0476, "step": 54770 }, { "epoch": 4.068023169463835, "grad_norm": 0.5226951837539673, "learning_rate": 5.5918609832169914e-06, "loss": 0.0442, "step": 54780 }, { "epoch": 4.068765780484182, "grad_norm": 0.48805347084999084, "learning_rate": 5.587405317094906e-06, "loss": 0.044, "step": 54790 }, { "epoch": 4.06950839150453, "grad_norm": 0.6936028599739075, "learning_rate": 5.5829496509728206e-06, "loss": 0.0593, "step": 54800 }, { "epoch": 4.070251002524877, "grad_norm": 2.0860705375671387, "learning_rate": 5.578493984850735e-06, "loss": 0.0653, "step": 54810 }, { "epoch": 4.070993613545225, "grad_norm": 1.1754940748214722, "learning_rate": 5.57403831872865e-06, "loss": 0.0628, "step": 54820 }, { "epoch": 4.071736224565573, "grad_norm": 1.029878854751587, "learning_rate": 5.5695826526065656e-06, "loss": 0.0553, "step": 54830 }, { "epoch": 4.07247883558592, "grad_norm": 0.6767436861991882, "learning_rate": 5.56512698648448e-06, "loss": 0.0671, "step": 54840 }, { "epoch": 4.073221446606268, "grad_norm": 1.4418343305587769, "learning_rate": 5.560671320362395e-06, "loss": 0.0582, "step": 54850 }, { "epoch": 4.073964057626615, "grad_norm": 0.44620373845100403, "learning_rate": 5.556215654240309e-06, "loss": 0.0845, "step": 54860 }, { "epoch": 4.074706668646963, "grad_norm": 1.5117290019989014, "learning_rate": 5.551759988118224e-06, "loss": 0.0614, "step": 54870 }, { "epoch": 4.0754492796673105, "grad_norm": 1.3769358396530151, "learning_rate": 5.547304321996138e-06, "loss": 0.0551, "step": 54880 }, { "epoch": 4.0761918906876575, "grad_norm": 1.5998951196670532, "learning_rate": 5.542848655874054e-06, "loss": 0.0743, "step": 54890 }, { "epoch": 4.076934501708005, "grad_norm": 1.0898070335388184, "learning_rate": 5.538392989751968e-06, "loss": 0.0667, "step": 54900 }, { "epoch": 4.077677112728353, "grad_norm": 1.2157825231552124, "learning_rate": 5.533937323629883e-06, "loss": 0.0511, "step": 54910 }, { "epoch": 4.0784197237487, "grad_norm": 2.0952835083007812, "learning_rate": 5.529481657507797e-06, "loss": 0.0447, "step": 54920 }, { "epoch": 4.079162334769048, "grad_norm": 1.1729780435562134, "learning_rate": 5.525025991385712e-06, "loss": 0.1045, "step": 54930 }, { "epoch": 4.079904945789395, "grad_norm": 1.0873030424118042, "learning_rate": 5.520570325263626e-06, "loss": 0.062, "step": 54940 }, { "epoch": 4.080647556809743, "grad_norm": 0.5280413627624512, "learning_rate": 5.516114659141542e-06, "loss": 0.0579, "step": 54950 }, { "epoch": 4.081390167830091, "grad_norm": 1.2786023616790771, "learning_rate": 5.511658993019457e-06, "loss": 0.0453, "step": 54960 }, { "epoch": 4.082132778850438, "grad_norm": 1.6670242547988892, "learning_rate": 5.507203326897371e-06, "loss": 0.0718, "step": 54970 }, { "epoch": 4.082875389870786, "grad_norm": 0.9172728657722473, "learning_rate": 5.502747660775286e-06, "loss": 0.0472, "step": 54980 }, { "epoch": 4.083618000891133, "grad_norm": 0.3648128807544708, "learning_rate": 5.4982919946532e-06, "loss": 0.0824, "step": 54990 }, { "epoch": 4.084360611911481, "grad_norm": 3.186699151992798, "learning_rate": 5.493836328531115e-06, "loss": 0.0656, "step": 55000 }, { "epoch": 4.085103222931829, "grad_norm": 1.5202915668487549, "learning_rate": 5.48938066240903e-06, "loss": 0.0586, "step": 55010 }, { "epoch": 4.085845833952176, "grad_norm": 1.0740153789520264, "learning_rate": 5.484924996286945e-06, "loss": 0.0612, "step": 55020 }, { "epoch": 4.086588444972524, "grad_norm": 1.715743899345398, "learning_rate": 5.4804693301648595e-06, "loss": 0.0613, "step": 55030 }, { "epoch": 4.087331055992871, "grad_norm": 2.118396520614624, "learning_rate": 5.4760136640427745e-06, "loss": 0.0698, "step": 55040 }, { "epoch": 4.0880736670132185, "grad_norm": 0.9983229637145996, "learning_rate": 5.471557997920689e-06, "loss": 0.0509, "step": 55050 }, { "epoch": 4.088816278033566, "grad_norm": 2.4664018154144287, "learning_rate": 5.4671023317986045e-06, "loss": 0.0678, "step": 55060 }, { "epoch": 4.089558889053913, "grad_norm": 0.848099410533905, "learning_rate": 5.462646665676519e-06, "loss": 0.0499, "step": 55070 }, { "epoch": 4.090301500074261, "grad_norm": 1.8434141874313354, "learning_rate": 5.458190999554434e-06, "loss": 0.0476, "step": 55080 }, { "epoch": 4.091044111094608, "grad_norm": 1.2754813432693481, "learning_rate": 5.453735333432349e-06, "loss": 0.0586, "step": 55090 }, { "epoch": 4.091786722114956, "grad_norm": 0.441261887550354, "learning_rate": 5.449279667310263e-06, "loss": 0.0583, "step": 55100 }, { "epoch": 4.092529333135304, "grad_norm": 1.066699743270874, "learning_rate": 5.444824001188178e-06, "loss": 0.0658, "step": 55110 }, { "epoch": 4.093271944155651, "grad_norm": 0.8867554664611816, "learning_rate": 5.440368335066093e-06, "loss": 0.0909, "step": 55120 }, { "epoch": 4.094014555175999, "grad_norm": 0.7380247116088867, "learning_rate": 5.435912668944008e-06, "loss": 0.0538, "step": 55130 }, { "epoch": 4.094757166196346, "grad_norm": 1.3226486444473267, "learning_rate": 5.431457002821922e-06, "loss": 0.0688, "step": 55140 }, { "epoch": 4.095499777216694, "grad_norm": 1.1807446479797363, "learning_rate": 5.427001336699837e-06, "loss": 0.0498, "step": 55150 }, { "epoch": 4.096242388237042, "grad_norm": 1.0485824346542358, "learning_rate": 5.422545670577751e-06, "loss": 0.0652, "step": 55160 }, { "epoch": 4.096984999257389, "grad_norm": 0.35923972725868225, "learning_rate": 5.418090004455666e-06, "loss": 0.0483, "step": 55170 }, { "epoch": 4.097727610277737, "grad_norm": 0.8954094648361206, "learning_rate": 5.413634338333581e-06, "loss": 0.0722, "step": 55180 }, { "epoch": 4.098470221298084, "grad_norm": 2.760606288909912, "learning_rate": 5.409178672211496e-06, "loss": 0.0826, "step": 55190 }, { "epoch": 4.099212832318432, "grad_norm": 1.2169206142425537, "learning_rate": 5.40472300608941e-06, "loss": 0.043, "step": 55200 }, { "epoch": 4.0999554433387795, "grad_norm": 0.5677967667579651, "learning_rate": 5.400267339967325e-06, "loss": 0.0508, "step": 55210 }, { "epoch": 4.1006980543591265, "grad_norm": 0.8861594200134277, "learning_rate": 5.39581167384524e-06, "loss": 0.0754, "step": 55220 }, { "epoch": 4.101440665379474, "grad_norm": 1.1130675077438354, "learning_rate": 5.391356007723154e-06, "loss": 0.0487, "step": 55230 }, { "epoch": 4.102183276399821, "grad_norm": 1.4723432064056396, "learning_rate": 5.38690034160107e-06, "loss": 0.0622, "step": 55240 }, { "epoch": 4.102925887420169, "grad_norm": 2.4365248680114746, "learning_rate": 5.382444675478984e-06, "loss": 0.0429, "step": 55250 }, { "epoch": 4.103668498440517, "grad_norm": 2.329158067703247, "learning_rate": 5.377989009356899e-06, "loss": 0.0546, "step": 55260 }, { "epoch": 4.104411109460864, "grad_norm": 0.6031007170677185, "learning_rate": 5.3735333432348134e-06, "loss": 0.0434, "step": 55270 }, { "epoch": 4.105153720481212, "grad_norm": 1.8432621955871582, "learning_rate": 5.3690776771127284e-06, "loss": 0.0504, "step": 55280 }, { "epoch": 4.105896331501559, "grad_norm": 4.149932384490967, "learning_rate": 5.364622010990643e-06, "loss": 0.0487, "step": 55290 }, { "epoch": 4.106638942521907, "grad_norm": 0.8511655926704407, "learning_rate": 5.360166344868558e-06, "loss": 0.0491, "step": 55300 }, { "epoch": 4.107381553542255, "grad_norm": 0.9837514162063599, "learning_rate": 5.3557106787464726e-06, "loss": 0.0698, "step": 55310 }, { "epoch": 4.108124164562602, "grad_norm": 3.7656993865966797, "learning_rate": 5.3512550126243876e-06, "loss": 0.0776, "step": 55320 }, { "epoch": 4.10886677558295, "grad_norm": 3.2895989418029785, "learning_rate": 5.346799346502302e-06, "loss": 0.0607, "step": 55330 }, { "epoch": 4.109609386603297, "grad_norm": 1.3208965063095093, "learning_rate": 5.342343680380217e-06, "loss": 0.0559, "step": 55340 }, { "epoch": 4.110351997623645, "grad_norm": 0.915107250213623, "learning_rate": 5.337888014258132e-06, "loss": 0.0958, "step": 55350 }, { "epoch": 4.111094608643993, "grad_norm": 3.8144876956939697, "learning_rate": 5.333432348136047e-06, "loss": 0.0531, "step": 55360 }, { "epoch": 4.11183721966434, "grad_norm": 0.789252519607544, "learning_rate": 5.328976682013962e-06, "loss": 0.0589, "step": 55370 }, { "epoch": 4.1125798306846875, "grad_norm": 0.5822809934616089, "learning_rate": 5.324521015891876e-06, "loss": 0.0394, "step": 55380 }, { "epoch": 4.1133224417050345, "grad_norm": 1.0521644353866577, "learning_rate": 5.320065349769791e-06, "loss": 0.0813, "step": 55390 }, { "epoch": 4.114065052725382, "grad_norm": 3.525383710861206, "learning_rate": 5.315609683647705e-06, "loss": 0.0696, "step": 55400 }, { "epoch": 4.11480766374573, "grad_norm": 2.288709878921509, "learning_rate": 5.311154017525621e-06, "loss": 0.0517, "step": 55410 }, { "epoch": 4.115550274766077, "grad_norm": 1.1933070421218872, "learning_rate": 5.306698351403535e-06, "loss": 0.0642, "step": 55420 }, { "epoch": 4.116292885786425, "grad_norm": 0.5231961011886597, "learning_rate": 5.30224268528145e-06, "loss": 0.0373, "step": 55430 }, { "epoch": 4.117035496806773, "grad_norm": 2.135310173034668, "learning_rate": 5.297787019159364e-06, "loss": 0.0701, "step": 55440 }, { "epoch": 4.11777810782712, "grad_norm": 1.2622560262680054, "learning_rate": 5.293331353037279e-06, "loss": 0.0359, "step": 55450 }, { "epoch": 4.118520718847468, "grad_norm": 0.9313594102859497, "learning_rate": 5.288875686915193e-06, "loss": 0.0639, "step": 55460 }, { "epoch": 4.119263329867815, "grad_norm": 1.2040377855300903, "learning_rate": 5.284420020793109e-06, "loss": 0.0436, "step": 55470 }, { "epoch": 4.120005940888163, "grad_norm": 1.6361944675445557, "learning_rate": 5.279964354671024e-06, "loss": 0.045, "step": 55480 }, { "epoch": 4.120748551908511, "grad_norm": 0.6171565651893616, "learning_rate": 5.275508688548938e-06, "loss": 0.0487, "step": 55490 }, { "epoch": 4.121491162928858, "grad_norm": 1.9398356676101685, "learning_rate": 5.271053022426853e-06, "loss": 0.0551, "step": 55500 }, { "epoch": 4.122233773949206, "grad_norm": 0.6700782775878906, "learning_rate": 5.266597356304767e-06, "loss": 0.0695, "step": 55510 }, { "epoch": 4.122976384969553, "grad_norm": 1.5354483127593994, "learning_rate": 5.262141690182682e-06, "loss": 0.0606, "step": 55520 }, { "epoch": 4.1237189959899005, "grad_norm": 2.600698947906494, "learning_rate": 5.257686024060597e-06, "loss": 0.0734, "step": 55530 }, { "epoch": 4.124461607010248, "grad_norm": 1.9559639692306519, "learning_rate": 5.253230357938512e-06, "loss": 0.0569, "step": 55540 }, { "epoch": 4.1252042180305954, "grad_norm": 2.100451946258545, "learning_rate": 5.2487746918164265e-06, "loss": 0.0729, "step": 55550 }, { "epoch": 4.125946829050943, "grad_norm": 2.451205253601074, "learning_rate": 5.2443190256943415e-06, "loss": 0.0457, "step": 55560 }, { "epoch": 4.12668944007129, "grad_norm": 1.9854660034179688, "learning_rate": 5.239863359572256e-06, "loss": 0.0379, "step": 55570 }, { "epoch": 4.127432051091638, "grad_norm": 1.1762291193008423, "learning_rate": 5.235407693450171e-06, "loss": 0.0425, "step": 55580 }, { "epoch": 4.128174662111986, "grad_norm": 1.3475638628005981, "learning_rate": 5.230952027328086e-06, "loss": 0.0654, "step": 55590 }, { "epoch": 4.128917273132333, "grad_norm": 1.0399333238601685, "learning_rate": 5.226496361206001e-06, "loss": 0.0595, "step": 55600 }, { "epoch": 4.129659884152681, "grad_norm": 0.3056696355342865, "learning_rate": 5.222040695083916e-06, "loss": 0.0424, "step": 55610 }, { "epoch": 4.130402495173028, "grad_norm": 2.0627856254577637, "learning_rate": 5.21758502896183e-06, "loss": 0.0635, "step": 55620 }, { "epoch": 4.131145106193376, "grad_norm": 2.32353138923645, "learning_rate": 5.213129362839745e-06, "loss": 0.0512, "step": 55630 }, { "epoch": 4.131887717213724, "grad_norm": 1.2325079441070557, "learning_rate": 5.208673696717659e-06, "loss": 0.027, "step": 55640 }, { "epoch": 4.132630328234071, "grad_norm": 1.5056791305541992, "learning_rate": 5.204218030595575e-06, "loss": 0.0549, "step": 55650 }, { "epoch": 4.133372939254419, "grad_norm": 2.7190983295440674, "learning_rate": 5.199762364473489e-06, "loss": 0.0631, "step": 55660 }, { "epoch": 4.134115550274766, "grad_norm": 1.026847243309021, "learning_rate": 5.195306698351404e-06, "loss": 0.0541, "step": 55670 }, { "epoch": 4.134858161295114, "grad_norm": 1.6611924171447754, "learning_rate": 5.190851032229318e-06, "loss": 0.0663, "step": 55680 }, { "epoch": 4.1356007723154615, "grad_norm": 0.3207695186138153, "learning_rate": 5.186395366107233e-06, "loss": 0.0528, "step": 55690 }, { "epoch": 4.1363433833358085, "grad_norm": 1.145107388496399, "learning_rate": 5.181939699985148e-06, "loss": 0.0617, "step": 55700 }, { "epoch": 4.137085994356156, "grad_norm": 2.4018893241882324, "learning_rate": 5.177484033863063e-06, "loss": 0.092, "step": 55710 }, { "epoch": 4.137828605376503, "grad_norm": 0.642469048500061, "learning_rate": 5.173028367740977e-06, "loss": 0.0801, "step": 55720 }, { "epoch": 4.138571216396851, "grad_norm": 0.8119722008705139, "learning_rate": 5.168572701618892e-06, "loss": 0.0502, "step": 55730 }, { "epoch": 4.139313827417199, "grad_norm": 0.8802699446678162, "learning_rate": 5.164117035496807e-06, "loss": 0.058, "step": 55740 }, { "epoch": 4.140056438437546, "grad_norm": 2.3792264461517334, "learning_rate": 5.159661369374721e-06, "loss": 0.0601, "step": 55750 }, { "epoch": 4.140799049457894, "grad_norm": 0.80853670835495, "learning_rate": 5.155205703252637e-06, "loss": 0.0554, "step": 55760 }, { "epoch": 4.141541660478241, "grad_norm": 1.0470110177993774, "learning_rate": 5.150750037130551e-06, "loss": 0.0569, "step": 55770 }, { "epoch": 4.142284271498589, "grad_norm": 3.2647767066955566, "learning_rate": 5.146294371008466e-06, "loss": 0.0529, "step": 55780 }, { "epoch": 4.143026882518937, "grad_norm": 1.5302366018295288, "learning_rate": 5.1418387048863804e-06, "loss": 0.0634, "step": 55790 }, { "epoch": 4.143769493539284, "grad_norm": 3.8410422801971436, "learning_rate": 5.1373830387642954e-06, "loss": 0.0607, "step": 55800 }, { "epoch": 4.144512104559632, "grad_norm": 1.0163367986679077, "learning_rate": 5.1329273726422096e-06, "loss": 0.0323, "step": 55810 }, { "epoch": 4.145254715579979, "grad_norm": 1.0663138628005981, "learning_rate": 5.128471706520125e-06, "loss": 0.0599, "step": 55820 }, { "epoch": 4.145997326600327, "grad_norm": 1.1490668058395386, "learning_rate": 5.1240160403980396e-06, "loss": 0.0542, "step": 55830 }, { "epoch": 4.146739937620675, "grad_norm": 2.020784616470337, "learning_rate": 5.1195603742759546e-06, "loss": 0.0645, "step": 55840 }, { "epoch": 4.147482548641022, "grad_norm": 0.6550213098526001, "learning_rate": 5.115104708153869e-06, "loss": 0.0588, "step": 55850 }, { "epoch": 4.1482251596613695, "grad_norm": 1.2832306623458862, "learning_rate": 5.110649042031784e-06, "loss": 0.0504, "step": 55860 }, { "epoch": 4.1489677706817165, "grad_norm": 0.4903786778450012, "learning_rate": 5.106193375909699e-06, "loss": 0.0826, "step": 55870 }, { "epoch": 4.149710381702064, "grad_norm": 4.058075904846191, "learning_rate": 5.101737709787614e-06, "loss": 0.0807, "step": 55880 }, { "epoch": 4.150452992722412, "grad_norm": 0.9241071939468384, "learning_rate": 5.097282043665529e-06, "loss": 0.0651, "step": 55890 }, { "epoch": 4.151195603742759, "grad_norm": 0.6750563979148865, "learning_rate": 5.092826377543443e-06, "loss": 0.049, "step": 55900 }, { "epoch": 4.151938214763107, "grad_norm": 2.1607372760772705, "learning_rate": 5.088370711421358e-06, "loss": 0.092, "step": 55910 }, { "epoch": 4.152680825783454, "grad_norm": 0.9965745210647583, "learning_rate": 5.083915045299272e-06, "loss": 0.0498, "step": 55920 }, { "epoch": 4.153423436803802, "grad_norm": 2.7033541202545166, "learning_rate": 5.079459379177187e-06, "loss": 0.0601, "step": 55930 }, { "epoch": 4.15416604782415, "grad_norm": 0.4900580644607544, "learning_rate": 5.075003713055102e-06, "loss": 0.0497, "step": 55940 }, { "epoch": 4.154908658844497, "grad_norm": 2.328214406967163, "learning_rate": 5.070548046933017e-06, "loss": 0.047, "step": 55950 }, { "epoch": 4.155651269864845, "grad_norm": 1.3433852195739746, "learning_rate": 5.066092380810931e-06, "loss": 0.0311, "step": 55960 }, { "epoch": 4.156393880885192, "grad_norm": 2.1777939796447754, "learning_rate": 5.061636714688846e-06, "loss": 0.0605, "step": 55970 }, { "epoch": 4.15713649190554, "grad_norm": 0.2275165617465973, "learning_rate": 5.05718104856676e-06, "loss": 0.047, "step": 55980 }, { "epoch": 4.157879102925888, "grad_norm": 0.16543416678905487, "learning_rate": 5.052725382444675e-06, "loss": 0.0714, "step": 55990 }, { "epoch": 4.158621713946235, "grad_norm": 1.2253612279891968, "learning_rate": 5.048269716322591e-06, "loss": 0.0605, "step": 56000 }, { "epoch": 4.159364324966583, "grad_norm": 2.219952344894409, "learning_rate": 5.043814050200505e-06, "loss": 0.0681, "step": 56010 }, { "epoch": 4.16010693598693, "grad_norm": 1.6367051601409912, "learning_rate": 5.03935838407842e-06, "loss": 0.038, "step": 56020 }, { "epoch": 4.1608495470072775, "grad_norm": 1.207080602645874, "learning_rate": 5.034902717956334e-06, "loss": 0.0787, "step": 56030 }, { "epoch": 4.161592158027625, "grad_norm": 2.1484475135803223, "learning_rate": 5.030447051834249e-06, "loss": 0.0528, "step": 56040 }, { "epoch": 4.162334769047972, "grad_norm": 0.8749150633811951, "learning_rate": 5.025991385712164e-06, "loss": 0.0848, "step": 56050 }, { "epoch": 4.16307738006832, "grad_norm": 0.868646502494812, "learning_rate": 5.021535719590079e-06, "loss": 0.0662, "step": 56060 }, { "epoch": 4.163819991088668, "grad_norm": 0.94334876537323, "learning_rate": 5.0170800534679935e-06, "loss": 0.058, "step": 56070 }, { "epoch": 4.164562602109015, "grad_norm": 0.9084467887878418, "learning_rate": 5.0126243873459085e-06, "loss": 0.0357, "step": 56080 }, { "epoch": 4.165305213129363, "grad_norm": 1.5573078393936157, "learning_rate": 5.008168721223823e-06, "loss": 0.0886, "step": 56090 }, { "epoch": 4.16604782414971, "grad_norm": 2.178457260131836, "learning_rate": 5.003713055101738e-06, "loss": 0.0442, "step": 56100 }, { "epoch": 4.166790435170058, "grad_norm": 1.7466106414794922, "learning_rate": 4.999257388979653e-06, "loss": 0.051, "step": 56110 }, { "epoch": 4.167533046190406, "grad_norm": 1.0477418899536133, "learning_rate": 4.994801722857568e-06, "loss": 0.0392, "step": 56120 }, { "epoch": 4.168275657210753, "grad_norm": 2.2968533039093018, "learning_rate": 4.990346056735482e-06, "loss": 0.0433, "step": 56130 }, { "epoch": 4.169018268231101, "grad_norm": 0.44317713379859924, "learning_rate": 4.985890390613397e-06, "loss": 0.0537, "step": 56140 }, { "epoch": 4.169760879251448, "grad_norm": 1.9365060329437256, "learning_rate": 4.981434724491312e-06, "loss": 0.0842, "step": 56150 }, { "epoch": 4.170503490271796, "grad_norm": 0.4268725514411926, "learning_rate": 4.976979058369226e-06, "loss": 0.064, "step": 56160 }, { "epoch": 4.171246101292144, "grad_norm": 0.8040658235549927, "learning_rate": 4.972523392247142e-06, "loss": 0.0691, "step": 56170 }, { "epoch": 4.171988712312491, "grad_norm": 1.0074480772018433, "learning_rate": 4.968067726125056e-06, "loss": 0.0687, "step": 56180 }, { "epoch": 4.1727313233328385, "grad_norm": 0.6365683078765869, "learning_rate": 4.963612060002971e-06, "loss": 0.0461, "step": 56190 }, { "epoch": 4.1734739343531855, "grad_norm": 2.264573812484741, "learning_rate": 4.959156393880885e-06, "loss": 0.0618, "step": 56200 }, { "epoch": 4.174216545373533, "grad_norm": 1.5489435195922852, "learning_rate": 4.9547007277588e-06, "loss": 0.0888, "step": 56210 }, { "epoch": 4.174959156393881, "grad_norm": 0.8037591576576233, "learning_rate": 4.950245061636714e-06, "loss": 0.0793, "step": 56220 }, { "epoch": 4.175701767414228, "grad_norm": 1.1610863208770752, "learning_rate": 4.94578939551463e-06, "loss": 0.0623, "step": 56230 }, { "epoch": 4.176444378434576, "grad_norm": 1.3831819295883179, "learning_rate": 4.941333729392544e-06, "loss": 0.059, "step": 56240 }, { "epoch": 4.177186989454923, "grad_norm": 0.5763561725616455, "learning_rate": 4.936878063270459e-06, "loss": 0.0341, "step": 56250 }, { "epoch": 4.177929600475271, "grad_norm": 1.123581051826477, "learning_rate": 4.932422397148373e-06, "loss": 0.0658, "step": 56260 }, { "epoch": 4.178672211495619, "grad_norm": 1.3264278173446655, "learning_rate": 4.927966731026288e-06, "loss": 0.0623, "step": 56270 }, { "epoch": 4.179414822515966, "grad_norm": 1.4704315662384033, "learning_rate": 4.923511064904203e-06, "loss": 0.0734, "step": 56280 }, { "epoch": 4.180157433536314, "grad_norm": 1.4368115663528442, "learning_rate": 4.919055398782118e-06, "loss": 0.0561, "step": 56290 }, { "epoch": 4.180900044556661, "grad_norm": 0.8675275444984436, "learning_rate": 4.914599732660033e-06, "loss": 0.08, "step": 56300 }, { "epoch": 4.181642655577009, "grad_norm": 1.2973276376724243, "learning_rate": 4.9101440665379474e-06, "loss": 0.0725, "step": 56310 }, { "epoch": 4.182385266597357, "grad_norm": 0.6267523169517517, "learning_rate": 4.905688400415862e-06, "loss": 0.0461, "step": 56320 }, { "epoch": 4.183127877617704, "grad_norm": 2.129601001739502, "learning_rate": 4.9012327342937766e-06, "loss": 0.0556, "step": 56330 }, { "epoch": 4.1838704886380516, "grad_norm": 2.3097314834594727, "learning_rate": 4.8967770681716916e-06, "loss": 0.0503, "step": 56340 }, { "epoch": 4.184613099658399, "grad_norm": 1.3276792764663696, "learning_rate": 4.8923214020496066e-06, "loss": 0.076, "step": 56350 }, { "epoch": 4.1853557106787465, "grad_norm": 0.35334932804107666, "learning_rate": 4.8878657359275216e-06, "loss": 0.0596, "step": 56360 }, { "epoch": 4.186098321699094, "grad_norm": 2.6858012676239014, "learning_rate": 4.883410069805436e-06, "loss": 0.0484, "step": 56370 }, { "epoch": 4.186840932719441, "grad_norm": 2.2050089836120605, "learning_rate": 4.878954403683351e-06, "loss": 0.067, "step": 56380 }, { "epoch": 4.187583543739789, "grad_norm": 0.7625178694725037, "learning_rate": 4.874498737561265e-06, "loss": 0.0654, "step": 56390 }, { "epoch": 4.188326154760136, "grad_norm": 1.7497589588165283, "learning_rate": 4.870043071439181e-06, "loss": 0.0781, "step": 56400 }, { "epoch": 4.189068765780484, "grad_norm": 0.6290327906608582, "learning_rate": 4.865587405317096e-06, "loss": 0.045, "step": 56410 }, { "epoch": 4.189811376800832, "grad_norm": 2.0656166076660156, "learning_rate": 4.86113173919501e-06, "loss": 0.068, "step": 56420 }, { "epoch": 4.190553987821179, "grad_norm": 1.3763740062713623, "learning_rate": 4.856676073072925e-06, "loss": 0.0746, "step": 56430 }, { "epoch": 4.191296598841527, "grad_norm": 1.222730278968811, "learning_rate": 4.852220406950839e-06, "loss": 0.0773, "step": 56440 }, { "epoch": 4.192039209861874, "grad_norm": 0.5212209820747375, "learning_rate": 4.847764740828754e-06, "loss": 0.0392, "step": 56450 }, { "epoch": 4.192781820882222, "grad_norm": 0.5401204824447632, "learning_rate": 4.843309074706669e-06, "loss": 0.0801, "step": 56460 }, { "epoch": 4.19352443190257, "grad_norm": 0.9797640442848206, "learning_rate": 4.838853408584584e-06, "loss": 0.0541, "step": 56470 }, { "epoch": 4.194267042922917, "grad_norm": 0.41546717286109924, "learning_rate": 4.834397742462498e-06, "loss": 0.0495, "step": 56480 }, { "epoch": 4.195009653943265, "grad_norm": 1.290562391281128, "learning_rate": 4.829942076340413e-06, "loss": 0.0371, "step": 56490 }, { "epoch": 4.195752264963612, "grad_norm": 0.5906802415847778, "learning_rate": 4.825486410218327e-06, "loss": 0.064, "step": 56500 }, { "epoch": 4.1964948759839595, "grad_norm": 1.46896493434906, "learning_rate": 4.821030744096242e-06, "loss": 0.0478, "step": 56510 }, { "epoch": 4.197237487004307, "grad_norm": 1.2562772035598755, "learning_rate": 4.816575077974157e-06, "loss": 0.0531, "step": 56520 }, { "epoch": 4.197980098024654, "grad_norm": 1.5765583515167236, "learning_rate": 4.812119411852072e-06, "loss": 0.0586, "step": 56530 }, { "epoch": 4.198722709045002, "grad_norm": 1.9203531742095947, "learning_rate": 4.807663745729987e-06, "loss": 0.0759, "step": 56540 }, { "epoch": 4.19946532006535, "grad_norm": 0.2275390923023224, "learning_rate": 4.803208079607901e-06, "loss": 0.0347, "step": 56550 }, { "epoch": 4.200207931085697, "grad_norm": 0.8825653791427612, "learning_rate": 4.798752413485816e-06, "loss": 0.0288, "step": 56560 }, { "epoch": 4.200950542106045, "grad_norm": 0.9257974624633789, "learning_rate": 4.7942967473637305e-06, "loss": 0.0421, "step": 56570 }, { "epoch": 4.201693153126392, "grad_norm": 0.9080449938774109, "learning_rate": 4.789841081241646e-06, "loss": 0.0569, "step": 56580 }, { "epoch": 4.20243576414674, "grad_norm": 0.7005447149276733, "learning_rate": 4.7853854151195605e-06, "loss": 0.1045, "step": 56590 }, { "epoch": 4.203178375167088, "grad_norm": 0.9929253458976746, "learning_rate": 4.7809297489974755e-06, "loss": 0.053, "step": 56600 }, { "epoch": 4.203920986187435, "grad_norm": 2.272272825241089, "learning_rate": 4.77647408287539e-06, "loss": 0.076, "step": 56610 }, { "epoch": 4.204663597207783, "grad_norm": 1.6657465696334839, "learning_rate": 4.772018416753305e-06, "loss": 0.0705, "step": 56620 }, { "epoch": 4.20540620822813, "grad_norm": 4.0360589027404785, "learning_rate": 4.767562750631219e-06, "loss": 0.0637, "step": 56630 }, { "epoch": 4.206148819248478, "grad_norm": 1.8561382293701172, "learning_rate": 4.763107084509135e-06, "loss": 0.0368, "step": 56640 }, { "epoch": 4.206891430268826, "grad_norm": 0.5723518133163452, "learning_rate": 4.758651418387049e-06, "loss": 0.0841, "step": 56650 }, { "epoch": 4.207634041289173, "grad_norm": 0.5894990563392639, "learning_rate": 4.754195752264964e-06, "loss": 0.0622, "step": 56660 }, { "epoch": 4.2083766523095205, "grad_norm": 0.48831412196159363, "learning_rate": 4.749740086142879e-06, "loss": 0.1112, "step": 56670 }, { "epoch": 4.2091192633298675, "grad_norm": 1.655131220817566, "learning_rate": 4.745284420020793e-06, "loss": 0.0583, "step": 56680 }, { "epoch": 4.209861874350215, "grad_norm": 0.4988187849521637, "learning_rate": 4.740828753898708e-06, "loss": 0.0436, "step": 56690 }, { "epoch": 4.210604485370563, "grad_norm": 6.94892692565918, "learning_rate": 4.736373087776623e-06, "loss": 0.0508, "step": 56700 }, { "epoch": 4.21134709639091, "grad_norm": 0.6588563323020935, "learning_rate": 4.731917421654538e-06, "loss": 0.0307, "step": 56710 }, { "epoch": 4.212089707411258, "grad_norm": 1.032076358795166, "learning_rate": 4.727461755532452e-06, "loss": 0.0532, "step": 56720 }, { "epoch": 4.212832318431605, "grad_norm": 0.7659674882888794, "learning_rate": 4.723006089410367e-06, "loss": 0.0798, "step": 56730 }, { "epoch": 4.213574929451953, "grad_norm": 1.151066780090332, "learning_rate": 4.718550423288281e-06, "loss": 0.0603, "step": 56740 }, { "epoch": 4.214317540472301, "grad_norm": 3.9358067512512207, "learning_rate": 4.714094757166197e-06, "loss": 0.0505, "step": 56750 }, { "epoch": 4.215060151492648, "grad_norm": 0.6404879093170166, "learning_rate": 4.709639091044111e-06, "loss": 0.0577, "step": 56760 }, { "epoch": 4.215802762512996, "grad_norm": 2.104989767074585, "learning_rate": 4.705183424922026e-06, "loss": 0.0566, "step": 56770 }, { "epoch": 4.216545373533343, "grad_norm": 0.7882208228111267, "learning_rate": 4.70072775879994e-06, "loss": 0.058, "step": 56780 }, { "epoch": 4.217287984553691, "grad_norm": 1.2608188390731812, "learning_rate": 4.696272092677855e-06, "loss": 0.0692, "step": 56790 }, { "epoch": 4.218030595574039, "grad_norm": 1.273901343345642, "learning_rate": 4.69181642655577e-06, "loss": 0.067, "step": 56800 }, { "epoch": 4.218773206594386, "grad_norm": 2.1798079013824463, "learning_rate": 4.687360760433685e-06, "loss": 0.0807, "step": 56810 }, { "epoch": 4.219515817614734, "grad_norm": 1.1464588642120361, "learning_rate": 4.6829050943116e-06, "loss": 0.0356, "step": 56820 }, { "epoch": 4.220258428635081, "grad_norm": 2.1322357654571533, "learning_rate": 4.678449428189514e-06, "loss": 0.0811, "step": 56830 }, { "epoch": 4.2210010396554285, "grad_norm": 1.9702597856521606, "learning_rate": 4.673993762067429e-06, "loss": 0.0382, "step": 56840 }, { "epoch": 4.221743650675776, "grad_norm": 1.0465569496154785, "learning_rate": 4.6695380959453436e-06, "loss": 0.0481, "step": 56850 }, { "epoch": 4.222486261696123, "grad_norm": 1.8960071802139282, "learning_rate": 4.6650824298232586e-06, "loss": 0.0823, "step": 56860 }, { "epoch": 4.223228872716471, "grad_norm": 1.5519205331802368, "learning_rate": 4.6606267637011736e-06, "loss": 0.0807, "step": 56870 }, { "epoch": 4.223971483736818, "grad_norm": 2.4255125522613525, "learning_rate": 4.6561710975790885e-06, "loss": 0.0666, "step": 56880 }, { "epoch": 4.224714094757166, "grad_norm": 1.1080626249313354, "learning_rate": 4.651715431457003e-06, "loss": 0.081, "step": 56890 }, { "epoch": 4.225456705777514, "grad_norm": 2.251464605331421, "learning_rate": 4.647259765334918e-06, "loss": 0.0743, "step": 56900 }, { "epoch": 4.226199316797861, "grad_norm": 0.8593553900718689, "learning_rate": 4.642804099212832e-06, "loss": 0.0523, "step": 56910 }, { "epoch": 4.226941927818209, "grad_norm": 0.543804407119751, "learning_rate": 4.638348433090747e-06, "loss": 0.0748, "step": 56920 }, { "epoch": 4.227684538838556, "grad_norm": 2.7038323879241943, "learning_rate": 4.633892766968663e-06, "loss": 0.059, "step": 56930 }, { "epoch": 4.228427149858904, "grad_norm": 3.215402364730835, "learning_rate": 4.629437100846577e-06, "loss": 0.0626, "step": 56940 }, { "epoch": 4.229169760879252, "grad_norm": 0.7325242161750793, "learning_rate": 4.624981434724492e-06, "loss": 0.055, "step": 56950 }, { "epoch": 4.229912371899599, "grad_norm": 1.3402073383331299, "learning_rate": 4.620525768602406e-06, "loss": 0.0661, "step": 56960 }, { "epoch": 4.230654982919947, "grad_norm": 2.4132492542266846, "learning_rate": 4.616070102480321e-06, "loss": 0.0618, "step": 56970 }, { "epoch": 4.231397593940294, "grad_norm": 2.361948251724243, "learning_rate": 4.611614436358235e-06, "loss": 0.029, "step": 56980 }, { "epoch": 4.232140204960642, "grad_norm": 1.8688277006149292, "learning_rate": 4.607158770236151e-06, "loss": 0.0675, "step": 56990 }, { "epoch": 4.2328828159809895, "grad_norm": 1.2990567684173584, "learning_rate": 4.602703104114065e-06, "loss": 0.0553, "step": 57000 }, { "epoch": 4.2336254270013365, "grad_norm": 1.1148401498794556, "learning_rate": 4.59824743799198e-06, "loss": 0.0565, "step": 57010 }, { "epoch": 4.234368038021684, "grad_norm": 1.900498867034912, "learning_rate": 4.593791771869894e-06, "loss": 0.0724, "step": 57020 }, { "epoch": 4.235110649042031, "grad_norm": 1.3050068616867065, "learning_rate": 4.589336105747809e-06, "loss": 0.059, "step": 57030 }, { "epoch": 4.235853260062379, "grad_norm": 1.2359297275543213, "learning_rate": 4.584880439625723e-06, "loss": 0.0581, "step": 57040 }, { "epoch": 4.236595871082727, "grad_norm": 0.6790696382522583, "learning_rate": 4.580424773503639e-06, "loss": 0.0591, "step": 57050 }, { "epoch": 4.237338482103074, "grad_norm": 1.258554220199585, "learning_rate": 4.575969107381554e-06, "loss": 0.0604, "step": 57060 }, { "epoch": 4.238081093123422, "grad_norm": 0.6367031335830688, "learning_rate": 4.571513441259468e-06, "loss": 0.0627, "step": 57070 }, { "epoch": 4.238823704143769, "grad_norm": 1.3916947841644287, "learning_rate": 4.567057775137383e-06, "loss": 0.0727, "step": 57080 }, { "epoch": 4.239566315164117, "grad_norm": 1.1736927032470703, "learning_rate": 4.5626021090152975e-06, "loss": 0.0492, "step": 57090 }, { "epoch": 4.240308926184465, "grad_norm": 2.185208320617676, "learning_rate": 4.558146442893213e-06, "loss": 0.0618, "step": 57100 }, { "epoch": 4.241051537204812, "grad_norm": 0.6165786385536194, "learning_rate": 4.5536907767711275e-06, "loss": 0.061, "step": 57110 }, { "epoch": 4.24179414822516, "grad_norm": 0.5800535678863525, "learning_rate": 4.5492351106490425e-06, "loss": 0.0724, "step": 57120 }, { "epoch": 4.242536759245507, "grad_norm": 0.8701562881469727, "learning_rate": 4.544779444526957e-06, "loss": 0.0333, "step": 57130 }, { "epoch": 4.243279370265855, "grad_norm": 0.8652254343032837, "learning_rate": 4.540323778404872e-06, "loss": 0.0258, "step": 57140 }, { "epoch": 4.244021981286203, "grad_norm": 3.373532772064209, "learning_rate": 4.535868112282786e-06, "loss": 0.066, "step": 57150 }, { "epoch": 4.24476459230655, "grad_norm": 0.9906690120697021, "learning_rate": 4.531412446160702e-06, "loss": 0.0716, "step": 57160 }, { "epoch": 4.2455072033268975, "grad_norm": 2.1584084033966064, "learning_rate": 4.526956780038616e-06, "loss": 0.0605, "step": 57170 }, { "epoch": 4.2462498143472445, "grad_norm": 1.1581281423568726, "learning_rate": 4.522501113916531e-06, "loss": 0.0437, "step": 57180 }, { "epoch": 4.246992425367592, "grad_norm": 0.5750119686126709, "learning_rate": 4.518045447794446e-06, "loss": 0.0542, "step": 57190 }, { "epoch": 4.24773503638794, "grad_norm": 0.9591627717018127, "learning_rate": 4.51358978167236e-06, "loss": 0.0591, "step": 57200 }, { "epoch": 4.248477647408287, "grad_norm": 1.943953037261963, "learning_rate": 4.509134115550275e-06, "loss": 0.0455, "step": 57210 }, { "epoch": 4.249220258428635, "grad_norm": 1.54121732711792, "learning_rate": 4.50467844942819e-06, "loss": 0.052, "step": 57220 }, { "epoch": 4.249962869448983, "grad_norm": 1.1736338138580322, "learning_rate": 4.500222783306105e-06, "loss": 0.0741, "step": 57230 }, { "epoch": 4.25070548046933, "grad_norm": 1.9899519681930542, "learning_rate": 4.495767117184019e-06, "loss": 0.0944, "step": 57240 }, { "epoch": 4.251448091489678, "grad_norm": 1.6793705224990845, "learning_rate": 4.491311451061934e-06, "loss": 0.0749, "step": 57250 }, { "epoch": 4.252190702510025, "grad_norm": 2.0780696868896484, "learning_rate": 4.486855784939848e-06, "loss": 0.0529, "step": 57260 }, { "epoch": 4.252933313530373, "grad_norm": 0.6456514000892639, "learning_rate": 4.482400118817763e-06, "loss": 0.0516, "step": 57270 }, { "epoch": 4.253675924550721, "grad_norm": 2.047966241836548, "learning_rate": 4.477944452695678e-06, "loss": 0.0564, "step": 57280 }, { "epoch": 4.254418535571068, "grad_norm": 1.8581652641296387, "learning_rate": 4.473488786573593e-06, "loss": 0.0645, "step": 57290 }, { "epoch": 4.255161146591416, "grad_norm": 0.8233331441879272, "learning_rate": 4.469033120451507e-06, "loss": 0.0339, "step": 57300 }, { "epoch": 4.255903757611763, "grad_norm": 1.5012775659561157, "learning_rate": 4.464577454329422e-06, "loss": 0.0678, "step": 57310 }, { "epoch": 4.2566463686321105, "grad_norm": 1.7684147357940674, "learning_rate": 4.460121788207337e-06, "loss": 0.0655, "step": 57320 }, { "epoch": 4.257388979652458, "grad_norm": 1.061919927597046, "learning_rate": 4.4556661220852514e-06, "loss": 0.0663, "step": 57330 }, { "epoch": 4.2581315906728054, "grad_norm": 3.2770798206329346, "learning_rate": 4.451210455963167e-06, "loss": 0.0863, "step": 57340 }, { "epoch": 4.258874201693153, "grad_norm": 0.6535485982894897, "learning_rate": 4.446754789841081e-06, "loss": 0.0603, "step": 57350 }, { "epoch": 4.2596168127135, "grad_norm": 0.8265778422355652, "learning_rate": 4.442299123718996e-06, "loss": 0.0573, "step": 57360 }, { "epoch": 4.260359423733848, "grad_norm": 0.3725847005844116, "learning_rate": 4.4378434575969106e-06, "loss": 0.0543, "step": 57370 }, { "epoch": 4.261102034754196, "grad_norm": 1.1065174341201782, "learning_rate": 4.4333877914748256e-06, "loss": 0.0578, "step": 57380 }, { "epoch": 4.261844645774543, "grad_norm": 3.13139009475708, "learning_rate": 4.42893212535274e-06, "loss": 0.0632, "step": 57390 }, { "epoch": 4.262587256794891, "grad_norm": 1.8046602010726929, "learning_rate": 4.4244764592306555e-06, "loss": 0.0508, "step": 57400 }, { "epoch": 4.263329867815238, "grad_norm": 0.9874463677406311, "learning_rate": 4.42002079310857e-06, "loss": 0.0546, "step": 57410 }, { "epoch": 4.264072478835586, "grad_norm": 2.401059865951538, "learning_rate": 4.415565126986485e-06, "loss": 0.0502, "step": 57420 }, { "epoch": 4.264815089855934, "grad_norm": 1.2862542867660522, "learning_rate": 4.411109460864399e-06, "loss": 0.0722, "step": 57430 }, { "epoch": 4.265557700876281, "grad_norm": 0.6845186948776245, "learning_rate": 4.406653794742314e-06, "loss": 0.053, "step": 57440 }, { "epoch": 4.266300311896629, "grad_norm": 0.5840150117874146, "learning_rate": 4.40219812862023e-06, "loss": 0.0395, "step": 57450 }, { "epoch": 4.267042922916976, "grad_norm": 0.8561588525772095, "learning_rate": 4.397742462498144e-06, "loss": 0.076, "step": 57460 }, { "epoch": 4.267785533937324, "grad_norm": 1.2348568439483643, "learning_rate": 4.393286796376059e-06, "loss": 0.0679, "step": 57470 }, { "epoch": 4.2685281449576715, "grad_norm": 1.4345803260803223, "learning_rate": 4.388831130253973e-06, "loss": 0.0542, "step": 57480 }, { "epoch": 4.2692707559780185, "grad_norm": 0.30238887667655945, "learning_rate": 4.384375464131888e-06, "loss": 0.0568, "step": 57490 }, { "epoch": 4.270013366998366, "grad_norm": 1.476379156112671, "learning_rate": 4.379919798009802e-06, "loss": 0.0448, "step": 57500 }, { "epoch": 4.270755978018713, "grad_norm": 0.6961525678634644, "learning_rate": 4.375464131887718e-06, "loss": 0.053, "step": 57510 }, { "epoch": 4.271498589039061, "grad_norm": 1.5048184394836426, "learning_rate": 4.371008465765632e-06, "loss": 0.0572, "step": 57520 }, { "epoch": 4.272241200059409, "grad_norm": 1.902750849723816, "learning_rate": 4.366552799643547e-06, "loss": 0.0564, "step": 57530 }, { "epoch": 4.272983811079756, "grad_norm": 2.073760747909546, "learning_rate": 4.362097133521461e-06, "loss": 0.052, "step": 57540 }, { "epoch": 4.273726422100104, "grad_norm": 0.6217005252838135, "learning_rate": 4.357641467399376e-06, "loss": 0.0794, "step": 57550 }, { "epoch": 4.274469033120451, "grad_norm": 1.285564661026001, "learning_rate": 4.35318580127729e-06, "loss": 0.0667, "step": 57560 }, { "epoch": 4.275211644140799, "grad_norm": 0.4138168692588806, "learning_rate": 4.348730135155206e-06, "loss": 0.0688, "step": 57570 }, { "epoch": 4.275954255161147, "grad_norm": 2.414457321166992, "learning_rate": 4.344274469033121e-06, "loss": 0.0496, "step": 57580 }, { "epoch": 4.276696866181494, "grad_norm": 1.193533182144165, "learning_rate": 4.339818802911035e-06, "loss": 0.0693, "step": 57590 }, { "epoch": 4.277439477201842, "grad_norm": 2.959575891494751, "learning_rate": 4.33536313678895e-06, "loss": 0.0625, "step": 57600 }, { "epoch": 4.278182088222189, "grad_norm": 4.037361145019531, "learning_rate": 4.3309074706668645e-06, "loss": 0.0413, "step": 57610 }, { "epoch": 4.278924699242537, "grad_norm": 2.123981475830078, "learning_rate": 4.3264518045447795e-06, "loss": 0.0575, "step": 57620 }, { "epoch": 4.279667310262885, "grad_norm": 1.8399453163146973, "learning_rate": 4.3219961384226945e-06, "loss": 0.0643, "step": 57630 }, { "epoch": 4.280409921283232, "grad_norm": 0.43379709124565125, "learning_rate": 4.3175404723006095e-06, "loss": 0.0514, "step": 57640 }, { "epoch": 4.2811525323035795, "grad_norm": 0.5563368797302246, "learning_rate": 4.313084806178524e-06, "loss": 0.0698, "step": 57650 }, { "epoch": 4.2818951433239265, "grad_norm": 4.087998867034912, "learning_rate": 4.308629140056439e-06, "loss": 0.0505, "step": 57660 }, { "epoch": 4.282637754344274, "grad_norm": 3.5374698638916016, "learning_rate": 4.304173473934353e-06, "loss": 0.0892, "step": 57670 }, { "epoch": 4.283380365364622, "grad_norm": 1.2814463376998901, "learning_rate": 4.299717807812268e-06, "loss": 0.0633, "step": 57680 }, { "epoch": 4.284122976384969, "grad_norm": 0.6887046098709106, "learning_rate": 4.295262141690183e-06, "loss": 0.0323, "step": 57690 }, { "epoch": 4.284865587405317, "grad_norm": 1.5045709609985352, "learning_rate": 4.290806475568098e-06, "loss": 0.0363, "step": 57700 }, { "epoch": 4.285608198425665, "grad_norm": 0.7762113213539124, "learning_rate": 4.286350809446012e-06, "loss": 0.0575, "step": 57710 }, { "epoch": 4.286350809446012, "grad_norm": 0.4139329195022583, "learning_rate": 4.281895143323927e-06, "loss": 0.0461, "step": 57720 }, { "epoch": 4.28709342046636, "grad_norm": 0.7771281599998474, "learning_rate": 4.277439477201842e-06, "loss": 0.043, "step": 57730 }, { "epoch": 4.287836031486707, "grad_norm": 1.1789294481277466, "learning_rate": 4.272983811079756e-06, "loss": 0.0563, "step": 57740 }, { "epoch": 4.288578642507055, "grad_norm": 2.2451627254486084, "learning_rate": 4.268528144957672e-06, "loss": 0.0681, "step": 57750 }, { "epoch": 4.289321253527403, "grad_norm": 0.40132951736450195, "learning_rate": 4.264072478835586e-06, "loss": 0.0559, "step": 57760 }, { "epoch": 4.29006386454775, "grad_norm": 1.868285059928894, "learning_rate": 4.259616812713501e-06, "loss": 0.0517, "step": 57770 }, { "epoch": 4.290806475568098, "grad_norm": 1.2447259426116943, "learning_rate": 4.255161146591415e-06, "loss": 0.0629, "step": 57780 }, { "epoch": 4.291549086588445, "grad_norm": 0.3125772774219513, "learning_rate": 4.25070548046933e-06, "loss": 0.0589, "step": 57790 }, { "epoch": 4.292291697608793, "grad_norm": 1.9337828159332275, "learning_rate": 4.246249814347245e-06, "loss": 0.0771, "step": 57800 }, { "epoch": 4.2930343086291405, "grad_norm": 1.6286951303482056, "learning_rate": 4.24179414822516e-06, "loss": 0.0577, "step": 57810 }, { "epoch": 4.2937769196494875, "grad_norm": 3.2661020755767822, "learning_rate": 4.237338482103074e-06, "loss": 0.0565, "step": 57820 }, { "epoch": 4.294519530669835, "grad_norm": 2.473935604095459, "learning_rate": 4.232882815980989e-06, "loss": 0.0672, "step": 57830 }, { "epoch": 4.295262141690182, "grad_norm": 1.5696542263031006, "learning_rate": 4.2284271498589034e-06, "loss": 0.0682, "step": 57840 }, { "epoch": 4.29600475271053, "grad_norm": 0.4458481967449188, "learning_rate": 4.223971483736818e-06, "loss": 0.0444, "step": 57850 }, { "epoch": 4.296747363730878, "grad_norm": 2.461646556854248, "learning_rate": 4.219515817614734e-06, "loss": 0.0548, "step": 57860 }, { "epoch": 4.297489974751225, "grad_norm": 3.5632870197296143, "learning_rate": 4.215060151492648e-06, "loss": 0.0551, "step": 57870 }, { "epoch": 4.298232585771573, "grad_norm": 0.38127386569976807, "learning_rate": 4.210604485370563e-06, "loss": 0.0746, "step": 57880 }, { "epoch": 4.29897519679192, "grad_norm": 0.4123340845108032, "learning_rate": 4.2061488192484775e-06, "loss": 0.052, "step": 57890 }, { "epoch": 4.299717807812268, "grad_norm": 1.4147988557815552, "learning_rate": 4.2016931531263925e-06, "loss": 0.0534, "step": 57900 }, { "epoch": 4.300460418832616, "grad_norm": 1.1483192443847656, "learning_rate": 4.197237487004307e-06, "loss": 0.0762, "step": 57910 }, { "epoch": 4.301203029852963, "grad_norm": 1.2446470260620117, "learning_rate": 4.1927818208822225e-06, "loss": 0.0827, "step": 57920 }, { "epoch": 4.301945640873311, "grad_norm": 1.6271134614944458, "learning_rate": 4.188326154760137e-06, "loss": 0.0517, "step": 57930 }, { "epoch": 4.302688251893658, "grad_norm": 0.6140291094779968, "learning_rate": 4.183870488638052e-06, "loss": 0.045, "step": 57940 }, { "epoch": 4.303430862914006, "grad_norm": 2.260127067565918, "learning_rate": 4.179414822515966e-06, "loss": 0.0545, "step": 57950 }, { "epoch": 4.304173473934354, "grad_norm": 1.3899246454238892, "learning_rate": 4.174959156393881e-06, "loss": 0.0625, "step": 57960 }, { "epoch": 4.304916084954701, "grad_norm": 3.2012217044830322, "learning_rate": 4.170503490271795e-06, "loss": 0.0644, "step": 57970 }, { "epoch": 4.3056586959750485, "grad_norm": 1.8873796463012695, "learning_rate": 4.166047824149711e-06, "loss": 0.0446, "step": 57980 }, { "epoch": 4.3064013069953955, "grad_norm": 1.7632603645324707, "learning_rate": 4.161592158027626e-06, "loss": 0.058, "step": 57990 }, { "epoch": 4.307143918015743, "grad_norm": 2.116173028945923, "learning_rate": 4.15713649190554e-06, "loss": 0.0564, "step": 58000 }, { "epoch": 4.307886529036091, "grad_norm": 1.2623026371002197, "learning_rate": 4.152680825783455e-06, "loss": 0.0515, "step": 58010 }, { "epoch": 4.308629140056438, "grad_norm": 0.5496644377708435, "learning_rate": 4.148225159661369e-06, "loss": 0.0891, "step": 58020 }, { "epoch": 4.309371751076786, "grad_norm": 1.1486482620239258, "learning_rate": 4.143769493539284e-06, "loss": 0.0526, "step": 58030 }, { "epoch": 4.310114362097133, "grad_norm": 1.4145158529281616, "learning_rate": 4.139313827417199e-06, "loss": 0.0745, "step": 58040 }, { "epoch": 4.310856973117481, "grad_norm": 2.3882088661193848, "learning_rate": 4.134858161295114e-06, "loss": 0.0881, "step": 58050 }, { "epoch": 4.311599584137829, "grad_norm": 1.1562554836273193, "learning_rate": 4.130402495173028e-06, "loss": 0.0556, "step": 58060 }, { "epoch": 4.312342195158176, "grad_norm": 2.3691024780273438, "learning_rate": 4.125946829050943e-06, "loss": 0.0798, "step": 58070 }, { "epoch": 4.313084806178524, "grad_norm": 1.2392774820327759, "learning_rate": 4.121491162928857e-06, "loss": 0.0689, "step": 58080 }, { "epoch": 4.313827417198871, "grad_norm": 1.4039784669876099, "learning_rate": 4.117035496806772e-06, "loss": 0.0513, "step": 58090 }, { "epoch": 4.314570028219219, "grad_norm": 0.6202054619789124, "learning_rate": 4.112579830684687e-06, "loss": 0.0448, "step": 58100 }, { "epoch": 4.315312639239567, "grad_norm": 0.5338848829269409, "learning_rate": 4.108124164562602e-06, "loss": 0.0619, "step": 58110 }, { "epoch": 4.316055250259914, "grad_norm": 0.5443835258483887, "learning_rate": 4.103668498440517e-06, "loss": 0.0665, "step": 58120 }, { "epoch": 4.3167978612802616, "grad_norm": 2.086144208908081, "learning_rate": 4.0992128323184315e-06, "loss": 0.0561, "step": 58130 }, { "epoch": 4.317540472300609, "grad_norm": 0.8639087677001953, "learning_rate": 4.0947571661963465e-06, "loss": 0.0538, "step": 58140 }, { "epoch": 4.3182830833209565, "grad_norm": 2.400470495223999, "learning_rate": 4.0903015000742615e-06, "loss": 0.0504, "step": 58150 }, { "epoch": 4.319025694341304, "grad_norm": 0.8947586417198181, "learning_rate": 4.0858458339521765e-06, "loss": 0.0413, "step": 58160 }, { "epoch": 4.319768305361651, "grad_norm": 1.5876610279083252, "learning_rate": 4.081390167830091e-06, "loss": 0.0721, "step": 58170 }, { "epoch": 4.320510916381999, "grad_norm": 2.025843858718872, "learning_rate": 4.076934501708006e-06, "loss": 0.0559, "step": 58180 }, { "epoch": 4.321253527402346, "grad_norm": 1.2650307416915894, "learning_rate": 4.07247883558592e-06, "loss": 0.0469, "step": 58190 }, { "epoch": 4.321996138422694, "grad_norm": 0.4236902892589569, "learning_rate": 4.068023169463835e-06, "loss": 0.0319, "step": 58200 }, { "epoch": 4.322738749443042, "grad_norm": 1.2939475774765015, "learning_rate": 4.06356750334175e-06, "loss": 0.0748, "step": 58210 }, { "epoch": 4.323481360463389, "grad_norm": 0.4584546685218811, "learning_rate": 4.059111837219665e-06, "loss": 0.0305, "step": 58220 }, { "epoch": 4.324223971483737, "grad_norm": 1.0066841840744019, "learning_rate": 4.054656171097579e-06, "loss": 0.0638, "step": 58230 }, { "epoch": 4.324966582504084, "grad_norm": 1.1847662925720215, "learning_rate": 4.050200504975494e-06, "loss": 0.0383, "step": 58240 }, { "epoch": 4.325709193524432, "grad_norm": 1.5462864637374878, "learning_rate": 4.045744838853409e-06, "loss": 0.0557, "step": 58250 }, { "epoch": 4.32645180454478, "grad_norm": 1.220777153968811, "learning_rate": 4.041289172731323e-06, "loss": 0.054, "step": 58260 }, { "epoch": 4.327194415565127, "grad_norm": 2.4415807723999023, "learning_rate": 4.036833506609239e-06, "loss": 0.062, "step": 58270 }, { "epoch": 4.327937026585475, "grad_norm": 1.2786998748779297, "learning_rate": 4.032377840487153e-06, "loss": 0.0608, "step": 58280 }, { "epoch": 4.328679637605822, "grad_norm": 0.7902323007583618, "learning_rate": 4.027922174365068e-06, "loss": 0.0624, "step": 58290 }, { "epoch": 4.3294222486261695, "grad_norm": 1.4461417198181152, "learning_rate": 4.023466508242982e-06, "loss": 0.0546, "step": 58300 }, { "epoch": 4.330164859646517, "grad_norm": 0.4553472697734833, "learning_rate": 4.019010842120897e-06, "loss": 0.0413, "step": 58310 }, { "epoch": 4.330907470666864, "grad_norm": 1.363437294960022, "learning_rate": 4.014555175998811e-06, "loss": 0.0512, "step": 58320 }, { "epoch": 4.331650081687212, "grad_norm": 1.6967765092849731, "learning_rate": 4.010099509876727e-06, "loss": 0.0611, "step": 58330 }, { "epoch": 4.332392692707559, "grad_norm": 1.917182207107544, "learning_rate": 4.005643843754641e-06, "loss": 0.0887, "step": 58340 }, { "epoch": 4.333135303727907, "grad_norm": 2.6857664585113525, "learning_rate": 4.001188177632556e-06, "loss": 0.06, "step": 58350 }, { "epoch": 4.333877914748255, "grad_norm": 2.381786346435547, "learning_rate": 3.99673251151047e-06, "loss": 0.0701, "step": 58360 }, { "epoch": 4.334620525768602, "grad_norm": 1.848760724067688, "learning_rate": 3.992276845388385e-06, "loss": 0.0649, "step": 58370 }, { "epoch": 4.33536313678895, "grad_norm": 1.4879848957061768, "learning_rate": 3.9878211792663e-06, "loss": 0.0502, "step": 58380 }, { "epoch": 4.336105747809297, "grad_norm": 1.29238760471344, "learning_rate": 3.983365513144215e-06, "loss": 0.0527, "step": 58390 }, { "epoch": 4.336848358829645, "grad_norm": 0.42699211835861206, "learning_rate": 3.97890984702213e-06, "loss": 0.047, "step": 58400 }, { "epoch": 4.337590969849993, "grad_norm": 2.1831252574920654, "learning_rate": 3.9744541809000445e-06, "loss": 0.0746, "step": 58410 }, { "epoch": 4.33833358087034, "grad_norm": 0.27628546953201294, "learning_rate": 3.9699985147779595e-06, "loss": 0.0367, "step": 58420 }, { "epoch": 4.339076191890688, "grad_norm": 1.2963409423828125, "learning_rate": 3.965542848655874e-06, "loss": 0.0693, "step": 58430 }, { "epoch": 4.339818802911036, "grad_norm": 3.566678762435913, "learning_rate": 3.961087182533789e-06, "loss": 0.0523, "step": 58440 }, { "epoch": 4.340561413931383, "grad_norm": 0.6280962228775024, "learning_rate": 3.956631516411704e-06, "loss": 0.0329, "step": 58450 }, { "epoch": 4.3413040249517305, "grad_norm": 0.4241829812526703, "learning_rate": 3.952175850289619e-06, "loss": 0.0304, "step": 58460 }, { "epoch": 4.3420466359720775, "grad_norm": 0.3502853214740753, "learning_rate": 3.947720184167533e-06, "loss": 0.0429, "step": 58470 }, { "epoch": 4.342789246992425, "grad_norm": 3.029207229614258, "learning_rate": 3.943264518045448e-06, "loss": 0.0859, "step": 58480 }, { "epoch": 4.343531858012773, "grad_norm": 0.4693982005119324, "learning_rate": 3.938808851923362e-06, "loss": 0.0478, "step": 58490 }, { "epoch": 4.34427446903312, "grad_norm": 1.9650620222091675, "learning_rate": 3.934353185801278e-06, "loss": 0.0574, "step": 58500 }, { "epoch": 4.345017080053468, "grad_norm": 2.096945285797119, "learning_rate": 3.929897519679193e-06, "loss": 0.0566, "step": 58510 }, { "epoch": 4.345759691073815, "grad_norm": 0.6621731519699097, "learning_rate": 3.925441853557107e-06, "loss": 0.0532, "step": 58520 }, { "epoch": 4.346502302094163, "grad_norm": 3.3194026947021484, "learning_rate": 3.920986187435022e-06, "loss": 0.0901, "step": 58530 }, { "epoch": 4.347244913114511, "grad_norm": 0.2626116871833801, "learning_rate": 3.916530521312936e-06, "loss": 0.05, "step": 58540 }, { "epoch": 4.347987524134858, "grad_norm": 1.0745980739593506, "learning_rate": 3.912074855190851e-06, "loss": 0.0672, "step": 58550 }, { "epoch": 4.348730135155206, "grad_norm": 1.1821939945220947, "learning_rate": 3.907619189068766e-06, "loss": 0.0755, "step": 58560 }, { "epoch": 4.349472746175553, "grad_norm": 3.788940668106079, "learning_rate": 3.903163522946681e-06, "loss": 0.0683, "step": 58570 }, { "epoch": 4.350215357195901, "grad_norm": 2.9081804752349854, "learning_rate": 3.898707856824595e-06, "loss": 0.0677, "step": 58580 }, { "epoch": 4.350957968216249, "grad_norm": 1.8848897218704224, "learning_rate": 3.89425219070251e-06, "loss": 0.0605, "step": 58590 }, { "epoch": 4.351700579236596, "grad_norm": 0.5404842495918274, "learning_rate": 3.889796524580424e-06, "loss": 0.04, "step": 58600 }, { "epoch": 4.352443190256944, "grad_norm": 1.7580265998840332, "learning_rate": 3.885340858458339e-06, "loss": 0.0307, "step": 58610 }, { "epoch": 4.353185801277291, "grad_norm": 1.8819963932037354, "learning_rate": 3.880885192336254e-06, "loss": 0.0704, "step": 58620 }, { "epoch": 4.3539284122976385, "grad_norm": 0.9093202948570251, "learning_rate": 3.876429526214169e-06, "loss": 0.0915, "step": 58630 }, { "epoch": 4.354671023317986, "grad_norm": 2.8272292613983154, "learning_rate": 3.871973860092084e-06, "loss": 0.0886, "step": 58640 }, { "epoch": 4.355413634338333, "grad_norm": 2.371199131011963, "learning_rate": 3.8675181939699985e-06, "loss": 0.0583, "step": 58650 }, { "epoch": 4.356156245358681, "grad_norm": 0.21469803154468536, "learning_rate": 3.8630625278479135e-06, "loss": 0.0405, "step": 58660 }, { "epoch": 4.356898856379028, "grad_norm": 1.9610093832015991, "learning_rate": 3.858606861725828e-06, "loss": 0.031, "step": 58670 }, { "epoch": 4.357641467399376, "grad_norm": 3.707371950149536, "learning_rate": 3.8541511956037435e-06, "loss": 0.0643, "step": 58680 }, { "epoch": 4.358384078419724, "grad_norm": 0.8655148148536682, "learning_rate": 3.849695529481658e-06, "loss": 0.0624, "step": 58690 }, { "epoch": 4.359126689440071, "grad_norm": 2.9356181621551514, "learning_rate": 3.845239863359573e-06, "loss": 0.081, "step": 58700 }, { "epoch": 4.359869300460419, "grad_norm": 1.3285295963287354, "learning_rate": 3.840784197237487e-06, "loss": 0.0463, "step": 58710 }, { "epoch": 4.360611911480766, "grad_norm": 0.44809725880622864, "learning_rate": 3.836328531115402e-06, "loss": 0.0769, "step": 58720 }, { "epoch": 4.361354522501114, "grad_norm": 1.0158475637435913, "learning_rate": 3.831872864993316e-06, "loss": 0.0721, "step": 58730 }, { "epoch": 4.362097133521462, "grad_norm": 1.1209678649902344, "learning_rate": 3.827417198871232e-06, "loss": 0.0601, "step": 58740 }, { "epoch": 4.362839744541809, "grad_norm": 1.2492932081222534, "learning_rate": 3.822961532749146e-06, "loss": 0.0776, "step": 58750 }, { "epoch": 4.363582355562157, "grad_norm": 0.5209352374076843, "learning_rate": 3.818505866627061e-06, "loss": 0.0311, "step": 58760 }, { "epoch": 4.364324966582504, "grad_norm": 0.8227794766426086, "learning_rate": 3.814050200504976e-06, "loss": 0.0282, "step": 58770 }, { "epoch": 4.365067577602852, "grad_norm": 1.6542755365371704, "learning_rate": 3.8095945343828904e-06, "loss": 0.0766, "step": 58780 }, { "epoch": 4.3658101886231995, "grad_norm": 2.0904018878936768, "learning_rate": 3.8051388682608054e-06, "loss": 0.0475, "step": 58790 }, { "epoch": 4.3665527996435465, "grad_norm": 1.682500958442688, "learning_rate": 3.8006832021387196e-06, "loss": 0.0691, "step": 58800 }, { "epoch": 4.367295410663894, "grad_norm": 2.096959352493286, "learning_rate": 3.796227536016635e-06, "loss": 0.0475, "step": 58810 }, { "epoch": 4.368038021684241, "grad_norm": 2.7535927295684814, "learning_rate": 3.791771869894549e-06, "loss": 0.0791, "step": 58820 }, { "epoch": 4.368780632704589, "grad_norm": 0.5873667001724243, "learning_rate": 3.787316203772464e-06, "loss": 0.0359, "step": 58830 }, { "epoch": 4.369523243724937, "grad_norm": 3.4915168285369873, "learning_rate": 3.7828605376503787e-06, "loss": 0.0502, "step": 58840 }, { "epoch": 4.370265854745284, "grad_norm": 0.34856438636779785, "learning_rate": 3.7784048715282937e-06, "loss": 0.0606, "step": 58850 }, { "epoch": 4.371008465765632, "grad_norm": 0.42992135882377625, "learning_rate": 3.773949205406208e-06, "loss": 0.0645, "step": 58860 }, { "epoch": 4.37175107678598, "grad_norm": 1.6763559579849243, "learning_rate": 3.7694935392841233e-06, "loss": 0.0786, "step": 58870 }, { "epoch": 4.372493687806327, "grad_norm": 2.7361538410186768, "learning_rate": 3.7650378731620374e-06, "loss": 0.0677, "step": 58880 }, { "epoch": 4.373236298826675, "grad_norm": 2.2586610317230225, "learning_rate": 3.7605822070399524e-06, "loss": 0.0419, "step": 58890 }, { "epoch": 4.373978909847022, "grad_norm": 0.7950805425643921, "learning_rate": 3.756126540917868e-06, "loss": 0.0381, "step": 58900 }, { "epoch": 4.37472152086737, "grad_norm": 0.500672459602356, "learning_rate": 3.751670874795782e-06, "loss": 0.0576, "step": 58910 }, { "epoch": 4.375464131887718, "grad_norm": 1.354580044746399, "learning_rate": 3.7472152086736965e-06, "loss": 0.0514, "step": 58920 }, { "epoch": 4.376206742908065, "grad_norm": 2.166482925415039, "learning_rate": 3.7427595425516115e-06, "loss": 0.0508, "step": 58930 }, { "epoch": 4.376949353928413, "grad_norm": 0.8477782011032104, "learning_rate": 3.738303876429526e-06, "loss": 0.0472, "step": 58940 }, { "epoch": 4.37769196494876, "grad_norm": 0.9658443331718445, "learning_rate": 3.733848210307441e-06, "loss": 0.0729, "step": 58950 }, { "epoch": 4.3784345759691075, "grad_norm": 3.2751779556274414, "learning_rate": 3.7293925441853557e-06, "loss": 0.0389, "step": 58960 }, { "epoch": 4.379177186989455, "grad_norm": 0.7033012509346008, "learning_rate": 3.7249368780632707e-06, "loss": 0.051, "step": 58970 }, { "epoch": 4.379919798009802, "grad_norm": 0.37652888894081116, "learning_rate": 3.7204812119411857e-06, "loss": 0.0488, "step": 58980 }, { "epoch": 4.38066240903015, "grad_norm": 1.1736301183700562, "learning_rate": 3.7160255458191002e-06, "loss": 0.0742, "step": 58990 }, { "epoch": 4.381405020050497, "grad_norm": 1.1174696683883667, "learning_rate": 3.711569879697015e-06, "loss": 0.0505, "step": 59000 }, { "epoch": 4.382147631070845, "grad_norm": 2.9306743144989014, "learning_rate": 3.70711421357493e-06, "loss": 0.051, "step": 59010 }, { "epoch": 4.382890242091193, "grad_norm": 1.5580140352249146, "learning_rate": 3.7026585474528444e-06, "loss": 0.0598, "step": 59020 }, { "epoch": 4.38363285311154, "grad_norm": 1.074127435684204, "learning_rate": 3.698202881330759e-06, "loss": 0.053, "step": 59030 }, { "epoch": 4.384375464131888, "grad_norm": 0.9551728367805481, "learning_rate": 3.693747215208674e-06, "loss": 0.0438, "step": 59040 }, { "epoch": 4.385118075152235, "grad_norm": 1.77925705909729, "learning_rate": 3.6892915490865885e-06, "loss": 0.0489, "step": 59050 }, { "epoch": 4.385860686172583, "grad_norm": 0.7532749772071838, "learning_rate": 3.684835882964503e-06, "loss": 0.0431, "step": 59060 }, { "epoch": 4.386603297192931, "grad_norm": 1.1308987140655518, "learning_rate": 3.680380216842418e-06, "loss": 0.0617, "step": 59070 }, { "epoch": 4.387345908213278, "grad_norm": 3.694500684738159, "learning_rate": 3.6759245507203326e-06, "loss": 0.065, "step": 59080 }, { "epoch": 4.388088519233626, "grad_norm": 2.997882127761841, "learning_rate": 3.671468884598247e-06, "loss": 0.063, "step": 59090 }, { "epoch": 4.388831130253973, "grad_norm": 1.4791189432144165, "learning_rate": 3.6670132184761626e-06, "loss": 0.0603, "step": 59100 }, { "epoch": 4.3895737412743205, "grad_norm": 1.4636845588684082, "learning_rate": 3.662557552354077e-06, "loss": 0.0548, "step": 59110 }, { "epoch": 4.390316352294668, "grad_norm": 1.0181242227554321, "learning_rate": 3.6581018862319918e-06, "loss": 0.0424, "step": 59120 }, { "epoch": 4.391058963315015, "grad_norm": 0.7411527037620544, "learning_rate": 3.6536462201099068e-06, "loss": 0.0591, "step": 59130 }, { "epoch": 4.391801574335363, "grad_norm": 0.6368983387947083, "learning_rate": 3.6491905539878213e-06, "loss": 0.0731, "step": 59140 }, { "epoch": 4.39254418535571, "grad_norm": 0.9122620224952698, "learning_rate": 3.644734887865736e-06, "loss": 0.0579, "step": 59150 }, { "epoch": 4.393286796376058, "grad_norm": 1.5511554479599, "learning_rate": 3.640279221743651e-06, "loss": 0.0578, "step": 59160 }, { "epoch": 4.394029407396406, "grad_norm": 1.4701236486434937, "learning_rate": 3.6358235556215655e-06, "loss": 0.0377, "step": 59170 }, { "epoch": 4.394772018416753, "grad_norm": 0.35914507508277893, "learning_rate": 3.63136788949948e-06, "loss": 0.0586, "step": 59180 }, { "epoch": 4.395514629437101, "grad_norm": 1.4817779064178467, "learning_rate": 3.626912223377395e-06, "loss": 0.0664, "step": 59190 }, { "epoch": 4.396257240457448, "grad_norm": 2.5295608043670654, "learning_rate": 3.6224565572553096e-06, "loss": 0.0454, "step": 59200 }, { "epoch": 4.396999851477796, "grad_norm": 0.7852722406387329, "learning_rate": 3.618000891133224e-06, "loss": 0.0399, "step": 59210 }, { "epoch": 4.397742462498144, "grad_norm": 0.24262398481369019, "learning_rate": 3.613545225011139e-06, "loss": 0.0409, "step": 59220 }, { "epoch": 4.398485073518491, "grad_norm": 0.33007925748825073, "learning_rate": 3.609089558889054e-06, "loss": 0.0593, "step": 59230 }, { "epoch": 4.399227684538839, "grad_norm": 1.3206017017364502, "learning_rate": 3.6046338927669687e-06, "loss": 0.0687, "step": 59240 }, { "epoch": 4.399970295559186, "grad_norm": 2.052076578140259, "learning_rate": 3.6001782266448837e-06, "loss": 0.0808, "step": 59250 }, { "epoch": 4.400712906579534, "grad_norm": 3.8353078365325928, "learning_rate": 3.5957225605227983e-06, "loss": 0.0581, "step": 59260 }, { "epoch": 4.4014555175998815, "grad_norm": 0.6021201014518738, "learning_rate": 3.591266894400713e-06, "loss": 0.0714, "step": 59270 }, { "epoch": 4.4021981286202285, "grad_norm": 2.1962730884552, "learning_rate": 3.586811228278628e-06, "loss": 0.0501, "step": 59280 }, { "epoch": 4.402940739640576, "grad_norm": 0.2795027792453766, "learning_rate": 3.5823555621565424e-06, "loss": 0.0573, "step": 59290 }, { "epoch": 4.403683350660923, "grad_norm": 1.4487619400024414, "learning_rate": 3.5778998960344574e-06, "loss": 0.0312, "step": 59300 }, { "epoch": 4.404425961681271, "grad_norm": 2.9178457260131836, "learning_rate": 3.573444229912372e-06, "loss": 0.0564, "step": 59310 }, { "epoch": 4.405168572701619, "grad_norm": 1.1441450119018555, "learning_rate": 3.5689885637902866e-06, "loss": 0.0618, "step": 59320 }, { "epoch": 4.405911183721966, "grad_norm": 2.4585134983062744, "learning_rate": 3.5645328976682016e-06, "loss": 0.044, "step": 59330 }, { "epoch": 4.406653794742314, "grad_norm": 1.8690593242645264, "learning_rate": 3.560077231546116e-06, "loss": 0.0638, "step": 59340 }, { "epoch": 4.407396405762661, "grad_norm": 1.3198564052581787, "learning_rate": 3.5556215654240307e-06, "loss": 0.0596, "step": 59350 }, { "epoch": 4.408139016783009, "grad_norm": 1.511386513710022, "learning_rate": 3.551165899301946e-06, "loss": 0.0775, "step": 59360 }, { "epoch": 4.408881627803357, "grad_norm": 1.489264965057373, "learning_rate": 3.5467102331798607e-06, "loss": 0.0824, "step": 59370 }, { "epoch": 4.409624238823704, "grad_norm": 0.6738532185554504, "learning_rate": 3.5422545670577753e-06, "loss": 0.1038, "step": 59380 }, { "epoch": 4.410366849844052, "grad_norm": 1.928895115852356, "learning_rate": 3.5377989009356903e-06, "loss": 0.0687, "step": 59390 }, { "epoch": 4.411109460864399, "grad_norm": 2.568911552429199, "learning_rate": 3.533343234813605e-06, "loss": 0.0677, "step": 59400 }, { "epoch": 4.411852071884747, "grad_norm": 3.7109265327453613, "learning_rate": 3.5288875686915194e-06, "loss": 0.0468, "step": 59410 }, { "epoch": 4.412594682905095, "grad_norm": 1.1439366340637207, "learning_rate": 3.5244319025694344e-06, "loss": 0.0471, "step": 59420 }, { "epoch": 4.413337293925442, "grad_norm": 1.701995849609375, "learning_rate": 3.519976236447349e-06, "loss": 0.0555, "step": 59430 }, { "epoch": 4.4140799049457895, "grad_norm": 3.7509143352508545, "learning_rate": 3.5155205703252635e-06, "loss": 0.0486, "step": 59440 }, { "epoch": 4.4148225159661365, "grad_norm": 2.0457451343536377, "learning_rate": 3.5110649042031785e-06, "loss": 0.0599, "step": 59450 }, { "epoch": 4.415565126986484, "grad_norm": 1.8995429277420044, "learning_rate": 3.506609238081093e-06, "loss": 0.0546, "step": 59460 }, { "epoch": 4.416307738006832, "grad_norm": 1.6824947595596313, "learning_rate": 3.5021535719590077e-06, "loss": 0.0679, "step": 59470 }, { "epoch": 4.417050349027179, "grad_norm": 2.0449635982513428, "learning_rate": 3.4976979058369227e-06, "loss": 0.0552, "step": 59480 }, { "epoch": 4.417792960047527, "grad_norm": 0.3595518469810486, "learning_rate": 3.4932422397148372e-06, "loss": 0.0452, "step": 59490 }, { "epoch": 4.418535571067874, "grad_norm": 0.20933711528778076, "learning_rate": 3.4887865735927522e-06, "loss": 0.0662, "step": 59500 }, { "epoch": 4.419278182088222, "grad_norm": 2.1247775554656982, "learning_rate": 3.4843309074706672e-06, "loss": 0.0577, "step": 59510 }, { "epoch": 4.42002079310857, "grad_norm": 1.3232066631317139, "learning_rate": 3.479875241348582e-06, "loss": 0.0739, "step": 59520 }, { "epoch": 4.420763404128917, "grad_norm": 1.2847281694412231, "learning_rate": 3.4754195752264964e-06, "loss": 0.1052, "step": 59530 }, { "epoch": 4.421506015149265, "grad_norm": 2.9592535495758057, "learning_rate": 3.4709639091044114e-06, "loss": 0.0837, "step": 59540 }, { "epoch": 4.422248626169612, "grad_norm": 3.9245753288269043, "learning_rate": 3.466508242982326e-06, "loss": 0.0602, "step": 59550 }, { "epoch": 4.42299123718996, "grad_norm": 0.8036717176437378, "learning_rate": 3.4620525768602405e-06, "loss": 0.0609, "step": 59560 }, { "epoch": 4.423733848210308, "grad_norm": 0.5781071782112122, "learning_rate": 3.4575969107381555e-06, "loss": 0.056, "step": 59570 }, { "epoch": 4.424476459230655, "grad_norm": 2.8381593227386475, "learning_rate": 3.45314124461607e-06, "loss": 0.0667, "step": 59580 }, { "epoch": 4.425219070251003, "grad_norm": 2.3946638107299805, "learning_rate": 3.4486855784939846e-06, "loss": 0.0354, "step": 59590 }, { "epoch": 4.4259616812713505, "grad_norm": 1.2118726968765259, "learning_rate": 3.4442299123718996e-06, "loss": 0.0549, "step": 59600 }, { "epoch": 4.4267042922916975, "grad_norm": 2.8117597103118896, "learning_rate": 3.439774246249814e-06, "loss": 0.0692, "step": 59610 }, { "epoch": 4.427446903312045, "grad_norm": 2.231323719024658, "learning_rate": 3.4353185801277288e-06, "loss": 0.0931, "step": 59620 }, { "epoch": 4.428189514332392, "grad_norm": 2.092609405517578, "learning_rate": 3.430862914005644e-06, "loss": 0.089, "step": 59630 }, { "epoch": 4.42893212535274, "grad_norm": 1.9209469556808472, "learning_rate": 3.4264072478835588e-06, "loss": 0.0671, "step": 59640 }, { "epoch": 4.429674736373088, "grad_norm": 2.280712842941284, "learning_rate": 3.4219515817614738e-06, "loss": 0.0598, "step": 59650 }, { "epoch": 4.430417347393435, "grad_norm": 1.3931483030319214, "learning_rate": 3.4174959156393883e-06, "loss": 0.063, "step": 59660 }, { "epoch": 4.431159958413783, "grad_norm": 1.0616021156311035, "learning_rate": 3.413040249517303e-06, "loss": 0.0531, "step": 59670 }, { "epoch": 4.43190256943413, "grad_norm": 1.928513765335083, "learning_rate": 3.408584583395218e-06, "loss": 0.0707, "step": 59680 }, { "epoch": 4.432645180454478, "grad_norm": 0.7961556315422058, "learning_rate": 3.4041289172731325e-06, "loss": 0.0584, "step": 59690 }, { "epoch": 4.433387791474826, "grad_norm": 1.220015525817871, "learning_rate": 3.399673251151047e-06, "loss": 0.0569, "step": 59700 }, { "epoch": 4.434130402495173, "grad_norm": 0.7394031882286072, "learning_rate": 3.395217585028962e-06, "loss": 0.0611, "step": 59710 }, { "epoch": 4.434873013515521, "grad_norm": 2.8178136348724365, "learning_rate": 3.3907619189068766e-06, "loss": 0.082, "step": 59720 }, { "epoch": 4.435615624535868, "grad_norm": 0.8858090043067932, "learning_rate": 3.386306252784791e-06, "loss": 0.064, "step": 59730 }, { "epoch": 4.436358235556216, "grad_norm": 1.7345311641693115, "learning_rate": 3.381850586662706e-06, "loss": 0.0571, "step": 59740 }, { "epoch": 4.437100846576564, "grad_norm": 1.1461231708526611, "learning_rate": 3.3773949205406207e-06, "loss": 0.0449, "step": 59750 }, { "epoch": 4.437843457596911, "grad_norm": 5.428303241729736, "learning_rate": 3.3729392544185357e-06, "loss": 0.0415, "step": 59760 }, { "epoch": 4.4385860686172585, "grad_norm": 3.327192544937134, "learning_rate": 3.3684835882964507e-06, "loss": 0.0834, "step": 59770 }, { "epoch": 4.4393286796376055, "grad_norm": 3.348003387451172, "learning_rate": 3.3640279221743653e-06, "loss": 0.0855, "step": 59780 }, { "epoch": 4.440071290657953, "grad_norm": 0.42248573899269104, "learning_rate": 3.35957225605228e-06, "loss": 0.0454, "step": 59790 }, { "epoch": 4.440813901678301, "grad_norm": 0.9625753164291382, "learning_rate": 3.355116589930195e-06, "loss": 0.0628, "step": 59800 }, { "epoch": 4.441556512698648, "grad_norm": 0.7581101059913635, "learning_rate": 3.3506609238081094e-06, "loss": 0.0483, "step": 59810 }, { "epoch": 4.442299123718996, "grad_norm": 1.2970561981201172, "learning_rate": 3.346205257686024e-06, "loss": 0.039, "step": 59820 }, { "epoch": 4.443041734739343, "grad_norm": 2.935488700866699, "learning_rate": 3.341749591563939e-06, "loss": 0.0532, "step": 59830 }, { "epoch": 4.443784345759691, "grad_norm": 2.0861330032348633, "learning_rate": 3.3372939254418536e-06, "loss": 0.0597, "step": 59840 }, { "epoch": 4.444526956780039, "grad_norm": 3.6742630004882812, "learning_rate": 3.332838259319768e-06, "loss": 0.0661, "step": 59850 }, { "epoch": 4.445269567800386, "grad_norm": 2.0556039810180664, "learning_rate": 3.328382593197683e-06, "loss": 0.036, "step": 59860 }, { "epoch": 4.446012178820734, "grad_norm": 0.8471649289131165, "learning_rate": 3.3239269270755977e-06, "loss": 0.0508, "step": 59870 }, { "epoch": 4.446754789841081, "grad_norm": 4.926666736602783, "learning_rate": 3.3194712609535123e-06, "loss": 0.0731, "step": 59880 }, { "epoch": 4.447497400861429, "grad_norm": 0.624416708946228, "learning_rate": 3.3150155948314277e-06, "loss": 0.0655, "step": 59890 }, { "epoch": 4.448240011881777, "grad_norm": 0.27737393975257874, "learning_rate": 3.3105599287093423e-06, "loss": 0.044, "step": 59900 }, { "epoch": 4.448982622902124, "grad_norm": 0.9491309523582458, "learning_rate": 3.306104262587257e-06, "loss": 0.0543, "step": 59910 }, { "epoch": 4.4497252339224715, "grad_norm": 3.4223947525024414, "learning_rate": 3.301648596465172e-06, "loss": 0.0775, "step": 59920 }, { "epoch": 4.4504678449428186, "grad_norm": 1.4844588041305542, "learning_rate": 3.2971929303430864e-06, "loss": 0.0869, "step": 59930 }, { "epoch": 4.4512104559631664, "grad_norm": 0.4859474003314972, "learning_rate": 3.292737264221001e-06, "loss": 0.0733, "step": 59940 }, { "epoch": 4.451953066983514, "grad_norm": 3.157867670059204, "learning_rate": 3.288281598098916e-06, "loss": 0.0344, "step": 59950 }, { "epoch": 4.452695678003861, "grad_norm": 1.859097957611084, "learning_rate": 3.2838259319768305e-06, "loss": 0.0689, "step": 59960 }, { "epoch": 4.453438289024209, "grad_norm": 1.4190930128097534, "learning_rate": 3.279370265854745e-06, "loss": 0.0691, "step": 59970 }, { "epoch": 4.454180900044556, "grad_norm": 1.7417631149291992, "learning_rate": 3.27491459973266e-06, "loss": 0.0664, "step": 59980 }, { "epoch": 4.454923511064904, "grad_norm": 0.7751902937889099, "learning_rate": 3.2704589336105747e-06, "loss": 0.0361, "step": 59990 }, { "epoch": 4.455666122085252, "grad_norm": 0.1440928876399994, "learning_rate": 3.2660032674884897e-06, "loss": 0.0985, "step": 60000 }, { "epoch": 4.456408733105599, "grad_norm": 0.39282867312431335, "learning_rate": 3.2615476013664042e-06, "loss": 0.0667, "step": 60010 }, { "epoch": 4.457151344125947, "grad_norm": 1.775133728981018, "learning_rate": 3.2570919352443192e-06, "loss": 0.0448, "step": 60020 }, { "epoch": 4.457893955146295, "grad_norm": 0.5823416709899902, "learning_rate": 3.2526362691222342e-06, "loss": 0.0666, "step": 60030 }, { "epoch": 4.458636566166642, "grad_norm": 1.3295433521270752, "learning_rate": 3.248180603000149e-06, "loss": 0.0311, "step": 60040 }, { "epoch": 4.45937917718699, "grad_norm": 2.622669219970703, "learning_rate": 3.2437249368780634e-06, "loss": 0.0918, "step": 60050 }, { "epoch": 4.460121788207337, "grad_norm": 1.4052746295928955, "learning_rate": 3.2392692707559784e-06, "loss": 0.0562, "step": 60060 }, { "epoch": 4.460864399227685, "grad_norm": 0.7786046862602234, "learning_rate": 3.234813604633893e-06, "loss": 0.0816, "step": 60070 }, { "epoch": 4.4616070102480325, "grad_norm": 0.7435610294342041, "learning_rate": 3.2303579385118075e-06, "loss": 0.054, "step": 60080 }, { "epoch": 4.4623496212683795, "grad_norm": 1.1836934089660645, "learning_rate": 3.2259022723897225e-06, "loss": 0.0801, "step": 60090 }, { "epoch": 4.463092232288727, "grad_norm": 1.5763529539108276, "learning_rate": 3.221446606267637e-06, "loss": 0.0548, "step": 60100 }, { "epoch": 4.463834843309074, "grad_norm": 1.6699156761169434, "learning_rate": 3.2169909401455516e-06, "loss": 0.0634, "step": 60110 }, { "epoch": 4.464577454329422, "grad_norm": 0.8276064991950989, "learning_rate": 3.2125352740234666e-06, "loss": 0.0634, "step": 60120 }, { "epoch": 4.46532006534977, "grad_norm": 0.7230677008628845, "learning_rate": 3.208079607901381e-06, "loss": 0.0656, "step": 60130 }, { "epoch": 4.466062676370117, "grad_norm": 1.405157446861267, "learning_rate": 3.2036239417792958e-06, "loss": 0.0971, "step": 60140 }, { "epoch": 4.466805287390465, "grad_norm": 1.1781624555587769, "learning_rate": 3.199168275657211e-06, "loss": 0.0574, "step": 60150 }, { "epoch": 4.467547898410812, "grad_norm": 0.2808609902858734, "learning_rate": 3.1947126095351258e-06, "loss": 0.0415, "step": 60160 }, { "epoch": 4.46829050943116, "grad_norm": 1.525888204574585, "learning_rate": 3.1902569434130403e-06, "loss": 0.0791, "step": 60170 }, { "epoch": 4.469033120451508, "grad_norm": 0.37916257977485657, "learning_rate": 3.1858012772909553e-06, "loss": 0.0406, "step": 60180 }, { "epoch": 4.469775731471855, "grad_norm": 1.2917027473449707, "learning_rate": 3.18134561116887e-06, "loss": 0.0632, "step": 60190 }, { "epoch": 4.470518342492203, "grad_norm": 1.9711061716079712, "learning_rate": 3.1768899450467845e-06, "loss": 0.0628, "step": 60200 }, { "epoch": 4.47126095351255, "grad_norm": 0.7012740969657898, "learning_rate": 3.1724342789246995e-06, "loss": 0.0308, "step": 60210 }, { "epoch": 4.472003564532898, "grad_norm": 2.618312358856201, "learning_rate": 3.167978612802614e-06, "loss": 0.0659, "step": 60220 }, { "epoch": 4.472746175553246, "grad_norm": 0.5947886109352112, "learning_rate": 3.1635229466805286e-06, "loss": 0.0411, "step": 60230 }, { "epoch": 4.473488786573593, "grad_norm": 4.743587493896484, "learning_rate": 3.1590672805584436e-06, "loss": 0.0349, "step": 60240 }, { "epoch": 4.4742313975939405, "grad_norm": 1.2651278972625732, "learning_rate": 3.154611614436358e-06, "loss": 0.0567, "step": 60250 }, { "epoch": 4.4749740086142875, "grad_norm": 2.4540841579437256, "learning_rate": 3.1501559483142727e-06, "loss": 0.0446, "step": 60260 }, { "epoch": 4.475716619634635, "grad_norm": 3.072303295135498, "learning_rate": 3.1457002821921877e-06, "loss": 0.0747, "step": 60270 }, { "epoch": 4.476459230654983, "grad_norm": 3.0911707878112793, "learning_rate": 3.1412446160701023e-06, "loss": 0.0377, "step": 60280 }, { "epoch": 4.47720184167533, "grad_norm": 0.6322579979896545, "learning_rate": 3.1367889499480173e-06, "loss": 0.0727, "step": 60290 }, { "epoch": 4.477944452695678, "grad_norm": 0.2527351379394531, "learning_rate": 3.1323332838259323e-06, "loss": 0.0379, "step": 60300 }, { "epoch": 4.478687063716025, "grad_norm": 2.0621883869171143, "learning_rate": 3.127877617703847e-06, "loss": 0.0577, "step": 60310 }, { "epoch": 4.479429674736373, "grad_norm": 0.48624280095100403, "learning_rate": 3.1234219515817614e-06, "loss": 0.048, "step": 60320 }, { "epoch": 4.480172285756721, "grad_norm": 0.3645511269569397, "learning_rate": 3.1189662854596764e-06, "loss": 0.0462, "step": 60330 }, { "epoch": 4.480914896777068, "grad_norm": 0.7930494546890259, "learning_rate": 3.114510619337591e-06, "loss": 0.0435, "step": 60340 }, { "epoch": 4.481657507797416, "grad_norm": 1.6003566980361938, "learning_rate": 3.110054953215506e-06, "loss": 0.0577, "step": 60350 }, { "epoch": 4.482400118817763, "grad_norm": 0.8492751717567444, "learning_rate": 3.1055992870934206e-06, "loss": 0.049, "step": 60360 }, { "epoch": 4.483142729838111, "grad_norm": 2.092189311981201, "learning_rate": 3.101143620971335e-06, "loss": 0.057, "step": 60370 }, { "epoch": 4.483885340858459, "grad_norm": 0.5786078572273254, "learning_rate": 3.09668795484925e-06, "loss": 0.0569, "step": 60380 }, { "epoch": 4.484627951878806, "grad_norm": 0.4640141725540161, "learning_rate": 3.0922322887271647e-06, "loss": 0.0618, "step": 60390 }, { "epoch": 4.485370562899154, "grad_norm": 2.307746171951294, "learning_rate": 3.0877766226050793e-06, "loss": 0.0578, "step": 60400 }, { "epoch": 4.486113173919501, "grad_norm": 1.5179271697998047, "learning_rate": 3.0833209564829943e-06, "loss": 0.0363, "step": 60410 }, { "epoch": 4.4868557849398485, "grad_norm": 0.30121228098869324, "learning_rate": 3.0788652903609093e-06, "loss": 0.05, "step": 60420 }, { "epoch": 4.487598395960196, "grad_norm": 2.5715456008911133, "learning_rate": 3.074409624238824e-06, "loss": 0.0622, "step": 60430 }, { "epoch": 4.488341006980543, "grad_norm": 2.3509955406188965, "learning_rate": 3.069953958116739e-06, "loss": 0.0728, "step": 60440 }, { "epoch": 4.489083618000891, "grad_norm": 1.9906114339828491, "learning_rate": 3.0654982919946534e-06, "loss": 0.0598, "step": 60450 }, { "epoch": 4.489826229021238, "grad_norm": 1.7488093376159668, "learning_rate": 3.061042625872568e-06, "loss": 0.0452, "step": 60460 }, { "epoch": 4.490568840041586, "grad_norm": 0.5462767481803894, "learning_rate": 3.056586959750483e-06, "loss": 0.0505, "step": 60470 }, { "epoch": 4.491311451061934, "grad_norm": 0.6967370510101318, "learning_rate": 3.0521312936283975e-06, "loss": 0.0968, "step": 60480 }, { "epoch": 4.492054062082281, "grad_norm": 0.6573207378387451, "learning_rate": 3.047675627506312e-06, "loss": 0.045, "step": 60490 }, { "epoch": 4.492796673102629, "grad_norm": 0.9481235146522522, "learning_rate": 3.043219961384227e-06, "loss": 0.0377, "step": 60500 }, { "epoch": 4.493539284122976, "grad_norm": 0.8041633367538452, "learning_rate": 3.0387642952621417e-06, "loss": 0.0376, "step": 60510 }, { "epoch": 4.494281895143324, "grad_norm": 0.4451924264431, "learning_rate": 3.0343086291400562e-06, "loss": 0.0509, "step": 60520 }, { "epoch": 4.495024506163672, "grad_norm": 1.242018222808838, "learning_rate": 3.0298529630179712e-06, "loss": 0.0612, "step": 60530 }, { "epoch": 4.495767117184019, "grad_norm": 1.5927053689956665, "learning_rate": 3.025397296895886e-06, "loss": 0.0466, "step": 60540 }, { "epoch": 4.496509728204367, "grad_norm": 0.3729563355445862, "learning_rate": 3.020941630773801e-06, "loss": 0.0654, "step": 60550 }, { "epoch": 4.497252339224714, "grad_norm": 1.2885338068008423, "learning_rate": 3.0164859646517158e-06, "loss": 0.0394, "step": 60560 }, { "epoch": 4.497994950245062, "grad_norm": 0.8227145075798035, "learning_rate": 3.0120302985296304e-06, "loss": 0.0379, "step": 60570 }, { "epoch": 4.4987375612654095, "grad_norm": 1.220255970954895, "learning_rate": 3.007574632407545e-06, "loss": 0.0491, "step": 60580 }, { "epoch": 4.4994801722857565, "grad_norm": 1.4378726482391357, "learning_rate": 3.00311896628546e-06, "loss": 0.0348, "step": 60590 }, { "epoch": 4.500222783306104, "grad_norm": 1.8809531927108765, "learning_rate": 2.9986633001633745e-06, "loss": 0.0556, "step": 60600 }, { "epoch": 4.500965394326451, "grad_norm": 0.39262035489082336, "learning_rate": 2.994207634041289e-06, "loss": 0.0387, "step": 60610 }, { "epoch": 4.501708005346799, "grad_norm": 1.774888038635254, "learning_rate": 2.989751967919204e-06, "loss": 0.0463, "step": 60620 }, { "epoch": 4.502450616367147, "grad_norm": 1.312552809715271, "learning_rate": 2.9852963017971186e-06, "loss": 0.0963, "step": 60630 }, { "epoch": 4.503193227387494, "grad_norm": 1.1189274787902832, "learning_rate": 2.980840635675033e-06, "loss": 0.0553, "step": 60640 }, { "epoch": 4.503935838407842, "grad_norm": 1.8008004426956177, "learning_rate": 2.976384969552948e-06, "loss": 0.0713, "step": 60650 }, { "epoch": 4.504678449428189, "grad_norm": 0.9484963417053223, "learning_rate": 2.9719293034308628e-06, "loss": 0.0425, "step": 60660 }, { "epoch": 4.505421060448537, "grad_norm": 1.8093992471694946, "learning_rate": 2.9674736373087778e-06, "loss": 0.0491, "step": 60670 }, { "epoch": 4.506163671468885, "grad_norm": 0.9192953705787659, "learning_rate": 2.9630179711866927e-06, "loss": 0.0439, "step": 60680 }, { "epoch": 4.506906282489232, "grad_norm": 1.6268279552459717, "learning_rate": 2.9585623050646073e-06, "loss": 0.0575, "step": 60690 }, { "epoch": 4.50764889350958, "grad_norm": 0.32879000902175903, "learning_rate": 2.9541066389425223e-06, "loss": 0.0425, "step": 60700 }, { "epoch": 4.508391504529927, "grad_norm": 1.2514315843582153, "learning_rate": 2.949650972820437e-06, "loss": 0.0541, "step": 60710 }, { "epoch": 4.509134115550275, "grad_norm": 1.9278550148010254, "learning_rate": 2.9451953066983515e-06, "loss": 0.0818, "step": 60720 }, { "epoch": 4.5098767265706226, "grad_norm": 0.9422726035118103, "learning_rate": 2.9407396405762664e-06, "loss": 0.0572, "step": 60730 }, { "epoch": 4.51061933759097, "grad_norm": 2.648932695388794, "learning_rate": 2.936283974454181e-06, "loss": 0.0671, "step": 60740 }, { "epoch": 4.5113619486113175, "grad_norm": 2.699385166168213, "learning_rate": 2.9318283083320956e-06, "loss": 0.0477, "step": 60750 }, { "epoch": 4.5121045596316645, "grad_norm": 0.6302013397216797, "learning_rate": 2.9273726422100106e-06, "loss": 0.056, "step": 60760 }, { "epoch": 4.512847170652012, "grad_norm": 1.3976058959960938, "learning_rate": 2.922916976087925e-06, "loss": 0.0684, "step": 60770 }, { "epoch": 4.51358978167236, "grad_norm": 1.1172116994857788, "learning_rate": 2.9184613099658397e-06, "loss": 0.0705, "step": 60780 }, { "epoch": 4.514332392692707, "grad_norm": 0.6140624284744263, "learning_rate": 2.9140056438437547e-06, "loss": 0.0561, "step": 60790 }, { "epoch": 4.515075003713055, "grad_norm": 1.3316572904586792, "learning_rate": 2.9095499777216693e-06, "loss": 0.0861, "step": 60800 }, { "epoch": 4.515817614733402, "grad_norm": 0.5338196754455566, "learning_rate": 2.9050943115995843e-06, "loss": 0.0516, "step": 60810 }, { "epoch": 4.51656022575375, "grad_norm": 1.8989540338516235, "learning_rate": 2.9006386454774993e-06, "loss": 0.0635, "step": 60820 }, { "epoch": 4.517302836774098, "grad_norm": 0.7707126140594482, "learning_rate": 2.896182979355414e-06, "loss": 0.076, "step": 60830 }, { "epoch": 4.518045447794445, "grad_norm": 3.540522575378418, "learning_rate": 2.8917273132333284e-06, "loss": 0.0565, "step": 60840 }, { "epoch": 4.518788058814793, "grad_norm": 1.879492998123169, "learning_rate": 2.8872716471112434e-06, "loss": 0.0722, "step": 60850 }, { "epoch": 4.519530669835141, "grad_norm": 2.0404903888702393, "learning_rate": 2.882815980989158e-06, "loss": 0.0665, "step": 60860 }, { "epoch": 4.520273280855488, "grad_norm": 0.7342921495437622, "learning_rate": 2.8783603148670726e-06, "loss": 0.0549, "step": 60870 }, { "epoch": 4.521015891875836, "grad_norm": 2.278888463973999, "learning_rate": 2.8739046487449876e-06, "loss": 0.0472, "step": 60880 }, { "epoch": 4.521758502896183, "grad_norm": 0.994851291179657, "learning_rate": 2.869448982622902e-06, "loss": 0.0613, "step": 60890 }, { "epoch": 4.5225011139165305, "grad_norm": 1.3378137350082397, "learning_rate": 2.8649933165008167e-06, "loss": 0.0629, "step": 60900 }, { "epoch": 4.523243724936878, "grad_norm": 1.5899670124053955, "learning_rate": 2.8605376503787317e-06, "loss": 0.0526, "step": 60910 }, { "epoch": 4.523986335957225, "grad_norm": 0.4787708520889282, "learning_rate": 2.8560819842566463e-06, "loss": 0.0414, "step": 60920 }, { "epoch": 4.524728946977573, "grad_norm": 1.1397485733032227, "learning_rate": 2.851626318134561e-06, "loss": 0.0578, "step": 60930 }, { "epoch": 4.52547155799792, "grad_norm": 1.5434584617614746, "learning_rate": 2.8471706520124762e-06, "loss": 0.0783, "step": 60940 }, { "epoch": 4.526214169018268, "grad_norm": 0.4989255666732788, "learning_rate": 2.842714985890391e-06, "loss": 0.0374, "step": 60950 }, { "epoch": 4.526956780038616, "grad_norm": 1.2678778171539307, "learning_rate": 2.8382593197683054e-06, "loss": 0.0509, "step": 60960 }, { "epoch": 4.527699391058963, "grad_norm": 1.7154262065887451, "learning_rate": 2.8338036536462204e-06, "loss": 0.0404, "step": 60970 }, { "epoch": 4.528442002079311, "grad_norm": 0.7340508103370667, "learning_rate": 2.829347987524135e-06, "loss": 0.0711, "step": 60980 }, { "epoch": 4.529184613099658, "grad_norm": 1.5638376474380493, "learning_rate": 2.8248923214020495e-06, "loss": 0.0575, "step": 60990 }, { "epoch": 4.529927224120006, "grad_norm": 1.4016845226287842, "learning_rate": 2.8204366552799645e-06, "loss": 0.0277, "step": 61000 }, { "epoch": 4.530669835140354, "grad_norm": 0.4884537160396576, "learning_rate": 2.815980989157879e-06, "loss": 0.0537, "step": 61010 }, { "epoch": 4.531412446160701, "grad_norm": 1.8895937204360962, "learning_rate": 2.811525323035794e-06, "loss": 0.08, "step": 61020 }, { "epoch": 4.532155057181049, "grad_norm": 0.5907909274101257, "learning_rate": 2.8070696569137087e-06, "loss": 0.0555, "step": 61030 }, { "epoch": 4.532897668201396, "grad_norm": 3.3188436031341553, "learning_rate": 2.8026139907916232e-06, "loss": 0.0834, "step": 61040 }, { "epoch": 4.533640279221744, "grad_norm": 1.3350552320480347, "learning_rate": 2.7981583246695382e-06, "loss": 0.0544, "step": 61050 }, { "epoch": 4.5343828902420915, "grad_norm": 1.2445580959320068, "learning_rate": 2.793702658547453e-06, "loss": 0.0694, "step": 61060 }, { "epoch": 4.5351255012624385, "grad_norm": 2.0901408195495605, "learning_rate": 2.7892469924253674e-06, "loss": 0.0721, "step": 61070 }, { "epoch": 4.535868112282786, "grad_norm": 1.3586374521255493, "learning_rate": 2.7847913263032828e-06, "loss": 0.0749, "step": 61080 }, { "epoch": 4.536610723303134, "grad_norm": 0.6412113904953003, "learning_rate": 2.7803356601811973e-06, "loss": 0.0538, "step": 61090 }, { "epoch": 4.537353334323481, "grad_norm": 0.7717999815940857, "learning_rate": 2.775879994059112e-06, "loss": 0.0578, "step": 61100 }, { "epoch": 4.538095945343829, "grad_norm": 0.7013200521469116, "learning_rate": 2.771424327937027e-06, "loss": 0.0765, "step": 61110 }, { "epoch": 4.538838556364176, "grad_norm": 1.3296678066253662, "learning_rate": 2.7669686618149415e-06, "loss": 0.0531, "step": 61120 }, { "epoch": 4.539581167384524, "grad_norm": 1.7592175006866455, "learning_rate": 2.762512995692856e-06, "loss": 0.051, "step": 61130 }, { "epoch": 4.540323778404872, "grad_norm": 1.7775698900222778, "learning_rate": 2.758057329570771e-06, "loss": 0.05, "step": 61140 }, { "epoch": 4.541066389425219, "grad_norm": 1.649628758430481, "learning_rate": 2.7536016634486856e-06, "loss": 0.0464, "step": 61150 }, { "epoch": 4.541809000445567, "grad_norm": 1.310793399810791, "learning_rate": 2.7491459973266e-06, "loss": 0.0686, "step": 61160 }, { "epoch": 4.542551611465914, "grad_norm": 1.5081290006637573, "learning_rate": 2.744690331204515e-06, "loss": 0.0675, "step": 61170 }, { "epoch": 4.543294222486262, "grad_norm": 1.5002572536468506, "learning_rate": 2.7402346650824298e-06, "loss": 0.0595, "step": 61180 }, { "epoch": 4.54403683350661, "grad_norm": 1.7009143829345703, "learning_rate": 2.7357789989603443e-06, "loss": 0.0824, "step": 61190 }, { "epoch": 4.544779444526957, "grad_norm": 2.5938093662261963, "learning_rate": 2.7313233328382593e-06, "loss": 0.052, "step": 61200 }, { "epoch": 4.545522055547305, "grad_norm": 1.4305498600006104, "learning_rate": 2.7268676667161743e-06, "loss": 0.0621, "step": 61210 }, { "epoch": 4.546264666567652, "grad_norm": 1.7838150262832642, "learning_rate": 2.722412000594089e-06, "loss": 0.0711, "step": 61220 }, { "epoch": 4.5470072775879995, "grad_norm": 0.737729549407959, "learning_rate": 2.717956334472004e-06, "loss": 0.0374, "step": 61230 }, { "epoch": 4.547749888608347, "grad_norm": 1.021944522857666, "learning_rate": 2.7135006683499184e-06, "loss": 0.0631, "step": 61240 }, { "epoch": 4.548492499628694, "grad_norm": 0.5011008977890015, "learning_rate": 2.709045002227833e-06, "loss": 0.0448, "step": 61250 }, { "epoch": 4.549235110649042, "grad_norm": 3.05027437210083, "learning_rate": 2.704589336105748e-06, "loss": 0.0467, "step": 61260 }, { "epoch": 4.549977721669389, "grad_norm": 3.195746660232544, "learning_rate": 2.7001336699836626e-06, "loss": 0.0584, "step": 61270 }, { "epoch": 4.550720332689737, "grad_norm": 1.1807307004928589, "learning_rate": 2.695678003861577e-06, "loss": 0.0474, "step": 61280 }, { "epoch": 4.551462943710085, "grad_norm": 2.7736690044403076, "learning_rate": 2.691222337739492e-06, "loss": 0.0686, "step": 61290 }, { "epoch": 4.552205554730432, "grad_norm": 1.5270766019821167, "learning_rate": 2.6867666716174067e-06, "loss": 0.0425, "step": 61300 }, { "epoch": 4.55294816575078, "grad_norm": 1.2185275554656982, "learning_rate": 2.6823110054953213e-06, "loss": 0.0443, "step": 61310 }, { "epoch": 4.553690776771127, "grad_norm": 1.2609001398086548, "learning_rate": 2.6778553393732363e-06, "loss": 0.0793, "step": 61320 }, { "epoch": 4.554433387791475, "grad_norm": 2.8986945152282715, "learning_rate": 2.673399673251151e-06, "loss": 0.0704, "step": 61330 }, { "epoch": 4.555175998811823, "grad_norm": 1.5010956525802612, "learning_rate": 2.668944007129066e-06, "loss": 0.0629, "step": 61340 }, { "epoch": 4.55591860983217, "grad_norm": 0.4704782962799072, "learning_rate": 2.664488341006981e-06, "loss": 0.0613, "step": 61350 }, { "epoch": 4.556661220852518, "grad_norm": 0.9099952578544617, "learning_rate": 2.6600326748848954e-06, "loss": 0.0481, "step": 61360 }, { "epoch": 4.557403831872865, "grad_norm": 3.606095790863037, "learning_rate": 2.6555770087628104e-06, "loss": 0.0792, "step": 61370 }, { "epoch": 4.558146442893213, "grad_norm": 1.9132657051086426, "learning_rate": 2.651121342640725e-06, "loss": 0.0379, "step": 61380 }, { "epoch": 4.5588890539135605, "grad_norm": 1.5124993324279785, "learning_rate": 2.6466656765186396e-06, "loss": 0.0654, "step": 61390 }, { "epoch": 4.5596316649339075, "grad_norm": 0.876136839389801, "learning_rate": 2.6422100103965545e-06, "loss": 0.0769, "step": 61400 }, { "epoch": 4.560374275954255, "grad_norm": 2.7477357387542725, "learning_rate": 2.637754344274469e-06, "loss": 0.0912, "step": 61410 }, { "epoch": 4.561116886974602, "grad_norm": 0.6993147730827332, "learning_rate": 2.6332986781523837e-06, "loss": 0.0432, "step": 61420 }, { "epoch": 4.56185949799495, "grad_norm": 2.7787704467773438, "learning_rate": 2.6288430120302987e-06, "loss": 0.0649, "step": 61430 }, { "epoch": 4.562602109015298, "grad_norm": 1.603108286857605, "learning_rate": 2.6243873459082133e-06, "loss": 0.0468, "step": 61440 }, { "epoch": 4.563344720035645, "grad_norm": 2.7773287296295166, "learning_rate": 2.619931679786128e-06, "loss": 0.0923, "step": 61450 }, { "epoch": 4.564087331055993, "grad_norm": 0.5554677844047546, "learning_rate": 2.615476013664043e-06, "loss": 0.1035, "step": 61460 }, { "epoch": 4.56482994207634, "grad_norm": 0.5234212279319763, "learning_rate": 2.611020347541958e-06, "loss": 0.0597, "step": 61470 }, { "epoch": 4.565572553096688, "grad_norm": 0.9597153067588806, "learning_rate": 2.6065646814198724e-06, "loss": 0.0679, "step": 61480 }, { "epoch": 4.566315164117036, "grad_norm": 3.1372969150543213, "learning_rate": 2.6021090152977874e-06, "loss": 0.0755, "step": 61490 }, { "epoch": 4.567057775137383, "grad_norm": 1.275407314300537, "learning_rate": 2.597653349175702e-06, "loss": 0.0494, "step": 61500 }, { "epoch": 4.567800386157731, "grad_norm": 1.3001893758773804, "learning_rate": 2.5931976830536165e-06, "loss": 0.0581, "step": 61510 }, { "epoch": 4.568542997178078, "grad_norm": 1.2099862098693848, "learning_rate": 2.5887420169315315e-06, "loss": 0.1197, "step": 61520 }, { "epoch": 4.569285608198426, "grad_norm": 1.1752756834030151, "learning_rate": 2.584286350809446e-06, "loss": 0.055, "step": 61530 }, { "epoch": 4.570028219218774, "grad_norm": 1.800934076309204, "learning_rate": 2.5798306846873607e-06, "loss": 0.0495, "step": 61540 }, { "epoch": 4.570770830239121, "grad_norm": 1.4622337818145752, "learning_rate": 2.5753750185652756e-06, "loss": 0.0713, "step": 61550 }, { "epoch": 4.5715134412594685, "grad_norm": 3.0324299335479736, "learning_rate": 2.5709193524431902e-06, "loss": 0.0626, "step": 61560 }, { "epoch": 4.5722560522798155, "grad_norm": 3.204300880432129, "learning_rate": 2.5664636863211048e-06, "loss": 0.05, "step": 61570 }, { "epoch": 4.572998663300163, "grad_norm": 0.8802090883255005, "learning_rate": 2.5620080201990198e-06, "loss": 0.0416, "step": 61580 }, { "epoch": 4.573741274320511, "grad_norm": 1.4025449752807617, "learning_rate": 2.5575523540769344e-06, "loss": 0.0891, "step": 61590 }, { "epoch": 4.574483885340858, "grad_norm": 2.5383615493774414, "learning_rate": 2.5530966879548493e-06, "loss": 0.0421, "step": 61600 }, { "epoch": 4.575226496361206, "grad_norm": 0.5844364166259766, "learning_rate": 2.5486410218327643e-06, "loss": 0.0615, "step": 61610 }, { "epoch": 4.575969107381553, "grad_norm": 1.9807744026184082, "learning_rate": 2.544185355710679e-06, "loss": 0.0521, "step": 61620 }, { "epoch": 4.576711718401901, "grad_norm": 2.7843782901763916, "learning_rate": 2.5397296895885935e-06, "loss": 0.0369, "step": 61630 }, { "epoch": 4.577454329422249, "grad_norm": 2.3830394744873047, "learning_rate": 2.5352740234665085e-06, "loss": 0.031, "step": 61640 }, { "epoch": 4.578196940442596, "grad_norm": 3.3283472061157227, "learning_rate": 2.530818357344423e-06, "loss": 0.0679, "step": 61650 }, { "epoch": 4.578939551462944, "grad_norm": 0.3360592722892761, "learning_rate": 2.5263626912223376e-06, "loss": 0.0799, "step": 61660 }, { "epoch": 4.579682162483291, "grad_norm": 1.3740484714508057, "learning_rate": 2.5219070251002526e-06, "loss": 0.0593, "step": 61670 }, { "epoch": 4.580424773503639, "grad_norm": 1.121627926826477, "learning_rate": 2.517451358978167e-06, "loss": 0.0485, "step": 61680 }, { "epoch": 4.581167384523987, "grad_norm": 1.13548743724823, "learning_rate": 2.512995692856082e-06, "loss": 0.0602, "step": 61690 }, { "epoch": 4.581909995544334, "grad_norm": 1.0693154335021973, "learning_rate": 2.5085400267339967e-06, "loss": 0.0607, "step": 61700 }, { "epoch": 4.5826526065646815, "grad_norm": 0.1895827353000641, "learning_rate": 2.5040843606119113e-06, "loss": 0.052, "step": 61710 }, { "epoch": 4.5833952175850285, "grad_norm": 1.7912211418151855, "learning_rate": 2.4996286944898263e-06, "loss": 0.0385, "step": 61720 }, { "epoch": 4.584137828605376, "grad_norm": 1.460799217224121, "learning_rate": 2.495173028367741e-06, "loss": 0.0519, "step": 61730 }, { "epoch": 4.584880439625724, "grad_norm": 0.9446871280670166, "learning_rate": 2.490717362245656e-06, "loss": 0.0334, "step": 61740 }, { "epoch": 4.585623050646071, "grad_norm": 2.315859317779541, "learning_rate": 2.486261696123571e-06, "loss": 0.0522, "step": 61750 }, { "epoch": 4.586365661666419, "grad_norm": 1.3986417055130005, "learning_rate": 2.4818060300014854e-06, "loss": 0.0468, "step": 61760 }, { "epoch": 4.587108272686766, "grad_norm": 0.41333481669425964, "learning_rate": 2.4773503638794e-06, "loss": 0.0342, "step": 61770 }, { "epoch": 4.587850883707114, "grad_norm": 1.8333367109298706, "learning_rate": 2.472894697757315e-06, "loss": 0.085, "step": 61780 }, { "epoch": 4.588593494727462, "grad_norm": 2.2030510902404785, "learning_rate": 2.4684390316352296e-06, "loss": 0.0629, "step": 61790 }, { "epoch": 4.589336105747809, "grad_norm": 1.6054326295852661, "learning_rate": 2.463983365513144e-06, "loss": 0.0628, "step": 61800 }, { "epoch": 4.590078716768157, "grad_norm": 0.6966524720191956, "learning_rate": 2.459527699391059e-06, "loss": 0.0585, "step": 61810 }, { "epoch": 4.590821327788504, "grad_norm": 0.6496719717979431, "learning_rate": 2.4550720332689737e-06, "loss": 0.0488, "step": 61820 }, { "epoch": 4.591563938808852, "grad_norm": 1.0793461799621582, "learning_rate": 2.4506163671468883e-06, "loss": 0.0609, "step": 61830 }, { "epoch": 4.5923065498292, "grad_norm": 2.4242899417877197, "learning_rate": 2.4461607010248033e-06, "loss": 0.0906, "step": 61840 }, { "epoch": 4.593049160849547, "grad_norm": 1.3996238708496094, "learning_rate": 2.441705034902718e-06, "loss": 0.07, "step": 61850 }, { "epoch": 4.593791771869895, "grad_norm": 1.6079001426696777, "learning_rate": 2.4372493687806324e-06, "loss": 0.0418, "step": 61860 }, { "epoch": 4.594534382890242, "grad_norm": 0.46042919158935547, "learning_rate": 2.432793702658548e-06, "loss": 0.0495, "step": 61870 }, { "epoch": 4.5952769939105895, "grad_norm": 0.7299936413764954, "learning_rate": 2.4283380365364624e-06, "loss": 0.0731, "step": 61880 }, { "epoch": 4.596019604930937, "grad_norm": 0.81382817029953, "learning_rate": 2.423882370414377e-06, "loss": 0.0321, "step": 61890 }, { "epoch": 4.596762215951284, "grad_norm": 1.3235479593276978, "learning_rate": 2.419426704292292e-06, "loss": 0.0521, "step": 61900 }, { "epoch": 4.597504826971632, "grad_norm": 2.247610330581665, "learning_rate": 2.4149710381702065e-06, "loss": 0.0735, "step": 61910 }, { "epoch": 4.598247437991979, "grad_norm": 0.9639174938201904, "learning_rate": 2.410515372048121e-06, "loss": 0.1068, "step": 61920 }, { "epoch": 4.598990049012327, "grad_norm": 2.1627914905548096, "learning_rate": 2.406059705926036e-06, "loss": 0.0592, "step": 61930 }, { "epoch": 4.599732660032675, "grad_norm": 0.5631018877029419, "learning_rate": 2.4016040398039507e-06, "loss": 0.0608, "step": 61940 }, { "epoch": 4.600475271053022, "grad_norm": 1.5326188802719116, "learning_rate": 2.3971483736818653e-06, "loss": 0.0571, "step": 61950 }, { "epoch": 4.60121788207337, "grad_norm": 1.1006548404693604, "learning_rate": 2.3926927075597802e-06, "loss": 0.0602, "step": 61960 }, { "epoch": 4.601960493093717, "grad_norm": 2.1133296489715576, "learning_rate": 2.388237041437695e-06, "loss": 0.0895, "step": 61970 }, { "epoch": 4.602703104114065, "grad_norm": 0.42553678154945374, "learning_rate": 2.3837813753156094e-06, "loss": 0.049, "step": 61980 }, { "epoch": 4.603445715134413, "grad_norm": 1.3893156051635742, "learning_rate": 2.3793257091935244e-06, "loss": 0.0459, "step": 61990 }, { "epoch": 4.60418832615476, "grad_norm": 0.9664128422737122, "learning_rate": 2.3748700430714394e-06, "loss": 0.0587, "step": 62000 }, { "epoch": 4.604930937175108, "grad_norm": 0.9554176330566406, "learning_rate": 2.370414376949354e-06, "loss": 0.0503, "step": 62010 }, { "epoch": 4.605673548195456, "grad_norm": 1.3135801553726196, "learning_rate": 2.365958710827269e-06, "loss": 0.07, "step": 62020 }, { "epoch": 4.606416159215803, "grad_norm": 4.117845058441162, "learning_rate": 2.3615030447051835e-06, "loss": 0.0766, "step": 62030 }, { "epoch": 4.6071587702361505, "grad_norm": 1.4412838220596313, "learning_rate": 2.3570473785830985e-06, "loss": 0.0376, "step": 62040 }, { "epoch": 4.6079013812564975, "grad_norm": 2.425473690032959, "learning_rate": 2.352591712461013e-06, "loss": 0.0399, "step": 62050 }, { "epoch": 4.608643992276845, "grad_norm": 0.8025254011154175, "learning_rate": 2.3481360463389276e-06, "loss": 0.0716, "step": 62060 }, { "epoch": 4.609386603297193, "grad_norm": 1.2991340160369873, "learning_rate": 2.3436803802168426e-06, "loss": 0.0495, "step": 62070 }, { "epoch": 4.61012921431754, "grad_norm": 0.7377578020095825, "learning_rate": 2.339224714094757e-06, "loss": 0.0436, "step": 62080 }, { "epoch": 4.610871825337888, "grad_norm": 0.37221047282218933, "learning_rate": 2.3347690479726718e-06, "loss": 0.0418, "step": 62090 }, { "epoch": 4.611614436358235, "grad_norm": 1.457234501838684, "learning_rate": 2.3303133818505868e-06, "loss": 0.0741, "step": 62100 }, { "epoch": 4.612357047378583, "grad_norm": 1.0528523921966553, "learning_rate": 2.3258577157285013e-06, "loss": 0.0518, "step": 62110 }, { "epoch": 4.613099658398931, "grad_norm": 1.0995347499847412, "learning_rate": 2.321402049606416e-06, "loss": 0.0827, "step": 62120 }, { "epoch": 4.613842269419278, "grad_norm": 2.883272409439087, "learning_rate": 2.3169463834843313e-06, "loss": 0.0476, "step": 62130 }, { "epoch": 4.614584880439626, "grad_norm": 1.5344513654708862, "learning_rate": 2.312490717362246e-06, "loss": 0.0426, "step": 62140 }, { "epoch": 4.615327491459973, "grad_norm": 2.136598587036133, "learning_rate": 2.3080350512401605e-06, "loss": 0.0948, "step": 62150 }, { "epoch": 4.616070102480321, "grad_norm": 1.1575771570205688, "learning_rate": 2.3035793851180755e-06, "loss": 0.0708, "step": 62160 }, { "epoch": 4.616812713500669, "grad_norm": 0.935723066329956, "learning_rate": 2.29912371899599e-06, "loss": 0.0624, "step": 62170 }, { "epoch": 4.617555324521016, "grad_norm": 2.6540639400482178, "learning_rate": 2.2946680528739046e-06, "loss": 0.0716, "step": 62180 }, { "epoch": 4.618297935541364, "grad_norm": 0.7408058047294617, "learning_rate": 2.2902123867518196e-06, "loss": 0.0612, "step": 62190 }, { "epoch": 4.619040546561711, "grad_norm": 1.5161961317062378, "learning_rate": 2.285756720629734e-06, "loss": 0.0565, "step": 62200 }, { "epoch": 4.6197831575820585, "grad_norm": 1.0326826572418213, "learning_rate": 2.2813010545076487e-06, "loss": 0.0633, "step": 62210 }, { "epoch": 4.620525768602406, "grad_norm": 1.720488429069519, "learning_rate": 2.2768453883855637e-06, "loss": 0.0651, "step": 62220 }, { "epoch": 4.621268379622753, "grad_norm": 0.7917996048927307, "learning_rate": 2.2723897222634783e-06, "loss": 0.0629, "step": 62230 }, { "epoch": 4.622010990643101, "grad_norm": 1.6765140295028687, "learning_rate": 2.267934056141393e-06, "loss": 0.0662, "step": 62240 }, { "epoch": 4.622753601663449, "grad_norm": 0.8106739521026611, "learning_rate": 2.263478390019308e-06, "loss": 0.0489, "step": 62250 }, { "epoch": 4.623496212683796, "grad_norm": 0.6915829181671143, "learning_rate": 2.259022723897223e-06, "loss": 0.0594, "step": 62260 }, { "epoch": 4.624238823704144, "grad_norm": 0.5459581613540649, "learning_rate": 2.2545670577751374e-06, "loss": 0.0536, "step": 62270 }, { "epoch": 4.624981434724491, "grad_norm": 1.8974658250808716, "learning_rate": 2.2501113916530524e-06, "loss": 0.0486, "step": 62280 }, { "epoch": 4.625724045744839, "grad_norm": 2.0770936012268066, "learning_rate": 2.245655725530967e-06, "loss": 0.072, "step": 62290 }, { "epoch": 4.626466656765187, "grad_norm": 1.3229902982711792, "learning_rate": 2.2412000594088816e-06, "loss": 0.0599, "step": 62300 }, { "epoch": 4.627209267785534, "grad_norm": 0.6747040748596191, "learning_rate": 2.2367443932867966e-06, "loss": 0.0734, "step": 62310 }, { "epoch": 4.627951878805882, "grad_norm": 2.171114921569824, "learning_rate": 2.232288727164711e-06, "loss": 0.0809, "step": 62320 }, { "epoch": 4.628694489826229, "grad_norm": 1.1108207702636719, "learning_rate": 2.2278330610426257e-06, "loss": 0.0447, "step": 62330 }, { "epoch": 4.629437100846577, "grad_norm": 1.5883184671401978, "learning_rate": 2.2233773949205407e-06, "loss": 0.0599, "step": 62340 }, { "epoch": 4.630179711866925, "grad_norm": 1.2950087785720825, "learning_rate": 2.2189217287984553e-06, "loss": 0.0346, "step": 62350 }, { "epoch": 4.630922322887272, "grad_norm": 2.8710503578186035, "learning_rate": 2.21446606267637e-06, "loss": 0.0433, "step": 62360 }, { "epoch": 4.6316649339076195, "grad_norm": 0.5784642696380615, "learning_rate": 2.210010396554285e-06, "loss": 0.0717, "step": 62370 }, { "epoch": 4.6324075449279665, "grad_norm": 1.163934350013733, "learning_rate": 2.2055547304321994e-06, "loss": 0.0449, "step": 62380 }, { "epoch": 4.633150155948314, "grad_norm": 2.270219564437866, "learning_rate": 2.201099064310115e-06, "loss": 0.0608, "step": 62390 }, { "epoch": 4.633892766968662, "grad_norm": 2.067028045654297, "learning_rate": 2.1966433981880294e-06, "loss": 0.0698, "step": 62400 }, { "epoch": 4.634635377989009, "grad_norm": 1.985038161277771, "learning_rate": 2.192187732065944e-06, "loss": 0.0819, "step": 62410 }, { "epoch": 4.635377989009357, "grad_norm": 3.1525087356567383, "learning_rate": 2.187732065943859e-06, "loss": 0.0757, "step": 62420 }, { "epoch": 4.636120600029704, "grad_norm": 2.7132487297058105, "learning_rate": 2.1832763998217735e-06, "loss": 0.046, "step": 62430 }, { "epoch": 4.636863211050052, "grad_norm": 1.3874857425689697, "learning_rate": 2.178820733699688e-06, "loss": 0.0706, "step": 62440 }, { "epoch": 4.6376058220704, "grad_norm": 1.2080367803573608, "learning_rate": 2.174365067577603e-06, "loss": 0.0525, "step": 62450 }, { "epoch": 4.638348433090747, "grad_norm": 1.1241955757141113, "learning_rate": 2.1699094014555177e-06, "loss": 0.0514, "step": 62460 }, { "epoch": 4.639091044111095, "grad_norm": 1.7691078186035156, "learning_rate": 2.1654537353334322e-06, "loss": 0.0997, "step": 62470 }, { "epoch": 4.639833655131442, "grad_norm": 1.5121815204620361, "learning_rate": 2.1609980692113472e-06, "loss": 0.0648, "step": 62480 }, { "epoch": 4.64057626615179, "grad_norm": 0.8992467522621155, "learning_rate": 2.156542403089262e-06, "loss": 0.0425, "step": 62490 }, { "epoch": 4.641318877172138, "grad_norm": 0.8572467565536499, "learning_rate": 2.1520867369671764e-06, "loss": 0.064, "step": 62500 }, { "epoch": 4.642061488192485, "grad_norm": 0.3243890106678009, "learning_rate": 2.1476310708450914e-06, "loss": 0.0469, "step": 62510 }, { "epoch": 4.6428040992128325, "grad_norm": 0.47468477487564087, "learning_rate": 2.143175404723006e-06, "loss": 0.0805, "step": 62520 }, { "epoch": 4.6435467102331796, "grad_norm": 0.49177148938179016, "learning_rate": 2.138719738600921e-06, "loss": 0.0608, "step": 62530 }, { "epoch": 4.6442893212535274, "grad_norm": 0.7259443402290344, "learning_rate": 2.134264072478836e-06, "loss": 0.0658, "step": 62540 }, { "epoch": 4.645031932273875, "grad_norm": 1.287712812423706, "learning_rate": 2.1298084063567505e-06, "loss": 0.0708, "step": 62550 }, { "epoch": 4.645774543294222, "grad_norm": 0.7995294332504272, "learning_rate": 2.125352740234665e-06, "loss": 0.0502, "step": 62560 }, { "epoch": 4.64651715431457, "grad_norm": 0.879058837890625, "learning_rate": 2.12089707411258e-06, "loss": 0.0922, "step": 62570 }, { "epoch": 4.647259765334917, "grad_norm": 1.0613620281219482, "learning_rate": 2.1164414079904946e-06, "loss": 0.0492, "step": 62580 }, { "epoch": 4.648002376355265, "grad_norm": 0.7447329759597778, "learning_rate": 2.111985741868409e-06, "loss": 0.0548, "step": 62590 }, { "epoch": 4.648744987375613, "grad_norm": 1.1669635772705078, "learning_rate": 2.107530075746324e-06, "loss": 0.062, "step": 62600 }, { "epoch": 4.64948759839596, "grad_norm": 0.3157268464565277, "learning_rate": 2.1030744096242388e-06, "loss": 0.0661, "step": 62610 }, { "epoch": 4.650230209416308, "grad_norm": 0.468450665473938, "learning_rate": 2.0986187435021533e-06, "loss": 0.0502, "step": 62620 }, { "epoch": 4.650972820436655, "grad_norm": 0.6913983821868896, "learning_rate": 2.0941630773800683e-06, "loss": 0.0732, "step": 62630 }, { "epoch": 4.651715431457003, "grad_norm": 0.5963155031204224, "learning_rate": 2.089707411257983e-06, "loss": 0.0588, "step": 62640 }, { "epoch": 4.652458042477351, "grad_norm": 0.6475550532341003, "learning_rate": 2.0852517451358975e-06, "loss": 0.0436, "step": 62650 }, { "epoch": 4.653200653497698, "grad_norm": 0.8035712242126465, "learning_rate": 2.080796079013813e-06, "loss": 0.0555, "step": 62660 }, { "epoch": 4.653943264518046, "grad_norm": 0.43951982259750366, "learning_rate": 2.0763404128917275e-06, "loss": 0.0567, "step": 62670 }, { "epoch": 4.654685875538393, "grad_norm": 2.8251731395721436, "learning_rate": 2.071884746769642e-06, "loss": 0.0697, "step": 62680 }, { "epoch": 4.6554284865587405, "grad_norm": 2.3607585430145264, "learning_rate": 2.067429080647557e-06, "loss": 0.0586, "step": 62690 }, { "epoch": 4.656171097579088, "grad_norm": 1.1953966617584229, "learning_rate": 2.0629734145254716e-06, "loss": 0.0645, "step": 62700 }, { "epoch": 4.656913708599435, "grad_norm": 2.366037130355835, "learning_rate": 2.058517748403386e-06, "loss": 0.0635, "step": 62710 }, { "epoch": 4.657656319619783, "grad_norm": 0.9777756333351135, "learning_rate": 2.054062082281301e-06, "loss": 0.0642, "step": 62720 }, { "epoch": 4.65839893064013, "grad_norm": 2.669673442840576, "learning_rate": 2.0496064161592157e-06, "loss": 0.0608, "step": 62730 }, { "epoch": 4.659141541660478, "grad_norm": 2.5014798641204834, "learning_rate": 2.0451507500371307e-06, "loss": 0.0718, "step": 62740 }, { "epoch": 4.659884152680826, "grad_norm": 2.0952281951904297, "learning_rate": 2.0406950839150453e-06, "loss": 0.0731, "step": 62750 }, { "epoch": 4.660626763701173, "grad_norm": 1.361879587173462, "learning_rate": 2.03623941779296e-06, "loss": 0.0482, "step": 62760 }, { "epoch": 4.661369374721521, "grad_norm": 0.5971656441688538, "learning_rate": 2.031783751670875e-06, "loss": 0.0543, "step": 62770 }, { "epoch": 4.662111985741868, "grad_norm": 2.2407009601593018, "learning_rate": 2.0273280855487894e-06, "loss": 0.0915, "step": 62780 }, { "epoch": 4.662854596762216, "grad_norm": 0.24493630230426788, "learning_rate": 2.0228724194267044e-06, "loss": 0.046, "step": 62790 }, { "epoch": 4.663597207782564, "grad_norm": 1.5833337306976318, "learning_rate": 2.0184167533046194e-06, "loss": 0.0539, "step": 62800 }, { "epoch": 4.664339818802911, "grad_norm": 0.7038244605064392, "learning_rate": 2.013961087182534e-06, "loss": 0.0618, "step": 62810 }, { "epoch": 4.665082429823259, "grad_norm": 0.6795600652694702, "learning_rate": 2.0095054210604486e-06, "loss": 0.0821, "step": 62820 }, { "epoch": 4.665825040843606, "grad_norm": 1.3816779851913452, "learning_rate": 2.0050497549383636e-06, "loss": 0.047, "step": 62830 }, { "epoch": 4.666567651863954, "grad_norm": 3.097158193588257, "learning_rate": 2.000594088816278e-06, "loss": 0.0707, "step": 62840 }, { "epoch": 4.6673102628843015, "grad_norm": 3.206883192062378, "learning_rate": 1.9961384226941927e-06, "loss": 0.0426, "step": 62850 }, { "epoch": 4.6680528739046485, "grad_norm": 3.3937978744506836, "learning_rate": 1.9916827565721077e-06, "loss": 0.0737, "step": 62860 }, { "epoch": 4.668795484924996, "grad_norm": 0.8779681921005249, "learning_rate": 1.9872270904500223e-06, "loss": 0.039, "step": 62870 }, { "epoch": 4.669538095945343, "grad_norm": 2.1197397708892822, "learning_rate": 1.982771424327937e-06, "loss": 0.0465, "step": 62880 }, { "epoch": 4.670280706965691, "grad_norm": 3.1833298206329346, "learning_rate": 1.978315758205852e-06, "loss": 0.0657, "step": 62890 }, { "epoch": 4.671023317986039, "grad_norm": 0.6295514702796936, "learning_rate": 1.9738600920837664e-06, "loss": 0.0515, "step": 62900 }, { "epoch": 4.671765929006386, "grad_norm": 0.9463853240013123, "learning_rate": 1.969404425961681e-06, "loss": 0.0732, "step": 62910 }, { "epoch": 4.672508540026734, "grad_norm": 2.7689971923828125, "learning_rate": 1.9649487598395964e-06, "loss": 0.0551, "step": 62920 }, { "epoch": 4.673251151047081, "grad_norm": 1.0542500019073486, "learning_rate": 1.960493093717511e-06, "loss": 0.078, "step": 62930 }, { "epoch": 4.673993762067429, "grad_norm": 6.436036586761475, "learning_rate": 1.9560374275954255e-06, "loss": 0.0641, "step": 62940 }, { "epoch": 4.674736373087777, "grad_norm": 0.5600406527519226, "learning_rate": 1.9515817614733405e-06, "loss": 0.0637, "step": 62950 }, { "epoch": 4.675478984108124, "grad_norm": 1.567610263824463, "learning_rate": 1.947126095351255e-06, "loss": 0.0449, "step": 62960 }, { "epoch": 4.676221595128472, "grad_norm": 2.6825263500213623, "learning_rate": 1.9426704292291697e-06, "loss": 0.0558, "step": 62970 }, { "epoch": 4.676964206148819, "grad_norm": 0.2970806658267975, "learning_rate": 1.9382147631070847e-06, "loss": 0.0522, "step": 62980 }, { "epoch": 4.677706817169167, "grad_norm": 1.4133620262145996, "learning_rate": 1.9337590969849992e-06, "loss": 0.0462, "step": 62990 }, { "epoch": 4.678449428189515, "grad_norm": 1.4297685623168945, "learning_rate": 1.929303430862914e-06, "loss": 0.0552, "step": 63000 }, { "epoch": 4.679192039209862, "grad_norm": 2.5894458293914795, "learning_rate": 1.924847764740829e-06, "loss": 0.0763, "step": 63010 }, { "epoch": 4.6799346502302095, "grad_norm": 2.202799081802368, "learning_rate": 1.9203920986187434e-06, "loss": 0.0616, "step": 63020 }, { "epoch": 4.6806772612505565, "grad_norm": 1.195757269859314, "learning_rate": 1.915936432496658e-06, "loss": 0.0802, "step": 63030 }, { "epoch": 4.681419872270904, "grad_norm": 0.8509438633918762, "learning_rate": 1.911480766374573e-06, "loss": 0.0413, "step": 63040 }, { "epoch": 4.682162483291252, "grad_norm": 2.7623627185821533, "learning_rate": 1.907025100252488e-06, "loss": 0.0487, "step": 63050 }, { "epoch": 4.682905094311599, "grad_norm": 1.9826782941818237, "learning_rate": 1.9025694341304027e-06, "loss": 0.0715, "step": 63060 }, { "epoch": 4.683647705331947, "grad_norm": 0.5094510316848755, "learning_rate": 1.8981137680083175e-06, "loss": 0.0596, "step": 63070 }, { "epoch": 4.684390316352294, "grad_norm": 1.9838165044784546, "learning_rate": 1.893658101886232e-06, "loss": 0.0584, "step": 63080 }, { "epoch": 4.685132927372642, "grad_norm": 0.8991755247116089, "learning_rate": 1.8892024357641469e-06, "loss": 0.0556, "step": 63090 }, { "epoch": 4.68587553839299, "grad_norm": 1.3264687061309814, "learning_rate": 1.8847467696420616e-06, "loss": 0.0679, "step": 63100 }, { "epoch": 4.686618149413337, "grad_norm": 1.1737868785858154, "learning_rate": 1.8802911035199762e-06, "loss": 0.079, "step": 63110 }, { "epoch": 4.687360760433685, "grad_norm": 1.7225794792175293, "learning_rate": 1.875835437397891e-06, "loss": 0.0473, "step": 63120 }, { "epoch": 4.688103371454032, "grad_norm": 0.6222664713859558, "learning_rate": 1.8713797712758058e-06, "loss": 0.0525, "step": 63130 }, { "epoch": 4.68884598247438, "grad_norm": 2.4344029426574707, "learning_rate": 1.8669241051537206e-06, "loss": 0.0497, "step": 63140 }, { "epoch": 4.689588593494728, "grad_norm": 0.5827934741973877, "learning_rate": 1.8624684390316353e-06, "loss": 0.0458, "step": 63150 }, { "epoch": 4.690331204515075, "grad_norm": 4.944514274597168, "learning_rate": 1.8580127729095501e-06, "loss": 0.0326, "step": 63160 }, { "epoch": 4.691073815535423, "grad_norm": 1.2443019151687622, "learning_rate": 1.853557106787465e-06, "loss": 0.0435, "step": 63170 }, { "epoch": 4.6918164265557705, "grad_norm": 0.31377652287483215, "learning_rate": 1.8491014406653795e-06, "loss": 0.0562, "step": 63180 }, { "epoch": 4.6925590375761175, "grad_norm": 1.3072824478149414, "learning_rate": 1.8446457745432943e-06, "loss": 0.0721, "step": 63190 }, { "epoch": 4.693301648596465, "grad_norm": 1.9341398477554321, "learning_rate": 1.840190108421209e-06, "loss": 0.0605, "step": 63200 }, { "epoch": 4.694044259616812, "grad_norm": 0.47264960408210754, "learning_rate": 1.8357344422991236e-06, "loss": 0.0632, "step": 63210 }, { "epoch": 4.69478687063716, "grad_norm": 2.389784574508667, "learning_rate": 1.8312787761770386e-06, "loss": 0.0605, "step": 63220 }, { "epoch": 4.695529481657508, "grad_norm": 2.27815842628479, "learning_rate": 1.8268231100549534e-06, "loss": 0.0653, "step": 63230 }, { "epoch": 4.696272092677855, "grad_norm": 1.2528233528137207, "learning_rate": 1.822367443932868e-06, "loss": 0.0568, "step": 63240 }, { "epoch": 4.697014703698203, "grad_norm": 2.5911924839019775, "learning_rate": 1.8179117778107827e-06, "loss": 0.0576, "step": 63250 }, { "epoch": 4.69775731471855, "grad_norm": 1.9090697765350342, "learning_rate": 1.8134561116886975e-06, "loss": 0.0368, "step": 63260 }, { "epoch": 4.698499925738898, "grad_norm": 0.29426848888397217, "learning_rate": 1.809000445566612e-06, "loss": 0.0387, "step": 63270 }, { "epoch": 4.699242536759246, "grad_norm": 0.4976974129676819, "learning_rate": 1.804544779444527e-06, "loss": 0.0605, "step": 63280 }, { "epoch": 4.699985147779593, "grad_norm": 2.4460465908050537, "learning_rate": 1.8000891133224419e-06, "loss": 0.0946, "step": 63290 }, { "epoch": 4.700727758799941, "grad_norm": 1.7145735025405884, "learning_rate": 1.7956334472003564e-06, "loss": 0.1029, "step": 63300 }, { "epoch": 4.701470369820288, "grad_norm": 3.410822868347168, "learning_rate": 1.7911777810782712e-06, "loss": 0.0448, "step": 63310 }, { "epoch": 4.702212980840636, "grad_norm": 2.253828525543213, "learning_rate": 1.786722114956186e-06, "loss": 0.0709, "step": 63320 }, { "epoch": 4.7029555918609836, "grad_norm": 0.8834261894226074, "learning_rate": 1.7822664488341008e-06, "loss": 0.0709, "step": 63330 }, { "epoch": 4.703698202881331, "grad_norm": 0.5737817883491516, "learning_rate": 1.7778107827120154e-06, "loss": 0.0317, "step": 63340 }, { "epoch": 4.7044408139016785, "grad_norm": 2.2672598361968994, "learning_rate": 1.7733551165899303e-06, "loss": 0.0572, "step": 63350 }, { "epoch": 4.7051834249220255, "grad_norm": 0.7184361219406128, "learning_rate": 1.7688994504678451e-06, "loss": 0.0242, "step": 63360 }, { "epoch": 4.705926035942373, "grad_norm": 2.6737210750579834, "learning_rate": 1.7644437843457597e-06, "loss": 0.063, "step": 63370 }, { "epoch": 4.706668646962721, "grad_norm": 1.496285080909729, "learning_rate": 1.7599881182236745e-06, "loss": 0.0321, "step": 63380 }, { "epoch": 4.707411257983068, "grad_norm": 1.187997579574585, "learning_rate": 1.7555324521015893e-06, "loss": 0.0588, "step": 63390 }, { "epoch": 4.708153869003416, "grad_norm": 0.6710416078567505, "learning_rate": 1.7510767859795038e-06, "loss": 0.04, "step": 63400 }, { "epoch": 4.708896480023764, "grad_norm": 1.344570279121399, "learning_rate": 1.7466211198574186e-06, "loss": 0.0616, "step": 63410 }, { "epoch": 4.709639091044111, "grad_norm": 1.7704460620880127, "learning_rate": 1.7421654537353336e-06, "loss": 0.0577, "step": 63420 }, { "epoch": 4.710381702064459, "grad_norm": 1.4288161993026733, "learning_rate": 1.7377097876132482e-06, "loss": 0.0438, "step": 63430 }, { "epoch": 4.711124313084806, "grad_norm": 0.8680292367935181, "learning_rate": 1.733254121491163e-06, "loss": 0.0569, "step": 63440 }, { "epoch": 4.711866924105154, "grad_norm": 0.42400112748146057, "learning_rate": 1.7287984553690777e-06, "loss": 0.0361, "step": 63450 }, { "epoch": 4.712609535125502, "grad_norm": 0.3633495271205902, "learning_rate": 1.7243427892469923e-06, "loss": 0.0364, "step": 63460 }, { "epoch": 4.713352146145849, "grad_norm": 0.9491689205169678, "learning_rate": 1.719887123124907e-06, "loss": 0.0824, "step": 63470 }, { "epoch": 4.714094757166197, "grad_norm": 2.717560052871704, "learning_rate": 1.715431457002822e-06, "loss": 0.0632, "step": 63480 }, { "epoch": 4.714837368186544, "grad_norm": 2.0803704261779785, "learning_rate": 1.7109757908807369e-06, "loss": 0.05, "step": 63490 }, { "epoch": 4.7155799792068915, "grad_norm": 1.6950163841247559, "learning_rate": 1.7065201247586514e-06, "loss": 0.0636, "step": 63500 }, { "epoch": 4.716322590227239, "grad_norm": 0.8007744550704956, "learning_rate": 1.7020644586365662e-06, "loss": 0.0632, "step": 63510 }, { "epoch": 4.717065201247586, "grad_norm": 0.830875039100647, "learning_rate": 1.697608792514481e-06, "loss": 0.0272, "step": 63520 }, { "epoch": 4.717807812267934, "grad_norm": 0.48793575167655945, "learning_rate": 1.6931531263923956e-06, "loss": 0.043, "step": 63530 }, { "epoch": 4.718550423288281, "grad_norm": 1.9636634588241577, "learning_rate": 1.6886974602703104e-06, "loss": 0.0448, "step": 63540 }, { "epoch": 4.719293034308629, "grad_norm": 1.2597278356552124, "learning_rate": 1.6842417941482254e-06, "loss": 0.0393, "step": 63550 }, { "epoch": 4.720035645328977, "grad_norm": 1.140265941619873, "learning_rate": 1.67978612802614e-06, "loss": 0.0668, "step": 63560 }, { "epoch": 4.720778256349324, "grad_norm": 1.5204119682312012, "learning_rate": 1.6753304619040547e-06, "loss": 0.0372, "step": 63570 }, { "epoch": 4.721520867369672, "grad_norm": 1.4788594245910645, "learning_rate": 1.6708747957819695e-06, "loss": 0.0902, "step": 63580 }, { "epoch": 4.722263478390019, "grad_norm": 1.6881049871444702, "learning_rate": 1.666419129659884e-06, "loss": 0.0413, "step": 63590 }, { "epoch": 4.723006089410367, "grad_norm": 2.068535327911377, "learning_rate": 1.6619634635377988e-06, "loss": 0.0732, "step": 63600 }, { "epoch": 4.723748700430715, "grad_norm": 2.630728006362915, "learning_rate": 1.6575077974157138e-06, "loss": 0.0431, "step": 63610 }, { "epoch": 4.724491311451062, "grad_norm": 0.42184069752693176, "learning_rate": 1.6530521312936284e-06, "loss": 0.0364, "step": 63620 }, { "epoch": 4.72523392247141, "grad_norm": 2.413302421569824, "learning_rate": 1.6485964651715432e-06, "loss": 0.0655, "step": 63630 }, { "epoch": 4.725976533491757, "grad_norm": 0.9263471364974976, "learning_rate": 1.644140799049458e-06, "loss": 0.0456, "step": 63640 }, { "epoch": 4.726719144512105, "grad_norm": 1.1987792253494263, "learning_rate": 1.6396851329273726e-06, "loss": 0.0405, "step": 63650 }, { "epoch": 4.7274617555324525, "grad_norm": 3.2439706325531006, "learning_rate": 1.6352294668052873e-06, "loss": 0.0617, "step": 63660 }, { "epoch": 4.7282043665527995, "grad_norm": 0.45291805267333984, "learning_rate": 1.6307738006832021e-06, "loss": 0.0573, "step": 63670 }, { "epoch": 4.728946977573147, "grad_norm": 0.45011815428733826, "learning_rate": 1.6263181345611171e-06, "loss": 0.0522, "step": 63680 }, { "epoch": 4.729689588593494, "grad_norm": 0.6326998472213745, "learning_rate": 1.6218624684390317e-06, "loss": 0.0621, "step": 63690 }, { "epoch": 4.730432199613842, "grad_norm": 0.5624150633811951, "learning_rate": 1.6174068023169465e-06, "loss": 0.0394, "step": 63700 }, { "epoch": 4.73117481063419, "grad_norm": 0.9579383134841919, "learning_rate": 1.6129511361948612e-06, "loss": 0.0856, "step": 63710 }, { "epoch": 4.731917421654537, "grad_norm": 1.3412039279937744, "learning_rate": 1.6084954700727758e-06, "loss": 0.0584, "step": 63720 }, { "epoch": 4.732660032674885, "grad_norm": 2.0105788707733154, "learning_rate": 1.6040398039506906e-06, "loss": 0.0573, "step": 63730 }, { "epoch": 4.733402643695232, "grad_norm": 1.5913349390029907, "learning_rate": 1.5995841378286056e-06, "loss": 0.0634, "step": 63740 }, { "epoch": 4.73414525471558, "grad_norm": 0.57387375831604, "learning_rate": 1.5951284717065202e-06, "loss": 0.037, "step": 63750 }, { "epoch": 4.734887865735928, "grad_norm": 0.6416048407554626, "learning_rate": 1.590672805584435e-06, "loss": 0.0565, "step": 63760 }, { "epoch": 4.735630476756275, "grad_norm": 0.8845180869102478, "learning_rate": 1.5862171394623497e-06, "loss": 0.0461, "step": 63770 }, { "epoch": 4.736373087776623, "grad_norm": 2.478346347808838, "learning_rate": 1.5817614733402643e-06, "loss": 0.0446, "step": 63780 }, { "epoch": 4.73711569879697, "grad_norm": 0.9896259307861328, "learning_rate": 1.577305807218179e-06, "loss": 0.085, "step": 63790 }, { "epoch": 4.737858309817318, "grad_norm": 1.2818790674209595, "learning_rate": 1.5728501410960939e-06, "loss": 0.0503, "step": 63800 }, { "epoch": 4.738600920837666, "grad_norm": 1.4775575399398804, "learning_rate": 1.5683944749740086e-06, "loss": 0.0388, "step": 63810 }, { "epoch": 4.739343531858013, "grad_norm": 0.6616837978363037, "learning_rate": 1.5639388088519234e-06, "loss": 0.0482, "step": 63820 }, { "epoch": 4.7400861428783605, "grad_norm": 0.6766378879547119, "learning_rate": 1.5594831427298382e-06, "loss": 0.0487, "step": 63830 }, { "epoch": 4.7408287538987075, "grad_norm": 4.399589538574219, "learning_rate": 1.555027476607753e-06, "loss": 0.0676, "step": 63840 }, { "epoch": 4.741571364919055, "grad_norm": 0.8377204537391663, "learning_rate": 1.5505718104856676e-06, "loss": 0.0473, "step": 63850 }, { "epoch": 4.742313975939403, "grad_norm": 0.36241453886032104, "learning_rate": 1.5461161443635823e-06, "loss": 0.0365, "step": 63860 }, { "epoch": 4.74305658695975, "grad_norm": 3.0188138484954834, "learning_rate": 1.5416604782414971e-06, "loss": 0.0621, "step": 63870 }, { "epoch": 4.743799197980098, "grad_norm": 1.3264600038528442, "learning_rate": 1.537204812119412e-06, "loss": 0.0571, "step": 63880 }, { "epoch": 4.744541809000445, "grad_norm": 2.425934076309204, "learning_rate": 1.5327491459973267e-06, "loss": 0.0562, "step": 63890 }, { "epoch": 4.745284420020793, "grad_norm": 1.697583794593811, "learning_rate": 1.5282934798752415e-06, "loss": 0.0656, "step": 63900 }, { "epoch": 4.746027031041141, "grad_norm": 1.6972935199737549, "learning_rate": 1.523837813753156e-06, "loss": 0.0846, "step": 63910 }, { "epoch": 4.746769642061488, "grad_norm": 0.5245199203491211, "learning_rate": 1.5193821476310708e-06, "loss": 0.048, "step": 63920 }, { "epoch": 4.747512253081836, "grad_norm": 1.2065707445144653, "learning_rate": 1.5149264815089856e-06, "loss": 0.0582, "step": 63930 }, { "epoch": 4.748254864102183, "grad_norm": 2.5891146659851074, "learning_rate": 1.5104708153869004e-06, "loss": 0.0675, "step": 63940 }, { "epoch": 4.748997475122531, "grad_norm": 1.8636987209320068, "learning_rate": 1.5060151492648152e-06, "loss": 0.0364, "step": 63950 }, { "epoch": 4.749740086142879, "grad_norm": 0.759530246257782, "learning_rate": 1.50155948314273e-06, "loss": 0.0409, "step": 63960 }, { "epoch": 4.750482697163226, "grad_norm": 1.0763570070266724, "learning_rate": 1.4971038170206445e-06, "loss": 0.0513, "step": 63970 }, { "epoch": 4.751225308183574, "grad_norm": 0.6795147657394409, "learning_rate": 1.4926481508985593e-06, "loss": 0.0513, "step": 63980 }, { "epoch": 4.751967919203921, "grad_norm": 1.966013789176941, "learning_rate": 1.488192484776474e-06, "loss": 0.078, "step": 63990 }, { "epoch": 4.7527105302242685, "grad_norm": 1.7441941499710083, "learning_rate": 1.4837368186543889e-06, "loss": 0.0454, "step": 64000 }, { "epoch": 4.753453141244616, "grad_norm": 0.6811540722846985, "learning_rate": 1.4792811525323037e-06, "loss": 0.0395, "step": 64010 }, { "epoch": 4.754195752264963, "grad_norm": 0.5677528381347656, "learning_rate": 1.4748254864102184e-06, "loss": 0.0369, "step": 64020 }, { "epoch": 4.754938363285311, "grad_norm": 0.3127375841140747, "learning_rate": 1.4703698202881332e-06, "loss": 0.0519, "step": 64030 }, { "epoch": 4.755680974305658, "grad_norm": 1.2747459411621094, "learning_rate": 1.4659141541660478e-06, "loss": 0.0507, "step": 64040 }, { "epoch": 4.756423585326006, "grad_norm": 1.9824333190917969, "learning_rate": 1.4614584880439626e-06, "loss": 0.0485, "step": 64050 }, { "epoch": 4.757166196346354, "grad_norm": 0.8089881539344788, "learning_rate": 1.4570028219218774e-06, "loss": 0.0495, "step": 64060 }, { "epoch": 4.757908807366701, "grad_norm": 0.6729184985160828, "learning_rate": 1.4525471557997921e-06, "loss": 0.0291, "step": 64070 }, { "epoch": 4.758651418387049, "grad_norm": 1.3998851776123047, "learning_rate": 1.448091489677707e-06, "loss": 0.0749, "step": 64080 }, { "epoch": 4.759394029407396, "grad_norm": 1.4854328632354736, "learning_rate": 1.4436358235556217e-06, "loss": 0.0579, "step": 64090 }, { "epoch": 4.760136640427744, "grad_norm": 0.500187873840332, "learning_rate": 1.4391801574335363e-06, "loss": 0.0491, "step": 64100 }, { "epoch": 4.760879251448092, "grad_norm": 2.0688111782073975, "learning_rate": 1.434724491311451e-06, "loss": 0.0442, "step": 64110 }, { "epoch": 4.761621862468439, "grad_norm": 0.8344265222549438, "learning_rate": 1.4302688251893658e-06, "loss": 0.0608, "step": 64120 }, { "epoch": 4.762364473488787, "grad_norm": 0.7157622575759888, "learning_rate": 1.4258131590672804e-06, "loss": 0.0717, "step": 64130 }, { "epoch": 4.763107084509134, "grad_norm": 2.2929582595825195, "learning_rate": 1.4213574929451954e-06, "loss": 0.0681, "step": 64140 }, { "epoch": 4.763849695529482, "grad_norm": 1.3758022785186768, "learning_rate": 1.4169018268231102e-06, "loss": 0.0542, "step": 64150 }, { "epoch": 4.7645923065498295, "grad_norm": 1.4729808568954468, "learning_rate": 1.4124461607010248e-06, "loss": 0.0578, "step": 64160 }, { "epoch": 4.7653349175701765, "grad_norm": 3.680039882659912, "learning_rate": 1.4079904945789395e-06, "loss": 0.0774, "step": 64170 }, { "epoch": 4.766077528590524, "grad_norm": 0.58192378282547, "learning_rate": 1.4035348284568543e-06, "loss": 0.0501, "step": 64180 }, { "epoch": 4.766820139610871, "grad_norm": 0.3413379192352295, "learning_rate": 1.3990791623347691e-06, "loss": 0.0271, "step": 64190 }, { "epoch": 4.767562750631219, "grad_norm": 1.4650648832321167, "learning_rate": 1.3946234962126837e-06, "loss": 0.0451, "step": 64200 }, { "epoch": 4.768305361651567, "grad_norm": 0.2223232537508011, "learning_rate": 1.3901678300905987e-06, "loss": 0.0705, "step": 64210 }, { "epoch": 4.769047972671914, "grad_norm": 3.3965070247650146, "learning_rate": 1.3857121639685135e-06, "loss": 0.0701, "step": 64220 }, { "epoch": 4.769790583692262, "grad_norm": 1.7762930393218994, "learning_rate": 1.381256497846428e-06, "loss": 0.0888, "step": 64230 }, { "epoch": 4.770533194712609, "grad_norm": 0.9293942451477051, "learning_rate": 1.3768008317243428e-06, "loss": 0.0653, "step": 64240 }, { "epoch": 4.771275805732957, "grad_norm": 1.2319601774215698, "learning_rate": 1.3723451656022576e-06, "loss": 0.0603, "step": 64250 }, { "epoch": 4.772018416753305, "grad_norm": 0.7350060343742371, "learning_rate": 1.3678894994801722e-06, "loss": 0.0535, "step": 64260 }, { "epoch": 4.772761027773652, "grad_norm": 0.3854759633541107, "learning_rate": 1.3634338333580872e-06, "loss": 0.041, "step": 64270 }, { "epoch": 4.773503638794, "grad_norm": 1.7290388345718384, "learning_rate": 1.358978167236002e-06, "loss": 0.0524, "step": 64280 }, { "epoch": 4.774246249814347, "grad_norm": 1.4731237888336182, "learning_rate": 1.3545225011139165e-06, "loss": 0.0818, "step": 64290 }, { "epoch": 4.774988860834695, "grad_norm": 1.8884638547897339, "learning_rate": 1.3500668349918313e-06, "loss": 0.0443, "step": 64300 }, { "epoch": 4.7757314718550425, "grad_norm": 1.970607042312622, "learning_rate": 1.345611168869746e-06, "loss": 0.0766, "step": 64310 }, { "epoch": 4.7764740828753895, "grad_norm": 0.458402544260025, "learning_rate": 1.3411555027476606e-06, "loss": 0.067, "step": 64320 }, { "epoch": 4.777216693895737, "grad_norm": 1.3864773511886597, "learning_rate": 1.3366998366255754e-06, "loss": 0.0479, "step": 64330 }, { "epoch": 4.777959304916085, "grad_norm": 3.7125790119171143, "learning_rate": 1.3322441705034904e-06, "loss": 0.088, "step": 64340 }, { "epoch": 4.778701915936432, "grad_norm": 0.38081493973731995, "learning_rate": 1.3277885043814052e-06, "loss": 0.0677, "step": 64350 }, { "epoch": 4.77944452695678, "grad_norm": 1.2474788427352905, "learning_rate": 1.3233328382593198e-06, "loss": 0.0681, "step": 64360 }, { "epoch": 4.780187137977127, "grad_norm": 0.9167222380638123, "learning_rate": 1.3188771721372346e-06, "loss": 0.0884, "step": 64370 }, { "epoch": 4.780929748997475, "grad_norm": 1.0856194496154785, "learning_rate": 1.3144215060151493e-06, "loss": 0.0562, "step": 64380 }, { "epoch": 4.781672360017823, "grad_norm": 1.6708109378814697, "learning_rate": 1.309965839893064e-06, "loss": 0.1026, "step": 64390 }, { "epoch": 4.78241497103817, "grad_norm": 1.4255729913711548, "learning_rate": 1.305510173770979e-06, "loss": 0.064, "step": 64400 }, { "epoch": 4.783157582058518, "grad_norm": 0.8202357292175293, "learning_rate": 1.3010545076488937e-06, "loss": 0.0501, "step": 64410 }, { "epoch": 4.783900193078865, "grad_norm": 1.9395742416381836, "learning_rate": 1.2965988415268083e-06, "loss": 0.0641, "step": 64420 }, { "epoch": 4.784642804099213, "grad_norm": 0.4814720153808594, "learning_rate": 1.292143175404723e-06, "loss": 0.0567, "step": 64430 }, { "epoch": 4.785385415119561, "grad_norm": 1.5214014053344727, "learning_rate": 1.2876875092826378e-06, "loss": 0.0261, "step": 64440 }, { "epoch": 4.786128026139908, "grad_norm": 1.4035849571228027, "learning_rate": 1.2832318431605524e-06, "loss": 0.0424, "step": 64450 }, { "epoch": 4.786870637160256, "grad_norm": 1.3969433307647705, "learning_rate": 1.2787761770384672e-06, "loss": 0.0411, "step": 64460 }, { "epoch": 4.787613248180603, "grad_norm": 0.7165231704711914, "learning_rate": 1.2743205109163822e-06, "loss": 0.0555, "step": 64470 }, { "epoch": 4.7883558592009505, "grad_norm": 2.941373825073242, "learning_rate": 1.2698648447942967e-06, "loss": 0.0755, "step": 64480 }, { "epoch": 4.789098470221298, "grad_norm": 0.6511389017105103, "learning_rate": 1.2654091786722115e-06, "loss": 0.0216, "step": 64490 }, { "epoch": 4.789841081241645, "grad_norm": 0.8992854952812195, "learning_rate": 1.2609535125501263e-06, "loss": 0.0711, "step": 64500 }, { "epoch": 4.790583692261993, "grad_norm": 1.678450107574463, "learning_rate": 1.256497846428041e-06, "loss": 0.0658, "step": 64510 }, { "epoch": 4.79132630328234, "grad_norm": 1.1826390027999878, "learning_rate": 1.2520421803059557e-06, "loss": 0.041, "step": 64520 }, { "epoch": 4.792068914302688, "grad_norm": 1.712506651878357, "learning_rate": 1.2475865141838704e-06, "loss": 0.0582, "step": 64530 }, { "epoch": 4.792811525323036, "grad_norm": 1.0903819799423218, "learning_rate": 1.2431308480617854e-06, "loss": 0.0657, "step": 64540 }, { "epoch": 4.793554136343383, "grad_norm": 1.190956950187683, "learning_rate": 1.2386751819397e-06, "loss": 0.0733, "step": 64550 }, { "epoch": 4.794296747363731, "grad_norm": 3.9811456203460693, "learning_rate": 1.2342195158176148e-06, "loss": 0.0726, "step": 64560 }, { "epoch": 4.795039358384079, "grad_norm": 0.5482442378997803, "learning_rate": 1.2297638496955296e-06, "loss": 0.0801, "step": 64570 }, { "epoch": 4.795781969404426, "grad_norm": 0.27074581384658813, "learning_rate": 1.2253081835734441e-06, "loss": 0.0592, "step": 64580 }, { "epoch": 4.796524580424774, "grad_norm": 1.4098689556121826, "learning_rate": 1.220852517451359e-06, "loss": 0.0702, "step": 64590 }, { "epoch": 4.797267191445121, "grad_norm": 1.838658332824707, "learning_rate": 1.216396851329274e-06, "loss": 0.0521, "step": 64600 }, { "epoch": 4.798009802465469, "grad_norm": 1.1644221544265747, "learning_rate": 1.2119411852071885e-06, "loss": 0.0518, "step": 64610 }, { "epoch": 4.798752413485817, "grad_norm": 1.8020392656326294, "learning_rate": 1.2074855190851033e-06, "loss": 0.0675, "step": 64620 }, { "epoch": 4.799495024506164, "grad_norm": 1.1796648502349854, "learning_rate": 1.203029852963018e-06, "loss": 0.0479, "step": 64630 }, { "epoch": 4.8002376355265115, "grad_norm": 2.24526047706604, "learning_rate": 1.1985741868409326e-06, "loss": 0.0374, "step": 64640 }, { "epoch": 4.8009802465468585, "grad_norm": 0.3404271900653839, "learning_rate": 1.1941185207188474e-06, "loss": 0.0318, "step": 64650 }, { "epoch": 4.801722857567206, "grad_norm": 0.40079450607299805, "learning_rate": 1.1896628545967622e-06, "loss": 0.0397, "step": 64660 }, { "epoch": 4.802465468587554, "grad_norm": 1.3549119234085083, "learning_rate": 1.185207188474677e-06, "loss": 0.0597, "step": 64670 }, { "epoch": 4.803208079607901, "grad_norm": 0.6947437524795532, "learning_rate": 1.1807515223525918e-06, "loss": 0.0653, "step": 64680 }, { "epoch": 4.803950690628249, "grad_norm": 0.9724387526512146, "learning_rate": 1.1762958562305065e-06, "loss": 0.0544, "step": 64690 }, { "epoch": 4.804693301648596, "grad_norm": 0.9956182241439819, "learning_rate": 1.1718401901084213e-06, "loss": 0.0393, "step": 64700 }, { "epoch": 4.805435912668944, "grad_norm": 0.9624569416046143, "learning_rate": 1.1673845239863359e-06, "loss": 0.0534, "step": 64710 }, { "epoch": 4.806178523689292, "grad_norm": 2.9609172344207764, "learning_rate": 1.1629288578642507e-06, "loss": 0.054, "step": 64720 }, { "epoch": 4.806921134709639, "grad_norm": 1.744933009147644, "learning_rate": 1.1584731917421657e-06, "loss": 0.0621, "step": 64730 }, { "epoch": 4.807663745729987, "grad_norm": 0.723945677280426, "learning_rate": 1.1540175256200802e-06, "loss": 0.042, "step": 64740 }, { "epoch": 4.808406356750334, "grad_norm": 1.1456420421600342, "learning_rate": 1.149561859497995e-06, "loss": 0.0395, "step": 64750 }, { "epoch": 4.809148967770682, "grad_norm": 2.3802542686462402, "learning_rate": 1.1451061933759098e-06, "loss": 0.0922, "step": 64760 }, { "epoch": 4.80989157879103, "grad_norm": 0.3124806880950928, "learning_rate": 1.1406505272538244e-06, "loss": 0.0576, "step": 64770 }, { "epoch": 4.810634189811377, "grad_norm": 0.9946483373641968, "learning_rate": 1.1361948611317392e-06, "loss": 0.0601, "step": 64780 }, { "epoch": 4.811376800831725, "grad_norm": 2.7371938228607178, "learning_rate": 1.131739195009654e-06, "loss": 0.0527, "step": 64790 }, { "epoch": 4.812119411852072, "grad_norm": 1.2650768756866455, "learning_rate": 1.1272835288875687e-06, "loss": 0.0535, "step": 64800 }, { "epoch": 4.8128620228724195, "grad_norm": 1.1405729055404663, "learning_rate": 1.1228278627654835e-06, "loss": 0.0361, "step": 64810 }, { "epoch": 4.813604633892767, "grad_norm": 1.991651177406311, "learning_rate": 1.1183721966433983e-06, "loss": 0.0721, "step": 64820 }, { "epoch": 4.814347244913114, "grad_norm": 2.725224018096924, "learning_rate": 1.1139165305213129e-06, "loss": 0.0446, "step": 64830 }, { "epoch": 4.815089855933462, "grad_norm": 0.7116602063179016, "learning_rate": 1.1094608643992276e-06, "loss": 0.0459, "step": 64840 }, { "epoch": 4.815832466953809, "grad_norm": 1.70388925075531, "learning_rate": 1.1050051982771424e-06, "loss": 0.07, "step": 64850 }, { "epoch": 4.816575077974157, "grad_norm": 0.45246338844299316, "learning_rate": 1.1005495321550574e-06, "loss": 0.0477, "step": 64860 }, { "epoch": 4.817317688994505, "grad_norm": 2.529362678527832, "learning_rate": 1.096093866032972e-06, "loss": 0.0773, "step": 64870 }, { "epoch": 4.818060300014852, "grad_norm": 2.60267972946167, "learning_rate": 1.0916381999108868e-06, "loss": 0.0683, "step": 64880 }, { "epoch": 4.8188029110352, "grad_norm": 2.9916200637817383, "learning_rate": 1.0871825337888016e-06, "loss": 0.0413, "step": 64890 }, { "epoch": 4.819545522055547, "grad_norm": 1.2470331192016602, "learning_rate": 1.0827268676667161e-06, "loss": 0.0669, "step": 64900 }, { "epoch": 4.820288133075895, "grad_norm": 1.2445175647735596, "learning_rate": 1.078271201544631e-06, "loss": 0.0672, "step": 64910 }, { "epoch": 4.821030744096243, "grad_norm": 0.964076042175293, "learning_rate": 1.0738155354225457e-06, "loss": 0.0642, "step": 64920 }, { "epoch": 4.82177335511659, "grad_norm": 0.8829526901245117, "learning_rate": 1.0693598693004605e-06, "loss": 0.0695, "step": 64930 }, { "epoch": 4.822515966136938, "grad_norm": 1.5874435901641846, "learning_rate": 1.0649042031783753e-06, "loss": 0.052, "step": 64940 }, { "epoch": 4.823258577157285, "grad_norm": 1.9532781839370728, "learning_rate": 1.06044853705629e-06, "loss": 0.0823, "step": 64950 }, { "epoch": 4.824001188177633, "grad_norm": 1.2501765489578247, "learning_rate": 1.0559928709342046e-06, "loss": 0.076, "step": 64960 }, { "epoch": 4.8247437991979805, "grad_norm": 1.186736822128296, "learning_rate": 1.0515372048121194e-06, "loss": 0.0524, "step": 64970 }, { "epoch": 4.8254864102183275, "grad_norm": 1.532196044921875, "learning_rate": 1.0470815386900342e-06, "loss": 0.0546, "step": 64980 }, { "epoch": 4.826229021238675, "grad_norm": 0.5501788854598999, "learning_rate": 1.0426258725679487e-06, "loss": 0.0407, "step": 64990 }, { "epoch": 4.826971632259022, "grad_norm": 0.5295414328575134, "learning_rate": 1.0381702064458637e-06, "loss": 0.0698, "step": 65000 }, { "epoch": 4.82771424327937, "grad_norm": 2.7445316314697266, "learning_rate": 1.0337145403237785e-06, "loss": 0.0864, "step": 65010 }, { "epoch": 4.828456854299718, "grad_norm": 2.2121715545654297, "learning_rate": 1.029258874201693e-06, "loss": 0.0606, "step": 65020 }, { "epoch": 4.829199465320065, "grad_norm": 1.5435631275177002, "learning_rate": 1.0248032080796079e-06, "loss": 0.0572, "step": 65030 }, { "epoch": 4.829942076340413, "grad_norm": 1.4504753351211548, "learning_rate": 1.0203475419575227e-06, "loss": 0.0387, "step": 65040 }, { "epoch": 4.83068468736076, "grad_norm": 2.8633036613464355, "learning_rate": 1.0158918758354374e-06, "loss": 0.0621, "step": 65050 }, { "epoch": 4.831427298381108, "grad_norm": 2.3440101146698, "learning_rate": 1.0114362097133522e-06, "loss": 0.0797, "step": 65060 }, { "epoch": 4.832169909401456, "grad_norm": 0.962746262550354, "learning_rate": 1.006980543591267e-06, "loss": 0.0636, "step": 65070 }, { "epoch": 4.832912520421803, "grad_norm": 0.8248947262763977, "learning_rate": 1.0025248774691818e-06, "loss": 0.0721, "step": 65080 }, { "epoch": 4.833655131442151, "grad_norm": 1.0095185041427612, "learning_rate": 9.980692113470964e-07, "loss": 0.036, "step": 65090 }, { "epoch": 4.834397742462498, "grad_norm": 1.019490122795105, "learning_rate": 9.936135452250111e-07, "loss": 0.0576, "step": 65100 }, { "epoch": 4.835140353482846, "grad_norm": 0.30174383521080017, "learning_rate": 9.89157879102926e-07, "loss": 0.0483, "step": 65110 }, { "epoch": 4.8358829645031935, "grad_norm": 1.3941235542297363, "learning_rate": 9.847022129808405e-07, "loss": 0.0886, "step": 65120 }, { "epoch": 4.8366255755235406, "grad_norm": 2.1527700424194336, "learning_rate": 9.802465468587555e-07, "loss": 0.039, "step": 65130 }, { "epoch": 4.8373681865438884, "grad_norm": 1.5553241968154907, "learning_rate": 9.757908807366703e-07, "loss": 0.0417, "step": 65140 }, { "epoch": 4.8381107975642355, "grad_norm": 1.5481144189834595, "learning_rate": 9.713352146145848e-07, "loss": 0.0638, "step": 65150 }, { "epoch": 4.838853408584583, "grad_norm": 2.373396873474121, "learning_rate": 9.668795484924996e-07, "loss": 0.0551, "step": 65160 }, { "epoch": 4.839596019604931, "grad_norm": 1.2677528858184814, "learning_rate": 9.624238823704144e-07, "loss": 0.0624, "step": 65170 }, { "epoch": 4.840338630625278, "grad_norm": 3.198784589767456, "learning_rate": 9.57968216248329e-07, "loss": 0.0483, "step": 65180 }, { "epoch": 4.841081241645626, "grad_norm": 0.36906376481056213, "learning_rate": 9.53512550126244e-07, "loss": 0.0587, "step": 65190 }, { "epoch": 4.841823852665973, "grad_norm": 1.4597629308700562, "learning_rate": 9.490568840041587e-07, "loss": 0.0588, "step": 65200 }, { "epoch": 4.842566463686321, "grad_norm": 1.0740894079208374, "learning_rate": 9.446012178820734e-07, "loss": 0.0312, "step": 65210 }, { "epoch": 4.843309074706669, "grad_norm": 2.1893422603607178, "learning_rate": 9.401455517599881e-07, "loss": 0.088, "step": 65220 }, { "epoch": 4.844051685727016, "grad_norm": 2.3881919384002686, "learning_rate": 9.356898856379029e-07, "loss": 0.0389, "step": 65230 }, { "epoch": 4.844794296747364, "grad_norm": 0.9090191721916199, "learning_rate": 9.312342195158177e-07, "loss": 0.0574, "step": 65240 }, { "epoch": 4.845536907767711, "grad_norm": 2.5130159854888916, "learning_rate": 9.267785533937324e-07, "loss": 0.0519, "step": 65250 }, { "epoch": 4.846279518788059, "grad_norm": 0.5649994611740112, "learning_rate": 9.223228872716471e-07, "loss": 0.0314, "step": 65260 }, { "epoch": 4.847022129808407, "grad_norm": 1.371864914894104, "learning_rate": 9.178672211495618e-07, "loss": 0.0652, "step": 65270 }, { "epoch": 4.847764740828754, "grad_norm": 0.3888418972492218, "learning_rate": 9.134115550274767e-07, "loss": 0.0693, "step": 65280 }, { "epoch": 4.8485073518491015, "grad_norm": 2.402367115020752, "learning_rate": 9.089558889053914e-07, "loss": 0.0578, "step": 65290 }, { "epoch": 4.8492499628694485, "grad_norm": 3.145559310913086, "learning_rate": 9.04500222783306e-07, "loss": 0.0581, "step": 65300 }, { "epoch": 4.849992573889796, "grad_norm": 0.6334661245346069, "learning_rate": 9.000445566612209e-07, "loss": 0.07, "step": 65310 }, { "epoch": 4.850735184910144, "grad_norm": 1.6239862442016602, "learning_rate": 8.955888905391356e-07, "loss": 0.0437, "step": 65320 }, { "epoch": 4.851477795930491, "grad_norm": 2.169163942337036, "learning_rate": 8.911332244170504e-07, "loss": 0.0579, "step": 65330 }, { "epoch": 4.852220406950839, "grad_norm": 0.9393569827079773, "learning_rate": 8.866775582949652e-07, "loss": 0.0424, "step": 65340 }, { "epoch": 4.852963017971186, "grad_norm": 0.3016482889652252, "learning_rate": 8.822218921728799e-07, "loss": 0.0443, "step": 65350 }, { "epoch": 4.853705628991534, "grad_norm": 0.676237165927887, "learning_rate": 8.777662260507946e-07, "loss": 0.0511, "step": 65360 }, { "epoch": 4.854448240011882, "grad_norm": 0.3696301579475403, "learning_rate": 8.733105599287093e-07, "loss": 0.0598, "step": 65370 }, { "epoch": 4.855190851032229, "grad_norm": 4.8189921379089355, "learning_rate": 8.688548938066241e-07, "loss": 0.0632, "step": 65380 }, { "epoch": 4.855933462052577, "grad_norm": 0.5955801010131836, "learning_rate": 8.643992276845389e-07, "loss": 0.029, "step": 65390 }, { "epoch": 4.856676073072924, "grad_norm": 1.868445634841919, "learning_rate": 8.599435615624536e-07, "loss": 0.0496, "step": 65400 }, { "epoch": 4.857418684093272, "grad_norm": 0.6771336197853088, "learning_rate": 8.554878954403684e-07, "loss": 0.0484, "step": 65410 }, { "epoch": 4.85816129511362, "grad_norm": 1.803080677986145, "learning_rate": 8.510322293182831e-07, "loss": 0.0599, "step": 65420 }, { "epoch": 4.858903906133967, "grad_norm": 2.175628900527954, "learning_rate": 8.465765631961978e-07, "loss": 0.0448, "step": 65430 }, { "epoch": 4.859646517154315, "grad_norm": 1.489719033241272, "learning_rate": 8.421208970741127e-07, "loss": 0.0435, "step": 65440 }, { "epoch": 4.860389128174662, "grad_norm": 0.7035216689109802, "learning_rate": 8.376652309520274e-07, "loss": 0.0551, "step": 65450 }, { "epoch": 4.8611317391950095, "grad_norm": 1.251814603805542, "learning_rate": 8.33209564829942e-07, "loss": 0.0537, "step": 65460 }, { "epoch": 4.861874350215357, "grad_norm": 0.5528387427330017, "learning_rate": 8.287538987078569e-07, "loss": 0.0571, "step": 65470 }, { "epoch": 4.862616961235704, "grad_norm": 2.852384328842163, "learning_rate": 8.242982325857716e-07, "loss": 0.0791, "step": 65480 }, { "epoch": 4.863359572256052, "grad_norm": 1.3060245513916016, "learning_rate": 8.198425664636863e-07, "loss": 0.0547, "step": 65490 }, { "epoch": 4.8641021832764, "grad_norm": 2.287452220916748, "learning_rate": 8.153869003416011e-07, "loss": 0.056, "step": 65500 }, { "epoch": 4.864844794296747, "grad_norm": 1.873404860496521, "learning_rate": 8.109312342195158e-07, "loss": 0.0859, "step": 65510 }, { "epoch": 4.865587405317095, "grad_norm": 1.0122098922729492, "learning_rate": 8.064755680974306e-07, "loss": 0.0531, "step": 65520 }, { "epoch": 4.866330016337442, "grad_norm": 1.163620948791504, "learning_rate": 8.020199019753453e-07, "loss": 0.0633, "step": 65530 }, { "epoch": 4.86707262735779, "grad_norm": 2.736693859100342, "learning_rate": 7.975642358532601e-07, "loss": 0.0622, "step": 65540 }, { "epoch": 4.867815238378138, "grad_norm": 0.43964171409606934, "learning_rate": 7.931085697311749e-07, "loss": 0.027, "step": 65550 }, { "epoch": 4.868557849398485, "grad_norm": 0.9539849162101746, "learning_rate": 7.886529036090895e-07, "loss": 0.043, "step": 65560 }, { "epoch": 4.869300460418833, "grad_norm": 0.3715505003929138, "learning_rate": 7.841972374870043e-07, "loss": 0.0631, "step": 65570 }, { "epoch": 4.87004307143918, "grad_norm": 1.210249423980713, "learning_rate": 7.797415713649191e-07, "loss": 0.0706, "step": 65580 }, { "epoch": 4.870785682459528, "grad_norm": 2.480217456817627, "learning_rate": 7.752859052428338e-07, "loss": 0.0576, "step": 65590 }, { "epoch": 4.871528293479876, "grad_norm": 2.4984142780303955, "learning_rate": 7.708302391207486e-07, "loss": 0.0893, "step": 65600 }, { "epoch": 4.872270904500223, "grad_norm": 0.5741029381752014, "learning_rate": 7.663745729986633e-07, "loss": 0.057, "step": 65610 }, { "epoch": 4.8730135155205705, "grad_norm": 2.359872341156006, "learning_rate": 7.61918906876578e-07, "loss": 0.0451, "step": 65620 }, { "epoch": 4.8737561265409175, "grad_norm": 1.7714812755584717, "learning_rate": 7.574632407544928e-07, "loss": 0.083, "step": 65630 }, { "epoch": 4.874498737561265, "grad_norm": 3.7563037872314453, "learning_rate": 7.530075746324076e-07, "loss": 0.0763, "step": 65640 }, { "epoch": 4.875241348581613, "grad_norm": 0.9975684285163879, "learning_rate": 7.485519085103223e-07, "loss": 0.0359, "step": 65650 }, { "epoch": 4.87598395960196, "grad_norm": 1.6247881650924683, "learning_rate": 7.44096242388237e-07, "loss": 0.0349, "step": 65660 }, { "epoch": 4.876726570622308, "grad_norm": 2.389453887939453, "learning_rate": 7.396405762661518e-07, "loss": 0.0537, "step": 65670 }, { "epoch": 4.877469181642655, "grad_norm": 2.3572044372558594, "learning_rate": 7.351849101440666e-07, "loss": 0.0356, "step": 65680 }, { "epoch": 4.878211792663003, "grad_norm": 2.2939834594726562, "learning_rate": 7.307292440219813e-07, "loss": 0.0741, "step": 65690 }, { "epoch": 4.878954403683351, "grad_norm": 1.0709495544433594, "learning_rate": 7.262735778998961e-07, "loss": 0.0741, "step": 65700 }, { "epoch": 4.879697014703698, "grad_norm": 1.768277645111084, "learning_rate": 7.218179117778109e-07, "loss": 0.0616, "step": 65710 }, { "epoch": 4.880439625724046, "grad_norm": 1.0159682035446167, "learning_rate": 7.173622456557255e-07, "loss": 0.0614, "step": 65720 }, { "epoch": 4.881182236744394, "grad_norm": 0.6093847751617432, "learning_rate": 7.129065795336402e-07, "loss": 0.0293, "step": 65730 }, { "epoch": 4.881924847764741, "grad_norm": 0.5181077718734741, "learning_rate": 7.084509134115551e-07, "loss": 0.0546, "step": 65740 }, { "epoch": 4.882667458785089, "grad_norm": 0.47307220101356506, "learning_rate": 7.039952472894698e-07, "loss": 0.0434, "step": 65750 }, { "epoch": 4.883410069805436, "grad_norm": 1.3281551599502563, "learning_rate": 6.995395811673846e-07, "loss": 0.0622, "step": 65760 }, { "epoch": 4.884152680825784, "grad_norm": 1.8027498722076416, "learning_rate": 6.950839150452993e-07, "loss": 0.057, "step": 65770 }, { "epoch": 4.8848952918461315, "grad_norm": 1.0578224658966064, "learning_rate": 6.90628248923214e-07, "loss": 0.0633, "step": 65780 }, { "epoch": 4.8856379028664785, "grad_norm": 1.2955671548843384, "learning_rate": 6.861725828011288e-07, "loss": 0.0574, "step": 65790 }, { "epoch": 4.886380513886826, "grad_norm": 2.2496156692504883, "learning_rate": 6.817169166790436e-07, "loss": 0.0629, "step": 65800 }, { "epoch": 4.887123124907173, "grad_norm": 1.6367658376693726, "learning_rate": 6.772612505569583e-07, "loss": 0.0392, "step": 65810 }, { "epoch": 4.887865735927521, "grad_norm": 0.9300660490989685, "learning_rate": 6.72805584434873e-07, "loss": 0.0769, "step": 65820 }, { "epoch": 4.888608346947869, "grad_norm": 0.5818589329719543, "learning_rate": 6.683499183127877e-07, "loss": 0.0558, "step": 65830 }, { "epoch": 4.889350957968216, "grad_norm": 2.5883166790008545, "learning_rate": 6.638942521907026e-07, "loss": 0.0754, "step": 65840 }, { "epoch": 4.890093568988564, "grad_norm": 2.193958282470703, "learning_rate": 6.594385860686173e-07, "loss": 0.0583, "step": 65850 }, { "epoch": 4.890836180008911, "grad_norm": 1.7549993991851807, "learning_rate": 6.54982919946532e-07, "loss": 0.0608, "step": 65860 }, { "epoch": 4.891578791029259, "grad_norm": 1.5535106658935547, "learning_rate": 6.505272538244468e-07, "loss": 0.0469, "step": 65870 }, { "epoch": 4.892321402049607, "grad_norm": 4.540996074676514, "learning_rate": 6.460715877023615e-07, "loss": 0.0707, "step": 65880 }, { "epoch": 4.893064013069954, "grad_norm": 0.570190966129303, "learning_rate": 6.416159215802762e-07, "loss": 0.0562, "step": 65890 }, { "epoch": 4.893806624090302, "grad_norm": 2.189643621444702, "learning_rate": 6.371602554581911e-07, "loss": 0.0557, "step": 65900 }, { "epoch": 4.894549235110649, "grad_norm": 1.114089846611023, "learning_rate": 6.327045893361058e-07, "loss": 0.0915, "step": 65910 }, { "epoch": 4.895291846130997, "grad_norm": 1.200714349746704, "learning_rate": 6.282489232140205e-07, "loss": 0.037, "step": 65920 }, { "epoch": 4.8960344571513446, "grad_norm": 1.7422734498977661, "learning_rate": 6.237932570919352e-07, "loss": 0.0929, "step": 65930 }, { "epoch": 4.896777068171692, "grad_norm": 1.0818594694137573, "learning_rate": 6.1933759096985e-07, "loss": 0.0399, "step": 65940 }, { "epoch": 4.8975196791920395, "grad_norm": 0.6319842338562012, "learning_rate": 6.148819248477648e-07, "loss": 0.0436, "step": 65950 }, { "epoch": 4.8982622902123865, "grad_norm": 1.0906407833099365, "learning_rate": 6.104262587256795e-07, "loss": 0.0474, "step": 65960 }, { "epoch": 4.899004901232734, "grad_norm": 0.6139366626739502, "learning_rate": 6.059705926035942e-07, "loss": 0.052, "step": 65970 }, { "epoch": 4.899747512253082, "grad_norm": 1.4248707294464111, "learning_rate": 6.01514926481509e-07, "loss": 0.0459, "step": 65980 }, { "epoch": 4.900490123273429, "grad_norm": 0.5179479122161865, "learning_rate": 5.970592603594237e-07, "loss": 0.0582, "step": 65990 }, { "epoch": 4.901232734293777, "grad_norm": 4.057821273803711, "learning_rate": 5.926035942373385e-07, "loss": 0.0822, "step": 66000 }, { "epoch": 4.901975345314124, "grad_norm": 0.7307072877883911, "learning_rate": 5.881479281152533e-07, "loss": 0.0771, "step": 66010 }, { "epoch": 4.902717956334472, "grad_norm": 1.7318345308303833, "learning_rate": 5.836922619931679e-07, "loss": 0.0519, "step": 66020 }, { "epoch": 4.90346056735482, "grad_norm": 0.3162935972213745, "learning_rate": 5.792365958710828e-07, "loss": 0.0516, "step": 66030 }, { "epoch": 4.904203178375167, "grad_norm": 1.487998127937317, "learning_rate": 5.747809297489975e-07, "loss": 0.0676, "step": 66040 }, { "epoch": 4.904945789395515, "grad_norm": 1.0163052082061768, "learning_rate": 5.703252636269122e-07, "loss": 0.0582, "step": 66050 }, { "epoch": 4.905688400415862, "grad_norm": 2.9083852767944336, "learning_rate": 5.65869597504827e-07, "loss": 0.0487, "step": 66060 }, { "epoch": 4.90643101143621, "grad_norm": 1.4686026573181152, "learning_rate": 5.614139313827418e-07, "loss": 0.0548, "step": 66070 }, { "epoch": 4.907173622456558, "grad_norm": 2.3733696937561035, "learning_rate": 5.569582652606564e-07, "loss": 0.0789, "step": 66080 }, { "epoch": 4.907916233476905, "grad_norm": 0.551871120929718, "learning_rate": 5.525025991385712e-07, "loss": 0.044, "step": 66090 }, { "epoch": 4.9086588444972525, "grad_norm": 0.9300947189331055, "learning_rate": 5.48046933016486e-07, "loss": 0.0611, "step": 66100 }, { "epoch": 4.9094014555175995, "grad_norm": 0.5513383746147156, "learning_rate": 5.435912668944008e-07, "loss": 0.0764, "step": 66110 }, { "epoch": 4.910144066537947, "grad_norm": 1.1287389993667603, "learning_rate": 5.391356007723155e-07, "loss": 0.0416, "step": 66120 }, { "epoch": 4.910886677558295, "grad_norm": 0.9969607591629028, "learning_rate": 5.346799346502302e-07, "loss": 0.0528, "step": 66130 }, { "epoch": 4.911629288578642, "grad_norm": 1.272377371788025, "learning_rate": 5.30224268528145e-07, "loss": 0.0528, "step": 66140 }, { "epoch": 4.91237189959899, "grad_norm": 0.5094819068908691, "learning_rate": 5.257686024060597e-07, "loss": 0.0744, "step": 66150 }, { "epoch": 4.913114510619337, "grad_norm": 1.5360733270645142, "learning_rate": 5.213129362839744e-07, "loss": 0.0431, "step": 66160 }, { "epoch": 4.913857121639685, "grad_norm": 2.1293842792510986, "learning_rate": 5.168572701618893e-07, "loss": 0.0791, "step": 66170 }, { "epoch": 4.914599732660033, "grad_norm": 2.370560646057129, "learning_rate": 5.124016040398039e-07, "loss": 0.0784, "step": 66180 }, { "epoch": 4.91534234368038, "grad_norm": 0.7912160158157349, "learning_rate": 5.079459379177187e-07, "loss": 0.0877, "step": 66190 }, { "epoch": 4.916084954700728, "grad_norm": 2.006499767303467, "learning_rate": 5.034902717956335e-07, "loss": 0.0906, "step": 66200 }, { "epoch": 4.916827565721075, "grad_norm": 2.1745078563690186, "learning_rate": 4.990346056735482e-07, "loss": 0.0558, "step": 66210 }, { "epoch": 4.917570176741423, "grad_norm": 1.09943425655365, "learning_rate": 4.94578939551463e-07, "loss": 0.041, "step": 66220 }, { "epoch": 4.918312787761771, "grad_norm": 1.0442047119140625, "learning_rate": 4.901232734293777e-07, "loss": 0.0659, "step": 66230 }, { "epoch": 4.919055398782118, "grad_norm": 4.601158618927002, "learning_rate": 4.856676073072924e-07, "loss": 0.0584, "step": 66240 }, { "epoch": 4.919798009802466, "grad_norm": 0.6347880363464355, "learning_rate": 4.812119411852072e-07, "loss": 0.0551, "step": 66250 }, { "epoch": 4.920540620822813, "grad_norm": 1.1013524532318115, "learning_rate": 4.76756275063122e-07, "loss": 0.0558, "step": 66260 }, { "epoch": 4.9212832318431605, "grad_norm": 1.1002392768859863, "learning_rate": 4.723006089410367e-07, "loss": 0.0531, "step": 66270 }, { "epoch": 4.922025842863508, "grad_norm": 1.1361026763916016, "learning_rate": 4.6784494281895144e-07, "loss": 0.0227, "step": 66280 }, { "epoch": 4.922768453883855, "grad_norm": 0.28624916076660156, "learning_rate": 4.633892766968662e-07, "loss": 0.0618, "step": 66290 }, { "epoch": 4.923511064904203, "grad_norm": 0.3221977651119232, "learning_rate": 4.589336105747809e-07, "loss": 0.0617, "step": 66300 }, { "epoch": 4.92425367592455, "grad_norm": 2.581843137741089, "learning_rate": 4.544779444526957e-07, "loss": 0.046, "step": 66310 }, { "epoch": 4.924996286944898, "grad_norm": 1.0967390537261963, "learning_rate": 4.5002227833061047e-07, "loss": 0.0466, "step": 66320 }, { "epoch": 4.925738897965246, "grad_norm": 2.719881296157837, "learning_rate": 4.455666122085252e-07, "loss": 0.0515, "step": 66330 }, { "epoch": 4.926481508985593, "grad_norm": 1.5159376859664917, "learning_rate": 4.411109460864399e-07, "loss": 0.0527, "step": 66340 }, { "epoch": 4.927224120005941, "grad_norm": 1.2129600048065186, "learning_rate": 4.3665527996435465e-07, "loss": 0.0918, "step": 66350 }, { "epoch": 4.927966731026288, "grad_norm": 3.8259670734405518, "learning_rate": 4.3219961384226944e-07, "loss": 0.0785, "step": 66360 }, { "epoch": 4.928709342046636, "grad_norm": 0.3441977798938751, "learning_rate": 4.277439477201842e-07, "loss": 0.0542, "step": 66370 }, { "epoch": 4.929451953066984, "grad_norm": 0.8108426332473755, "learning_rate": 4.232882815980989e-07, "loss": 0.0517, "step": 66380 }, { "epoch": 4.930194564087331, "grad_norm": 3.1225900650024414, "learning_rate": 4.188326154760137e-07, "loss": 0.0525, "step": 66390 }, { "epoch": 4.930937175107679, "grad_norm": 2.4837539196014404, "learning_rate": 4.1437694935392846e-07, "loss": 0.0445, "step": 66400 }, { "epoch": 4.931679786128026, "grad_norm": 0.961150050163269, "learning_rate": 4.0992128323184314e-07, "loss": 0.0613, "step": 66410 }, { "epoch": 4.932422397148374, "grad_norm": 0.4817768931388855, "learning_rate": 4.054656171097579e-07, "loss": 0.0348, "step": 66420 }, { "epoch": 4.9331650081687215, "grad_norm": 3.031409502029419, "learning_rate": 4.0100995098767265e-07, "loss": 0.0548, "step": 66430 }, { "epoch": 4.9339076191890685, "grad_norm": 0.8737612366676331, "learning_rate": 3.9655428486558743e-07, "loss": 0.0305, "step": 66440 }, { "epoch": 4.934650230209416, "grad_norm": 1.2402093410491943, "learning_rate": 3.9209861874350216e-07, "loss": 0.0571, "step": 66450 }, { "epoch": 4.935392841229763, "grad_norm": 1.8115334510803223, "learning_rate": 3.876429526214169e-07, "loss": 0.037, "step": 66460 }, { "epoch": 4.936135452250111, "grad_norm": 0.969933271408081, "learning_rate": 3.8318728649933167e-07, "loss": 0.0609, "step": 66470 }, { "epoch": 4.936878063270459, "grad_norm": 1.2570370435714722, "learning_rate": 3.787316203772464e-07, "loss": 0.0489, "step": 66480 }, { "epoch": 4.937620674290806, "grad_norm": 0.9259024262428284, "learning_rate": 3.7427595425516113e-07, "loss": 0.0624, "step": 66490 }, { "epoch": 4.938363285311154, "grad_norm": 1.2812902927398682, "learning_rate": 3.698202881330759e-07, "loss": 0.0582, "step": 66500 }, { "epoch": 4.939105896331501, "grad_norm": 1.1236120462417603, "learning_rate": 3.6536462201099064e-07, "loss": 0.0588, "step": 66510 }, { "epoch": 4.939848507351849, "grad_norm": 2.4268791675567627, "learning_rate": 3.6090895588890543e-07, "loss": 0.0463, "step": 66520 }, { "epoch": 4.940591118372197, "grad_norm": 1.1556020975112915, "learning_rate": 3.564532897668201e-07, "loss": 0.0474, "step": 66530 }, { "epoch": 4.941333729392544, "grad_norm": 0.47208043932914734, "learning_rate": 3.519976236447349e-07, "loss": 0.054, "step": 66540 }, { "epoch": 4.942076340412892, "grad_norm": 2.0012338161468506, "learning_rate": 3.4754195752264967e-07, "loss": 0.0446, "step": 66550 }, { "epoch": 4.942818951433239, "grad_norm": 6.049137592315674, "learning_rate": 3.430862914005644e-07, "loss": 0.0627, "step": 66560 }, { "epoch": 4.943561562453587, "grad_norm": 1.165906548500061, "learning_rate": 3.3863062527847913e-07, "loss": 0.067, "step": 66570 }, { "epoch": 4.944304173473935, "grad_norm": 2.933342218399048, "learning_rate": 3.3417495915639386e-07, "loss": 0.0588, "step": 66580 }, { "epoch": 4.945046784494282, "grad_norm": 0.5011084675788879, "learning_rate": 3.2971929303430864e-07, "loss": 0.0507, "step": 66590 }, { "epoch": 4.9457893955146295, "grad_norm": 0.8596967458724976, "learning_rate": 3.252636269122234e-07, "loss": 0.0707, "step": 66600 }, { "epoch": 4.9465320065349765, "grad_norm": 2.5871288776397705, "learning_rate": 3.208079607901381e-07, "loss": 0.0587, "step": 66610 }, { "epoch": 4.947274617555324, "grad_norm": 0.5067526698112488, "learning_rate": 3.163522946680529e-07, "loss": 0.0519, "step": 66620 }, { "epoch": 4.948017228575672, "grad_norm": 0.439694344997406, "learning_rate": 3.118966285459676e-07, "loss": 0.0436, "step": 66630 }, { "epoch": 4.948759839596019, "grad_norm": 2.3128857612609863, "learning_rate": 3.074409624238824e-07, "loss": 0.0436, "step": 66640 }, { "epoch": 4.949502450616367, "grad_norm": 1.841361403465271, "learning_rate": 3.029852963017971e-07, "loss": 0.0492, "step": 66650 }, { "epoch": 4.950245061636715, "grad_norm": 1.778883457183838, "learning_rate": 2.9852963017971185e-07, "loss": 0.038, "step": 66660 }, { "epoch": 4.950987672657062, "grad_norm": 1.8489772081375122, "learning_rate": 2.9407396405762663e-07, "loss": 0.0729, "step": 66670 }, { "epoch": 4.95173028367741, "grad_norm": 2.0135104656219482, "learning_rate": 2.896182979355414e-07, "loss": 0.0562, "step": 66680 }, { "epoch": 4.952472894697757, "grad_norm": 0.24336954951286316, "learning_rate": 2.851626318134561e-07, "loss": 0.0569, "step": 66690 }, { "epoch": 4.953215505718105, "grad_norm": 0.4141107201576233, "learning_rate": 2.807069656913709e-07, "loss": 0.0319, "step": 66700 }, { "epoch": 4.953958116738453, "grad_norm": 0.26303309202194214, "learning_rate": 2.762512995692856e-07, "loss": 0.0629, "step": 66710 }, { "epoch": 4.9547007277588, "grad_norm": 0.37212514877319336, "learning_rate": 2.717956334472004e-07, "loss": 0.0382, "step": 66720 }, { "epoch": 4.955443338779148, "grad_norm": 2.371757745742798, "learning_rate": 2.673399673251151e-07, "loss": 0.093, "step": 66730 }, { "epoch": 4.956185949799495, "grad_norm": 2.2808210849761963, "learning_rate": 2.6288430120302985e-07, "loss": 0.0723, "step": 66740 }, { "epoch": 4.956928560819843, "grad_norm": 2.8832616806030273, "learning_rate": 2.5842863508094463e-07, "loss": 0.0392, "step": 66750 }, { "epoch": 4.9576711718401905, "grad_norm": 0.5029025077819824, "learning_rate": 2.5397296895885936e-07, "loss": 0.0486, "step": 66760 }, { "epoch": 4.9584137828605375, "grad_norm": 2.287649154663086, "learning_rate": 2.495173028367741e-07, "loss": 0.0458, "step": 66770 }, { "epoch": 4.959156393880885, "grad_norm": 2.1999528408050537, "learning_rate": 2.4506163671468887e-07, "loss": 0.0567, "step": 66780 }, { "epoch": 4.959899004901232, "grad_norm": 3.0611720085144043, "learning_rate": 2.406059705926036e-07, "loss": 0.0937, "step": 66790 }, { "epoch": 4.96064161592158, "grad_norm": 2.270627737045288, "learning_rate": 2.3615030447051836e-07, "loss": 0.0653, "step": 66800 }, { "epoch": 4.961384226941928, "grad_norm": 0.9296445250511169, "learning_rate": 2.316946383484331e-07, "loss": 0.0645, "step": 66810 }, { "epoch": 4.962126837962275, "grad_norm": 3.700183868408203, "learning_rate": 2.2723897222634784e-07, "loss": 0.0964, "step": 66820 }, { "epoch": 4.962869448982623, "grad_norm": 1.576913595199585, "learning_rate": 2.227833061042626e-07, "loss": 0.0419, "step": 66830 }, { "epoch": 4.96361206000297, "grad_norm": 3.2986557483673096, "learning_rate": 2.1832763998217733e-07, "loss": 0.0588, "step": 66840 }, { "epoch": 4.964354671023318, "grad_norm": 1.3685272932052612, "learning_rate": 2.138719738600921e-07, "loss": 0.0492, "step": 66850 }, { "epoch": 4.965097282043666, "grad_norm": 0.872087836265564, "learning_rate": 2.0941630773800684e-07, "loss": 0.0318, "step": 66860 }, { "epoch": 4.965839893064013, "grad_norm": 0.4379057288169861, "learning_rate": 2.0496064161592157e-07, "loss": 0.0529, "step": 66870 }, { "epoch": 4.966582504084361, "grad_norm": 1.2920769453048706, "learning_rate": 2.0050497549383632e-07, "loss": 0.0887, "step": 66880 }, { "epoch": 4.967325115104709, "grad_norm": 1.0842275619506836, "learning_rate": 1.9604930937175108e-07, "loss": 0.0444, "step": 66890 }, { "epoch": 4.968067726125056, "grad_norm": 1.154530644416809, "learning_rate": 1.9159364324966584e-07, "loss": 0.0532, "step": 66900 }, { "epoch": 4.9688103371454035, "grad_norm": 0.3080504834651947, "learning_rate": 1.8713797712758057e-07, "loss": 0.0344, "step": 66910 }, { "epoch": 4.9695529481657505, "grad_norm": 2.1118993759155273, "learning_rate": 1.8268231100549532e-07, "loss": 0.063, "step": 66920 }, { "epoch": 4.9702955591860984, "grad_norm": 3.921952486038208, "learning_rate": 1.7822664488341005e-07, "loss": 0.0543, "step": 66930 }, { "epoch": 4.971038170206446, "grad_norm": 1.0275546312332153, "learning_rate": 1.7377097876132483e-07, "loss": 0.0334, "step": 66940 }, { "epoch": 4.971780781226793, "grad_norm": 3.7615277767181396, "learning_rate": 1.6931531263923956e-07, "loss": 0.0489, "step": 66950 }, { "epoch": 4.972523392247141, "grad_norm": 2.601187229156494, "learning_rate": 1.6485964651715432e-07, "loss": 0.0696, "step": 66960 }, { "epoch": 4.973266003267488, "grad_norm": 2.8414504528045654, "learning_rate": 1.6040398039506905e-07, "loss": 0.0683, "step": 66970 }, { "epoch": 4.974008614287836, "grad_norm": 0.3260228633880615, "learning_rate": 1.559483142729838e-07, "loss": 0.0238, "step": 66980 }, { "epoch": 4.974751225308184, "grad_norm": 0.6493045091629028, "learning_rate": 1.5149264815089856e-07, "loss": 0.0541, "step": 66990 }, { "epoch": 4.975493836328531, "grad_norm": 1.2345525026321411, "learning_rate": 1.4703698202881332e-07, "loss": 0.0544, "step": 67000 }, { "epoch": 4.976236447348879, "grad_norm": 1.9710088968276978, "learning_rate": 1.4258131590672805e-07, "loss": 0.0466, "step": 67010 }, { "epoch": 4.976979058369226, "grad_norm": 0.2602188289165497, "learning_rate": 1.381256497846428e-07, "loss": 0.0513, "step": 67020 }, { "epoch": 4.977721669389574, "grad_norm": 2.1740708351135254, "learning_rate": 1.3366998366255756e-07, "loss": 0.0645, "step": 67030 }, { "epoch": 4.978464280409922, "grad_norm": 2.4682114124298096, "learning_rate": 1.2921431754047231e-07, "loss": 0.0687, "step": 67040 }, { "epoch": 4.979206891430269, "grad_norm": 0.23082710802555084, "learning_rate": 1.2475865141838704e-07, "loss": 0.0425, "step": 67050 }, { "epoch": 4.979949502450617, "grad_norm": 0.2882836163043976, "learning_rate": 1.203029852963018e-07, "loss": 0.0423, "step": 67060 }, { "epoch": 4.980692113470964, "grad_norm": 1.1012338399887085, "learning_rate": 1.1584731917421656e-07, "loss": 0.0695, "step": 67070 }, { "epoch": 4.9814347244913115, "grad_norm": 1.7664963006973267, "learning_rate": 1.113916530521313e-07, "loss": 0.051, "step": 67080 }, { "epoch": 4.982177335511659, "grad_norm": 3.3410887718200684, "learning_rate": 1.0693598693004605e-07, "loss": 0.0547, "step": 67090 }, { "epoch": 4.982919946532006, "grad_norm": 1.7740800380706787, "learning_rate": 1.0248032080796078e-07, "loss": 0.0407, "step": 67100 }, { "epoch": 4.983662557552354, "grad_norm": 1.093974232673645, "learning_rate": 9.802465468587554e-08, "loss": 0.0456, "step": 67110 }, { "epoch": 4.984405168572701, "grad_norm": 1.4569469690322876, "learning_rate": 9.356898856379028e-08, "loss": 0.0691, "step": 67120 }, { "epoch": 4.985147779593049, "grad_norm": 2.364649534225464, "learning_rate": 8.911332244170503e-08, "loss": 0.0769, "step": 67130 }, { "epoch": 4.985890390613397, "grad_norm": 0.6497974991798401, "learning_rate": 8.465765631961978e-08, "loss": 0.0882, "step": 67140 }, { "epoch": 4.986633001633744, "grad_norm": 1.16084623336792, "learning_rate": 8.020199019753452e-08, "loss": 0.0376, "step": 67150 }, { "epoch": 4.987375612654092, "grad_norm": 2.3974056243896484, "learning_rate": 7.574632407544928e-08, "loss": 0.03, "step": 67160 }, { "epoch": 4.988118223674439, "grad_norm": 0.8052327036857605, "learning_rate": 7.129065795336402e-08, "loss": 0.0542, "step": 67170 }, { "epoch": 4.988860834694787, "grad_norm": 0.5777415037155151, "learning_rate": 6.683499183127878e-08, "loss": 0.0494, "step": 67180 }, { "epoch": 4.989603445715135, "grad_norm": 1.4238885641098022, "learning_rate": 6.237932570919352e-08, "loss": 0.0537, "step": 67190 }, { "epoch": 4.990346056735482, "grad_norm": 3.352360963821411, "learning_rate": 5.792365958710828e-08, "loss": 0.0523, "step": 67200 }, { "epoch": 4.99108866775583, "grad_norm": 1.1126995086669922, "learning_rate": 5.346799346502303e-08, "loss": 0.05, "step": 67210 }, { "epoch": 4.991831278776177, "grad_norm": 1.2887812852859497, "learning_rate": 4.901232734293777e-08, "loss": 0.0496, "step": 67220 }, { "epoch": 4.992573889796525, "grad_norm": 4.443135738372803, "learning_rate": 4.455666122085251e-08, "loss": 0.0526, "step": 67230 }, { "epoch": 4.9933165008168725, "grad_norm": 1.4799822568893433, "learning_rate": 4.010099509876726e-08, "loss": 0.0601, "step": 67240 }, { "epoch": 4.9940591118372195, "grad_norm": 0.5479670166969299, "learning_rate": 3.564532897668201e-08, "loss": 0.0607, "step": 67250 }, { "epoch": 4.994801722857567, "grad_norm": 2.0314948558807373, "learning_rate": 3.118966285459676e-08, "loss": 0.0603, "step": 67260 }, { "epoch": 4.995544333877914, "grad_norm": 0.9696687459945679, "learning_rate": 2.6733996732511514e-08, "loss": 0.0572, "step": 67270 }, { "epoch": 4.996286944898262, "grad_norm": 1.398507833480835, "learning_rate": 2.2278330610426256e-08, "loss": 0.0886, "step": 67280 }, { "epoch": 4.99702955591861, "grad_norm": 1.3897038698196411, "learning_rate": 1.7822664488341006e-08, "loss": 0.0742, "step": 67290 }, { "epoch": 4.997772166938957, "grad_norm": 1.0769445896148682, "learning_rate": 1.3366998366255757e-08, "loss": 0.0252, "step": 67300 }, { "epoch": 4.998514777959305, "grad_norm": 2.8286406993865967, "learning_rate": 8.911332244170503e-09, "loss": 0.05, "step": 67310 }, { "epoch": 4.999257388979652, "grad_norm": 2.435650110244751, "learning_rate": 4.4556661220852515e-09, "loss": 0.0417, "step": 67320 }, { "epoch": 5.0, "grad_norm": 6.435393333435059, "learning_rate": 0.0, "loss": 0.0683, "step": 67330 }, { "epoch": 5.0, "eval_f1": 0.0, "eval_loss": 0.05167969688773155, "eval_runtime": 794.3074, "eval_samples_per_second": 47.864, "eval_steps_per_second": 2.993, "step": 67330 }, { "epoch": 5.0, "step": 67330, "total_flos": 8.34383324330106e+19, "train_loss": 0.0714715420367653, "train_runtime": 35752.3153, "train_samples_per_second": 30.13, "train_steps_per_second": 1.883 } ], "logging_steps": 10, "max_steps": 67330, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.34383324330106e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }