{ "best_metric": 0.04290741682052612, "best_model_checkpoint": "./test_default_model/checkpoint-19805", "epoch": 5.0, "eval_steps": 500, "global_step": 19805, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00297000297000297, "grad_norm": 1.7466098070144653, "learning_rate": 2.9982179982179983e-05, "loss": 0.2849, "step": 10 }, { "epoch": 0.00594000594000594, "grad_norm": 1.1830675601959229, "learning_rate": 2.9964359964359965e-05, "loss": 0.165, "step": 20 }, { "epoch": 0.00891000891000891, "grad_norm": 1.1286729574203491, "learning_rate": 2.9946539946539947e-05, "loss": 0.1477, "step": 30 }, { "epoch": 0.01188001188001188, "grad_norm": 1.1905314922332764, "learning_rate": 2.992871992871993e-05, "loss": 0.1291, "step": 40 }, { "epoch": 0.01485001485001485, "grad_norm": 1.1131917238235474, "learning_rate": 2.991089991089991e-05, "loss": 0.1261, "step": 50 }, { "epoch": 0.01782001782001782, "grad_norm": 1.267146348953247, "learning_rate": 2.9893079893079894e-05, "loss": 0.1202, "step": 60 }, { "epoch": 0.02079002079002079, "grad_norm": 1.0151565074920654, "learning_rate": 2.9875259875259876e-05, "loss": 0.1209, "step": 70 }, { "epoch": 0.02376002376002376, "grad_norm": 0.8143087029457092, "learning_rate": 2.9857439857439858e-05, "loss": 0.1142, "step": 80 }, { "epoch": 0.02673002673002673, "grad_norm": 1.4103150367736816, "learning_rate": 2.983961983961984e-05, "loss": 0.1123, "step": 90 }, { "epoch": 0.0297000297000297, "grad_norm": 1.6572177410125732, "learning_rate": 2.9821799821799822e-05, "loss": 0.1077, "step": 100 }, { "epoch": 0.03267003267003267, "grad_norm": 0.7510947585105896, "learning_rate": 2.9803979803979805e-05, "loss": 0.1209, "step": 110 }, { "epoch": 0.03564003564003564, "grad_norm": 1.0185679197311401, "learning_rate": 2.9786159786159787e-05, "loss": 0.1073, "step": 120 }, { "epoch": 0.03861003861003861, "grad_norm": 1.214509129524231, "learning_rate": 2.976833976833977e-05, "loss": 0.1243, "step": 130 }, { "epoch": 0.04158004158004158, "grad_norm": 0.7015527486801147, "learning_rate": 2.975051975051975e-05, "loss": 0.1073, "step": 140 }, { "epoch": 0.04455004455004455, "grad_norm": 0.7993521690368652, "learning_rate": 2.9732699732699733e-05, "loss": 0.1162, "step": 150 }, { "epoch": 0.04752004752004752, "grad_norm": 1.4627108573913574, "learning_rate": 2.9714879714879715e-05, "loss": 0.1094, "step": 160 }, { "epoch": 0.05049005049005049, "grad_norm": 0.5512596368789673, "learning_rate": 2.9697059697059698e-05, "loss": 0.1105, "step": 170 }, { "epoch": 0.05346005346005346, "grad_norm": 0.6437152028083801, "learning_rate": 2.967923967923968e-05, "loss": 0.1165, "step": 180 }, { "epoch": 0.05643005643005643, "grad_norm": 0.9050448536872864, "learning_rate": 2.9661419661419662e-05, "loss": 0.1158, "step": 190 }, { "epoch": 0.0594000594000594, "grad_norm": 0.7845998406410217, "learning_rate": 2.9643599643599644e-05, "loss": 0.1046, "step": 200 }, { "epoch": 0.062370062370062374, "grad_norm": 1.2148687839508057, "learning_rate": 2.9625779625779626e-05, "loss": 0.1048, "step": 210 }, { "epoch": 0.06534006534006534, "grad_norm": 0.5540338754653931, "learning_rate": 2.960795960795961e-05, "loss": 0.0989, "step": 220 }, { "epoch": 0.0683100683100683, "grad_norm": 1.147627830505371, "learning_rate": 2.959013959013959e-05, "loss": 0.0948, "step": 230 }, { "epoch": 0.07128007128007129, "grad_norm": 0.6427733898162842, "learning_rate": 2.9572319572319573e-05, "loss": 0.1132, "step": 240 }, { "epoch": 0.07425007425007425, "grad_norm": 0.48930391669273376, "learning_rate": 2.9554499554499555e-05, "loss": 0.1077, "step": 250 }, { "epoch": 0.07722007722007722, "grad_norm": 0.5915262699127197, "learning_rate": 2.9536679536679537e-05, "loss": 0.0988, "step": 260 }, { "epoch": 0.08019008019008018, "grad_norm": 0.9437302350997925, "learning_rate": 2.951885951885952e-05, "loss": 0.0943, "step": 270 }, { "epoch": 0.08316008316008316, "grad_norm": 0.8880864381790161, "learning_rate": 2.95010395010395e-05, "loss": 0.0874, "step": 280 }, { "epoch": 0.08613008613008613, "grad_norm": 0.5144450068473816, "learning_rate": 2.9483219483219484e-05, "loss": 0.1055, "step": 290 }, { "epoch": 0.0891000891000891, "grad_norm": 0.718809187412262, "learning_rate": 2.9465399465399466e-05, "loss": 0.1052, "step": 300 }, { "epoch": 0.09207009207009206, "grad_norm": 1.3571438789367676, "learning_rate": 2.9447579447579448e-05, "loss": 0.1063, "step": 310 }, { "epoch": 0.09504009504009504, "grad_norm": 1.6258764266967773, "learning_rate": 2.942975942975943e-05, "loss": 0.1162, "step": 320 }, { "epoch": 0.09801009801009801, "grad_norm": 0.46299999952316284, "learning_rate": 2.9411939411939412e-05, "loss": 0.0883, "step": 330 }, { "epoch": 0.10098010098010098, "grad_norm": 0.6494696140289307, "learning_rate": 2.9394119394119395e-05, "loss": 0.112, "step": 340 }, { "epoch": 0.10395010395010396, "grad_norm": 0.6922862529754639, "learning_rate": 2.9376299376299377e-05, "loss": 0.099, "step": 350 }, { "epoch": 0.10692010692010692, "grad_norm": 0.6345096230506897, "learning_rate": 2.935847935847936e-05, "loss": 0.0879, "step": 360 }, { "epoch": 0.10989010989010989, "grad_norm": 1.3332024812698364, "learning_rate": 2.934065934065934e-05, "loss": 0.1001, "step": 370 }, { "epoch": 0.11286011286011285, "grad_norm": 1.0127153396606445, "learning_rate": 2.9322839322839323e-05, "loss": 0.0997, "step": 380 }, { "epoch": 0.11583011583011583, "grad_norm": 0.48219984769821167, "learning_rate": 2.930501930501931e-05, "loss": 0.0875, "step": 390 }, { "epoch": 0.1188001188001188, "grad_norm": 0.8579444289207458, "learning_rate": 2.9287199287199288e-05, "loss": 0.1045, "step": 400 }, { "epoch": 0.12177012177012177, "grad_norm": 0.5488039255142212, "learning_rate": 2.926937926937927e-05, "loss": 0.1084, "step": 410 }, { "epoch": 0.12474012474012475, "grad_norm": 1.2597718238830566, "learning_rate": 2.9251559251559252e-05, "loss": 0.0853, "step": 420 }, { "epoch": 0.1277101277101277, "grad_norm": 1.077631950378418, "learning_rate": 2.9233739233739234e-05, "loss": 0.0962, "step": 430 }, { "epoch": 0.13068013068013068, "grad_norm": 0.5581513047218323, "learning_rate": 2.9215919215919216e-05, "loss": 0.0922, "step": 440 }, { "epoch": 0.13365013365013365, "grad_norm": 0.6805756092071533, "learning_rate": 2.91980991980992e-05, "loss": 0.1083, "step": 450 }, { "epoch": 0.1366201366201366, "grad_norm": 0.860261857509613, "learning_rate": 2.9180279180279184e-05, "loss": 0.0852, "step": 460 }, { "epoch": 0.13959013959013958, "grad_norm": 1.9232168197631836, "learning_rate": 2.9162459162459163e-05, "loss": 0.0933, "step": 470 }, { "epoch": 0.14256014256014257, "grad_norm": 0.8232311606407166, "learning_rate": 2.9144639144639145e-05, "loss": 0.0928, "step": 480 }, { "epoch": 0.14553014553014554, "grad_norm": 0.8007870316505432, "learning_rate": 2.9126819126819127e-05, "loss": 0.0906, "step": 490 }, { "epoch": 0.1485001485001485, "grad_norm": 1.1848207712173462, "learning_rate": 2.910899910899911e-05, "loss": 0.1052, "step": 500 }, { "epoch": 0.15147015147015147, "grad_norm": 0.5605499744415283, "learning_rate": 2.909117909117909e-05, "loss": 0.093, "step": 510 }, { "epoch": 0.15444015444015444, "grad_norm": 0.6382190585136414, "learning_rate": 2.9073359073359074e-05, "loss": 0.0997, "step": 520 }, { "epoch": 0.1574101574101574, "grad_norm": 0.5192627310752869, "learning_rate": 2.905553905553906e-05, "loss": 0.1098, "step": 530 }, { "epoch": 0.16038016038016037, "grad_norm": 0.5898168683052063, "learning_rate": 2.9037719037719038e-05, "loss": 0.0951, "step": 540 }, { "epoch": 0.16335016335016336, "grad_norm": 0.465077668428421, "learning_rate": 2.901989901989902e-05, "loss": 0.0929, "step": 550 }, { "epoch": 0.16632016632016633, "grad_norm": 0.6358753442764282, "learning_rate": 2.9002079002079002e-05, "loss": 0.0999, "step": 560 }, { "epoch": 0.1692901692901693, "grad_norm": 0.7714558839797974, "learning_rate": 2.8984258984258984e-05, "loss": 0.1031, "step": 570 }, { "epoch": 0.17226017226017226, "grad_norm": 0.865616500377655, "learning_rate": 2.8966438966438967e-05, "loss": 0.0932, "step": 580 }, { "epoch": 0.17523017523017523, "grad_norm": 0.621036171913147, "learning_rate": 2.894861894861895e-05, "loss": 0.094, "step": 590 }, { "epoch": 0.1782001782001782, "grad_norm": 0.5007760524749756, "learning_rate": 2.8930798930798934e-05, "loss": 0.099, "step": 600 }, { "epoch": 0.18117018117018116, "grad_norm": 0.47733157873153687, "learning_rate": 2.8912978912978913e-05, "loss": 0.0795, "step": 610 }, { "epoch": 0.18414018414018413, "grad_norm": 0.40642765164375305, "learning_rate": 2.8895158895158895e-05, "loss": 0.0829, "step": 620 }, { "epoch": 0.18711018711018712, "grad_norm": 1.1361258029937744, "learning_rate": 2.8877338877338877e-05, "loss": 0.0893, "step": 630 }, { "epoch": 0.1900801900801901, "grad_norm": 0.7784861922264099, "learning_rate": 2.885951885951886e-05, "loss": 0.0888, "step": 640 }, { "epoch": 0.19305019305019305, "grad_norm": 0.43066325783729553, "learning_rate": 2.8841698841698842e-05, "loss": 0.0966, "step": 650 }, { "epoch": 0.19602019602019602, "grad_norm": 0.36752209067344666, "learning_rate": 2.8823878823878824e-05, "loss": 0.0999, "step": 660 }, { "epoch": 0.19899019899019899, "grad_norm": 0.9712108969688416, "learning_rate": 2.880605880605881e-05, "loss": 0.0906, "step": 670 }, { "epoch": 0.20196020196020195, "grad_norm": 0.714443564414978, "learning_rate": 2.878823878823879e-05, "loss": 0.1049, "step": 680 }, { "epoch": 0.20493020493020492, "grad_norm": 0.3934662640094757, "learning_rate": 2.877041877041877e-05, "loss": 0.09, "step": 690 }, { "epoch": 0.2079002079002079, "grad_norm": 1.9262911081314087, "learning_rate": 2.8752598752598753e-05, "loss": 0.1051, "step": 700 }, { "epoch": 0.21087021087021088, "grad_norm": 0.6336867809295654, "learning_rate": 2.8734778734778735e-05, "loss": 0.0852, "step": 710 }, { "epoch": 0.21384021384021384, "grad_norm": 0.45155736804008484, "learning_rate": 2.8716958716958717e-05, "loss": 0.0928, "step": 720 }, { "epoch": 0.2168102168102168, "grad_norm": 0.6008352041244507, "learning_rate": 2.86991386991387e-05, "loss": 0.0829, "step": 730 }, { "epoch": 0.21978021978021978, "grad_norm": 0.4825937747955322, "learning_rate": 2.8681318681318685e-05, "loss": 0.0995, "step": 740 }, { "epoch": 0.22275022275022274, "grad_norm": 1.0774333477020264, "learning_rate": 2.8663498663498664e-05, "loss": 0.1031, "step": 750 }, { "epoch": 0.2257202257202257, "grad_norm": 0.7147405743598938, "learning_rate": 2.8645678645678646e-05, "loss": 0.1018, "step": 760 }, { "epoch": 0.2286902286902287, "grad_norm": 0.6777707934379578, "learning_rate": 2.8627858627858628e-05, "loss": 0.0901, "step": 770 }, { "epoch": 0.23166023166023167, "grad_norm": 0.4215840697288513, "learning_rate": 2.861003861003861e-05, "loss": 0.0862, "step": 780 }, { "epoch": 0.23463023463023464, "grad_norm": 0.4555210471153259, "learning_rate": 2.8592218592218592e-05, "loss": 0.0825, "step": 790 }, { "epoch": 0.2376002376002376, "grad_norm": 0.7088650465011597, "learning_rate": 2.8574398574398574e-05, "loss": 0.0932, "step": 800 }, { "epoch": 0.24057024057024057, "grad_norm": 0.6595791578292847, "learning_rate": 2.855657855657856e-05, "loss": 0.098, "step": 810 }, { "epoch": 0.24354024354024353, "grad_norm": 0.5375499725341797, "learning_rate": 2.853875853875854e-05, "loss": 0.0875, "step": 820 }, { "epoch": 0.2465102465102465, "grad_norm": 0.4199369549751282, "learning_rate": 2.852093852093852e-05, "loss": 0.0807, "step": 830 }, { "epoch": 0.2494802494802495, "grad_norm": 0.41728097200393677, "learning_rate": 2.8503118503118503e-05, "loss": 0.0925, "step": 840 }, { "epoch": 0.25245025245025243, "grad_norm": 0.6526634693145752, "learning_rate": 2.8485298485298485e-05, "loss": 0.093, "step": 850 }, { "epoch": 0.2554202554202554, "grad_norm": 0.6086540222167969, "learning_rate": 2.8467478467478467e-05, "loss": 0.1036, "step": 860 }, { "epoch": 0.25839025839025836, "grad_norm": 0.8363798260688782, "learning_rate": 2.844965844965845e-05, "loss": 0.0871, "step": 870 }, { "epoch": 0.26136026136026136, "grad_norm": 0.49175241589546204, "learning_rate": 2.8431838431838435e-05, "loss": 0.0953, "step": 880 }, { "epoch": 0.26433026433026435, "grad_norm": 0.6891732811927795, "learning_rate": 2.8414018414018414e-05, "loss": 0.0789, "step": 890 }, { "epoch": 0.2673002673002673, "grad_norm": 0.7982739210128784, "learning_rate": 2.8396198396198396e-05, "loss": 0.1105, "step": 900 }, { "epoch": 0.2702702702702703, "grad_norm": 1.5775654315948486, "learning_rate": 2.8378378378378378e-05, "loss": 0.0902, "step": 910 }, { "epoch": 0.2732402732402732, "grad_norm": 0.42223745584487915, "learning_rate": 2.836055836055836e-05, "loss": 0.0937, "step": 920 }, { "epoch": 0.2762102762102762, "grad_norm": 0.8138239979743958, "learning_rate": 2.8342738342738343e-05, "loss": 0.0989, "step": 930 }, { "epoch": 0.27918027918027916, "grad_norm": 0.5486122965812683, "learning_rate": 2.8324918324918325e-05, "loss": 0.0901, "step": 940 }, { "epoch": 0.28215028215028215, "grad_norm": 0.5096667408943176, "learning_rate": 2.830709830709831e-05, "loss": 0.0799, "step": 950 }, { "epoch": 0.28512028512028514, "grad_norm": 0.5797027945518494, "learning_rate": 2.8289278289278293e-05, "loss": 0.0907, "step": 960 }, { "epoch": 0.2880902880902881, "grad_norm": 0.7815655469894409, "learning_rate": 2.827145827145827e-05, "loss": 0.0793, "step": 970 }, { "epoch": 0.2910602910602911, "grad_norm": 0.5682644248008728, "learning_rate": 2.8253638253638253e-05, "loss": 0.0799, "step": 980 }, { "epoch": 0.294030294030294, "grad_norm": 0.5554261207580566, "learning_rate": 2.8235818235818236e-05, "loss": 0.0949, "step": 990 }, { "epoch": 0.297000297000297, "grad_norm": 0.5728469491004944, "learning_rate": 2.8217998217998218e-05, "loss": 0.1111, "step": 1000 }, { "epoch": 0.29997029997029995, "grad_norm": 0.5483665466308594, "learning_rate": 2.82001782001782e-05, "loss": 0.0901, "step": 1010 }, { "epoch": 0.30294030294030294, "grad_norm": 0.7061681151390076, "learning_rate": 2.8182358182358186e-05, "loss": 0.0942, "step": 1020 }, { "epoch": 0.30591030591030594, "grad_norm": 0.4503157436847687, "learning_rate": 2.8164538164538168e-05, "loss": 0.0837, "step": 1030 }, { "epoch": 0.3088803088803089, "grad_norm": 1.187880277633667, "learning_rate": 2.8146718146718146e-05, "loss": 0.0866, "step": 1040 }, { "epoch": 0.31185031185031187, "grad_norm": 1.120139718055725, "learning_rate": 2.812889812889813e-05, "loss": 0.1058, "step": 1050 }, { "epoch": 0.3148203148203148, "grad_norm": 0.7681704759597778, "learning_rate": 2.811107811107811e-05, "loss": 0.0864, "step": 1060 }, { "epoch": 0.3177903177903178, "grad_norm": 0.6372396349906921, "learning_rate": 2.8093258093258093e-05, "loss": 0.0886, "step": 1070 }, { "epoch": 0.32076032076032074, "grad_norm": 0.7018745541572571, "learning_rate": 2.8075438075438075e-05, "loss": 0.0786, "step": 1080 }, { "epoch": 0.32373032373032373, "grad_norm": 0.8289116621017456, "learning_rate": 2.805761805761806e-05, "loss": 0.0964, "step": 1090 }, { "epoch": 0.3267003267003267, "grad_norm": 0.7211658954620361, "learning_rate": 2.8039798039798043e-05, "loss": 0.1066, "step": 1100 }, { "epoch": 0.32967032967032966, "grad_norm": 0.677126407623291, "learning_rate": 2.802197802197802e-05, "loss": 0.081, "step": 1110 }, { "epoch": 0.33264033264033266, "grad_norm": 0.3897887170314789, "learning_rate": 2.8004158004158004e-05, "loss": 0.0937, "step": 1120 }, { "epoch": 0.3356103356103356, "grad_norm": 0.5881434679031372, "learning_rate": 2.7986337986337986e-05, "loss": 0.0852, "step": 1130 }, { "epoch": 0.3385803385803386, "grad_norm": 0.6897678971290588, "learning_rate": 2.7968517968517968e-05, "loss": 0.0873, "step": 1140 }, { "epoch": 0.34155034155034153, "grad_norm": 0.6038883328437805, "learning_rate": 2.795069795069795e-05, "loss": 0.0805, "step": 1150 }, { "epoch": 0.3445203445203445, "grad_norm": 0.4414396286010742, "learning_rate": 2.7932877932877936e-05, "loss": 0.0981, "step": 1160 }, { "epoch": 0.3474903474903475, "grad_norm": 0.48170387744903564, "learning_rate": 2.7915057915057918e-05, "loss": 0.0938, "step": 1170 }, { "epoch": 0.35046035046035046, "grad_norm": 0.5567618012428284, "learning_rate": 2.7897237897237897e-05, "loss": 0.0897, "step": 1180 }, { "epoch": 0.35343035343035345, "grad_norm": 0.6452346444129944, "learning_rate": 2.787941787941788e-05, "loss": 0.0948, "step": 1190 }, { "epoch": 0.3564003564003564, "grad_norm": 0.4139314889907837, "learning_rate": 2.786159786159786e-05, "loss": 0.0849, "step": 1200 }, { "epoch": 0.3593703593703594, "grad_norm": 0.5524829030036926, "learning_rate": 2.7843777843777843e-05, "loss": 0.0742, "step": 1210 }, { "epoch": 0.3623403623403623, "grad_norm": 1.0731943845748901, "learning_rate": 2.7825957825957826e-05, "loss": 0.0834, "step": 1220 }, { "epoch": 0.3653103653103653, "grad_norm": 0.6787437796592712, "learning_rate": 2.780813780813781e-05, "loss": 0.0921, "step": 1230 }, { "epoch": 0.36828036828036825, "grad_norm": 0.536044716835022, "learning_rate": 2.7790317790317793e-05, "loss": 0.0837, "step": 1240 }, { "epoch": 0.37125037125037125, "grad_norm": 0.4149301052093506, "learning_rate": 2.7772497772497772e-05, "loss": 0.0847, "step": 1250 }, { "epoch": 0.37422037422037424, "grad_norm": 0.6760357618331909, "learning_rate": 2.7754677754677754e-05, "loss": 0.0965, "step": 1260 }, { "epoch": 0.3771903771903772, "grad_norm": 0.8695538640022278, "learning_rate": 2.7736857736857736e-05, "loss": 0.0862, "step": 1270 }, { "epoch": 0.3801603801603802, "grad_norm": 1.1023316383361816, "learning_rate": 2.771903771903772e-05, "loss": 0.0818, "step": 1280 }, { "epoch": 0.3831303831303831, "grad_norm": 1.0046688318252563, "learning_rate": 2.77012177012177e-05, "loss": 0.0923, "step": 1290 }, { "epoch": 0.3861003861003861, "grad_norm": 0.4843716323375702, "learning_rate": 2.7683397683397686e-05, "loss": 0.0774, "step": 1300 }, { "epoch": 0.38907038907038904, "grad_norm": 0.6335024833679199, "learning_rate": 2.766557766557767e-05, "loss": 0.083, "step": 1310 }, { "epoch": 0.39204039204039204, "grad_norm": 0.5234698057174683, "learning_rate": 2.7647757647757647e-05, "loss": 0.0755, "step": 1320 }, { "epoch": 0.39501039501039503, "grad_norm": 0.477662056684494, "learning_rate": 2.762993762993763e-05, "loss": 0.0756, "step": 1330 }, { "epoch": 0.39798039798039797, "grad_norm": 0.5107772350311279, "learning_rate": 2.761211761211761e-05, "loss": 0.0688, "step": 1340 }, { "epoch": 0.40095040095040096, "grad_norm": 0.6898319125175476, "learning_rate": 2.7594297594297594e-05, "loss": 0.0856, "step": 1350 }, { "epoch": 0.4039204039204039, "grad_norm": 0.5590442419052124, "learning_rate": 2.7576477576477576e-05, "loss": 0.0857, "step": 1360 }, { "epoch": 0.4068904068904069, "grad_norm": 0.6682615280151367, "learning_rate": 2.755865755865756e-05, "loss": 0.0828, "step": 1370 }, { "epoch": 0.40986040986040984, "grad_norm": 0.27072158455848694, "learning_rate": 2.7540837540837544e-05, "loss": 0.0775, "step": 1380 }, { "epoch": 0.41283041283041283, "grad_norm": 0.6918196082115173, "learning_rate": 2.7523017523017522e-05, "loss": 0.0734, "step": 1390 }, { "epoch": 0.4158004158004158, "grad_norm": 0.6403471827507019, "learning_rate": 2.7505197505197505e-05, "loss": 0.0814, "step": 1400 }, { "epoch": 0.41877041877041876, "grad_norm": 0.7018643617630005, "learning_rate": 2.7487377487377487e-05, "loss": 0.0811, "step": 1410 }, { "epoch": 0.42174042174042176, "grad_norm": 0.6571378111839294, "learning_rate": 2.746955746955747e-05, "loss": 0.0786, "step": 1420 }, { "epoch": 0.4247104247104247, "grad_norm": 0.7818433046340942, "learning_rate": 2.745173745173745e-05, "loss": 0.0743, "step": 1430 }, { "epoch": 0.4276804276804277, "grad_norm": 0.7524327635765076, "learning_rate": 2.7433917433917437e-05, "loss": 0.0757, "step": 1440 }, { "epoch": 0.4306504306504306, "grad_norm": 0.8632511496543884, "learning_rate": 2.741609741609742e-05, "loss": 0.084, "step": 1450 }, { "epoch": 0.4336204336204336, "grad_norm": 0.6295231580734253, "learning_rate": 2.7398277398277398e-05, "loss": 0.0892, "step": 1460 }, { "epoch": 0.4365904365904366, "grad_norm": 0.6907210946083069, "learning_rate": 2.738045738045738e-05, "loss": 0.1006, "step": 1470 }, { "epoch": 0.43956043956043955, "grad_norm": 0.617152988910675, "learning_rate": 2.7362637362637362e-05, "loss": 0.097, "step": 1480 }, { "epoch": 0.44253044253044255, "grad_norm": 0.6373753547668457, "learning_rate": 2.7344817344817344e-05, "loss": 0.0784, "step": 1490 }, { "epoch": 0.4455004455004455, "grad_norm": 0.7640069723129272, "learning_rate": 2.7326997326997326e-05, "loss": 0.0729, "step": 1500 }, { "epoch": 0.4484704484704485, "grad_norm": 0.5482354164123535, "learning_rate": 2.7309177309177312e-05, "loss": 0.0876, "step": 1510 }, { "epoch": 0.4514404514404514, "grad_norm": 0.7966523766517639, "learning_rate": 2.7291357291357294e-05, "loss": 0.0833, "step": 1520 }, { "epoch": 0.4544104544104544, "grad_norm": 0.6484697461128235, "learning_rate": 2.7273537273537276e-05, "loss": 0.0854, "step": 1530 }, { "epoch": 0.4573804573804574, "grad_norm": 0.43090665340423584, "learning_rate": 2.7255717255717255e-05, "loss": 0.0914, "step": 1540 }, { "epoch": 0.46035046035046034, "grad_norm": 0.5118837356567383, "learning_rate": 2.7237897237897237e-05, "loss": 0.0819, "step": 1550 }, { "epoch": 0.46332046332046334, "grad_norm": 0.9723702669143677, "learning_rate": 2.722007722007722e-05, "loss": 0.088, "step": 1560 }, { "epoch": 0.4662904662904663, "grad_norm": 1.0589011907577515, "learning_rate": 2.72022572022572e-05, "loss": 0.0963, "step": 1570 }, { "epoch": 0.46926046926046927, "grad_norm": 0.6201198697090149, "learning_rate": 2.7184437184437187e-05, "loss": 0.0702, "step": 1580 }, { "epoch": 0.4722304722304722, "grad_norm": 0.40020257234573364, "learning_rate": 2.716661716661717e-05, "loss": 0.0752, "step": 1590 }, { "epoch": 0.4752004752004752, "grad_norm": 0.8229923844337463, "learning_rate": 2.714879714879715e-05, "loss": 0.1031, "step": 1600 }, { "epoch": 0.4781704781704782, "grad_norm": 0.5380883812904358, "learning_rate": 2.713097713097713e-05, "loss": 0.0911, "step": 1610 }, { "epoch": 0.48114048114048114, "grad_norm": 0.507243812084198, "learning_rate": 2.7113157113157112e-05, "loss": 0.079, "step": 1620 }, { "epoch": 0.48411048411048413, "grad_norm": 0.6244765520095825, "learning_rate": 2.7095337095337095e-05, "loss": 0.0643, "step": 1630 }, { "epoch": 0.48708048708048707, "grad_norm": 1.1058402061462402, "learning_rate": 2.7077517077517077e-05, "loss": 0.0928, "step": 1640 }, { "epoch": 0.49005049005049006, "grad_norm": 0.8316872715950012, "learning_rate": 2.7059697059697062e-05, "loss": 0.0729, "step": 1650 }, { "epoch": 0.493020493020493, "grad_norm": 0.6039434671401978, "learning_rate": 2.7041877041877044e-05, "loss": 0.0907, "step": 1660 }, { "epoch": 0.495990495990496, "grad_norm": 0.47073495388031006, "learning_rate": 2.7024057024057027e-05, "loss": 0.0806, "step": 1670 }, { "epoch": 0.498960498960499, "grad_norm": 0.4234858453273773, "learning_rate": 2.7006237006237005e-05, "loss": 0.0796, "step": 1680 }, { "epoch": 0.5019305019305019, "grad_norm": 0.7585604190826416, "learning_rate": 2.6988416988416988e-05, "loss": 0.0922, "step": 1690 }, { "epoch": 0.5049005049005049, "grad_norm": 0.5006585717201233, "learning_rate": 2.697059697059697e-05, "loss": 0.0826, "step": 1700 }, { "epoch": 0.5078705078705079, "grad_norm": 0.6841594576835632, "learning_rate": 2.6952776952776952e-05, "loss": 0.0879, "step": 1710 }, { "epoch": 0.5108405108405109, "grad_norm": 0.6505159139633179, "learning_rate": 2.6934956934956937e-05, "loss": 0.097, "step": 1720 }, { "epoch": 0.5138105138105138, "grad_norm": 0.48233747482299805, "learning_rate": 2.691713691713692e-05, "loss": 0.079, "step": 1730 }, { "epoch": 0.5167805167805167, "grad_norm": 0.5792484879493713, "learning_rate": 2.6899316899316902e-05, "loss": 0.0847, "step": 1740 }, { "epoch": 0.5197505197505198, "grad_norm": 0.6649707555770874, "learning_rate": 2.688149688149688e-05, "loss": 0.0584, "step": 1750 }, { "epoch": 0.5227205227205227, "grad_norm": 0.6543247699737549, "learning_rate": 2.6863676863676863e-05, "loss": 0.0558, "step": 1760 }, { "epoch": 0.5256905256905257, "grad_norm": 0.6927476525306702, "learning_rate": 2.6845856845856845e-05, "loss": 0.0828, "step": 1770 }, { "epoch": 0.5286605286605287, "grad_norm": 0.9066148996353149, "learning_rate": 2.6828036828036827e-05, "loss": 0.0827, "step": 1780 }, { "epoch": 0.5316305316305316, "grad_norm": 0.6122345924377441, "learning_rate": 2.6810216810216813e-05, "loss": 0.0831, "step": 1790 }, { "epoch": 0.5346005346005346, "grad_norm": 0.5523887872695923, "learning_rate": 2.6792396792396795e-05, "loss": 0.0925, "step": 1800 }, { "epoch": 0.5375705375705375, "grad_norm": 0.9167420268058777, "learning_rate": 2.6774576774576777e-05, "loss": 0.0756, "step": 1810 }, { "epoch": 0.5405405405405406, "grad_norm": 0.4656206965446472, "learning_rate": 2.6756756756756756e-05, "loss": 0.0827, "step": 1820 }, { "epoch": 0.5435105435105435, "grad_norm": 0.49738115072250366, "learning_rate": 2.6738936738936738e-05, "loss": 0.0996, "step": 1830 }, { "epoch": 0.5464805464805464, "grad_norm": 0.7212559580802917, "learning_rate": 2.672111672111672e-05, "loss": 0.0791, "step": 1840 }, { "epoch": 0.5494505494505495, "grad_norm": 0.6626265645027161, "learning_rate": 2.6703296703296702e-05, "loss": 0.091, "step": 1850 }, { "epoch": 0.5524205524205524, "grad_norm": 0.38933899998664856, "learning_rate": 2.6685476685476688e-05, "loss": 0.0788, "step": 1860 }, { "epoch": 0.5553905553905554, "grad_norm": 0.41860514879226685, "learning_rate": 2.666765666765667e-05, "loss": 0.1031, "step": 1870 }, { "epoch": 0.5583605583605583, "grad_norm": 0.5364987850189209, "learning_rate": 2.6649836649836652e-05, "loss": 0.0891, "step": 1880 }, { "epoch": 0.5613305613305614, "grad_norm": 0.28089386224746704, "learning_rate": 2.663201663201663e-05, "loss": 0.0921, "step": 1890 }, { "epoch": 0.5643005643005643, "grad_norm": 0.6708937287330627, "learning_rate": 2.6614196614196613e-05, "loss": 0.0876, "step": 1900 }, { "epoch": 0.5672705672705672, "grad_norm": 0.49499982595443726, "learning_rate": 2.6596376596376595e-05, "loss": 0.0889, "step": 1910 }, { "epoch": 0.5702405702405703, "grad_norm": 0.5181038975715637, "learning_rate": 2.6578556578556577e-05, "loss": 0.0687, "step": 1920 }, { "epoch": 0.5732105732105732, "grad_norm": 0.4590006172657013, "learning_rate": 2.6560736560736563e-05, "loss": 0.0843, "step": 1930 }, { "epoch": 0.5761805761805762, "grad_norm": 0.542353630065918, "learning_rate": 2.6542916542916545e-05, "loss": 0.0807, "step": 1940 }, { "epoch": 0.5791505791505791, "grad_norm": 0.4152495861053467, "learning_rate": 2.6525096525096527e-05, "loss": 0.0833, "step": 1950 }, { "epoch": 0.5821205821205822, "grad_norm": 0.4847126603126526, "learning_rate": 2.6507276507276506e-05, "loss": 0.0844, "step": 1960 }, { "epoch": 0.5850905850905851, "grad_norm": 0.5619663596153259, "learning_rate": 2.648945648945649e-05, "loss": 0.0768, "step": 1970 }, { "epoch": 0.588060588060588, "grad_norm": 0.6558105945587158, "learning_rate": 2.647163647163647e-05, "loss": 0.0754, "step": 1980 }, { "epoch": 0.5910305910305911, "grad_norm": 0.9754857420921326, "learning_rate": 2.6453816453816453e-05, "loss": 0.0868, "step": 1990 }, { "epoch": 0.594000594000594, "grad_norm": 0.4641966223716736, "learning_rate": 2.6435996435996438e-05, "loss": 0.0929, "step": 2000 }, { "epoch": 0.596970596970597, "grad_norm": 0.46997398138046265, "learning_rate": 2.641817641817642e-05, "loss": 0.0822, "step": 2010 }, { "epoch": 0.5999405999405999, "grad_norm": 0.6096898913383484, "learning_rate": 2.6400356400356403e-05, "loss": 0.0871, "step": 2020 }, { "epoch": 0.6029106029106029, "grad_norm": 0.4723495543003082, "learning_rate": 2.638253638253638e-05, "loss": 0.0767, "step": 2030 }, { "epoch": 0.6058806058806059, "grad_norm": 0.5081328749656677, "learning_rate": 2.6364716364716364e-05, "loss": 0.0728, "step": 2040 }, { "epoch": 0.6088506088506088, "grad_norm": 0.5929988026618958, "learning_rate": 2.6346896346896346e-05, "loss": 0.0777, "step": 2050 }, { "epoch": 0.6118206118206119, "grad_norm": 0.5095152854919434, "learning_rate": 2.6329076329076328e-05, "loss": 0.084, "step": 2060 }, { "epoch": 0.6147906147906148, "grad_norm": 0.47717463970184326, "learning_rate": 2.6311256311256313e-05, "loss": 0.0724, "step": 2070 }, { "epoch": 0.6177606177606177, "grad_norm": 0.3432537615299225, "learning_rate": 2.6293436293436296e-05, "loss": 0.0727, "step": 2080 }, { "epoch": 0.6207306207306207, "grad_norm": 0.6386498212814331, "learning_rate": 2.6275616275616278e-05, "loss": 0.0814, "step": 2090 }, { "epoch": 0.6237006237006237, "grad_norm": 0.5590204000473022, "learning_rate": 2.625779625779626e-05, "loss": 0.0803, "step": 2100 }, { "epoch": 0.6266706266706267, "grad_norm": 0.3727136552333832, "learning_rate": 2.623997623997624e-05, "loss": 0.0784, "step": 2110 }, { "epoch": 0.6296406296406296, "grad_norm": 0.9345456957817078, "learning_rate": 2.622215622215622e-05, "loss": 0.102, "step": 2120 }, { "epoch": 0.6326106326106327, "grad_norm": 0.6383994221687317, "learning_rate": 2.6204336204336203e-05, "loss": 0.0799, "step": 2130 }, { "epoch": 0.6355806355806356, "grad_norm": 0.6339811682701111, "learning_rate": 2.618651618651619e-05, "loss": 0.0697, "step": 2140 }, { "epoch": 0.6385506385506385, "grad_norm": 0.6489042639732361, "learning_rate": 2.616869616869617e-05, "loss": 0.0762, "step": 2150 }, { "epoch": 0.6415206415206415, "grad_norm": 0.43688729405403137, "learning_rate": 2.6150876150876153e-05, "loss": 0.0856, "step": 2160 }, { "epoch": 0.6444906444906445, "grad_norm": 0.5854159593582153, "learning_rate": 2.6133056133056135e-05, "loss": 0.0923, "step": 2170 }, { "epoch": 0.6474606474606475, "grad_norm": 0.4497719407081604, "learning_rate": 2.6115236115236114e-05, "loss": 0.0863, "step": 2180 }, { "epoch": 0.6504306504306504, "grad_norm": 0.39971357583999634, "learning_rate": 2.6097416097416096e-05, "loss": 0.0693, "step": 2190 }, { "epoch": 0.6534006534006535, "grad_norm": 0.6880261301994324, "learning_rate": 2.6079596079596078e-05, "loss": 0.0861, "step": 2200 }, { "epoch": 0.6563706563706564, "grad_norm": 0.39452025294303894, "learning_rate": 2.6061776061776064e-05, "loss": 0.0666, "step": 2210 }, { "epoch": 0.6593406593406593, "grad_norm": 0.4145357310771942, "learning_rate": 2.6043956043956046e-05, "loss": 0.07, "step": 2220 }, { "epoch": 0.6623106623106623, "grad_norm": 0.6330484747886658, "learning_rate": 2.6026136026136028e-05, "loss": 0.084, "step": 2230 }, { "epoch": 0.6652806652806653, "grad_norm": 0.5894971489906311, "learning_rate": 2.600831600831601e-05, "loss": 0.0925, "step": 2240 }, { "epoch": 0.6682506682506683, "grad_norm": 0.3733588457107544, "learning_rate": 2.599049599049599e-05, "loss": 0.082, "step": 2250 }, { "epoch": 0.6712206712206712, "grad_norm": 0.45527949929237366, "learning_rate": 2.597267597267597e-05, "loss": 0.0769, "step": 2260 }, { "epoch": 0.6741906741906742, "grad_norm": 0.6295212507247925, "learning_rate": 2.5954855954855953e-05, "loss": 0.0798, "step": 2270 }, { "epoch": 0.6771606771606772, "grad_norm": 0.4148741066455841, "learning_rate": 2.593703593703594e-05, "loss": 0.0702, "step": 2280 }, { "epoch": 0.6801306801306801, "grad_norm": 0.4446201026439667, "learning_rate": 2.591921591921592e-05, "loss": 0.081, "step": 2290 }, { "epoch": 0.6831006831006831, "grad_norm": 0.5348713397979736, "learning_rate": 2.5901395901395903e-05, "loss": 0.0804, "step": 2300 }, { "epoch": 0.6860706860706861, "grad_norm": 0.7064197659492493, "learning_rate": 2.5883575883575886e-05, "loss": 0.0766, "step": 2310 }, { "epoch": 0.689040689040689, "grad_norm": 0.5868175029754639, "learning_rate": 2.5865755865755864e-05, "loss": 0.0797, "step": 2320 }, { "epoch": 0.692010692010692, "grad_norm": 0.6839095950126648, "learning_rate": 2.5847935847935846e-05, "loss": 0.0794, "step": 2330 }, { "epoch": 0.694980694980695, "grad_norm": 0.41192343831062317, "learning_rate": 2.583011583011583e-05, "loss": 0.0706, "step": 2340 }, { "epoch": 0.697950697950698, "grad_norm": 0.7668315768241882, "learning_rate": 2.5812295812295814e-05, "loss": 0.0785, "step": 2350 }, { "epoch": 0.7009207009207009, "grad_norm": 0.43974947929382324, "learning_rate": 2.5794475794475796e-05, "loss": 0.0712, "step": 2360 }, { "epoch": 0.7038907038907039, "grad_norm": 0.3848420977592468, "learning_rate": 2.577665577665578e-05, "loss": 0.077, "step": 2370 }, { "epoch": 0.7068607068607069, "grad_norm": 0.6403735280036926, "learning_rate": 2.575883575883576e-05, "loss": 0.0729, "step": 2380 }, { "epoch": 0.7098307098307098, "grad_norm": 0.5417028665542603, "learning_rate": 2.574101574101574e-05, "loss": 0.0834, "step": 2390 }, { "epoch": 0.7128007128007128, "grad_norm": 0.9361075162887573, "learning_rate": 2.572319572319572e-05, "loss": 0.077, "step": 2400 }, { "epoch": 0.7157707157707157, "grad_norm": 0.483093798160553, "learning_rate": 2.5705375705375707e-05, "loss": 0.088, "step": 2410 }, { "epoch": 0.7187407187407188, "grad_norm": 0.4506361782550812, "learning_rate": 2.568755568755569e-05, "loss": 0.0919, "step": 2420 }, { "epoch": 0.7217107217107217, "grad_norm": 0.6593904495239258, "learning_rate": 2.566973566973567e-05, "loss": 0.087, "step": 2430 }, { "epoch": 0.7246807246807246, "grad_norm": 0.5274522304534912, "learning_rate": 2.5651915651915654e-05, "loss": 0.0768, "step": 2440 }, { "epoch": 0.7276507276507277, "grad_norm": 0.5065791606903076, "learning_rate": 2.5634095634095636e-05, "loss": 0.0828, "step": 2450 }, { "epoch": 0.7306207306207306, "grad_norm": 0.6130974888801575, "learning_rate": 2.5616275616275615e-05, "loss": 0.0742, "step": 2460 }, { "epoch": 0.7335907335907336, "grad_norm": 0.6379355192184448, "learning_rate": 2.5598455598455597e-05, "loss": 0.0847, "step": 2470 }, { "epoch": 0.7365607365607365, "grad_norm": 0.6738227009773254, "learning_rate": 2.5580635580635582e-05, "loss": 0.0793, "step": 2480 }, { "epoch": 0.7395307395307396, "grad_norm": 0.6309618949890137, "learning_rate": 2.5562815562815565e-05, "loss": 0.0871, "step": 2490 }, { "epoch": 0.7425007425007425, "grad_norm": 0.2825660705566406, "learning_rate": 2.5544995544995547e-05, "loss": 0.074, "step": 2500 }, { "epoch": 0.7454707454707454, "grad_norm": 0.43583425879478455, "learning_rate": 2.552717552717553e-05, "loss": 0.0858, "step": 2510 }, { "epoch": 0.7484407484407485, "grad_norm": 0.7557492256164551, "learning_rate": 2.550935550935551e-05, "loss": 0.0691, "step": 2520 }, { "epoch": 0.7514107514107514, "grad_norm": 0.44126811623573303, "learning_rate": 2.549153549153549e-05, "loss": 0.0664, "step": 2530 }, { "epoch": 0.7543807543807544, "grad_norm": 0.5966764092445374, "learning_rate": 2.5473715473715472e-05, "loss": 0.0766, "step": 2540 }, { "epoch": 0.7573507573507573, "grad_norm": 0.4621107578277588, "learning_rate": 2.5455895455895458e-05, "loss": 0.0834, "step": 2550 }, { "epoch": 0.7603207603207603, "grad_norm": 0.593605637550354, "learning_rate": 2.543807543807544e-05, "loss": 0.0812, "step": 2560 }, { "epoch": 0.7632907632907633, "grad_norm": 0.8139130473136902, "learning_rate": 2.5420255420255422e-05, "loss": 0.0663, "step": 2570 }, { "epoch": 0.7662607662607662, "grad_norm": 0.4853007197380066, "learning_rate": 2.5402435402435404e-05, "loss": 0.0789, "step": 2580 }, { "epoch": 0.7692307692307693, "grad_norm": 0.4105505645275116, "learning_rate": 2.5384615384615386e-05, "loss": 0.0702, "step": 2590 }, { "epoch": 0.7722007722007722, "grad_norm": 0.5971934795379639, "learning_rate": 2.5366795366795365e-05, "loss": 0.0847, "step": 2600 }, { "epoch": 0.7751707751707752, "grad_norm": 0.34833744168281555, "learning_rate": 2.5348975348975347e-05, "loss": 0.064, "step": 2610 }, { "epoch": 0.7781407781407781, "grad_norm": 0.35726526379585266, "learning_rate": 2.5331155331155333e-05, "loss": 0.0758, "step": 2620 }, { "epoch": 0.7811107811107811, "grad_norm": 0.4475048780441284, "learning_rate": 2.5313335313335315e-05, "loss": 0.0768, "step": 2630 }, { "epoch": 0.7840807840807841, "grad_norm": 0.48018935322761536, "learning_rate": 2.5295515295515297e-05, "loss": 0.0723, "step": 2640 }, { "epoch": 0.787050787050787, "grad_norm": 0.47765350341796875, "learning_rate": 2.527769527769528e-05, "loss": 0.0765, "step": 2650 }, { "epoch": 0.7900207900207901, "grad_norm": 0.6376664638519287, "learning_rate": 2.525987525987526e-05, "loss": 0.0777, "step": 2660 }, { "epoch": 0.792990792990793, "grad_norm": 0.7332932353019714, "learning_rate": 2.524205524205524e-05, "loss": 0.0963, "step": 2670 }, { "epoch": 0.7959607959607959, "grad_norm": 0.6165478825569153, "learning_rate": 2.5224235224235222e-05, "loss": 0.0827, "step": 2680 }, { "epoch": 0.7989307989307989, "grad_norm": 0.693350613117218, "learning_rate": 2.5206415206415208e-05, "loss": 0.0752, "step": 2690 }, { "epoch": 0.8019008019008019, "grad_norm": 0.5711894035339355, "learning_rate": 2.518859518859519e-05, "loss": 0.0699, "step": 2700 }, { "epoch": 0.8048708048708049, "grad_norm": 0.6042230725288391, "learning_rate": 2.5170775170775172e-05, "loss": 0.0681, "step": 2710 }, { "epoch": 0.8078408078408078, "grad_norm": 0.43989643454551697, "learning_rate": 2.5152955152955155e-05, "loss": 0.0684, "step": 2720 }, { "epoch": 0.8108108108108109, "grad_norm": 0.3606058359146118, "learning_rate": 2.5135135135135137e-05, "loss": 0.0793, "step": 2730 }, { "epoch": 0.8137808137808138, "grad_norm": 0.578762412071228, "learning_rate": 2.511731511731512e-05, "loss": 0.0703, "step": 2740 }, { "epoch": 0.8167508167508167, "grad_norm": 0.5686031579971313, "learning_rate": 2.5099495099495098e-05, "loss": 0.0851, "step": 2750 }, { "epoch": 0.8197208197208197, "grad_norm": 0.5423585772514343, "learning_rate": 2.5081675081675083e-05, "loss": 0.0744, "step": 2760 }, { "epoch": 0.8226908226908227, "grad_norm": 0.6459795236587524, "learning_rate": 2.5063855063855065e-05, "loss": 0.0749, "step": 2770 }, { "epoch": 0.8256608256608257, "grad_norm": 0.5151922106742859, "learning_rate": 2.5046035046035048e-05, "loss": 0.0838, "step": 2780 }, { "epoch": 0.8286308286308286, "grad_norm": 0.49044474959373474, "learning_rate": 2.502821502821503e-05, "loss": 0.081, "step": 2790 }, { "epoch": 0.8316008316008316, "grad_norm": 0.6159443855285645, "learning_rate": 2.5010395010395012e-05, "loss": 0.0814, "step": 2800 }, { "epoch": 0.8345708345708346, "grad_norm": 0.6860203146934509, "learning_rate": 2.4992574992574994e-05, "loss": 0.0731, "step": 2810 }, { "epoch": 0.8375408375408375, "grad_norm": 0.43102753162384033, "learning_rate": 2.4974754974754973e-05, "loss": 0.0867, "step": 2820 }, { "epoch": 0.8405108405108405, "grad_norm": 0.6863781809806824, "learning_rate": 2.495693495693496e-05, "loss": 0.0681, "step": 2830 }, { "epoch": 0.8434808434808435, "grad_norm": 0.6627882122993469, "learning_rate": 2.493911493911494e-05, "loss": 0.0692, "step": 2840 }, { "epoch": 0.8464508464508465, "grad_norm": 0.556719183921814, "learning_rate": 2.4921294921294923e-05, "loss": 0.0942, "step": 2850 }, { "epoch": 0.8494208494208494, "grad_norm": 0.6097808480262756, "learning_rate": 2.4903474903474905e-05, "loss": 0.0788, "step": 2860 }, { "epoch": 0.8523908523908524, "grad_norm": 0.3771260976791382, "learning_rate": 2.4885654885654887e-05, "loss": 0.0872, "step": 2870 }, { "epoch": 0.8553608553608554, "grad_norm": 0.2577713131904602, "learning_rate": 2.486783486783487e-05, "loss": 0.0849, "step": 2880 }, { "epoch": 0.8583308583308583, "grad_norm": 0.6618907451629639, "learning_rate": 2.4850014850014848e-05, "loss": 0.0794, "step": 2890 }, { "epoch": 0.8613008613008613, "grad_norm": 0.33715909719467163, "learning_rate": 2.4832194832194834e-05, "loss": 0.0689, "step": 2900 }, { "epoch": 0.8642708642708643, "grad_norm": 0.5500791072845459, "learning_rate": 2.4814374814374816e-05, "loss": 0.0862, "step": 2910 }, { "epoch": 0.8672408672408672, "grad_norm": 0.6228634119033813, "learning_rate": 2.4796554796554798e-05, "loss": 0.0769, "step": 2920 }, { "epoch": 0.8702108702108702, "grad_norm": 0.8019270896911621, "learning_rate": 2.477873477873478e-05, "loss": 0.071, "step": 2930 }, { "epoch": 0.8731808731808732, "grad_norm": 0.47143444418907166, "learning_rate": 2.4760914760914762e-05, "loss": 0.0812, "step": 2940 }, { "epoch": 0.8761508761508762, "grad_norm": 0.47617822885513306, "learning_rate": 2.4743094743094744e-05, "loss": 0.0769, "step": 2950 }, { "epoch": 0.8791208791208791, "grad_norm": 0.6791771054267883, "learning_rate": 2.4725274725274723e-05, "loss": 0.0693, "step": 2960 }, { "epoch": 0.882090882090882, "grad_norm": 0.4986003339290619, "learning_rate": 2.470745470745471e-05, "loss": 0.0778, "step": 2970 }, { "epoch": 0.8850608850608851, "grad_norm": 0.351012647151947, "learning_rate": 2.468963468963469e-05, "loss": 0.073, "step": 2980 }, { "epoch": 0.888030888030888, "grad_norm": 0.6079609394073486, "learning_rate": 2.4671814671814673e-05, "loss": 0.075, "step": 2990 }, { "epoch": 0.891000891000891, "grad_norm": 0.49167245626449585, "learning_rate": 2.4653994653994655e-05, "loss": 0.0745, "step": 3000 }, { "epoch": 0.893970893970894, "grad_norm": 0.49965718388557434, "learning_rate": 2.4636174636174637e-05, "loss": 0.0861, "step": 3010 }, { "epoch": 0.896940896940897, "grad_norm": 0.5942029356956482, "learning_rate": 2.461835461835462e-05, "loss": 0.0775, "step": 3020 }, { "epoch": 0.8999108999108999, "grad_norm": 0.5431137084960938, "learning_rate": 2.46005346005346e-05, "loss": 0.0732, "step": 3030 }, { "epoch": 0.9028809028809028, "grad_norm": 0.4982147514820099, "learning_rate": 2.4582714582714584e-05, "loss": 0.0749, "step": 3040 }, { "epoch": 0.9058509058509059, "grad_norm": 0.6718347072601318, "learning_rate": 2.4564894564894566e-05, "loss": 0.0843, "step": 3050 }, { "epoch": 0.9088209088209088, "grad_norm": 0.7574843168258667, "learning_rate": 2.454707454707455e-05, "loss": 0.0769, "step": 3060 }, { "epoch": 0.9117909117909118, "grad_norm": 0.5467488169670105, "learning_rate": 2.452925452925453e-05, "loss": 0.0802, "step": 3070 }, { "epoch": 0.9147609147609148, "grad_norm": 0.4699064791202545, "learning_rate": 2.4511434511434513e-05, "loss": 0.0813, "step": 3080 }, { "epoch": 0.9177309177309178, "grad_norm": 0.4939485788345337, "learning_rate": 2.4493614493614495e-05, "loss": 0.0763, "step": 3090 }, { "epoch": 0.9207009207009207, "grad_norm": 0.4790801405906677, "learning_rate": 2.4475794475794474e-05, "loss": 0.0765, "step": 3100 }, { "epoch": 0.9236709236709236, "grad_norm": 0.3700208365917206, "learning_rate": 2.445797445797446e-05, "loss": 0.0862, "step": 3110 }, { "epoch": 0.9266409266409267, "grad_norm": 0.5105488300323486, "learning_rate": 2.444015444015444e-05, "loss": 0.0773, "step": 3120 }, { "epoch": 0.9296109296109296, "grad_norm": 0.3455560803413391, "learning_rate": 2.4422334422334424e-05, "loss": 0.0716, "step": 3130 }, { "epoch": 0.9325809325809326, "grad_norm": 0.5318461656570435, "learning_rate": 2.4404514404514406e-05, "loss": 0.079, "step": 3140 }, { "epoch": 0.9355509355509356, "grad_norm": 0.42595726251602173, "learning_rate": 2.4386694386694388e-05, "loss": 0.0892, "step": 3150 }, { "epoch": 0.9385209385209385, "grad_norm": 0.651802659034729, "learning_rate": 2.436887436887437e-05, "loss": 0.0793, "step": 3160 }, { "epoch": 0.9414909414909415, "grad_norm": 0.6579793095588684, "learning_rate": 2.435105435105435e-05, "loss": 0.0661, "step": 3170 }, { "epoch": 0.9444609444609444, "grad_norm": 0.5980479717254639, "learning_rate": 2.4333234333234334e-05, "loss": 0.0757, "step": 3180 }, { "epoch": 0.9474309474309475, "grad_norm": 0.5788313746452332, "learning_rate": 2.4315414315414317e-05, "loss": 0.0921, "step": 3190 }, { "epoch": 0.9504009504009504, "grad_norm": 0.47703874111175537, "learning_rate": 2.42975942975943e-05, "loss": 0.0661, "step": 3200 }, { "epoch": 0.9533709533709533, "grad_norm": 0.5644926428794861, "learning_rate": 2.427977427977428e-05, "loss": 0.0706, "step": 3210 }, { "epoch": 0.9563409563409564, "grad_norm": 0.6008754372596741, "learning_rate": 2.4261954261954263e-05, "loss": 0.0757, "step": 3220 }, { "epoch": 0.9593109593109593, "grad_norm": 0.5607688426971436, "learning_rate": 2.4244134244134245e-05, "loss": 0.0718, "step": 3230 }, { "epoch": 0.9622809622809623, "grad_norm": 0.7359547019004822, "learning_rate": 2.4226314226314224e-05, "loss": 0.083, "step": 3240 }, { "epoch": 0.9652509652509652, "grad_norm": 0.596994161605835, "learning_rate": 2.420849420849421e-05, "loss": 0.0717, "step": 3250 }, { "epoch": 0.9682209682209683, "grad_norm": 0.7237496972084045, "learning_rate": 2.4190674190674192e-05, "loss": 0.0767, "step": 3260 }, { "epoch": 0.9711909711909712, "grad_norm": 0.6103200316429138, "learning_rate": 2.4172854172854174e-05, "loss": 0.0738, "step": 3270 }, { "epoch": 0.9741609741609741, "grad_norm": 0.7314611077308655, "learning_rate": 2.4155034155034156e-05, "loss": 0.0851, "step": 3280 }, { "epoch": 0.9771309771309772, "grad_norm": 0.496187299489975, "learning_rate": 2.4137214137214138e-05, "loss": 0.0757, "step": 3290 }, { "epoch": 0.9801009801009801, "grad_norm": 0.5102724432945251, "learning_rate": 2.411939411939412e-05, "loss": 0.0705, "step": 3300 }, { "epoch": 0.9830709830709831, "grad_norm": 0.43364787101745605, "learning_rate": 2.4101574101574103e-05, "loss": 0.0594, "step": 3310 }, { "epoch": 0.986040986040986, "grad_norm": 0.5329870581626892, "learning_rate": 2.4083754083754085e-05, "loss": 0.0757, "step": 3320 }, { "epoch": 0.989010989010989, "grad_norm": 0.5290941596031189, "learning_rate": 2.4065934065934067e-05, "loss": 0.0798, "step": 3330 }, { "epoch": 0.991980991980992, "grad_norm": 0.5744608044624329, "learning_rate": 2.404811404811405e-05, "loss": 0.072, "step": 3340 }, { "epoch": 0.9949509949509949, "grad_norm": 0.5449424386024475, "learning_rate": 2.403029403029403e-05, "loss": 0.0827, "step": 3350 }, { "epoch": 0.997920997920998, "grad_norm": 0.5638298392295837, "learning_rate": 2.4012474012474013e-05, "loss": 0.0796, "step": 3360 }, { "epoch": 1.0, "eval_f1": 0.49727767695099817, "eval_loss": 0.0686563029885292, "eval_runtime": 821.5096, "eval_samples_per_second": 46.279, "eval_steps_per_second": 0.724, "step": 3367 }, { "epoch": 1.0008910008910008, "grad_norm": 0.5497238039970398, "learning_rate": 2.3994653994653996e-05, "loss": 0.0703, "step": 3370 }, { "epoch": 1.0038610038610039, "grad_norm": 0.3895362913608551, "learning_rate": 2.3976833976833978e-05, "loss": 0.0714, "step": 3380 }, { "epoch": 1.006831006831007, "grad_norm": 0.5208247900009155, "learning_rate": 2.395901395901396e-05, "loss": 0.0882, "step": 3390 }, { "epoch": 1.0098010098010097, "grad_norm": 0.4272199273109436, "learning_rate": 2.3941193941193942e-05, "loss": 0.0735, "step": 3400 }, { "epoch": 1.0127710127710128, "grad_norm": 0.5025156140327454, "learning_rate": 2.3923373923373924e-05, "loss": 0.0706, "step": 3410 }, { "epoch": 1.0157410157410158, "grad_norm": 0.3242335617542267, "learning_rate": 2.3905553905553906e-05, "loss": 0.0678, "step": 3420 }, { "epoch": 1.0187110187110187, "grad_norm": 0.3997895121574402, "learning_rate": 2.388773388773389e-05, "loss": 0.0812, "step": 3430 }, { "epoch": 1.0216810216810217, "grad_norm": 0.752778172492981, "learning_rate": 2.386991386991387e-05, "loss": 0.0884, "step": 3440 }, { "epoch": 1.0246510246510248, "grad_norm": 0.8602269291877747, "learning_rate": 2.3852093852093853e-05, "loss": 0.0878, "step": 3450 }, { "epoch": 1.0276210276210276, "grad_norm": 0.4281240403652191, "learning_rate": 2.3834273834273835e-05, "loss": 0.0718, "step": 3460 }, { "epoch": 1.0305910305910306, "grad_norm": 0.5941810607910156, "learning_rate": 2.3816453816453817e-05, "loss": 0.0737, "step": 3470 }, { "epoch": 1.0335610335610335, "grad_norm": 0.573628306388855, "learning_rate": 2.37986337986338e-05, "loss": 0.0659, "step": 3480 }, { "epoch": 1.0365310365310365, "grad_norm": 0.6910396814346313, "learning_rate": 2.378081378081378e-05, "loss": 0.084, "step": 3490 }, { "epoch": 1.0395010395010396, "grad_norm": 0.38856300711631775, "learning_rate": 2.3762993762993764e-05, "loss": 0.0761, "step": 3500 }, { "epoch": 1.0424710424710424, "grad_norm": 0.41457536816596985, "learning_rate": 2.3745173745173746e-05, "loss": 0.082, "step": 3510 }, { "epoch": 1.0454410454410454, "grad_norm": 0.6538494825363159, "learning_rate": 2.3727353727353728e-05, "loss": 0.0817, "step": 3520 }, { "epoch": 1.0484110484110485, "grad_norm": 0.3478659689426422, "learning_rate": 2.370953370953371e-05, "loss": 0.0851, "step": 3530 }, { "epoch": 1.0513810513810513, "grad_norm": 0.546033501625061, "learning_rate": 2.3691713691713692e-05, "loss": 0.084, "step": 3540 }, { "epoch": 1.0543510543510544, "grad_norm": 0.4026525020599365, "learning_rate": 2.3673893673893675e-05, "loss": 0.0725, "step": 3550 }, { "epoch": 1.0573210573210574, "grad_norm": 0.5000739097595215, "learning_rate": 2.3656073656073657e-05, "loss": 0.0822, "step": 3560 }, { "epoch": 1.0602910602910602, "grad_norm": 0.48692411184310913, "learning_rate": 2.363825363825364e-05, "loss": 0.0796, "step": 3570 }, { "epoch": 1.0632610632610633, "grad_norm": 0.5664608478546143, "learning_rate": 2.362043362043362e-05, "loss": 0.0661, "step": 3580 }, { "epoch": 1.0662310662310661, "grad_norm": 0.502124547958374, "learning_rate": 2.3602613602613603e-05, "loss": 0.0638, "step": 3590 }, { "epoch": 1.0692010692010692, "grad_norm": 0.5469791889190674, "learning_rate": 2.3584793584793586e-05, "loss": 0.0719, "step": 3600 }, { "epoch": 1.0721710721710722, "grad_norm": 0.5133867859840393, "learning_rate": 2.3566973566973568e-05, "loss": 0.0772, "step": 3610 }, { "epoch": 1.075141075141075, "grad_norm": 0.5197412371635437, "learning_rate": 2.354915354915355e-05, "loss": 0.0862, "step": 3620 }, { "epoch": 1.078111078111078, "grad_norm": 0.4368208050727844, "learning_rate": 2.3531333531333532e-05, "loss": 0.0618, "step": 3630 }, { "epoch": 1.0810810810810811, "grad_norm": 0.46737584471702576, "learning_rate": 2.3513513513513514e-05, "loss": 0.0618, "step": 3640 }, { "epoch": 1.084051084051084, "grad_norm": 0.46774664521217346, "learning_rate": 2.3495693495693496e-05, "loss": 0.0844, "step": 3650 }, { "epoch": 1.087021087021087, "grad_norm": 0.5892476439476013, "learning_rate": 2.347787347787348e-05, "loss": 0.0823, "step": 3660 }, { "epoch": 1.08999108999109, "grad_norm": 0.32615166902542114, "learning_rate": 2.346005346005346e-05, "loss": 0.0579, "step": 3670 }, { "epoch": 1.092961092961093, "grad_norm": 0.4170238673686981, "learning_rate": 2.3442233442233443e-05, "loss": 0.0655, "step": 3680 }, { "epoch": 1.095931095931096, "grad_norm": 0.4704936146736145, "learning_rate": 2.3424413424413425e-05, "loss": 0.0814, "step": 3690 }, { "epoch": 1.098901098901099, "grad_norm": 0.5191180109977722, "learning_rate": 2.3406593406593407e-05, "loss": 0.0787, "step": 3700 }, { "epoch": 1.1018711018711018, "grad_norm": 0.48460114002227783, "learning_rate": 2.338877338877339e-05, "loss": 0.0522, "step": 3710 }, { "epoch": 1.1048411048411049, "grad_norm": 0.5503575205802917, "learning_rate": 2.337095337095337e-05, "loss": 0.0769, "step": 3720 }, { "epoch": 1.107811107811108, "grad_norm": 0.6398834586143494, "learning_rate": 2.3353133353133354e-05, "loss": 0.0664, "step": 3730 }, { "epoch": 1.1107811107811107, "grad_norm": 0.39908480644226074, "learning_rate": 2.3335313335313336e-05, "loss": 0.0759, "step": 3740 }, { "epoch": 1.1137511137511138, "grad_norm": 0.4675776958465576, "learning_rate": 2.3317493317493318e-05, "loss": 0.0765, "step": 3750 }, { "epoch": 1.1167211167211166, "grad_norm": 0.350972443819046, "learning_rate": 2.32996732996733e-05, "loss": 0.0777, "step": 3760 }, { "epoch": 1.1196911196911197, "grad_norm": 0.4611550569534302, "learning_rate": 2.3281853281853282e-05, "loss": 0.0709, "step": 3770 }, { "epoch": 1.1226611226611227, "grad_norm": 0.5342544913291931, "learning_rate": 2.3264033264033265e-05, "loss": 0.0649, "step": 3780 }, { "epoch": 1.1256311256311256, "grad_norm": 0.6507514119148254, "learning_rate": 2.3246213246213247e-05, "loss": 0.076, "step": 3790 }, { "epoch": 1.1286011286011286, "grad_norm": 0.7478254437446594, "learning_rate": 2.322839322839323e-05, "loss": 0.0857, "step": 3800 }, { "epoch": 1.1315711315711316, "grad_norm": 0.5067834258079529, "learning_rate": 2.321057321057321e-05, "loss": 0.0745, "step": 3810 }, { "epoch": 1.1345411345411345, "grad_norm": 0.6091060042381287, "learning_rate": 2.3192753192753193e-05, "loss": 0.0761, "step": 3820 }, { "epoch": 1.1375111375111375, "grad_norm": 0.4694317579269409, "learning_rate": 2.3174933174933175e-05, "loss": 0.0815, "step": 3830 }, { "epoch": 1.1404811404811406, "grad_norm": 0.5222705006599426, "learning_rate": 2.3157113157113158e-05, "loss": 0.0788, "step": 3840 }, { "epoch": 1.1434511434511434, "grad_norm": 0.5226296782493591, "learning_rate": 2.313929313929314e-05, "loss": 0.0773, "step": 3850 }, { "epoch": 1.1464211464211465, "grad_norm": 0.5545721054077148, "learning_rate": 2.3121473121473122e-05, "loss": 0.0675, "step": 3860 }, { "epoch": 1.1493911493911493, "grad_norm": 0.5250979065895081, "learning_rate": 2.3103653103653104e-05, "loss": 0.0815, "step": 3870 }, { "epoch": 1.1523611523611523, "grad_norm": 0.4267248213291168, "learning_rate": 2.3085833085833086e-05, "loss": 0.0704, "step": 3880 }, { "epoch": 1.1553311553311554, "grad_norm": 0.3308209478855133, "learning_rate": 2.306801306801307e-05, "loss": 0.08, "step": 3890 }, { "epoch": 1.1583011583011582, "grad_norm": 0.49279993772506714, "learning_rate": 2.305019305019305e-05, "loss": 0.0868, "step": 3900 }, { "epoch": 1.1612711612711613, "grad_norm": 0.49307748675346375, "learning_rate": 2.3032373032373033e-05, "loss": 0.081, "step": 3910 }, { "epoch": 1.1642411642411643, "grad_norm": 0.691349446773529, "learning_rate": 2.3014553014553015e-05, "loss": 0.0712, "step": 3920 }, { "epoch": 1.1672111672111671, "grad_norm": 0.4932047724723816, "learning_rate": 2.2996732996732997e-05, "loss": 0.0683, "step": 3930 }, { "epoch": 1.1701811701811702, "grad_norm": 0.5138940811157227, "learning_rate": 2.297891297891298e-05, "loss": 0.0646, "step": 3940 }, { "epoch": 1.1731511731511732, "grad_norm": 0.4573695659637451, "learning_rate": 2.2961092961092965e-05, "loss": 0.0593, "step": 3950 }, { "epoch": 1.176121176121176, "grad_norm": 0.6048777103424072, "learning_rate": 2.2943272943272944e-05, "loss": 0.0768, "step": 3960 }, { "epoch": 1.179091179091179, "grad_norm": 0.6311981678009033, "learning_rate": 2.2925452925452926e-05, "loss": 0.0901, "step": 3970 }, { "epoch": 1.1820611820611822, "grad_norm": 0.4408791661262512, "learning_rate": 2.2907632907632908e-05, "loss": 0.0729, "step": 3980 }, { "epoch": 1.185031185031185, "grad_norm": 0.3359534740447998, "learning_rate": 2.288981288981289e-05, "loss": 0.071, "step": 3990 }, { "epoch": 1.188001188001188, "grad_norm": 0.3939429223537445, "learning_rate": 2.2871992871992872e-05, "loss": 0.0681, "step": 4000 }, { "epoch": 1.190971190971191, "grad_norm": 0.46291255950927734, "learning_rate": 2.2854172854172855e-05, "loss": 0.0721, "step": 4010 }, { "epoch": 1.193941193941194, "grad_norm": 0.4679121971130371, "learning_rate": 2.283635283635284e-05, "loss": 0.0823, "step": 4020 }, { "epoch": 1.196911196911197, "grad_norm": 0.6498029232025146, "learning_rate": 2.281853281853282e-05, "loss": 0.064, "step": 4030 }, { "epoch": 1.1998811998811998, "grad_norm": 0.5375266671180725, "learning_rate": 2.28007128007128e-05, "loss": 0.0827, "step": 4040 }, { "epoch": 1.2028512028512028, "grad_norm": 0.7022712230682373, "learning_rate": 2.2782892782892783e-05, "loss": 0.0723, "step": 4050 }, { "epoch": 1.2058212058212059, "grad_norm": 0.888565182685852, "learning_rate": 2.2765072765072765e-05, "loss": 0.0699, "step": 4060 }, { "epoch": 1.2087912087912087, "grad_norm": 0.615304172039032, "learning_rate": 2.2747252747252748e-05, "loss": 0.0639, "step": 4070 }, { "epoch": 1.2117612117612118, "grad_norm": 1.067995309829712, "learning_rate": 2.272943272943273e-05, "loss": 0.0727, "step": 4080 }, { "epoch": 1.2147312147312148, "grad_norm": 0.38957396149635315, "learning_rate": 2.2711612711612715e-05, "loss": 0.0817, "step": 4090 }, { "epoch": 1.2177012177012176, "grad_norm": 0.4814799726009369, "learning_rate": 2.2693792693792694e-05, "loss": 0.0676, "step": 4100 }, { "epoch": 1.2206712206712207, "grad_norm": 0.33193427324295044, "learning_rate": 2.2675972675972676e-05, "loss": 0.0717, "step": 4110 }, { "epoch": 1.2236412236412235, "grad_norm": 0.5651602149009705, "learning_rate": 2.265815265815266e-05, "loss": 0.0669, "step": 4120 }, { "epoch": 1.2266112266112266, "grad_norm": 0.6378253102302551, "learning_rate": 2.264033264033264e-05, "loss": 0.0897, "step": 4130 }, { "epoch": 1.2295812295812296, "grad_norm": 0.6030372977256775, "learning_rate": 2.2622512622512623e-05, "loss": 0.0896, "step": 4140 }, { "epoch": 1.2325512325512324, "grad_norm": 0.8515591621398926, "learning_rate": 2.2604692604692605e-05, "loss": 0.0645, "step": 4150 }, { "epoch": 1.2355212355212355, "grad_norm": 0.6547635197639465, "learning_rate": 2.258687258687259e-05, "loss": 0.0736, "step": 4160 }, { "epoch": 1.2384912384912385, "grad_norm": 0.4761018753051758, "learning_rate": 2.256905256905257e-05, "loss": 0.0689, "step": 4170 }, { "epoch": 1.2414612414612414, "grad_norm": 0.39740657806396484, "learning_rate": 2.255123255123255e-05, "loss": 0.0696, "step": 4180 }, { "epoch": 1.2444312444312444, "grad_norm": 0.49501290917396545, "learning_rate": 2.2533412533412534e-05, "loss": 0.0779, "step": 4190 }, { "epoch": 1.2474012474012475, "grad_norm": 0.5703093409538269, "learning_rate": 2.2515592515592516e-05, "loss": 0.0663, "step": 4200 }, { "epoch": 1.2503712503712503, "grad_norm": 0.4675036370754242, "learning_rate": 2.2497772497772498e-05, "loss": 0.0772, "step": 4210 }, { "epoch": 1.2533412533412533, "grad_norm": 0.6520904898643494, "learning_rate": 2.247995247995248e-05, "loss": 0.074, "step": 4220 }, { "epoch": 1.2563112563112564, "grad_norm": 0.4377146065235138, "learning_rate": 2.2462132462132466e-05, "loss": 0.0752, "step": 4230 }, { "epoch": 1.2592812592812592, "grad_norm": 0.4791605472564697, "learning_rate": 2.2444312444312444e-05, "loss": 0.0614, "step": 4240 }, { "epoch": 1.2622512622512623, "grad_norm": 0.5933295488357544, "learning_rate": 2.2426492426492427e-05, "loss": 0.0832, "step": 4250 }, { "epoch": 1.2652212652212653, "grad_norm": 0.4189813435077667, "learning_rate": 2.240867240867241e-05, "loss": 0.069, "step": 4260 }, { "epoch": 1.2681912681912682, "grad_norm": 0.651421070098877, "learning_rate": 2.239085239085239e-05, "loss": 0.0791, "step": 4270 }, { "epoch": 1.2711612711612712, "grad_norm": 0.40593355894088745, "learning_rate": 2.2373032373032373e-05, "loss": 0.0638, "step": 4280 }, { "epoch": 1.2741312741312742, "grad_norm": 0.5226801037788391, "learning_rate": 2.2355212355212355e-05, "loss": 0.077, "step": 4290 }, { "epoch": 1.277101277101277, "grad_norm": 0.6062614321708679, "learning_rate": 2.233739233739234e-05, "loss": 0.068, "step": 4300 }, { "epoch": 1.2800712800712801, "grad_norm": 0.48023584485054016, "learning_rate": 2.231957231957232e-05, "loss": 0.0622, "step": 4310 }, { "epoch": 1.2830412830412832, "grad_norm": 0.4292398989200592, "learning_rate": 2.2301752301752302e-05, "loss": 0.0951, "step": 4320 }, { "epoch": 1.286011286011286, "grad_norm": 0.509908139705658, "learning_rate": 2.2283932283932284e-05, "loss": 0.0703, "step": 4330 }, { "epoch": 1.288981288981289, "grad_norm": 0.36277303099632263, "learning_rate": 2.2266112266112266e-05, "loss": 0.0752, "step": 4340 }, { "epoch": 1.2919512919512919, "grad_norm": 0.4135016202926636, "learning_rate": 2.2248292248292248e-05, "loss": 0.0673, "step": 4350 }, { "epoch": 1.294921294921295, "grad_norm": 0.4465673863887787, "learning_rate": 2.223047223047223e-05, "loss": 0.0774, "step": 4360 }, { "epoch": 1.2978912978912978, "grad_norm": 0.3581428825855255, "learning_rate": 2.2212652212652216e-05, "loss": 0.0722, "step": 4370 }, { "epoch": 1.3008613008613008, "grad_norm": 0.8216081261634827, "learning_rate": 2.2194832194832195e-05, "loss": 0.0695, "step": 4380 }, { "epoch": 1.3038313038313039, "grad_norm": 0.3974524736404419, "learning_rate": 2.2177012177012177e-05, "loss": 0.0548, "step": 4390 }, { "epoch": 1.3068013068013067, "grad_norm": 0.40166157484054565, "learning_rate": 2.215919215919216e-05, "loss": 0.0821, "step": 4400 }, { "epoch": 1.3097713097713097, "grad_norm": 0.6108930706977844, "learning_rate": 2.214137214137214e-05, "loss": 0.0771, "step": 4410 }, { "epoch": 1.3127413127413128, "grad_norm": 0.33659735321998596, "learning_rate": 2.2123552123552123e-05, "loss": 0.0866, "step": 4420 }, { "epoch": 1.3157113157113156, "grad_norm": 0.41419750452041626, "learning_rate": 2.2105732105732106e-05, "loss": 0.066, "step": 4430 }, { "epoch": 1.3186813186813187, "grad_norm": 0.39843958616256714, "learning_rate": 2.208791208791209e-05, "loss": 0.0717, "step": 4440 }, { "epoch": 1.3216513216513217, "grad_norm": 0.4193469285964966, "learning_rate": 2.207009207009207e-05, "loss": 0.0608, "step": 4450 }, { "epoch": 1.3246213246213245, "grad_norm": 0.310855507850647, "learning_rate": 2.2052272052272052e-05, "loss": 0.0623, "step": 4460 }, { "epoch": 1.3275913275913276, "grad_norm": 0.3885134160518646, "learning_rate": 2.2034452034452034e-05, "loss": 0.0565, "step": 4470 }, { "epoch": 1.3305613305613306, "grad_norm": 0.31589820981025696, "learning_rate": 2.2016632016632017e-05, "loss": 0.0591, "step": 4480 }, { "epoch": 1.3335313335313335, "grad_norm": 0.4833143651485443, "learning_rate": 2.1998811998812e-05, "loss": 0.0758, "step": 4490 }, { "epoch": 1.3365013365013365, "grad_norm": 0.47030189633369446, "learning_rate": 2.198099198099198e-05, "loss": 0.0644, "step": 4500 }, { "epoch": 1.3394713394713396, "grad_norm": 0.44581151008605957, "learning_rate": 2.1963171963171966e-05, "loss": 0.0675, "step": 4510 }, { "epoch": 1.3424413424413424, "grad_norm": 0.5004817247390747, "learning_rate": 2.1945351945351945e-05, "loss": 0.0835, "step": 4520 }, { "epoch": 1.3454113454113454, "grad_norm": 0.5188937783241272, "learning_rate": 2.1927531927531927e-05, "loss": 0.0739, "step": 4530 }, { "epoch": 1.3483813483813485, "grad_norm": 0.386055052280426, "learning_rate": 2.190971190971191e-05, "loss": 0.0752, "step": 4540 }, { "epoch": 1.3513513513513513, "grad_norm": 0.5287050008773804, "learning_rate": 2.1891891891891892e-05, "loss": 0.0704, "step": 4550 }, { "epoch": 1.3543213543213544, "grad_norm": 0.5197706818580627, "learning_rate": 2.1874071874071874e-05, "loss": 0.0722, "step": 4560 }, { "epoch": 1.3572913572913574, "grad_norm": 1.044822335243225, "learning_rate": 2.1856251856251856e-05, "loss": 0.0774, "step": 4570 }, { "epoch": 1.3602613602613602, "grad_norm": 0.35167747735977173, "learning_rate": 2.183843183843184e-05, "loss": 0.0688, "step": 4580 }, { "epoch": 1.3632313632313633, "grad_norm": 0.5518337488174438, "learning_rate": 2.1820611820611824e-05, "loss": 0.0899, "step": 4590 }, { "epoch": 1.3662013662013661, "grad_norm": 0.5644456148147583, "learning_rate": 2.1802791802791803e-05, "loss": 0.0808, "step": 4600 }, { "epoch": 1.3691713691713692, "grad_norm": 0.45010289549827576, "learning_rate": 2.1784971784971785e-05, "loss": 0.0839, "step": 4610 }, { "epoch": 1.3721413721413722, "grad_norm": 0.6567732095718384, "learning_rate": 2.1767151767151767e-05, "loss": 0.0761, "step": 4620 }, { "epoch": 1.375111375111375, "grad_norm": 0.582931399345398, "learning_rate": 2.174933174933175e-05, "loss": 0.0669, "step": 4630 }, { "epoch": 1.378081378081378, "grad_norm": 0.39117926359176636, "learning_rate": 2.173151173151173e-05, "loss": 0.0763, "step": 4640 }, { "epoch": 1.381051381051381, "grad_norm": 0.44285526871681213, "learning_rate": 2.1713691713691717e-05, "loss": 0.071, "step": 4650 }, { "epoch": 1.384021384021384, "grad_norm": 0.6497974395751953, "learning_rate": 2.16958716958717e-05, "loss": 0.0647, "step": 4660 }, { "epoch": 1.386991386991387, "grad_norm": 0.4394398033618927, "learning_rate": 2.1678051678051678e-05, "loss": 0.0666, "step": 4670 }, { "epoch": 1.3899613899613898, "grad_norm": 0.6339782476425171, "learning_rate": 2.166023166023166e-05, "loss": 0.0693, "step": 4680 }, { "epoch": 1.392931392931393, "grad_norm": 0.24844326078891754, "learning_rate": 2.1642411642411642e-05, "loss": 0.0631, "step": 4690 }, { "epoch": 1.395901395901396, "grad_norm": 0.41448843479156494, "learning_rate": 2.1624591624591624e-05, "loss": 0.0667, "step": 4700 }, { "epoch": 1.3988713988713988, "grad_norm": 0.30131953954696655, "learning_rate": 2.1606771606771606e-05, "loss": 0.0625, "step": 4710 }, { "epoch": 1.4018414018414018, "grad_norm": 0.7573267817497253, "learning_rate": 2.1588951588951592e-05, "loss": 0.0672, "step": 4720 }, { "epoch": 1.4048114048114049, "grad_norm": 0.5527480840682983, "learning_rate": 2.1571131571131574e-05, "loss": 0.0597, "step": 4730 }, { "epoch": 1.4077814077814077, "grad_norm": 0.5866405367851257, "learning_rate": 2.1553311553311553e-05, "loss": 0.0676, "step": 4740 }, { "epoch": 1.4107514107514108, "grad_norm": 0.3691079318523407, "learning_rate": 2.1535491535491535e-05, "loss": 0.073, "step": 4750 }, { "epoch": 1.4137214137214138, "grad_norm": 0.46354126930236816, "learning_rate": 2.1517671517671517e-05, "loss": 0.062, "step": 4760 }, { "epoch": 1.4166914166914166, "grad_norm": 0.4648849368095398, "learning_rate": 2.14998514998515e-05, "loss": 0.0854, "step": 4770 }, { "epoch": 1.4196614196614197, "grad_norm": 0.4591132402420044, "learning_rate": 2.148203148203148e-05, "loss": 0.0571, "step": 4780 }, { "epoch": 1.4226314226314227, "grad_norm": 0.6278248429298401, "learning_rate": 2.1464211464211467e-05, "loss": 0.0669, "step": 4790 }, { "epoch": 1.4256014256014256, "grad_norm": 0.7873584032058716, "learning_rate": 2.144639144639145e-05, "loss": 0.0708, "step": 4800 }, { "epoch": 1.4285714285714286, "grad_norm": 0.42913201451301575, "learning_rate": 2.1428571428571428e-05, "loss": 0.0666, "step": 4810 }, { "epoch": 1.4315414315414317, "grad_norm": 0.34143778681755066, "learning_rate": 2.141075141075141e-05, "loss": 0.0829, "step": 4820 }, { "epoch": 1.4345114345114345, "grad_norm": 0.47077706456184387, "learning_rate": 2.1392931392931392e-05, "loss": 0.0794, "step": 4830 }, { "epoch": 1.4374814374814375, "grad_norm": 0.4886973202228546, "learning_rate": 2.1375111375111375e-05, "loss": 0.0646, "step": 4840 }, { "epoch": 1.4404514404514406, "grad_norm": 0.4241088628768921, "learning_rate": 2.1357291357291357e-05, "loss": 0.0762, "step": 4850 }, { "epoch": 1.4434214434214434, "grad_norm": 0.4464230537414551, "learning_rate": 2.1339471339471342e-05, "loss": 0.0698, "step": 4860 }, { "epoch": 1.4463914463914465, "grad_norm": 0.36223044991493225, "learning_rate": 2.1321651321651325e-05, "loss": 0.0587, "step": 4870 }, { "epoch": 1.4493614493614493, "grad_norm": 0.5170213580131531, "learning_rate": 2.1303831303831303e-05, "loss": 0.0663, "step": 4880 }, { "epoch": 1.4523314523314523, "grad_norm": 0.43765634298324585, "learning_rate": 2.1286011286011286e-05, "loss": 0.07, "step": 4890 }, { "epoch": 1.4553014553014554, "grad_norm": 0.30947327613830566, "learning_rate": 2.1268191268191268e-05, "loss": 0.0682, "step": 4900 }, { "epoch": 1.4582714582714582, "grad_norm": 0.480027973651886, "learning_rate": 2.125037125037125e-05, "loss": 0.0652, "step": 4910 }, { "epoch": 1.4612414612414613, "grad_norm": 0.7047821283340454, "learning_rate": 2.1232551232551232e-05, "loss": 0.0643, "step": 4920 }, { "epoch": 1.464211464211464, "grad_norm": 0.741016685962677, "learning_rate": 2.1214731214731218e-05, "loss": 0.0645, "step": 4930 }, { "epoch": 1.4671814671814671, "grad_norm": 0.5473170280456543, "learning_rate": 2.11969111969112e-05, "loss": 0.0636, "step": 4940 }, { "epoch": 1.4701514701514702, "grad_norm": 0.4111592471599579, "learning_rate": 2.117909117909118e-05, "loss": 0.0676, "step": 4950 }, { "epoch": 1.473121473121473, "grad_norm": 0.7355438470840454, "learning_rate": 2.116127116127116e-05, "loss": 0.0666, "step": 4960 }, { "epoch": 1.476091476091476, "grad_norm": 0.2529616355895996, "learning_rate": 2.1143451143451143e-05, "loss": 0.0626, "step": 4970 }, { "epoch": 1.4790614790614791, "grad_norm": 0.7075737714767456, "learning_rate": 2.1125631125631125e-05, "loss": 0.061, "step": 4980 }, { "epoch": 1.482031482031482, "grad_norm": 0.39400067925453186, "learning_rate": 2.1107811107811107e-05, "loss": 0.0714, "step": 4990 }, { "epoch": 1.485001485001485, "grad_norm": 0.4059322774410248, "learning_rate": 2.1089991089991093e-05, "loss": 0.0587, "step": 5000 }, { "epoch": 1.487971487971488, "grad_norm": 0.3679432272911072, "learning_rate": 2.1072171072171075e-05, "loss": 0.0707, "step": 5010 }, { "epoch": 1.4909414909414909, "grad_norm": 0.45325401425361633, "learning_rate": 2.1054351054351054e-05, "loss": 0.0789, "step": 5020 }, { "epoch": 1.493911493911494, "grad_norm": 0.36480912566185, "learning_rate": 2.1036531036531036e-05, "loss": 0.0742, "step": 5030 }, { "epoch": 1.496881496881497, "grad_norm": 0.4680189788341522, "learning_rate": 2.1018711018711018e-05, "loss": 0.0691, "step": 5040 }, { "epoch": 1.4998514998514998, "grad_norm": 0.40691086649894714, "learning_rate": 2.1000891000891e-05, "loss": 0.0841, "step": 5050 }, { "epoch": 1.5028215028215028, "grad_norm": 0.30459049344062805, "learning_rate": 2.0983070983070982e-05, "loss": 0.0569, "step": 5060 }, { "epoch": 1.505791505791506, "grad_norm": 0.8983843326568604, "learning_rate": 2.0965250965250968e-05, "loss": 0.0719, "step": 5070 }, { "epoch": 1.5087615087615087, "grad_norm": 0.5937901139259338, "learning_rate": 2.094743094743095e-05, "loss": 0.076, "step": 5080 }, { "epoch": 1.5117315117315118, "grad_norm": 0.3914330005645752, "learning_rate": 2.092961092961093e-05, "loss": 0.0698, "step": 5090 }, { "epoch": 1.5147015147015148, "grad_norm": 0.38608691096305847, "learning_rate": 2.091179091179091e-05, "loss": 0.0777, "step": 5100 }, { "epoch": 1.5176715176715176, "grad_norm": 0.7738357186317444, "learning_rate": 2.0893970893970893e-05, "loss": 0.0693, "step": 5110 }, { "epoch": 1.5206415206415207, "grad_norm": 0.6664383411407471, "learning_rate": 2.0876150876150875e-05, "loss": 0.072, "step": 5120 }, { "epoch": 1.5236115236115237, "grad_norm": 0.38639238476753235, "learning_rate": 2.0858330858330858e-05, "loss": 0.0724, "step": 5130 }, { "epoch": 1.5265815265815266, "grad_norm": 0.7060205936431885, "learning_rate": 2.0840510840510843e-05, "loss": 0.0665, "step": 5140 }, { "epoch": 1.5295515295515294, "grad_norm": 0.674248993396759, "learning_rate": 2.0822690822690825e-05, "loss": 0.0769, "step": 5150 }, { "epoch": 1.5325215325215327, "grad_norm": 0.45502710342407227, "learning_rate": 2.0804870804870808e-05, "loss": 0.0808, "step": 5160 }, { "epoch": 1.5354915354915355, "grad_norm": 0.4794248938560486, "learning_rate": 2.0787050787050786e-05, "loss": 0.0701, "step": 5170 }, { "epoch": 1.5384615384615383, "grad_norm": 0.6008143424987793, "learning_rate": 2.076923076923077e-05, "loss": 0.0697, "step": 5180 }, { "epoch": 1.5414315414315416, "grad_norm": 0.5068689584732056, "learning_rate": 2.075141075141075e-05, "loss": 0.079, "step": 5190 }, { "epoch": 1.5444015444015444, "grad_norm": 0.4885605275630951, "learning_rate": 2.0733590733590733e-05, "loss": 0.0749, "step": 5200 }, { "epoch": 1.5473715473715473, "grad_norm": 0.5522565841674805, "learning_rate": 2.071577071577072e-05, "loss": 0.071, "step": 5210 }, { "epoch": 1.5503415503415503, "grad_norm": 0.32774174213409424, "learning_rate": 2.06979506979507e-05, "loss": 0.0556, "step": 5220 }, { "epoch": 1.5533115533115534, "grad_norm": 0.5104330778121948, "learning_rate": 2.0680130680130683e-05, "loss": 0.0738, "step": 5230 }, { "epoch": 1.5562815562815562, "grad_norm": 0.5387243628501892, "learning_rate": 2.066231066231066e-05, "loss": 0.0656, "step": 5240 }, { "epoch": 1.5592515592515592, "grad_norm": 0.49494025111198425, "learning_rate": 2.0644490644490644e-05, "loss": 0.0684, "step": 5250 }, { "epoch": 1.5622215622215623, "grad_norm": 0.5351789593696594, "learning_rate": 2.0626670626670626e-05, "loss": 0.0629, "step": 5260 }, { "epoch": 1.565191565191565, "grad_norm": 0.5836585760116577, "learning_rate": 2.0608850608850608e-05, "loss": 0.0725, "step": 5270 }, { "epoch": 1.5681615681615682, "grad_norm": 0.5254115462303162, "learning_rate": 2.0591030591030594e-05, "loss": 0.0617, "step": 5280 }, { "epoch": 1.5711315711315712, "grad_norm": 0.9055864810943604, "learning_rate": 2.0573210573210576e-05, "loss": 0.0702, "step": 5290 }, { "epoch": 1.574101574101574, "grad_norm": 0.6344959139823914, "learning_rate": 2.0555390555390558e-05, "loss": 0.0571, "step": 5300 }, { "epoch": 1.577071577071577, "grad_norm": 0.4069235622882843, "learning_rate": 2.0537570537570537e-05, "loss": 0.0562, "step": 5310 }, { "epoch": 1.5800415800415801, "grad_norm": 0.4786476194858551, "learning_rate": 2.051975051975052e-05, "loss": 0.0661, "step": 5320 }, { "epoch": 1.583011583011583, "grad_norm": 0.45690423250198364, "learning_rate": 2.05019305019305e-05, "loss": 0.0754, "step": 5330 }, { "epoch": 1.585981585981586, "grad_norm": 0.3506830036640167, "learning_rate": 2.0484110484110483e-05, "loss": 0.0671, "step": 5340 }, { "epoch": 1.588951588951589, "grad_norm": 0.6035703420639038, "learning_rate": 2.046629046629047e-05, "loss": 0.0697, "step": 5350 }, { "epoch": 1.5919215919215919, "grad_norm": 0.5453073382377625, "learning_rate": 2.044847044847045e-05, "loss": 0.0782, "step": 5360 }, { "epoch": 1.594891594891595, "grad_norm": 0.5534022450447083, "learning_rate": 2.0430650430650433e-05, "loss": 0.058, "step": 5370 }, { "epoch": 1.597861597861598, "grad_norm": 0.6920284032821655, "learning_rate": 2.0412830412830412e-05, "loss": 0.0824, "step": 5380 }, { "epoch": 1.6008316008316008, "grad_norm": 0.4295329451560974, "learning_rate": 2.0395010395010394e-05, "loss": 0.0708, "step": 5390 }, { "epoch": 1.6038016038016036, "grad_norm": 0.6782526969909668, "learning_rate": 2.0377190377190376e-05, "loss": 0.0687, "step": 5400 }, { "epoch": 1.606771606771607, "grad_norm": 0.37526410818099976, "learning_rate": 2.035937035937036e-05, "loss": 0.0706, "step": 5410 }, { "epoch": 1.6097416097416097, "grad_norm": 0.581466555595398, "learning_rate": 2.0341550341550344e-05, "loss": 0.0658, "step": 5420 }, { "epoch": 1.6127116127116126, "grad_norm": 0.6016833186149597, "learning_rate": 2.0323730323730326e-05, "loss": 0.0778, "step": 5430 }, { "epoch": 1.6156816156816158, "grad_norm": 0.46572285890579224, "learning_rate": 2.0305910305910308e-05, "loss": 0.0704, "step": 5440 }, { "epoch": 1.6186516186516187, "grad_norm": 0.4586747884750366, "learning_rate": 2.0288090288090287e-05, "loss": 0.0582, "step": 5450 }, { "epoch": 1.6216216216216215, "grad_norm": 0.5045327544212341, "learning_rate": 2.027027027027027e-05, "loss": 0.0774, "step": 5460 }, { "epoch": 1.6245916245916245, "grad_norm": 0.4661884307861328, "learning_rate": 2.025245025245025e-05, "loss": 0.0706, "step": 5470 }, { "epoch": 1.6275616275616276, "grad_norm": 0.3280268609523773, "learning_rate": 2.0234630234630234e-05, "loss": 0.0663, "step": 5480 }, { "epoch": 1.6305316305316304, "grad_norm": 0.4486147165298462, "learning_rate": 2.021681021681022e-05, "loss": 0.0698, "step": 5490 }, { "epoch": 1.6335016335016335, "grad_norm": 0.5801326036453247, "learning_rate": 2.01989901989902e-05, "loss": 0.0808, "step": 5500 }, { "epoch": 1.6364716364716365, "grad_norm": 0.43352991342544556, "learning_rate": 2.0181170181170183e-05, "loss": 0.0522, "step": 5510 }, { "epoch": 1.6394416394416393, "grad_norm": 0.5242543816566467, "learning_rate": 2.0163350163350162e-05, "loss": 0.0744, "step": 5520 }, { "epoch": 1.6424116424116424, "grad_norm": 0.5735893249511719, "learning_rate": 2.0145530145530144e-05, "loss": 0.0663, "step": 5530 }, { "epoch": 1.6453816453816454, "grad_norm": 0.3472582697868347, "learning_rate": 2.0127710127710127e-05, "loss": 0.0685, "step": 5540 }, { "epoch": 1.6483516483516483, "grad_norm": 0.4069629907608032, "learning_rate": 2.010989010989011e-05, "loss": 0.0619, "step": 5550 }, { "epoch": 1.6513216513216513, "grad_norm": 0.5932218432426453, "learning_rate": 2.0092070092070094e-05, "loss": 0.0711, "step": 5560 }, { "epoch": 1.6542916542916544, "grad_norm": 0.7638351321220398, "learning_rate": 2.0074250074250076e-05, "loss": 0.0837, "step": 5570 }, { "epoch": 1.6572616572616572, "grad_norm": 0.7104766368865967, "learning_rate": 2.005643005643006e-05, "loss": 0.0659, "step": 5580 }, { "epoch": 1.6602316602316602, "grad_norm": 0.6623921394348145, "learning_rate": 2.0038610038610037e-05, "loss": 0.0693, "step": 5590 }, { "epoch": 1.6632016632016633, "grad_norm": 0.5632063746452332, "learning_rate": 2.002079002079002e-05, "loss": 0.0647, "step": 5600 }, { "epoch": 1.6661716661716661, "grad_norm": 0.36101019382476807, "learning_rate": 2.0002970002970002e-05, "loss": 0.0724, "step": 5610 }, { "epoch": 1.6691416691416692, "grad_norm": 0.4157385230064392, "learning_rate": 1.9985149985149984e-05, "loss": 0.0609, "step": 5620 }, { "epoch": 1.6721116721116722, "grad_norm": 0.4751082956790924, "learning_rate": 1.996732996732997e-05, "loss": 0.0681, "step": 5630 }, { "epoch": 1.675081675081675, "grad_norm": 0.5545091032981873, "learning_rate": 1.994950994950995e-05, "loss": 0.0726, "step": 5640 }, { "epoch": 1.678051678051678, "grad_norm": 0.7477673888206482, "learning_rate": 1.9931689931689934e-05, "loss": 0.0707, "step": 5650 }, { "epoch": 1.6810216810216811, "grad_norm": 0.8139877915382385, "learning_rate": 1.9913869913869913e-05, "loss": 0.0708, "step": 5660 }, { "epoch": 1.683991683991684, "grad_norm": 0.26891762018203735, "learning_rate": 1.9896049896049895e-05, "loss": 0.0703, "step": 5670 }, { "epoch": 1.6869616869616868, "grad_norm": 0.47424066066741943, "learning_rate": 1.9878229878229877e-05, "loss": 0.0772, "step": 5680 }, { "epoch": 1.68993168993169, "grad_norm": 0.41330039501190186, "learning_rate": 1.986040986040986e-05, "loss": 0.078, "step": 5690 }, { "epoch": 1.692901692901693, "grad_norm": 0.45802241563796997, "learning_rate": 1.9842589842589845e-05, "loss": 0.0588, "step": 5700 }, { "epoch": 1.6958716958716957, "grad_norm": 0.5270569920539856, "learning_rate": 1.9824769824769827e-05, "loss": 0.066, "step": 5710 }, { "epoch": 1.698841698841699, "grad_norm": 0.5334698557853699, "learning_rate": 1.980694980694981e-05, "loss": 0.0795, "step": 5720 }, { "epoch": 1.7018117018117018, "grad_norm": 0.4093966484069824, "learning_rate": 1.978912978912979e-05, "loss": 0.0752, "step": 5730 }, { "epoch": 1.7047817047817047, "grad_norm": 0.5499134659767151, "learning_rate": 1.977130977130977e-05, "loss": 0.071, "step": 5740 }, { "epoch": 1.7077517077517077, "grad_norm": 0.5507758259773254, "learning_rate": 1.9753489753489752e-05, "loss": 0.0762, "step": 5750 }, { "epoch": 1.7107217107217108, "grad_norm": 0.726193904876709, "learning_rate": 1.9735669735669734e-05, "loss": 0.073, "step": 5760 }, { "epoch": 1.7136917136917136, "grad_norm": 0.499423086643219, "learning_rate": 1.971784971784972e-05, "loss": 0.0669, "step": 5770 }, { "epoch": 1.7166617166617166, "grad_norm": 0.4177100956439972, "learning_rate": 1.9700029700029702e-05, "loss": 0.0643, "step": 5780 }, { "epoch": 1.7196317196317197, "grad_norm": 0.7960310578346252, "learning_rate": 1.9682209682209684e-05, "loss": 0.0724, "step": 5790 }, { "epoch": 1.7226017226017225, "grad_norm": 0.4406733512878418, "learning_rate": 1.9664389664389666e-05, "loss": 0.0776, "step": 5800 }, { "epoch": 1.7255717255717256, "grad_norm": 0.530737042427063, "learning_rate": 1.9646569646569645e-05, "loss": 0.0693, "step": 5810 }, { "epoch": 1.7285417285417286, "grad_norm": 0.29855164885520935, "learning_rate": 1.9628749628749627e-05, "loss": 0.0719, "step": 5820 }, { "epoch": 1.7315117315117314, "grad_norm": 0.5606129765510559, "learning_rate": 1.961092961092961e-05, "loss": 0.0773, "step": 5830 }, { "epoch": 1.7344817344817345, "grad_norm": 0.4716852307319641, "learning_rate": 1.9593109593109595e-05, "loss": 0.0699, "step": 5840 }, { "epoch": 1.7374517374517375, "grad_norm": 0.39249199628829956, "learning_rate": 1.9575289575289577e-05, "loss": 0.078, "step": 5850 }, { "epoch": 1.7404217404217404, "grad_norm": 0.5014438629150391, "learning_rate": 1.955746955746956e-05, "loss": 0.0753, "step": 5860 }, { "epoch": 1.7433917433917434, "grad_norm": 0.535271167755127, "learning_rate": 1.953964953964954e-05, "loss": 0.0722, "step": 5870 }, { "epoch": 1.7463617463617465, "grad_norm": 0.3693440854549408, "learning_rate": 1.952182952182952e-05, "loss": 0.0824, "step": 5880 }, { "epoch": 1.7493317493317493, "grad_norm": 0.6997837424278259, "learning_rate": 1.9504009504009503e-05, "loss": 0.0719, "step": 5890 }, { "epoch": 1.7523017523017523, "grad_norm": 0.3417227864265442, "learning_rate": 1.9486189486189485e-05, "loss": 0.0701, "step": 5900 }, { "epoch": 1.7552717552717554, "grad_norm": 1.077194094657898, "learning_rate": 1.946836946836947e-05, "loss": 0.0787, "step": 5910 }, { "epoch": 1.7582417582417582, "grad_norm": 0.7957248687744141, "learning_rate": 1.9450549450549452e-05, "loss": 0.0878, "step": 5920 }, { "epoch": 1.7612117612117613, "grad_norm": 0.7661225199699402, "learning_rate": 1.9432729432729435e-05, "loss": 0.0701, "step": 5930 }, { "epoch": 1.7641817641817643, "grad_norm": 0.4629841446876526, "learning_rate": 1.9414909414909417e-05, "loss": 0.0704, "step": 5940 }, { "epoch": 1.7671517671517671, "grad_norm": 0.9389346241950989, "learning_rate": 1.9397089397089396e-05, "loss": 0.076, "step": 5950 }, { "epoch": 1.77012177012177, "grad_norm": 0.3709728717803955, "learning_rate": 1.9379269379269378e-05, "loss": 0.064, "step": 5960 }, { "epoch": 1.7730917730917732, "grad_norm": 0.4123302102088928, "learning_rate": 1.936144936144936e-05, "loss": 0.0762, "step": 5970 }, { "epoch": 1.776061776061776, "grad_norm": 0.5153429508209229, "learning_rate": 1.9343629343629345e-05, "loss": 0.0825, "step": 5980 }, { "epoch": 1.779031779031779, "grad_norm": 0.2630942761898041, "learning_rate": 1.9325809325809328e-05, "loss": 0.0578, "step": 5990 }, { "epoch": 1.7820017820017822, "grad_norm": 0.4419863522052765, "learning_rate": 1.930798930798931e-05, "loss": 0.0548, "step": 6000 }, { "epoch": 1.784971784971785, "grad_norm": 0.46090295910835266, "learning_rate": 1.9290169290169292e-05, "loss": 0.073, "step": 6010 }, { "epoch": 1.7879417879417878, "grad_norm": 1.1012392044067383, "learning_rate": 1.927234927234927e-05, "loss": 0.0725, "step": 6020 }, { "epoch": 1.7909117909117909, "grad_norm": 0.422880083322525, "learning_rate": 1.9254529254529253e-05, "loss": 0.0674, "step": 6030 }, { "epoch": 1.793881793881794, "grad_norm": 0.6051161885261536, "learning_rate": 1.9236709236709235e-05, "loss": 0.0669, "step": 6040 }, { "epoch": 1.7968517968517967, "grad_norm": 0.351578027009964, "learning_rate": 1.921888921888922e-05, "loss": 0.069, "step": 6050 }, { "epoch": 1.7998217998217998, "grad_norm": 0.606691300868988, "learning_rate": 1.9201069201069203e-05, "loss": 0.0641, "step": 6060 }, { "epoch": 1.8027918027918028, "grad_norm": 0.8968992829322815, "learning_rate": 1.9183249183249185e-05, "loss": 0.0734, "step": 6070 }, { "epoch": 1.8057618057618057, "grad_norm": 0.5204905867576599, "learning_rate": 1.9165429165429167e-05, "loss": 0.0741, "step": 6080 }, { "epoch": 1.8087318087318087, "grad_norm": 0.6135872602462769, "learning_rate": 1.9147609147609146e-05, "loss": 0.0791, "step": 6090 }, { "epoch": 1.8117018117018118, "grad_norm": 0.5273720622062683, "learning_rate": 1.9129789129789128e-05, "loss": 0.0655, "step": 6100 }, { "epoch": 1.8146718146718146, "grad_norm": 0.4117693305015564, "learning_rate": 1.911196911196911e-05, "loss": 0.0658, "step": 6110 }, { "epoch": 1.8176418176418176, "grad_norm": 0.5177286267280579, "learning_rate": 1.9094149094149096e-05, "loss": 0.0773, "step": 6120 }, { "epoch": 1.8206118206118207, "grad_norm": 0.5179166793823242, "learning_rate": 1.9076329076329078e-05, "loss": 0.06, "step": 6130 }, { "epoch": 1.8235818235818235, "grad_norm": 0.48499223589897156, "learning_rate": 1.905850905850906e-05, "loss": 0.0876, "step": 6140 }, { "epoch": 1.8265518265518266, "grad_norm": 0.5573757886886597, "learning_rate": 1.9040689040689042e-05, "loss": 0.0684, "step": 6150 }, { "epoch": 1.8295218295218296, "grad_norm": 0.481963574886322, "learning_rate": 1.902286902286902e-05, "loss": 0.0807, "step": 6160 }, { "epoch": 1.8324918324918325, "grad_norm": 0.4293064475059509, "learning_rate": 1.9005049005049003e-05, "loss": 0.0779, "step": 6170 }, { "epoch": 1.8354618354618355, "grad_norm": 0.4655805826187134, "learning_rate": 1.8987228987228986e-05, "loss": 0.0782, "step": 6180 }, { "epoch": 1.8384318384318385, "grad_norm": 0.5430210828781128, "learning_rate": 1.896940896940897e-05, "loss": 0.0601, "step": 6190 }, { "epoch": 1.8414018414018414, "grad_norm": 0.9118245244026184, "learning_rate": 1.8951588951588953e-05, "loss": 0.076, "step": 6200 }, { "epoch": 1.8443718443718444, "grad_norm": 0.3974968194961548, "learning_rate": 1.8933768933768935e-05, "loss": 0.0742, "step": 6210 }, { "epoch": 1.8473418473418475, "grad_norm": 0.393530935049057, "learning_rate": 1.8915948915948918e-05, "loss": 0.0755, "step": 6220 }, { "epoch": 1.8503118503118503, "grad_norm": 0.6730740070343018, "learning_rate": 1.8898128898128896e-05, "loss": 0.0675, "step": 6230 }, { "epoch": 1.8532818532818531, "grad_norm": 0.5142623782157898, "learning_rate": 1.888030888030888e-05, "loss": 0.0694, "step": 6240 }, { "epoch": 1.8562518562518564, "grad_norm": 0.344099223613739, "learning_rate": 1.886248886248886e-05, "loss": 0.0591, "step": 6250 }, { "epoch": 1.8592218592218592, "grad_norm": 0.5664836168289185, "learning_rate": 1.8844668844668846e-05, "loss": 0.0725, "step": 6260 }, { "epoch": 1.862191862191862, "grad_norm": 0.2773604989051819, "learning_rate": 1.882684882684883e-05, "loss": 0.0653, "step": 6270 }, { "epoch": 1.865161865161865, "grad_norm": 0.35496875643730164, "learning_rate": 1.880902880902881e-05, "loss": 0.0711, "step": 6280 }, { "epoch": 1.8681318681318682, "grad_norm": 0.2887316644191742, "learning_rate": 1.8791208791208793e-05, "loss": 0.0661, "step": 6290 }, { "epoch": 1.871101871101871, "grad_norm": 0.5518425107002258, "learning_rate": 1.8773388773388775e-05, "loss": 0.0663, "step": 6300 }, { "epoch": 1.874071874071874, "grad_norm": 0.6332146525382996, "learning_rate": 1.8755568755568754e-05, "loss": 0.079, "step": 6310 }, { "epoch": 1.877041877041877, "grad_norm": 0.49415746331214905, "learning_rate": 1.8737748737748736e-05, "loss": 0.071, "step": 6320 }, { "epoch": 1.88001188001188, "grad_norm": 0.6736321449279785, "learning_rate": 1.871992871992872e-05, "loss": 0.0749, "step": 6330 }, { "epoch": 1.882981882981883, "grad_norm": 0.9153728485107422, "learning_rate": 1.8702108702108704e-05, "loss": 0.0689, "step": 6340 }, { "epoch": 1.885951885951886, "grad_norm": 0.3608382046222687, "learning_rate": 1.8684288684288686e-05, "loss": 0.064, "step": 6350 }, { "epoch": 1.8889218889218888, "grad_norm": 0.3779090344905853, "learning_rate": 1.8666468666468668e-05, "loss": 0.072, "step": 6360 }, { "epoch": 1.8918918918918919, "grad_norm": 0.5436325669288635, "learning_rate": 1.864864864864865e-05, "loss": 0.0738, "step": 6370 }, { "epoch": 1.894861894861895, "grad_norm": 0.720585823059082, "learning_rate": 1.863082863082863e-05, "loss": 0.0651, "step": 6380 }, { "epoch": 1.8978318978318978, "grad_norm": 0.3137255609035492, "learning_rate": 1.861300861300861e-05, "loss": 0.0634, "step": 6390 }, { "epoch": 1.9008019008019008, "grad_norm": 0.6744168400764465, "learning_rate": 1.8595188595188597e-05, "loss": 0.0652, "step": 6400 }, { "epoch": 1.9037719037719039, "grad_norm": 0.33474233746528625, "learning_rate": 1.857736857736858e-05, "loss": 0.0663, "step": 6410 }, { "epoch": 1.9067419067419067, "grad_norm": 0.3249685764312744, "learning_rate": 1.855954855954856e-05, "loss": 0.0772, "step": 6420 }, { "epoch": 1.9097119097119097, "grad_norm": 0.8481233716011047, "learning_rate": 1.8541728541728543e-05, "loss": 0.0715, "step": 6430 }, { "epoch": 1.9126819126819128, "grad_norm": 0.40865710377693176, "learning_rate": 1.8523908523908525e-05, "loss": 0.0672, "step": 6440 }, { "epoch": 1.9156519156519156, "grad_norm": 0.40034782886505127, "learning_rate": 1.8506088506088504e-05, "loss": 0.0599, "step": 6450 }, { "epoch": 1.9186219186219187, "grad_norm": 0.37332504987716675, "learning_rate": 1.8488268488268486e-05, "loss": 0.0726, "step": 6460 }, { "epoch": 1.9215919215919217, "grad_norm": 0.39983534812927246, "learning_rate": 1.8470448470448472e-05, "loss": 0.0673, "step": 6470 }, { "epoch": 1.9245619245619245, "grad_norm": 0.3581918776035309, "learning_rate": 1.8452628452628454e-05, "loss": 0.0644, "step": 6480 }, { "epoch": 1.9275319275319274, "grad_norm": 0.4143809676170349, "learning_rate": 1.8434808434808436e-05, "loss": 0.0582, "step": 6490 }, { "epoch": 1.9305019305019306, "grad_norm": 0.42415744066238403, "learning_rate": 1.841698841698842e-05, "loss": 0.067, "step": 6500 }, { "epoch": 1.9334719334719335, "grad_norm": 0.5267529487609863, "learning_rate": 1.83991683991684e-05, "loss": 0.0562, "step": 6510 }, { "epoch": 1.9364419364419363, "grad_norm": 0.44437262415885925, "learning_rate": 1.838134838134838e-05, "loss": 0.0723, "step": 6520 }, { "epoch": 1.9394119394119396, "grad_norm": 0.5024462938308716, "learning_rate": 1.836352836352836e-05, "loss": 0.0788, "step": 6530 }, { "epoch": 1.9423819423819424, "grad_norm": 0.3392117917537689, "learning_rate": 1.8345708345708347e-05, "loss": 0.0685, "step": 6540 }, { "epoch": 1.9453519453519452, "grad_norm": 0.4275413751602173, "learning_rate": 1.832788832788833e-05, "loss": 0.0728, "step": 6550 }, { "epoch": 1.9483219483219483, "grad_norm": 0.3413922190666199, "learning_rate": 1.831006831006831e-05, "loss": 0.0694, "step": 6560 }, { "epoch": 1.9512919512919513, "grad_norm": 0.4779782295227051, "learning_rate": 1.8292248292248294e-05, "loss": 0.0654, "step": 6570 }, { "epoch": 1.9542619542619541, "grad_norm": 0.4912964701652527, "learning_rate": 1.8274428274428276e-05, "loss": 0.0729, "step": 6580 }, { "epoch": 1.9572319572319572, "grad_norm": 0.3358478546142578, "learning_rate": 1.8256608256608254e-05, "loss": 0.069, "step": 6590 }, { "epoch": 1.9602019602019602, "grad_norm": 0.5066028237342834, "learning_rate": 1.8238788238788237e-05, "loss": 0.0626, "step": 6600 }, { "epoch": 1.963171963171963, "grad_norm": 0.5891350507736206, "learning_rate": 1.8220968220968222e-05, "loss": 0.0643, "step": 6610 }, { "epoch": 1.9661419661419661, "grad_norm": 0.5142768621444702, "learning_rate": 1.8203148203148204e-05, "loss": 0.06, "step": 6620 }, { "epoch": 1.9691119691119692, "grad_norm": 0.463016539812088, "learning_rate": 1.8185328185328187e-05, "loss": 0.062, "step": 6630 }, { "epoch": 1.972081972081972, "grad_norm": 0.27797237038612366, "learning_rate": 1.816750816750817e-05, "loss": 0.0638, "step": 6640 }, { "epoch": 1.975051975051975, "grad_norm": 0.8923652768135071, "learning_rate": 1.814968814968815e-05, "loss": 0.0815, "step": 6650 }, { "epoch": 1.978021978021978, "grad_norm": 0.43557631969451904, "learning_rate": 1.813186813186813e-05, "loss": 0.066, "step": 6660 }, { "epoch": 1.980991980991981, "grad_norm": 0.40481114387512207, "learning_rate": 1.8114048114048112e-05, "loss": 0.0685, "step": 6670 }, { "epoch": 1.983961983961984, "grad_norm": 0.5298916101455688, "learning_rate": 1.8096228096228097e-05, "loss": 0.0546, "step": 6680 }, { "epoch": 1.986931986931987, "grad_norm": 0.687917172908783, "learning_rate": 1.807840807840808e-05, "loss": 0.0713, "step": 6690 }, { "epoch": 1.9899019899019899, "grad_norm": 0.44359517097473145, "learning_rate": 1.8060588060588062e-05, "loss": 0.0659, "step": 6700 }, { "epoch": 1.992871992871993, "grad_norm": 0.48727744817733765, "learning_rate": 1.8042768042768044e-05, "loss": 0.075, "step": 6710 }, { "epoch": 1.995841995841996, "grad_norm": 0.46480777859687805, "learning_rate": 1.8024948024948026e-05, "loss": 0.0758, "step": 6720 }, { "epoch": 1.9988119988119988, "grad_norm": 0.7983739376068115, "learning_rate": 1.8007128007128005e-05, "loss": 0.0611, "step": 6730 }, { "epoch": 2.0, "eval_f1": 0.49727767695099817, "eval_loss": 0.05854379013180733, "eval_runtime": 176.456, "eval_samples_per_second": 215.459, "eval_steps_per_second": 3.372, "step": 6734 }, { "epoch": 2.0017820017820016, "grad_norm": 0.7805753946304321, "learning_rate": 1.7989307989307987e-05, "loss": 0.0529, "step": 6740 }, { "epoch": 2.004752004752005, "grad_norm": 0.436716765165329, "learning_rate": 1.7971487971487973e-05, "loss": 0.0657, "step": 6750 }, { "epoch": 2.0077220077220077, "grad_norm": 0.347323477268219, "learning_rate": 1.7953667953667955e-05, "loss": 0.0733, "step": 6760 }, { "epoch": 2.0106920106920105, "grad_norm": 0.4401879608631134, "learning_rate": 1.7935847935847937e-05, "loss": 0.0709, "step": 6770 }, { "epoch": 2.013662013662014, "grad_norm": 0.6687297224998474, "learning_rate": 1.791802791802792e-05, "loss": 0.0655, "step": 6780 }, { "epoch": 2.0166320166320166, "grad_norm": 0.6250702142715454, "learning_rate": 1.79002079002079e-05, "loss": 0.072, "step": 6790 }, { "epoch": 2.0196020196020195, "grad_norm": 0.7498565912246704, "learning_rate": 1.788238788238788e-05, "loss": 0.0724, "step": 6800 }, { "epoch": 2.0225720225720227, "grad_norm": 0.5209968090057373, "learning_rate": 1.7864567864567862e-05, "loss": 0.0686, "step": 6810 }, { "epoch": 2.0255420255420256, "grad_norm": 0.6656198501586914, "learning_rate": 1.7846747846747848e-05, "loss": 0.0636, "step": 6820 }, { "epoch": 2.0285120285120284, "grad_norm": 0.4398493766784668, "learning_rate": 1.782892782892783e-05, "loss": 0.0626, "step": 6830 }, { "epoch": 2.0314820314820317, "grad_norm": 0.3464367985725403, "learning_rate": 1.7811107811107812e-05, "loss": 0.0636, "step": 6840 }, { "epoch": 2.0344520344520345, "grad_norm": 0.5368358492851257, "learning_rate": 1.7793287793287794e-05, "loss": 0.0661, "step": 6850 }, { "epoch": 2.0374220374220373, "grad_norm": 0.6472384929656982, "learning_rate": 1.7775467775467776e-05, "loss": 0.0647, "step": 6860 }, { "epoch": 2.0403920403920406, "grad_norm": 0.49248170852661133, "learning_rate": 1.7757647757647755e-05, "loss": 0.0689, "step": 6870 }, { "epoch": 2.0433620433620434, "grad_norm": 0.2520935535430908, "learning_rate": 1.7739827739827737e-05, "loss": 0.074, "step": 6880 }, { "epoch": 2.0463320463320462, "grad_norm": 0.5810942053794861, "learning_rate": 1.7722007722007723e-05, "loss": 0.0675, "step": 6890 }, { "epoch": 2.0493020493020495, "grad_norm": 0.7744080424308777, "learning_rate": 1.7704187704187705e-05, "loss": 0.0636, "step": 6900 }, { "epoch": 2.0522720522720523, "grad_norm": 0.6021985411643982, "learning_rate": 1.7686367686367687e-05, "loss": 0.0683, "step": 6910 }, { "epoch": 2.055242055242055, "grad_norm": 0.6123180985450745, "learning_rate": 1.766854766854767e-05, "loss": 0.0791, "step": 6920 }, { "epoch": 2.0582120582120584, "grad_norm": 0.6447744965553284, "learning_rate": 1.765072765072765e-05, "loss": 0.0705, "step": 6930 }, { "epoch": 2.0611820611820613, "grad_norm": 0.5168854594230652, "learning_rate": 1.7632907632907634e-05, "loss": 0.0611, "step": 6940 }, { "epoch": 2.064152064152064, "grad_norm": 0.9751071333885193, "learning_rate": 1.7615087615087613e-05, "loss": 0.0662, "step": 6950 }, { "epoch": 2.067122067122067, "grad_norm": 0.5001913905143738, "learning_rate": 1.7597267597267598e-05, "loss": 0.0654, "step": 6960 }, { "epoch": 2.07009207009207, "grad_norm": 0.6123823523521423, "learning_rate": 1.757944757944758e-05, "loss": 0.0806, "step": 6970 }, { "epoch": 2.073062073062073, "grad_norm": 0.5449308156967163, "learning_rate": 1.7561627561627563e-05, "loss": 0.0643, "step": 6980 }, { "epoch": 2.076032076032076, "grad_norm": 0.31201791763305664, "learning_rate": 1.7543807543807545e-05, "loss": 0.0739, "step": 6990 }, { "epoch": 2.079002079002079, "grad_norm": 0.8544298410415649, "learning_rate": 1.7525987525987527e-05, "loss": 0.069, "step": 7000 }, { "epoch": 2.081972081972082, "grad_norm": 0.5308842062950134, "learning_rate": 1.750816750816751e-05, "loss": 0.0658, "step": 7010 }, { "epoch": 2.0849420849420848, "grad_norm": 0.47668206691741943, "learning_rate": 1.7490347490347488e-05, "loss": 0.076, "step": 7020 }, { "epoch": 2.087912087912088, "grad_norm": 0.47977229952812195, "learning_rate": 1.7472527472527473e-05, "loss": 0.0683, "step": 7030 }, { "epoch": 2.090882090882091, "grad_norm": 0.9374080896377563, "learning_rate": 1.7454707454707456e-05, "loss": 0.0743, "step": 7040 }, { "epoch": 2.0938520938520937, "grad_norm": 0.5665603280067444, "learning_rate": 1.7436887436887438e-05, "loss": 0.0725, "step": 7050 }, { "epoch": 2.096822096822097, "grad_norm": 0.44158872961997986, "learning_rate": 1.741906741906742e-05, "loss": 0.0769, "step": 7060 }, { "epoch": 2.0997920997921, "grad_norm": 0.36570894718170166, "learning_rate": 1.7401247401247402e-05, "loss": 0.0686, "step": 7070 }, { "epoch": 2.1027621027621026, "grad_norm": 0.4633289575576782, "learning_rate": 1.7383427383427384e-05, "loss": 0.0717, "step": 7080 }, { "epoch": 2.105732105732106, "grad_norm": 0.6178393959999084, "learning_rate": 1.7365607365607363e-05, "loss": 0.0719, "step": 7090 }, { "epoch": 2.1087021087021087, "grad_norm": 0.3964090049266815, "learning_rate": 1.734778734778735e-05, "loss": 0.0542, "step": 7100 }, { "epoch": 2.1116721116721116, "grad_norm": 0.3831978738307953, "learning_rate": 1.732996732996733e-05, "loss": 0.0711, "step": 7110 }, { "epoch": 2.114642114642115, "grad_norm": 0.4152994453907013, "learning_rate": 1.7312147312147313e-05, "loss": 0.0617, "step": 7120 }, { "epoch": 2.1176121176121177, "grad_norm": 0.5786647796630859, "learning_rate": 1.7294327294327295e-05, "loss": 0.0678, "step": 7130 }, { "epoch": 2.1205821205821205, "grad_norm": 0.5444033145904541, "learning_rate": 1.7276507276507277e-05, "loss": 0.0605, "step": 7140 }, { "epoch": 2.1235521235521237, "grad_norm": 0.18499556183815002, "learning_rate": 1.725868725868726e-05, "loss": 0.0568, "step": 7150 }, { "epoch": 2.1265221265221266, "grad_norm": 0.3817172050476074, "learning_rate": 1.7240867240867238e-05, "loss": 0.0559, "step": 7160 }, { "epoch": 2.1294921294921294, "grad_norm": 0.5504813194274902, "learning_rate": 1.7223047223047224e-05, "loss": 0.0642, "step": 7170 }, { "epoch": 2.1324621324621322, "grad_norm": 0.34808218479156494, "learning_rate": 1.7205227205227206e-05, "loss": 0.0483, "step": 7180 }, { "epoch": 2.1354321354321355, "grad_norm": 0.45135316252708435, "learning_rate": 1.7187407187407188e-05, "loss": 0.0591, "step": 7190 }, { "epoch": 2.1384021384021383, "grad_norm": 0.5405902862548828, "learning_rate": 1.716958716958717e-05, "loss": 0.0548, "step": 7200 }, { "epoch": 2.141372141372141, "grad_norm": 0.4525381624698639, "learning_rate": 1.7151767151767152e-05, "loss": 0.0775, "step": 7210 }, { "epoch": 2.1443421443421444, "grad_norm": 0.9278238415718079, "learning_rate": 1.7133947133947135e-05, "loss": 0.0748, "step": 7220 }, { "epoch": 2.1473121473121473, "grad_norm": 0.34462785720825195, "learning_rate": 1.7116127116127117e-05, "loss": 0.0693, "step": 7230 }, { "epoch": 2.15028215028215, "grad_norm": 0.5502927899360657, "learning_rate": 1.70983070983071e-05, "loss": 0.0704, "step": 7240 }, { "epoch": 2.1532521532521534, "grad_norm": 0.5558304786682129, "learning_rate": 1.708048708048708e-05, "loss": 0.072, "step": 7250 }, { "epoch": 2.156222156222156, "grad_norm": 0.43772050738334656, "learning_rate": 1.7062667062667063e-05, "loss": 0.0609, "step": 7260 }, { "epoch": 2.159192159192159, "grad_norm": 0.85486900806427, "learning_rate": 1.7044847044847045e-05, "loss": 0.0768, "step": 7270 }, { "epoch": 2.1621621621621623, "grad_norm": 0.31786465644836426, "learning_rate": 1.7027027027027028e-05, "loss": 0.0578, "step": 7280 }, { "epoch": 2.165132165132165, "grad_norm": 0.37934377789497375, "learning_rate": 1.700920700920701e-05, "loss": 0.0724, "step": 7290 }, { "epoch": 2.168102168102168, "grad_norm": 0.5212098360061646, "learning_rate": 1.6991386991386992e-05, "loss": 0.0587, "step": 7300 }, { "epoch": 2.171072171072171, "grad_norm": 0.4610010087490082, "learning_rate": 1.6973566973566974e-05, "loss": 0.0659, "step": 7310 }, { "epoch": 2.174042174042174, "grad_norm": 0.4683549404144287, "learning_rate": 1.6955746955746956e-05, "loss": 0.0559, "step": 7320 }, { "epoch": 2.177012177012177, "grad_norm": 1.537858009338379, "learning_rate": 1.693792693792694e-05, "loss": 0.0657, "step": 7330 }, { "epoch": 2.17998217998218, "grad_norm": 0.8588612675666809, "learning_rate": 1.692010692010692e-05, "loss": 0.0818, "step": 7340 }, { "epoch": 2.182952182952183, "grad_norm": 0.5644201636314392, "learning_rate": 1.6902286902286903e-05, "loss": 0.0553, "step": 7350 }, { "epoch": 2.185922185922186, "grad_norm": 0.5589690804481506, "learning_rate": 1.6884466884466885e-05, "loss": 0.0634, "step": 7360 }, { "epoch": 2.188892188892189, "grad_norm": 0.4421149790287018, "learning_rate": 1.6866646866646867e-05, "loss": 0.0714, "step": 7370 }, { "epoch": 2.191862191862192, "grad_norm": 0.7251006960868835, "learning_rate": 1.684882684882685e-05, "loss": 0.0741, "step": 7380 }, { "epoch": 2.1948321948321947, "grad_norm": 0.5653437972068787, "learning_rate": 1.683100683100683e-05, "loss": 0.0636, "step": 7390 }, { "epoch": 2.197802197802198, "grad_norm": 0.37989261746406555, "learning_rate": 1.6813186813186814e-05, "loss": 0.0681, "step": 7400 }, { "epoch": 2.200772200772201, "grad_norm": 0.38947612047195435, "learning_rate": 1.6795366795366796e-05, "loss": 0.0584, "step": 7410 }, { "epoch": 2.2037422037422036, "grad_norm": 0.5566168427467346, "learning_rate": 1.6777546777546778e-05, "loss": 0.0567, "step": 7420 }, { "epoch": 2.206712206712207, "grad_norm": 0.664364755153656, "learning_rate": 1.675972675972676e-05, "loss": 0.0735, "step": 7430 }, { "epoch": 2.2096822096822097, "grad_norm": 0.3879406154155731, "learning_rate": 1.6741906741906742e-05, "loss": 0.0684, "step": 7440 }, { "epoch": 2.2126522126522126, "grad_norm": 0.34745240211486816, "learning_rate": 1.6724086724086725e-05, "loss": 0.0727, "step": 7450 }, { "epoch": 2.215622215622216, "grad_norm": 0.48188093304634094, "learning_rate": 1.6706266706266707e-05, "loss": 0.0622, "step": 7460 }, { "epoch": 2.2185922185922187, "grad_norm": 0.6234533786773682, "learning_rate": 1.668844668844669e-05, "loss": 0.0681, "step": 7470 }, { "epoch": 2.2215622215622215, "grad_norm": 0.6129441261291504, "learning_rate": 1.667062667062667e-05, "loss": 0.0674, "step": 7480 }, { "epoch": 2.2245322245322248, "grad_norm": 0.36409103870391846, "learning_rate": 1.6652806652806653e-05, "loss": 0.0673, "step": 7490 }, { "epoch": 2.2275022275022276, "grad_norm": 0.8309186697006226, "learning_rate": 1.6634986634986635e-05, "loss": 0.0543, "step": 7500 }, { "epoch": 2.2304722304722304, "grad_norm": 0.4031508266925812, "learning_rate": 1.6617166617166618e-05, "loss": 0.0842, "step": 7510 }, { "epoch": 2.2334422334422332, "grad_norm": 0.35200947523117065, "learning_rate": 1.65993465993466e-05, "loss": 0.0611, "step": 7520 }, { "epoch": 2.2364122364122365, "grad_norm": 0.5327655673027039, "learning_rate": 1.6581526581526582e-05, "loss": 0.0611, "step": 7530 }, { "epoch": 2.2393822393822393, "grad_norm": 0.3595449924468994, "learning_rate": 1.6563706563706564e-05, "loss": 0.0718, "step": 7540 }, { "epoch": 2.242352242352242, "grad_norm": 0.6577255129814148, "learning_rate": 1.6545886545886546e-05, "loss": 0.0748, "step": 7550 }, { "epoch": 2.2453222453222454, "grad_norm": 0.46405327320098877, "learning_rate": 1.652806652806653e-05, "loss": 0.0735, "step": 7560 }, { "epoch": 2.2482922482922483, "grad_norm": 0.6792459487915039, "learning_rate": 1.651024651024651e-05, "loss": 0.0809, "step": 7570 }, { "epoch": 2.251262251262251, "grad_norm": 0.4969274401664734, "learning_rate": 1.6492426492426496e-05, "loss": 0.0616, "step": 7580 }, { "epoch": 2.2542322542322544, "grad_norm": 0.7882120609283447, "learning_rate": 1.6474606474606475e-05, "loss": 0.0756, "step": 7590 }, { "epoch": 2.257202257202257, "grad_norm": 0.4611985683441162, "learning_rate": 1.6456786456786457e-05, "loss": 0.0658, "step": 7600 }, { "epoch": 2.26017226017226, "grad_norm": 0.5099868774414062, "learning_rate": 1.643896643896644e-05, "loss": 0.0691, "step": 7610 }, { "epoch": 2.2631422631422633, "grad_norm": 0.461518257856369, "learning_rate": 1.642114642114642e-05, "loss": 0.0604, "step": 7620 }, { "epoch": 2.266112266112266, "grad_norm": 0.3580944240093231, "learning_rate": 1.6403326403326404e-05, "loss": 0.0609, "step": 7630 }, { "epoch": 2.269082269082269, "grad_norm": 0.36803242564201355, "learning_rate": 1.6385506385506386e-05, "loss": 0.0674, "step": 7640 }, { "epoch": 2.2720522720522722, "grad_norm": 0.3887629806995392, "learning_rate": 1.636768636768637e-05, "loss": 0.0733, "step": 7650 }, { "epoch": 2.275022275022275, "grad_norm": 0.6474005579948425, "learning_rate": 1.634986634986635e-05, "loss": 0.0679, "step": 7660 }, { "epoch": 2.277992277992278, "grad_norm": 0.6048378944396973, "learning_rate": 1.6332046332046332e-05, "loss": 0.0643, "step": 7670 }, { "epoch": 2.280962280962281, "grad_norm": 0.45778071880340576, "learning_rate": 1.6314226314226314e-05, "loss": 0.0673, "step": 7680 }, { "epoch": 2.283932283932284, "grad_norm": 0.6061732172966003, "learning_rate": 1.6296406296406297e-05, "loss": 0.0786, "step": 7690 }, { "epoch": 2.286902286902287, "grad_norm": 0.4730798602104187, "learning_rate": 1.627858627858628e-05, "loss": 0.0572, "step": 7700 }, { "epoch": 2.2898722898722896, "grad_norm": 0.43137332797050476, "learning_rate": 1.626076626076626e-05, "loss": 0.0572, "step": 7710 }, { "epoch": 2.292842292842293, "grad_norm": 0.36243513226509094, "learning_rate": 1.6242946242946247e-05, "loss": 0.0722, "step": 7720 }, { "epoch": 2.2958122958122957, "grad_norm": 0.6039568781852722, "learning_rate": 1.6225126225126225e-05, "loss": 0.0628, "step": 7730 }, { "epoch": 2.2987822987822986, "grad_norm": 0.7253831028938293, "learning_rate": 1.6207306207306207e-05, "loss": 0.054, "step": 7740 }, { "epoch": 2.301752301752302, "grad_norm": 0.675613522529602, "learning_rate": 1.618948618948619e-05, "loss": 0.0689, "step": 7750 }, { "epoch": 2.3047223047223047, "grad_norm": 0.5215617418289185, "learning_rate": 1.6171666171666172e-05, "loss": 0.0733, "step": 7760 }, { "epoch": 2.3076923076923075, "grad_norm": 0.6846175193786621, "learning_rate": 1.6153846153846154e-05, "loss": 0.0615, "step": 7770 }, { "epoch": 2.3106623106623108, "grad_norm": 0.397480845451355, "learning_rate": 1.6136026136026136e-05, "loss": 0.0641, "step": 7780 }, { "epoch": 2.3136323136323136, "grad_norm": 0.33144304156303406, "learning_rate": 1.6118206118206122e-05, "loss": 0.0605, "step": 7790 }, { "epoch": 2.3166023166023164, "grad_norm": 0.452396035194397, "learning_rate": 1.61003861003861e-05, "loss": 0.0787, "step": 7800 }, { "epoch": 2.3195723195723197, "grad_norm": 0.4840039908885956, "learning_rate": 1.6082566082566083e-05, "loss": 0.0756, "step": 7810 }, { "epoch": 2.3225423225423225, "grad_norm": 0.3425714671611786, "learning_rate": 1.6064746064746065e-05, "loss": 0.0661, "step": 7820 }, { "epoch": 2.3255123255123253, "grad_norm": 0.5093306303024292, "learning_rate": 1.6046926046926047e-05, "loss": 0.0765, "step": 7830 }, { "epoch": 2.3284823284823286, "grad_norm": 0.37477800250053406, "learning_rate": 1.602910602910603e-05, "loss": 0.0751, "step": 7840 }, { "epoch": 2.3314523314523314, "grad_norm": 0.6057155132293701, "learning_rate": 1.601128601128601e-05, "loss": 0.0738, "step": 7850 }, { "epoch": 2.3344223344223343, "grad_norm": 0.5092906355857849, "learning_rate": 1.5993465993465997e-05, "loss": 0.0684, "step": 7860 }, { "epoch": 2.3373923373923375, "grad_norm": 0.464747816324234, "learning_rate": 1.5975645975645976e-05, "loss": 0.0614, "step": 7870 }, { "epoch": 2.3403623403623404, "grad_norm": 0.4768252372741699, "learning_rate": 1.5957825957825958e-05, "loss": 0.0722, "step": 7880 }, { "epoch": 2.343332343332343, "grad_norm": 0.5153640508651733, "learning_rate": 1.594000594000594e-05, "loss": 0.0713, "step": 7890 }, { "epoch": 2.3463023463023465, "grad_norm": 0.6367037296295166, "learning_rate": 1.5922185922185922e-05, "loss": 0.0681, "step": 7900 }, { "epoch": 2.3492723492723493, "grad_norm": 0.36707803606987, "learning_rate": 1.5904365904365904e-05, "loss": 0.0625, "step": 7910 }, { "epoch": 2.352242352242352, "grad_norm": 0.21663016080856323, "learning_rate": 1.5886545886545887e-05, "loss": 0.0602, "step": 7920 }, { "epoch": 2.3552123552123554, "grad_norm": 0.5676469206809998, "learning_rate": 1.5868725868725872e-05, "loss": 0.0605, "step": 7930 }, { "epoch": 2.358182358182358, "grad_norm": 0.4324367940425873, "learning_rate": 1.585090585090585e-05, "loss": 0.0661, "step": 7940 }, { "epoch": 2.361152361152361, "grad_norm": 0.40506285429000854, "learning_rate": 1.5833085833085833e-05, "loss": 0.0555, "step": 7950 }, { "epoch": 2.3641223641223643, "grad_norm": 0.30328163504600525, "learning_rate": 1.5815265815265815e-05, "loss": 0.0676, "step": 7960 }, { "epoch": 2.367092367092367, "grad_norm": 0.449945330619812, "learning_rate": 1.5797445797445797e-05, "loss": 0.0731, "step": 7970 }, { "epoch": 2.37006237006237, "grad_norm": 0.5466241836547852, "learning_rate": 1.577962577962578e-05, "loss": 0.0587, "step": 7980 }, { "epoch": 2.3730323730323732, "grad_norm": 0.2828434407711029, "learning_rate": 1.5761805761805762e-05, "loss": 0.0699, "step": 7990 }, { "epoch": 2.376002376002376, "grad_norm": 0.7119054794311523, "learning_rate": 1.5743985743985747e-05, "loss": 0.0615, "step": 8000 }, { "epoch": 2.378972378972379, "grad_norm": 0.5612084269523621, "learning_rate": 1.5726165726165726e-05, "loss": 0.0713, "step": 8010 }, { "epoch": 2.381942381942382, "grad_norm": 0.3906906545162201, "learning_rate": 1.5708345708345708e-05, "loss": 0.0628, "step": 8020 }, { "epoch": 2.384912384912385, "grad_norm": 0.4246062636375427, "learning_rate": 1.569052569052569e-05, "loss": 0.0693, "step": 8030 }, { "epoch": 2.387882387882388, "grad_norm": 0.7282578349113464, "learning_rate": 1.5672705672705673e-05, "loss": 0.0627, "step": 8040 }, { "epoch": 2.390852390852391, "grad_norm": 0.4457809627056122, "learning_rate": 1.5654885654885655e-05, "loss": 0.0756, "step": 8050 }, { "epoch": 2.393822393822394, "grad_norm": 0.460112065076828, "learning_rate": 1.5637065637065637e-05, "loss": 0.0715, "step": 8060 }, { "epoch": 2.3967923967923968, "grad_norm": 0.8285795450210571, "learning_rate": 1.5619245619245622e-05, "loss": 0.0731, "step": 8070 }, { "epoch": 2.3997623997623996, "grad_norm": 0.6187976002693176, "learning_rate": 1.56014256014256e-05, "loss": 0.0711, "step": 8080 }, { "epoch": 2.402732402732403, "grad_norm": 0.42191964387893677, "learning_rate": 1.5583605583605583e-05, "loss": 0.062, "step": 8090 }, { "epoch": 2.4057024057024057, "grad_norm": 0.3665825128555298, "learning_rate": 1.5565785565785566e-05, "loss": 0.0726, "step": 8100 }, { "epoch": 2.4086724086724085, "grad_norm": 0.535072386264801, "learning_rate": 1.5547965547965548e-05, "loss": 0.0767, "step": 8110 }, { "epoch": 2.4116424116424118, "grad_norm": 0.5114570260047913, "learning_rate": 1.553014553014553e-05, "loss": 0.0571, "step": 8120 }, { "epoch": 2.4146124146124146, "grad_norm": 0.5549605488777161, "learning_rate": 1.5512325512325512e-05, "loss": 0.0741, "step": 8130 }, { "epoch": 2.4175824175824174, "grad_norm": 0.47435063123703003, "learning_rate": 1.5494505494505498e-05, "loss": 0.0715, "step": 8140 }, { "epoch": 2.4205524205524207, "grad_norm": 0.45239725708961487, "learning_rate": 1.547668547668548e-05, "loss": 0.076, "step": 8150 }, { "epoch": 2.4235224235224235, "grad_norm": 0.3530510663986206, "learning_rate": 1.545886545886546e-05, "loss": 0.0603, "step": 8160 }, { "epoch": 2.4264924264924264, "grad_norm": 0.34246182441711426, "learning_rate": 1.544104544104544e-05, "loss": 0.0687, "step": 8170 }, { "epoch": 2.4294624294624296, "grad_norm": 0.4563390612602234, "learning_rate": 1.5423225423225423e-05, "loss": 0.0592, "step": 8180 }, { "epoch": 2.4324324324324325, "grad_norm": 0.6311094760894775, "learning_rate": 1.5405405405405405e-05, "loss": 0.0734, "step": 8190 }, { "epoch": 2.4354024354024353, "grad_norm": 0.398874431848526, "learning_rate": 1.5387585387585387e-05, "loss": 0.0658, "step": 8200 }, { "epoch": 2.4383724383724386, "grad_norm": 0.47651001811027527, "learning_rate": 1.5369765369765373e-05, "loss": 0.0653, "step": 8210 }, { "epoch": 2.4413424413424414, "grad_norm": 0.5543814897537231, "learning_rate": 1.5351945351945355e-05, "loss": 0.077, "step": 8220 }, { "epoch": 2.444312444312444, "grad_norm": 0.6091018915176392, "learning_rate": 1.5334125334125334e-05, "loss": 0.0673, "step": 8230 }, { "epoch": 2.447282447282447, "grad_norm": 0.5908657312393188, "learning_rate": 1.5316305316305316e-05, "loss": 0.0592, "step": 8240 }, { "epoch": 2.4502524502524503, "grad_norm": 0.707524836063385, "learning_rate": 1.5298485298485298e-05, "loss": 0.067, "step": 8250 }, { "epoch": 2.453222453222453, "grad_norm": 0.5802726745605469, "learning_rate": 1.528066528066528e-05, "loss": 0.0717, "step": 8260 }, { "epoch": 2.456192456192456, "grad_norm": 0.5758719444274902, "learning_rate": 1.5262845262845263e-05, "loss": 0.0654, "step": 8270 }, { "epoch": 2.4591624591624592, "grad_norm": 0.2951982617378235, "learning_rate": 1.5245025245025246e-05, "loss": 0.0646, "step": 8280 }, { "epoch": 2.462132462132462, "grad_norm": 0.6033930778503418, "learning_rate": 1.5227205227205229e-05, "loss": 0.0616, "step": 8290 }, { "epoch": 2.465102465102465, "grad_norm": 0.7335968613624573, "learning_rate": 1.520938520938521e-05, "loss": 0.0527, "step": 8300 }, { "epoch": 2.468072468072468, "grad_norm": 0.2696143686771393, "learning_rate": 1.5191565191565193e-05, "loss": 0.062, "step": 8310 }, { "epoch": 2.471042471042471, "grad_norm": 0.24488599598407745, "learning_rate": 1.5173745173745173e-05, "loss": 0.0661, "step": 8320 }, { "epoch": 2.474012474012474, "grad_norm": 0.43688690662384033, "learning_rate": 1.5155925155925156e-05, "loss": 0.0563, "step": 8330 }, { "epoch": 2.476982476982477, "grad_norm": 0.557025134563446, "learning_rate": 1.5138105138105138e-05, "loss": 0.0598, "step": 8340 }, { "epoch": 2.47995247995248, "grad_norm": 0.6465451717376709, "learning_rate": 1.5120285120285122e-05, "loss": 0.0669, "step": 8350 }, { "epoch": 2.4829224829224827, "grad_norm": 0.38359346985816956, "learning_rate": 1.5102465102465104e-05, "loss": 0.0715, "step": 8360 }, { "epoch": 2.485892485892486, "grad_norm": 0.6876797080039978, "learning_rate": 1.5084645084645086e-05, "loss": 0.0799, "step": 8370 }, { "epoch": 2.488862488862489, "grad_norm": 0.47395193576812744, "learning_rate": 1.5066825066825068e-05, "loss": 0.0699, "step": 8380 }, { "epoch": 2.4918324918324917, "grad_norm": 0.4975154399871826, "learning_rate": 1.5049005049005049e-05, "loss": 0.0655, "step": 8390 }, { "epoch": 2.494802494802495, "grad_norm": 0.5417085886001587, "learning_rate": 1.503118503118503e-05, "loss": 0.0768, "step": 8400 }, { "epoch": 2.4977724977724978, "grad_norm": 0.593275785446167, "learning_rate": 1.5013365013365013e-05, "loss": 0.0613, "step": 8410 }, { "epoch": 2.5007425007425006, "grad_norm": 0.5146422386169434, "learning_rate": 1.4995544995544995e-05, "loss": 0.0539, "step": 8420 }, { "epoch": 2.503712503712504, "grad_norm": 0.5107398629188538, "learning_rate": 1.4977724977724977e-05, "loss": 0.0642, "step": 8430 }, { "epoch": 2.5066825066825067, "grad_norm": 0.40161600708961487, "learning_rate": 1.4959904959904961e-05, "loss": 0.0598, "step": 8440 }, { "epoch": 2.5096525096525095, "grad_norm": 0.3634410500526428, "learning_rate": 1.4942084942084943e-05, "loss": 0.064, "step": 8450 }, { "epoch": 2.512622512622513, "grad_norm": 0.29765084385871887, "learning_rate": 1.4924264924264924e-05, "loss": 0.0596, "step": 8460 }, { "epoch": 2.5155925155925156, "grad_norm": 0.5146110653877258, "learning_rate": 1.4906444906444908e-05, "loss": 0.0794, "step": 8470 }, { "epoch": 2.5185625185625184, "grad_norm": 0.49415668845176697, "learning_rate": 1.488862488862489e-05, "loss": 0.0612, "step": 8480 }, { "epoch": 2.5215325215325217, "grad_norm": 0.3173198997974396, "learning_rate": 1.487080487080487e-05, "loss": 0.0576, "step": 8490 }, { "epoch": 2.5245025245025245, "grad_norm": 0.4311143159866333, "learning_rate": 1.4852984852984852e-05, "loss": 0.0721, "step": 8500 }, { "epoch": 2.5274725274725274, "grad_norm": 0.3858831524848938, "learning_rate": 1.4835164835164836e-05, "loss": 0.0553, "step": 8510 }, { "epoch": 2.5304425304425306, "grad_norm": 0.4288255572319031, "learning_rate": 1.4817344817344818e-05, "loss": 0.0598, "step": 8520 }, { "epoch": 2.5334125334125335, "grad_norm": 0.6533911228179932, "learning_rate": 1.4799524799524799e-05, "loss": 0.0776, "step": 8530 }, { "epoch": 2.5363825363825363, "grad_norm": 0.4716707468032837, "learning_rate": 1.4781704781704783e-05, "loss": 0.0586, "step": 8540 }, { "epoch": 2.5393525393525396, "grad_norm": 0.40273916721343994, "learning_rate": 1.4763884763884765e-05, "loss": 0.0665, "step": 8550 }, { "epoch": 2.5423225423225424, "grad_norm": 0.5408886075019836, "learning_rate": 1.4746064746064745e-05, "loss": 0.0606, "step": 8560 }, { "epoch": 2.5452925452925452, "grad_norm": 0.4306439757347107, "learning_rate": 1.4728244728244728e-05, "loss": 0.0725, "step": 8570 }, { "epoch": 2.5482625482625485, "grad_norm": 0.6419402360916138, "learning_rate": 1.4710424710424711e-05, "loss": 0.0808, "step": 8580 }, { "epoch": 2.5512325512325513, "grad_norm": 0.4105408489704132, "learning_rate": 1.4692604692604694e-05, "loss": 0.0618, "step": 8590 }, { "epoch": 2.554202554202554, "grad_norm": 0.6038805246353149, "learning_rate": 1.4674784674784674e-05, "loss": 0.0583, "step": 8600 }, { "epoch": 2.5571725571725574, "grad_norm": 0.6562811732292175, "learning_rate": 1.4656964656964658e-05, "loss": 0.0631, "step": 8610 }, { "epoch": 2.5601425601425603, "grad_norm": 0.5605425238609314, "learning_rate": 1.463914463914464e-05, "loss": 0.0679, "step": 8620 }, { "epoch": 2.563112563112563, "grad_norm": 0.4276842474937439, "learning_rate": 1.4621324621324622e-05, "loss": 0.0592, "step": 8630 }, { "epoch": 2.5660825660825664, "grad_norm": 0.46107247471809387, "learning_rate": 1.4603504603504603e-05, "loss": 0.0585, "step": 8640 }, { "epoch": 2.569052569052569, "grad_norm": 0.5134992003440857, "learning_rate": 1.4585684585684587e-05, "loss": 0.0734, "step": 8650 }, { "epoch": 2.572022572022572, "grad_norm": 0.4581449031829834, "learning_rate": 1.4567864567864569e-05, "loss": 0.0549, "step": 8660 }, { "epoch": 2.574992574992575, "grad_norm": 0.5128817558288574, "learning_rate": 1.455004455004455e-05, "loss": 0.0648, "step": 8670 }, { "epoch": 2.577962577962578, "grad_norm": 0.5839508771896362, "learning_rate": 1.4532224532224533e-05, "loss": 0.0794, "step": 8680 }, { "epoch": 2.580932580932581, "grad_norm": 0.4006098210811615, "learning_rate": 1.4514404514404515e-05, "loss": 0.0679, "step": 8690 }, { "epoch": 2.5839025839025838, "grad_norm": 0.24022406339645386, "learning_rate": 1.4496584496584498e-05, "loss": 0.0671, "step": 8700 }, { "epoch": 2.586872586872587, "grad_norm": 0.390082448720932, "learning_rate": 1.4478764478764478e-05, "loss": 0.0533, "step": 8710 }, { "epoch": 2.58984258984259, "grad_norm": 0.5063132643699646, "learning_rate": 1.4460944460944462e-05, "loss": 0.065, "step": 8720 }, { "epoch": 2.5928125928125927, "grad_norm": 0.4413723647594452, "learning_rate": 1.4443124443124444e-05, "loss": 0.0628, "step": 8730 }, { "epoch": 2.5957825957825955, "grad_norm": 0.5134592056274414, "learning_rate": 1.4425304425304425e-05, "loss": 0.0726, "step": 8740 }, { "epoch": 2.598752598752599, "grad_norm": 0.6060248613357544, "learning_rate": 1.4407484407484408e-05, "loss": 0.0685, "step": 8750 }, { "epoch": 2.6017226017226016, "grad_norm": 0.54691481590271, "learning_rate": 1.438966438966439e-05, "loss": 0.0707, "step": 8760 }, { "epoch": 2.6046926046926044, "grad_norm": 0.3673838675022125, "learning_rate": 1.4371844371844373e-05, "loss": 0.0735, "step": 8770 }, { "epoch": 2.6076626076626077, "grad_norm": 0.47034141421318054, "learning_rate": 1.4354024354024353e-05, "loss": 0.0683, "step": 8780 }, { "epoch": 2.6106326106326105, "grad_norm": 0.6322289109230042, "learning_rate": 1.4336204336204337e-05, "loss": 0.0712, "step": 8790 }, { "epoch": 2.6136026136026134, "grad_norm": 0.3621010482311249, "learning_rate": 1.431838431838432e-05, "loss": 0.0599, "step": 8800 }, { "epoch": 2.6165726165726166, "grad_norm": 0.3426471948623657, "learning_rate": 1.43005643005643e-05, "loss": 0.0699, "step": 8810 }, { "epoch": 2.6195426195426195, "grad_norm": 0.4808509647846222, "learning_rate": 1.4282744282744284e-05, "loss": 0.0563, "step": 8820 }, { "epoch": 2.6225126225126223, "grad_norm": 0.588964581489563, "learning_rate": 1.4264924264924266e-05, "loss": 0.0667, "step": 8830 }, { "epoch": 2.6254826254826256, "grad_norm": 0.48968905210494995, "learning_rate": 1.4247104247104248e-05, "loss": 0.0648, "step": 8840 }, { "epoch": 2.6284526284526284, "grad_norm": 0.4962276816368103, "learning_rate": 1.4229284229284228e-05, "loss": 0.0719, "step": 8850 }, { "epoch": 2.631422631422631, "grad_norm": 0.5036596059799194, "learning_rate": 1.4211464211464212e-05, "loss": 0.0736, "step": 8860 }, { "epoch": 2.6343926343926345, "grad_norm": 0.47525274753570557, "learning_rate": 1.4193644193644194e-05, "loss": 0.0606, "step": 8870 }, { "epoch": 2.6373626373626373, "grad_norm": 0.6138589978218079, "learning_rate": 1.4175824175824177e-05, "loss": 0.064, "step": 8880 }, { "epoch": 2.64033264033264, "grad_norm": 0.2877761721611023, "learning_rate": 1.4158004158004159e-05, "loss": 0.0645, "step": 8890 }, { "epoch": 2.6433026433026434, "grad_norm": 0.4664807617664337, "learning_rate": 1.4140184140184141e-05, "loss": 0.0573, "step": 8900 }, { "epoch": 2.6462726462726462, "grad_norm": 0.38519200682640076, "learning_rate": 1.4122364122364123e-05, "loss": 0.0666, "step": 8910 }, { "epoch": 2.649242649242649, "grad_norm": 0.7016706466674805, "learning_rate": 1.4104544104544104e-05, "loss": 0.0597, "step": 8920 }, { "epoch": 2.6522126522126523, "grad_norm": 0.5760989785194397, "learning_rate": 1.4086724086724087e-05, "loss": 0.0606, "step": 8930 }, { "epoch": 2.655182655182655, "grad_norm": 0.47811734676361084, "learning_rate": 1.406890406890407e-05, "loss": 0.0688, "step": 8940 }, { "epoch": 2.658152658152658, "grad_norm": 0.3496223986148834, "learning_rate": 1.4051084051084052e-05, "loss": 0.0569, "step": 8950 }, { "epoch": 2.6611226611226613, "grad_norm": 0.6245877742767334, "learning_rate": 1.4033264033264034e-05, "loss": 0.0694, "step": 8960 }, { "epoch": 2.664092664092664, "grad_norm": 0.38785070180892944, "learning_rate": 1.4015444015444016e-05, "loss": 0.0599, "step": 8970 }, { "epoch": 2.667062667062667, "grad_norm": 0.3740558624267578, "learning_rate": 1.3997623997623998e-05, "loss": 0.0562, "step": 8980 }, { "epoch": 2.67003267003267, "grad_norm": 0.4402414560317993, "learning_rate": 1.3979803979803979e-05, "loss": 0.0595, "step": 8990 }, { "epoch": 2.673002673002673, "grad_norm": 0.6891340017318726, "learning_rate": 1.3961983961983963e-05, "loss": 0.0712, "step": 9000 }, { "epoch": 2.675972675972676, "grad_norm": 0.44056686758995056, "learning_rate": 1.3944163944163945e-05, "loss": 0.0712, "step": 9010 }, { "epoch": 2.678942678942679, "grad_norm": 0.42997923493385315, "learning_rate": 1.3926343926343927e-05, "loss": 0.0509, "step": 9020 }, { "epoch": 2.681912681912682, "grad_norm": 0.4868006110191345, "learning_rate": 1.390852390852391e-05, "loss": 0.0722, "step": 9030 }, { "epoch": 2.684882684882685, "grad_norm": 0.4716143310070038, "learning_rate": 1.3890703890703891e-05, "loss": 0.0643, "step": 9040 }, { "epoch": 2.687852687852688, "grad_norm": 0.4905288815498352, "learning_rate": 1.3872883872883874e-05, "loss": 0.0592, "step": 9050 }, { "epoch": 2.690822690822691, "grad_norm": 0.4081631302833557, "learning_rate": 1.3855063855063854e-05, "loss": 0.0736, "step": 9060 }, { "epoch": 2.6937926937926937, "grad_norm": 0.447644978761673, "learning_rate": 1.3837243837243838e-05, "loss": 0.0654, "step": 9070 }, { "epoch": 2.696762696762697, "grad_norm": 0.22904683649539948, "learning_rate": 1.381942381942382e-05, "loss": 0.0562, "step": 9080 }, { "epoch": 2.6997326997327, "grad_norm": 0.5609009861946106, "learning_rate": 1.3801603801603802e-05, "loss": 0.0558, "step": 9090 }, { "epoch": 2.7027027027027026, "grad_norm": 0.6101239919662476, "learning_rate": 1.3783783783783784e-05, "loss": 0.0665, "step": 9100 }, { "epoch": 2.705672705672706, "grad_norm": 0.49575671553611755, "learning_rate": 1.3765963765963767e-05, "loss": 0.0589, "step": 9110 }, { "epoch": 2.7086427086427087, "grad_norm": 0.5980531573295593, "learning_rate": 1.3748143748143749e-05, "loss": 0.0715, "step": 9120 }, { "epoch": 2.7116127116127116, "grad_norm": 0.3581327497959137, "learning_rate": 1.373032373032373e-05, "loss": 0.0641, "step": 9130 }, { "epoch": 2.714582714582715, "grad_norm": 0.5521288514137268, "learning_rate": 1.3712503712503713e-05, "loss": 0.0611, "step": 9140 }, { "epoch": 2.7175527175527177, "grad_norm": 0.617689847946167, "learning_rate": 1.3694683694683695e-05, "loss": 0.0556, "step": 9150 }, { "epoch": 2.7205227205227205, "grad_norm": 0.32165491580963135, "learning_rate": 1.3676863676863677e-05, "loss": 0.0714, "step": 9160 }, { "epoch": 2.7234927234927238, "grad_norm": 0.3842147886753082, "learning_rate": 1.365904365904366e-05, "loss": 0.0599, "step": 9170 }, { "epoch": 2.7264627264627266, "grad_norm": 0.41680991649627686, "learning_rate": 1.3641223641223642e-05, "loss": 0.0628, "step": 9180 }, { "epoch": 2.7294327294327294, "grad_norm": 0.6326974630355835, "learning_rate": 1.3623403623403624e-05, "loss": 0.0571, "step": 9190 }, { "epoch": 2.7324027324027322, "grad_norm": 0.4563412070274353, "learning_rate": 1.3605583605583606e-05, "loss": 0.0647, "step": 9200 }, { "epoch": 2.7353727353727355, "grad_norm": 0.5637513995170593, "learning_rate": 1.3587763587763588e-05, "loss": 0.0661, "step": 9210 }, { "epoch": 2.7383427383427383, "grad_norm": 0.373116135597229, "learning_rate": 1.356994356994357e-05, "loss": 0.0552, "step": 9220 }, { "epoch": 2.741312741312741, "grad_norm": 0.6611971259117126, "learning_rate": 1.3552123552123553e-05, "loss": 0.0694, "step": 9230 }, { "epoch": 2.7442827442827444, "grad_norm": 0.5132849812507629, "learning_rate": 1.3534303534303535e-05, "loss": 0.0607, "step": 9240 }, { "epoch": 2.7472527472527473, "grad_norm": 0.40150707960128784, "learning_rate": 1.3516483516483517e-05, "loss": 0.0682, "step": 9250 }, { "epoch": 2.75022275022275, "grad_norm": 0.8982331156730652, "learning_rate": 1.3498663498663499e-05, "loss": 0.058, "step": 9260 }, { "epoch": 2.753192753192753, "grad_norm": 0.42595192790031433, "learning_rate": 1.3480843480843481e-05, "loss": 0.0673, "step": 9270 }, { "epoch": 2.756162756162756, "grad_norm": 0.5409243106842041, "learning_rate": 1.3463023463023463e-05, "loss": 0.0553, "step": 9280 }, { "epoch": 2.759132759132759, "grad_norm": 0.5729924440383911, "learning_rate": 1.3445203445203446e-05, "loss": 0.0677, "step": 9290 }, { "epoch": 2.762102762102762, "grad_norm": 0.4854719638824463, "learning_rate": 1.3427383427383428e-05, "loss": 0.054, "step": 9300 }, { "epoch": 2.765072765072765, "grad_norm": 0.7021495699882507, "learning_rate": 1.340956340956341e-05, "loss": 0.0618, "step": 9310 }, { "epoch": 2.768042768042768, "grad_norm": 0.5088809132575989, "learning_rate": 1.3391743391743392e-05, "loss": 0.0652, "step": 9320 }, { "epoch": 2.7710127710127708, "grad_norm": 0.3599695861339569, "learning_rate": 1.3373923373923374e-05, "loss": 0.0758, "step": 9330 }, { "epoch": 2.773982773982774, "grad_norm": 0.2429090142250061, "learning_rate": 1.3356103356103356e-05, "loss": 0.0721, "step": 9340 }, { "epoch": 2.776952776952777, "grad_norm": 0.42269906401634216, "learning_rate": 1.3338283338283339e-05, "loss": 0.0642, "step": 9350 }, { "epoch": 2.7799227799227797, "grad_norm": 0.5263569951057434, "learning_rate": 1.332046332046332e-05, "loss": 0.0635, "step": 9360 }, { "epoch": 2.782892782892783, "grad_norm": 0.3662327527999878, "learning_rate": 1.3302643302643303e-05, "loss": 0.0628, "step": 9370 }, { "epoch": 2.785862785862786, "grad_norm": 0.43335428833961487, "learning_rate": 1.3284823284823285e-05, "loss": 0.0713, "step": 9380 }, { "epoch": 2.7888327888327886, "grad_norm": 0.5907623767852783, "learning_rate": 1.3267003267003267e-05, "loss": 0.0534, "step": 9390 }, { "epoch": 2.791802791802792, "grad_norm": 0.340541809797287, "learning_rate": 1.324918324918325e-05, "loss": 0.052, "step": 9400 }, { "epoch": 2.7947727947727947, "grad_norm": 0.4090157151222229, "learning_rate": 1.3231363231363232e-05, "loss": 0.0686, "step": 9410 }, { "epoch": 2.7977427977427975, "grad_norm": 0.3752903640270233, "learning_rate": 1.3213543213543214e-05, "loss": 0.0612, "step": 9420 }, { "epoch": 2.800712800712801, "grad_norm": 0.48351070284843445, "learning_rate": 1.3195723195723196e-05, "loss": 0.058, "step": 9430 }, { "epoch": 2.8036828036828036, "grad_norm": 0.7287899851799011, "learning_rate": 1.3177903177903178e-05, "loss": 0.0787, "step": 9440 }, { "epoch": 2.8066528066528065, "grad_norm": 0.4591059684753418, "learning_rate": 1.316008316008316e-05, "loss": 0.0507, "step": 9450 }, { "epoch": 2.8096228096228097, "grad_norm": 0.6308128833770752, "learning_rate": 1.3142263142263142e-05, "loss": 0.0783, "step": 9460 }, { "epoch": 2.8125928125928126, "grad_norm": 0.5566859841346741, "learning_rate": 1.3124443124443125e-05, "loss": 0.067, "step": 9470 }, { "epoch": 2.8155628155628154, "grad_norm": 0.42038193345069885, "learning_rate": 1.3106623106623107e-05, "loss": 0.0554, "step": 9480 }, { "epoch": 2.8185328185328187, "grad_norm": 0.34577420353889465, "learning_rate": 1.3088803088803089e-05, "loss": 0.0742, "step": 9490 }, { "epoch": 2.8215028215028215, "grad_norm": 0.5111622214317322, "learning_rate": 1.3070983070983071e-05, "loss": 0.068, "step": 9500 }, { "epoch": 2.8244728244728243, "grad_norm": 0.24577349424362183, "learning_rate": 1.3053163053163053e-05, "loss": 0.0617, "step": 9510 }, { "epoch": 2.8274428274428276, "grad_norm": 0.3329918682575226, "learning_rate": 1.3035343035343037e-05, "loss": 0.072, "step": 9520 }, { "epoch": 2.8304128304128304, "grad_norm": 0.5380098819732666, "learning_rate": 1.3017523017523018e-05, "loss": 0.0618, "step": 9530 }, { "epoch": 2.8333828333828333, "grad_norm": 0.539607584476471, "learning_rate": 1.2999702999703e-05, "loss": 0.0529, "step": 9540 }, { "epoch": 2.8363528363528365, "grad_norm": 0.6192976236343384, "learning_rate": 1.2981882981882982e-05, "loss": 0.0695, "step": 9550 }, { "epoch": 2.8393228393228394, "grad_norm": 0.44225507974624634, "learning_rate": 1.2964062964062964e-05, "loss": 0.0537, "step": 9560 }, { "epoch": 2.842292842292842, "grad_norm": 0.5681447386741638, "learning_rate": 1.2946242946242946e-05, "loss": 0.0737, "step": 9570 }, { "epoch": 2.8452628452628455, "grad_norm": 0.5931240320205688, "learning_rate": 1.2928422928422929e-05, "loss": 0.0661, "step": 9580 }, { "epoch": 2.8482328482328483, "grad_norm": 0.4011771082878113, "learning_rate": 1.2910602910602912e-05, "loss": 0.0661, "step": 9590 }, { "epoch": 2.851202851202851, "grad_norm": 0.574195921421051, "learning_rate": 1.2892782892782893e-05, "loss": 0.0677, "step": 9600 }, { "epoch": 2.8541728541728544, "grad_norm": 0.5977892875671387, "learning_rate": 1.2874962874962875e-05, "loss": 0.075, "step": 9610 }, { "epoch": 2.857142857142857, "grad_norm": 0.3630739152431488, "learning_rate": 1.2857142857142857e-05, "loss": 0.0555, "step": 9620 }, { "epoch": 2.86011286011286, "grad_norm": 0.39152857661247253, "learning_rate": 1.283932283932284e-05, "loss": 0.0692, "step": 9630 }, { "epoch": 2.8630828630828633, "grad_norm": 0.2847200036048889, "learning_rate": 1.2821502821502822e-05, "loss": 0.0504, "step": 9640 }, { "epoch": 2.866052866052866, "grad_norm": 0.46334296464920044, "learning_rate": 1.2803682803682804e-05, "loss": 0.067, "step": 9650 }, { "epoch": 2.869022869022869, "grad_norm": 0.6711926460266113, "learning_rate": 1.2785862785862788e-05, "loss": 0.0739, "step": 9660 }, { "epoch": 2.8719928719928722, "grad_norm": 0.5789605975151062, "learning_rate": 1.2768042768042768e-05, "loss": 0.0649, "step": 9670 }, { "epoch": 2.874962874962875, "grad_norm": 0.5450757741928101, "learning_rate": 1.275022275022275e-05, "loss": 0.0659, "step": 9680 }, { "epoch": 2.877932877932878, "grad_norm": 0.4336056709289551, "learning_rate": 1.2732402732402732e-05, "loss": 0.064, "step": 9690 }, { "epoch": 2.880902880902881, "grad_norm": 0.43332991003990173, "learning_rate": 1.2714582714582715e-05, "loss": 0.0703, "step": 9700 }, { "epoch": 2.883872883872884, "grad_norm": 0.26582634449005127, "learning_rate": 1.2696762696762697e-05, "loss": 0.0573, "step": 9710 }, { "epoch": 2.886842886842887, "grad_norm": 0.39930054545402527, "learning_rate": 1.2678942678942679e-05, "loss": 0.0608, "step": 9720 }, { "epoch": 2.88981288981289, "grad_norm": 0.6703673601150513, "learning_rate": 1.2661122661122663e-05, "loss": 0.0706, "step": 9730 }, { "epoch": 2.892782892782893, "grad_norm": 0.3226848542690277, "learning_rate": 1.2643302643302643e-05, "loss": 0.0565, "step": 9740 }, { "epoch": 2.8957528957528957, "grad_norm": 0.4727514386177063, "learning_rate": 1.2625482625482625e-05, "loss": 0.0482, "step": 9750 }, { "epoch": 2.8987228987228986, "grad_norm": 0.744326651096344, "learning_rate": 1.2607662607662608e-05, "loss": 0.0679, "step": 9760 }, { "epoch": 2.901692901692902, "grad_norm": 0.46024101972579956, "learning_rate": 1.258984258984259e-05, "loss": 0.0648, "step": 9770 }, { "epoch": 2.9046629046629047, "grad_norm": 0.5013512969017029, "learning_rate": 1.2572022572022572e-05, "loss": 0.0563, "step": 9780 }, { "epoch": 2.9076329076329075, "grad_norm": 0.7148948907852173, "learning_rate": 1.2554202554202554e-05, "loss": 0.0735, "step": 9790 }, { "epoch": 2.9106029106029108, "grad_norm": 0.4620581865310669, "learning_rate": 1.2536382536382538e-05, "loss": 0.0678, "step": 9800 }, { "epoch": 2.9135729135729136, "grad_norm": 0.5615851879119873, "learning_rate": 1.2518562518562518e-05, "loss": 0.0599, "step": 9810 }, { "epoch": 2.9165429165429164, "grad_norm": 0.5745916366577148, "learning_rate": 1.25007425007425e-05, "loss": 0.0663, "step": 9820 }, { "epoch": 2.9195129195129192, "grad_norm": 0.34011173248291016, "learning_rate": 1.2482922482922483e-05, "loss": 0.0524, "step": 9830 }, { "epoch": 2.9224829224829225, "grad_norm": 0.5845355987548828, "learning_rate": 1.2465102465102467e-05, "loss": 0.0625, "step": 9840 }, { "epoch": 2.9254529254529253, "grad_norm": 0.5317063331604004, "learning_rate": 1.2447282447282447e-05, "loss": 0.0589, "step": 9850 }, { "epoch": 2.928422928422928, "grad_norm": 0.3282083570957184, "learning_rate": 1.242946242946243e-05, "loss": 0.059, "step": 9860 }, { "epoch": 2.9313929313929314, "grad_norm": 0.3801690638065338, "learning_rate": 1.2411642411642413e-05, "loss": 0.0628, "step": 9870 }, { "epoch": 2.9343629343629343, "grad_norm": 0.5469937324523926, "learning_rate": 1.2393822393822394e-05, "loss": 0.0681, "step": 9880 }, { "epoch": 2.937332937332937, "grad_norm": 0.7467171549797058, "learning_rate": 1.2376002376002376e-05, "loss": 0.0555, "step": 9890 }, { "epoch": 2.9403029403029404, "grad_norm": 0.5576099157333374, "learning_rate": 1.2358182358182358e-05, "loss": 0.0722, "step": 9900 }, { "epoch": 2.943272943272943, "grad_norm": 0.5140604972839355, "learning_rate": 1.2340362340362342e-05, "loss": 0.0628, "step": 9910 }, { "epoch": 2.946242946242946, "grad_norm": 0.6918432116508484, "learning_rate": 1.2322542322542322e-05, "loss": 0.0709, "step": 9920 }, { "epoch": 2.9492129492129493, "grad_norm": 0.4932166635990143, "learning_rate": 1.2304722304722305e-05, "loss": 0.0685, "step": 9930 }, { "epoch": 2.952182952182952, "grad_norm": 0.42789584398269653, "learning_rate": 1.2286902286902288e-05, "loss": 0.0654, "step": 9940 }, { "epoch": 2.955152955152955, "grad_norm": 0.6951575875282288, "learning_rate": 1.2269082269082269e-05, "loss": 0.0677, "step": 9950 }, { "epoch": 2.9581229581229582, "grad_norm": 0.5306366682052612, "learning_rate": 1.2251262251262251e-05, "loss": 0.0561, "step": 9960 }, { "epoch": 2.961092961092961, "grad_norm": 0.45280131697654724, "learning_rate": 1.2233442233442233e-05, "loss": 0.0594, "step": 9970 }, { "epoch": 2.964062964062964, "grad_norm": 0.5789720416069031, "learning_rate": 1.2215622215622217e-05, "loss": 0.0691, "step": 9980 }, { "epoch": 2.967032967032967, "grad_norm": 0.4295837879180908, "learning_rate": 1.2197802197802198e-05, "loss": 0.0492, "step": 9990 }, { "epoch": 2.97000297000297, "grad_norm": 0.3032509684562683, "learning_rate": 1.217998217998218e-05, "loss": 0.0634, "step": 10000 }, { "epoch": 2.972972972972973, "grad_norm": 0.6462733149528503, "learning_rate": 1.2162162162162164e-05, "loss": 0.0667, "step": 10010 }, { "epoch": 2.975942975942976, "grad_norm": 0.5056395530700684, "learning_rate": 1.2144342144342144e-05, "loss": 0.0637, "step": 10020 }, { "epoch": 2.978912978912979, "grad_norm": 0.3662366569042206, "learning_rate": 1.2126522126522126e-05, "loss": 0.0709, "step": 10030 }, { "epoch": 2.9818829818829817, "grad_norm": 0.49650683999061584, "learning_rate": 1.2108702108702108e-05, "loss": 0.0711, "step": 10040 }, { "epoch": 2.984852984852985, "grad_norm": 0.44112861156463623, "learning_rate": 1.2090882090882092e-05, "loss": 0.066, "step": 10050 }, { "epoch": 2.987822987822988, "grad_norm": 0.5365132689476013, "learning_rate": 1.2073062073062073e-05, "loss": 0.0589, "step": 10060 }, { "epoch": 2.9907929907929907, "grad_norm": 0.4564819931983948, "learning_rate": 1.2055242055242055e-05, "loss": 0.0686, "step": 10070 }, { "epoch": 2.993762993762994, "grad_norm": 0.6063446402549744, "learning_rate": 1.2037422037422039e-05, "loss": 0.0673, "step": 10080 }, { "epoch": 2.9967329967329968, "grad_norm": 0.516140878200531, "learning_rate": 1.2019602019602021e-05, "loss": 0.0454, "step": 10090 }, { "epoch": 2.9997029997029996, "grad_norm": 0.36144575476646423, "learning_rate": 1.2001782001782001e-05, "loss": 0.0581, "step": 10100 }, { "epoch": 3.0, "eval_f1": 0.49727767695099817, "eval_loss": 0.059147998690605164, "eval_runtime": 179.7759, "eval_samples_per_second": 211.48, "eval_steps_per_second": 3.31, "step": 10101 }, { "epoch": 3.002673002673003, "grad_norm": 0.46553778648376465, "learning_rate": 1.1983961983961984e-05, "loss": 0.0572, "step": 10110 }, { "epoch": 3.0056430056430057, "grad_norm": 0.38310161232948303, "learning_rate": 1.1966141966141967e-05, "loss": 0.0653, "step": 10120 }, { "epoch": 3.0086130086130085, "grad_norm": 0.7176486253738403, "learning_rate": 1.1948321948321948e-05, "loss": 0.0696, "step": 10130 }, { "epoch": 3.011583011583012, "grad_norm": 0.3964185118675232, "learning_rate": 1.193050193050193e-05, "loss": 0.0559, "step": 10140 }, { "epoch": 3.0145530145530146, "grad_norm": 0.480051189661026, "learning_rate": 1.1912681912681914e-05, "loss": 0.0688, "step": 10150 }, { "epoch": 3.0175230175230174, "grad_norm": 0.4801310896873474, "learning_rate": 1.1894861894861896e-05, "loss": 0.0653, "step": 10160 }, { "epoch": 3.0204930204930207, "grad_norm": 0.7674263119697571, "learning_rate": 1.1877041877041877e-05, "loss": 0.0639, "step": 10170 }, { "epoch": 3.0234630234630235, "grad_norm": 0.35185834765434265, "learning_rate": 1.1859221859221859e-05, "loss": 0.0446, "step": 10180 }, { "epoch": 3.0264330264330264, "grad_norm": 0.6630620956420898, "learning_rate": 1.1841401841401843e-05, "loss": 0.0705, "step": 10190 }, { "epoch": 3.029403029403029, "grad_norm": 0.5050874352455139, "learning_rate": 1.1823581823581823e-05, "loss": 0.0563, "step": 10200 }, { "epoch": 3.0323730323730325, "grad_norm": 0.29523542523384094, "learning_rate": 1.1805761805761805e-05, "loss": 0.0598, "step": 10210 }, { "epoch": 3.0353430353430353, "grad_norm": 0.5692099928855896, "learning_rate": 1.1787941787941789e-05, "loss": 0.0733, "step": 10220 }, { "epoch": 3.038313038313038, "grad_norm": 0.714964747428894, "learning_rate": 1.1770121770121771e-05, "loss": 0.0547, "step": 10230 }, { "epoch": 3.0412830412830414, "grad_norm": 0.4890214502811432, "learning_rate": 1.1752301752301752e-05, "loss": 0.0679, "step": 10240 }, { "epoch": 3.044253044253044, "grad_norm": 0.5631494522094727, "learning_rate": 1.1734481734481734e-05, "loss": 0.06, "step": 10250 }, { "epoch": 3.047223047223047, "grad_norm": 0.6472118496894836, "learning_rate": 1.1716661716661718e-05, "loss": 0.0533, "step": 10260 }, { "epoch": 3.0501930501930503, "grad_norm": 0.6611066460609436, "learning_rate": 1.1698841698841698e-05, "loss": 0.0591, "step": 10270 }, { "epoch": 3.053163053163053, "grad_norm": 0.4274856448173523, "learning_rate": 1.168102168102168e-05, "loss": 0.0652, "step": 10280 }, { "epoch": 3.056133056133056, "grad_norm": 0.32548412680625916, "learning_rate": 1.1663201663201664e-05, "loss": 0.0678, "step": 10290 }, { "epoch": 3.0591030591030592, "grad_norm": 0.36015450954437256, "learning_rate": 1.1645381645381647e-05, "loss": 0.0691, "step": 10300 }, { "epoch": 3.062073062073062, "grad_norm": 0.5831524133682251, "learning_rate": 1.1627561627561627e-05, "loss": 0.0735, "step": 10310 }, { "epoch": 3.065043065043065, "grad_norm": 0.7021368741989136, "learning_rate": 1.160974160974161e-05, "loss": 0.0728, "step": 10320 }, { "epoch": 3.068013068013068, "grad_norm": 0.5424765944480896, "learning_rate": 1.1591921591921593e-05, "loss": 0.0617, "step": 10330 }, { "epoch": 3.070983070983071, "grad_norm": 0.7176571488380432, "learning_rate": 1.1574101574101574e-05, "loss": 0.0677, "step": 10340 }, { "epoch": 3.073953073953074, "grad_norm": 0.33526375889778137, "learning_rate": 1.1556281556281556e-05, "loss": 0.0626, "step": 10350 }, { "epoch": 3.076923076923077, "grad_norm": 0.4724681079387665, "learning_rate": 1.153846153846154e-05, "loss": 0.0575, "step": 10360 }, { "epoch": 3.07989307989308, "grad_norm": 0.6367087364196777, "learning_rate": 1.1520641520641522e-05, "loss": 0.0538, "step": 10370 }, { "epoch": 3.0828630828630827, "grad_norm": 0.31437206268310547, "learning_rate": 1.1502821502821502e-05, "loss": 0.0643, "step": 10380 }, { "epoch": 3.085833085833086, "grad_norm": 0.4423040449619293, "learning_rate": 1.1485001485001484e-05, "loss": 0.0684, "step": 10390 }, { "epoch": 3.088803088803089, "grad_norm": 0.4041610360145569, "learning_rate": 1.1467181467181468e-05, "loss": 0.0552, "step": 10400 }, { "epoch": 3.0917730917730917, "grad_norm": 0.4148096442222595, "learning_rate": 1.144936144936145e-05, "loss": 0.0693, "step": 10410 }, { "epoch": 3.094743094743095, "grad_norm": 0.30476808547973633, "learning_rate": 1.1431541431541431e-05, "loss": 0.0684, "step": 10420 }, { "epoch": 3.0977130977130978, "grad_norm": 0.7706785798072815, "learning_rate": 1.1413721413721415e-05, "loss": 0.0707, "step": 10430 }, { "epoch": 3.1006831006831006, "grad_norm": 0.3732987940311432, "learning_rate": 1.1395901395901397e-05, "loss": 0.0619, "step": 10440 }, { "epoch": 3.1036531036531034, "grad_norm": 0.4054795503616333, "learning_rate": 1.1378081378081377e-05, "loss": 0.0666, "step": 10450 }, { "epoch": 3.1066231066231067, "grad_norm": 0.660860538482666, "learning_rate": 1.136026136026136e-05, "loss": 0.0639, "step": 10460 }, { "epoch": 3.1095931095931095, "grad_norm": 0.5180338025093079, "learning_rate": 1.1342441342441343e-05, "loss": 0.068, "step": 10470 }, { "epoch": 3.1125631125631124, "grad_norm": 0.44153326749801636, "learning_rate": 1.1324621324621326e-05, "loss": 0.0621, "step": 10480 }, { "epoch": 3.1155331155331156, "grad_norm": 0.6957278251647949, "learning_rate": 1.1306801306801306e-05, "loss": 0.0553, "step": 10490 }, { "epoch": 3.1185031185031185, "grad_norm": 0.29442107677459717, "learning_rate": 1.128898128898129e-05, "loss": 0.0646, "step": 10500 }, { "epoch": 3.1214731214731213, "grad_norm": 0.4631500244140625, "learning_rate": 1.1271161271161272e-05, "loss": 0.0699, "step": 10510 }, { "epoch": 3.1244431244431246, "grad_norm": 0.4856095314025879, "learning_rate": 1.1253341253341253e-05, "loss": 0.0685, "step": 10520 }, { "epoch": 3.1274131274131274, "grad_norm": 0.7424579858779907, "learning_rate": 1.1235521235521235e-05, "loss": 0.0639, "step": 10530 }, { "epoch": 3.13038313038313, "grad_norm": 0.5345817804336548, "learning_rate": 1.1217701217701219e-05, "loss": 0.0641, "step": 10540 }, { "epoch": 3.1333531333531335, "grad_norm": 0.5012867450714111, "learning_rate": 1.11998811998812e-05, "loss": 0.0803, "step": 10550 }, { "epoch": 3.1363231363231363, "grad_norm": 0.5213742852210999, "learning_rate": 1.1182061182061181e-05, "loss": 0.0469, "step": 10560 }, { "epoch": 3.139293139293139, "grad_norm": 0.39430922269821167, "learning_rate": 1.1164241164241165e-05, "loss": 0.0502, "step": 10570 }, { "epoch": 3.1422631422631424, "grad_norm": 0.6875708699226379, "learning_rate": 1.1146421146421147e-05, "loss": 0.0617, "step": 10580 }, { "epoch": 3.1452331452331452, "grad_norm": 0.4213047921657562, "learning_rate": 1.1128601128601128e-05, "loss": 0.067, "step": 10590 }, { "epoch": 3.148203148203148, "grad_norm": 0.9495222568511963, "learning_rate": 1.111078111078111e-05, "loss": 0.0446, "step": 10600 }, { "epoch": 3.1511731511731513, "grad_norm": 0.37120023369789124, "learning_rate": 1.1092961092961094e-05, "loss": 0.0727, "step": 10610 }, { "epoch": 3.154143154143154, "grad_norm": 0.44335830211639404, "learning_rate": 1.1075141075141076e-05, "loss": 0.0574, "step": 10620 }, { "epoch": 3.157113157113157, "grad_norm": 0.6420602798461914, "learning_rate": 1.1057321057321056e-05, "loss": 0.0708, "step": 10630 }, { "epoch": 3.1600831600831603, "grad_norm": 0.4319610297679901, "learning_rate": 1.103950103950104e-05, "loss": 0.0646, "step": 10640 }, { "epoch": 3.163053163053163, "grad_norm": 0.34275874495506287, "learning_rate": 1.1021681021681022e-05, "loss": 0.0511, "step": 10650 }, { "epoch": 3.166023166023166, "grad_norm": 0.32853662967681885, "learning_rate": 1.1003861003861003e-05, "loss": 0.0476, "step": 10660 }, { "epoch": 3.168993168993169, "grad_norm": 0.7371835708618164, "learning_rate": 1.0986040986040985e-05, "loss": 0.067, "step": 10670 }, { "epoch": 3.171963171963172, "grad_norm": 0.23537606000900269, "learning_rate": 1.0968220968220969e-05, "loss": 0.0636, "step": 10680 }, { "epoch": 3.174933174933175, "grad_norm": 0.638041615486145, "learning_rate": 1.0950400950400951e-05, "loss": 0.0658, "step": 10690 }, { "epoch": 3.177903177903178, "grad_norm": 0.7828889489173889, "learning_rate": 1.0932580932580932e-05, "loss": 0.0603, "step": 10700 }, { "epoch": 3.180873180873181, "grad_norm": 0.41569939255714417, "learning_rate": 1.0914760914760916e-05, "loss": 0.0528, "step": 10710 }, { "epoch": 3.1838431838431838, "grad_norm": 0.4870140552520752, "learning_rate": 1.0896940896940898e-05, "loss": 0.0565, "step": 10720 }, { "epoch": 3.186813186813187, "grad_norm": 0.3599897623062134, "learning_rate": 1.087912087912088e-05, "loss": 0.0469, "step": 10730 }, { "epoch": 3.18978318978319, "grad_norm": 0.26678797602653503, "learning_rate": 1.086130086130086e-05, "loss": 0.0708, "step": 10740 }, { "epoch": 3.1927531927531927, "grad_norm": 0.6243604421615601, "learning_rate": 1.0843480843480844e-05, "loss": 0.0486, "step": 10750 }, { "epoch": 3.1957231957231955, "grad_norm": 0.5825532674789429, "learning_rate": 1.0825660825660826e-05, "loss": 0.0663, "step": 10760 }, { "epoch": 3.198693198693199, "grad_norm": 0.4092167913913727, "learning_rate": 1.0807840807840807e-05, "loss": 0.0704, "step": 10770 }, { "epoch": 3.2016632016632016, "grad_norm": 0.5701293349266052, "learning_rate": 1.079002079002079e-05, "loss": 0.0635, "step": 10780 }, { "epoch": 3.2046332046332044, "grad_norm": 0.25863227248191833, "learning_rate": 1.0772200772200773e-05, "loss": 0.0641, "step": 10790 }, { "epoch": 3.2076032076032077, "grad_norm": 0.3742627203464508, "learning_rate": 1.0754380754380755e-05, "loss": 0.0698, "step": 10800 }, { "epoch": 3.2105732105732105, "grad_norm": 0.3190728724002838, "learning_rate": 1.0736560736560736e-05, "loss": 0.0585, "step": 10810 }, { "epoch": 3.2135432135432134, "grad_norm": 0.49537599086761475, "learning_rate": 1.071874071874072e-05, "loss": 0.0635, "step": 10820 }, { "epoch": 3.2165132165132166, "grad_norm": 0.3896566927433014, "learning_rate": 1.0700920700920702e-05, "loss": 0.0753, "step": 10830 }, { "epoch": 3.2194832194832195, "grad_norm": 0.6234869956970215, "learning_rate": 1.0683100683100682e-05, "loss": 0.0545, "step": 10840 }, { "epoch": 3.2224532224532223, "grad_norm": 0.421795129776001, "learning_rate": 1.0665280665280666e-05, "loss": 0.0714, "step": 10850 }, { "epoch": 3.2254232254232256, "grad_norm": 0.6576681733131409, "learning_rate": 1.0647460647460648e-05, "loss": 0.0661, "step": 10860 }, { "epoch": 3.2283932283932284, "grad_norm": 0.5803960561752319, "learning_rate": 1.062964062964063e-05, "loss": 0.0732, "step": 10870 }, { "epoch": 3.2313632313632312, "grad_norm": 0.39635559916496277, "learning_rate": 1.0611820611820612e-05, "loss": 0.0473, "step": 10880 }, { "epoch": 3.2343332343332345, "grad_norm": 0.5573329329490662, "learning_rate": 1.0594000594000595e-05, "loss": 0.059, "step": 10890 }, { "epoch": 3.2373032373032373, "grad_norm": 0.6418017148971558, "learning_rate": 1.0576180576180577e-05, "loss": 0.0633, "step": 10900 }, { "epoch": 3.24027324027324, "grad_norm": 0.6030585169792175, "learning_rate": 1.0558360558360557e-05, "loss": 0.0596, "step": 10910 }, { "epoch": 3.2432432432432434, "grad_norm": 0.41735953092575073, "learning_rate": 1.0540540540540541e-05, "loss": 0.0585, "step": 10920 }, { "epoch": 3.2462132462132463, "grad_norm": 0.7560169696807861, "learning_rate": 1.0522720522720523e-05, "loss": 0.0566, "step": 10930 }, { "epoch": 3.249183249183249, "grad_norm": 0.2606422007083893, "learning_rate": 1.0504900504900505e-05, "loss": 0.0498, "step": 10940 }, { "epoch": 3.252153252153252, "grad_norm": 0.5863521695137024, "learning_rate": 1.0487080487080488e-05, "loss": 0.0603, "step": 10950 }, { "epoch": 3.255123255123255, "grad_norm": 0.4618661403656006, "learning_rate": 1.046926046926047e-05, "loss": 0.0709, "step": 10960 }, { "epoch": 3.258093258093258, "grad_norm": 0.3728097975254059, "learning_rate": 1.0451440451440452e-05, "loss": 0.0605, "step": 10970 }, { "epoch": 3.261063261063261, "grad_norm": 0.4798294007778168, "learning_rate": 1.0433620433620434e-05, "loss": 0.0559, "step": 10980 }, { "epoch": 3.264033264033264, "grad_norm": 0.6178519129753113, "learning_rate": 1.0415800415800416e-05, "loss": 0.0696, "step": 10990 }, { "epoch": 3.267003267003267, "grad_norm": 0.4793247580528259, "learning_rate": 1.0397980397980398e-05, "loss": 0.059, "step": 11000 }, { "epoch": 3.2699732699732698, "grad_norm": 0.33142969012260437, "learning_rate": 1.038016038016038e-05, "loss": 0.0439, "step": 11010 }, { "epoch": 3.272943272943273, "grad_norm": 0.261089950799942, "learning_rate": 1.0362340362340363e-05, "loss": 0.0586, "step": 11020 }, { "epoch": 3.275913275913276, "grad_norm": 0.34269529581069946, "learning_rate": 1.0344520344520345e-05, "loss": 0.0679, "step": 11030 }, { "epoch": 3.2788832788832787, "grad_norm": 0.4112348258495331, "learning_rate": 1.0326700326700327e-05, "loss": 0.0599, "step": 11040 }, { "epoch": 3.281853281853282, "grad_norm": 0.5969886183738708, "learning_rate": 1.030888030888031e-05, "loss": 0.0719, "step": 11050 }, { "epoch": 3.284823284823285, "grad_norm": 0.5105575323104858, "learning_rate": 1.0291060291060291e-05, "loss": 0.053, "step": 11060 }, { "epoch": 3.2877932877932876, "grad_norm": 0.4884382486343384, "learning_rate": 1.0273240273240274e-05, "loss": 0.0553, "step": 11070 }, { "epoch": 3.290763290763291, "grad_norm": 0.4914264678955078, "learning_rate": 1.0255420255420256e-05, "loss": 0.0592, "step": 11080 }, { "epoch": 3.2937332937332937, "grad_norm": 0.44552749395370483, "learning_rate": 1.0237600237600238e-05, "loss": 0.0527, "step": 11090 }, { "epoch": 3.2967032967032965, "grad_norm": 0.43704137206077576, "learning_rate": 1.021978021978022e-05, "loss": 0.062, "step": 11100 }, { "epoch": 3.2996732996733, "grad_norm": 0.45537468791007996, "learning_rate": 1.0201960201960202e-05, "loss": 0.0685, "step": 11110 }, { "epoch": 3.3026433026433026, "grad_norm": 0.45990774035453796, "learning_rate": 1.0184140184140184e-05, "loss": 0.0675, "step": 11120 }, { "epoch": 3.3056133056133055, "grad_norm": 0.375456303358078, "learning_rate": 1.0166320166320167e-05, "loss": 0.0585, "step": 11130 }, { "epoch": 3.3085833085833087, "grad_norm": 0.42089733481407166, "learning_rate": 1.0148500148500149e-05, "loss": 0.0636, "step": 11140 }, { "epoch": 3.3115533115533116, "grad_norm": 0.4135701060295105, "learning_rate": 1.0130680130680131e-05, "loss": 0.0742, "step": 11150 }, { "epoch": 3.3145233145233144, "grad_norm": 0.47297653555870056, "learning_rate": 1.0112860112860113e-05, "loss": 0.0723, "step": 11160 }, { "epoch": 3.3174933174933177, "grad_norm": 0.5323516726493835, "learning_rate": 1.0095040095040095e-05, "loss": 0.0577, "step": 11170 }, { "epoch": 3.3204633204633205, "grad_norm": 0.37327128648757935, "learning_rate": 1.0077220077220078e-05, "loss": 0.0606, "step": 11180 }, { "epoch": 3.3234333234333233, "grad_norm": 0.9338498711585999, "learning_rate": 1.005940005940006e-05, "loss": 0.0679, "step": 11190 }, { "epoch": 3.3264033264033266, "grad_norm": 0.5313315987586975, "learning_rate": 1.0041580041580042e-05, "loss": 0.0592, "step": 11200 }, { "epoch": 3.3293733293733294, "grad_norm": 0.27918875217437744, "learning_rate": 1.0023760023760024e-05, "loss": 0.0661, "step": 11210 }, { "epoch": 3.3323433323433322, "grad_norm": 0.3626916706562042, "learning_rate": 1.0005940005940006e-05, "loss": 0.0493, "step": 11220 }, { "epoch": 3.3353133353133355, "grad_norm": 0.43011564016342163, "learning_rate": 9.988119988119988e-06, "loss": 0.0681, "step": 11230 }, { "epoch": 3.3382833382833383, "grad_norm": 0.5412601232528687, "learning_rate": 9.97029997029997e-06, "loss": 0.0639, "step": 11240 }, { "epoch": 3.341253341253341, "grad_norm": 0.6399582028388977, "learning_rate": 9.952479952479953e-06, "loss": 0.0497, "step": 11250 }, { "epoch": 3.3442233442233444, "grad_norm": 0.44264036417007446, "learning_rate": 9.934659934659935e-06, "loss": 0.0582, "step": 11260 }, { "epoch": 3.3471933471933473, "grad_norm": 0.3276296854019165, "learning_rate": 9.916839916839917e-06, "loss": 0.0463, "step": 11270 }, { "epoch": 3.35016335016335, "grad_norm": 0.3752717077732086, "learning_rate": 9.8990198990199e-06, "loss": 0.0725, "step": 11280 }, { "epoch": 3.3531333531333534, "grad_norm": 0.5361660718917847, "learning_rate": 9.881199881199881e-06, "loss": 0.0674, "step": 11290 }, { "epoch": 3.356103356103356, "grad_norm": 0.5567395687103271, "learning_rate": 9.863379863379865e-06, "loss": 0.0708, "step": 11300 }, { "epoch": 3.359073359073359, "grad_norm": 0.4132004678249359, "learning_rate": 9.845559845559846e-06, "loss": 0.0667, "step": 11310 }, { "epoch": 3.362043362043362, "grad_norm": 0.5917862057685852, "learning_rate": 9.827739827739828e-06, "loss": 0.0494, "step": 11320 }, { "epoch": 3.365013365013365, "grad_norm": 0.41967251896858215, "learning_rate": 9.80991980991981e-06, "loss": 0.0526, "step": 11330 }, { "epoch": 3.367983367983368, "grad_norm": 0.2610296308994293, "learning_rate": 9.792099792099792e-06, "loss": 0.0597, "step": 11340 }, { "epoch": 3.3709533709533708, "grad_norm": 0.6055606603622437, "learning_rate": 9.774279774279774e-06, "loss": 0.0615, "step": 11350 }, { "epoch": 3.373923373923374, "grad_norm": 0.6907655596733093, "learning_rate": 9.756459756459757e-06, "loss": 0.0592, "step": 11360 }, { "epoch": 3.376893376893377, "grad_norm": 0.5287322402000427, "learning_rate": 9.73863973863974e-06, "loss": 0.0501, "step": 11370 }, { "epoch": 3.3798633798633797, "grad_norm": 0.3826773762702942, "learning_rate": 9.720819720819721e-06, "loss": 0.0654, "step": 11380 }, { "epoch": 3.382833382833383, "grad_norm": 0.4057276248931885, "learning_rate": 9.702999702999703e-06, "loss": 0.0686, "step": 11390 }, { "epoch": 3.385803385803386, "grad_norm": 0.3789379596710205, "learning_rate": 9.685179685179685e-06, "loss": 0.0705, "step": 11400 }, { "epoch": 3.3887733887733886, "grad_norm": 0.6244688630104065, "learning_rate": 9.667359667359667e-06, "loss": 0.0529, "step": 11410 }, { "epoch": 3.391743391743392, "grad_norm": 0.4109695255756378, "learning_rate": 9.64953964953965e-06, "loss": 0.0606, "step": 11420 }, { "epoch": 3.3947133947133947, "grad_norm": 0.5615403652191162, "learning_rate": 9.631719631719632e-06, "loss": 0.0619, "step": 11430 }, { "epoch": 3.3976833976833976, "grad_norm": 0.5328549742698669, "learning_rate": 9.613899613899616e-06, "loss": 0.0577, "step": 11440 }, { "epoch": 3.400653400653401, "grad_norm": 0.6064325571060181, "learning_rate": 9.596079596079596e-06, "loss": 0.0627, "step": 11450 }, { "epoch": 3.4036234036234037, "grad_norm": 0.3764317035675049, "learning_rate": 9.578259578259578e-06, "loss": 0.0492, "step": 11460 }, { "epoch": 3.4065934065934065, "grad_norm": 0.40372684597969055, "learning_rate": 9.56043956043956e-06, "loss": 0.0625, "step": 11470 }, { "epoch": 3.4095634095634098, "grad_norm": 0.5874956250190735, "learning_rate": 9.542619542619543e-06, "loss": 0.0554, "step": 11480 }, { "epoch": 3.4125334125334126, "grad_norm": 0.6757147908210754, "learning_rate": 9.524799524799525e-06, "loss": 0.0503, "step": 11490 }, { "epoch": 3.4155034155034154, "grad_norm": 0.33406156301498413, "learning_rate": 9.506979506979507e-06, "loss": 0.0751, "step": 11500 }, { "epoch": 3.4184734184734182, "grad_norm": 0.22471563518047333, "learning_rate": 9.48915948915949e-06, "loss": 0.0608, "step": 11510 }, { "epoch": 3.4214434214434215, "grad_norm": 0.29463276267051697, "learning_rate": 9.471339471339471e-06, "loss": 0.0541, "step": 11520 }, { "epoch": 3.4244134244134243, "grad_norm": 0.5052198171615601, "learning_rate": 9.453519453519453e-06, "loss": 0.0705, "step": 11530 }, { "epoch": 3.427383427383427, "grad_norm": 0.4901563823223114, "learning_rate": 9.435699435699436e-06, "loss": 0.0676, "step": 11540 }, { "epoch": 3.4303534303534304, "grad_norm": 0.8629029989242554, "learning_rate": 9.417879417879418e-06, "loss": 0.0601, "step": 11550 }, { "epoch": 3.4333234333234333, "grad_norm": 0.4550071656703949, "learning_rate": 9.4000594000594e-06, "loss": 0.056, "step": 11560 }, { "epoch": 3.436293436293436, "grad_norm": 0.4358525574207306, "learning_rate": 9.382239382239382e-06, "loss": 0.0581, "step": 11570 }, { "epoch": 3.4392634392634394, "grad_norm": 0.5264983773231506, "learning_rate": 9.364419364419366e-06, "loss": 0.0545, "step": 11580 }, { "epoch": 3.442233442233442, "grad_norm": 0.6651470065116882, "learning_rate": 9.346599346599347e-06, "loss": 0.0502, "step": 11590 }, { "epoch": 3.445203445203445, "grad_norm": 0.8447353839874268, "learning_rate": 9.328779328779329e-06, "loss": 0.0635, "step": 11600 }, { "epoch": 3.4481734481734483, "grad_norm": 0.39546453952789307, "learning_rate": 9.31095931095931e-06, "loss": 0.0569, "step": 11610 }, { "epoch": 3.451143451143451, "grad_norm": 0.5016785860061646, "learning_rate": 9.293139293139295e-06, "loss": 0.0566, "step": 11620 }, { "epoch": 3.454113454113454, "grad_norm": 0.4469011723995209, "learning_rate": 9.275319275319275e-06, "loss": 0.0561, "step": 11630 }, { "epoch": 3.457083457083457, "grad_norm": 0.7788525819778442, "learning_rate": 9.257499257499257e-06, "loss": 0.0609, "step": 11640 }, { "epoch": 3.46005346005346, "grad_norm": 0.38840779662132263, "learning_rate": 9.239679239679241e-06, "loss": 0.0563, "step": 11650 }, { "epoch": 3.463023463023463, "grad_norm": 0.4505913257598877, "learning_rate": 9.221859221859222e-06, "loss": 0.0551, "step": 11660 }, { "epoch": 3.465993465993466, "grad_norm": 0.4120921492576599, "learning_rate": 9.204039204039204e-06, "loss": 0.0626, "step": 11670 }, { "epoch": 3.468963468963469, "grad_norm": 0.32375073432922363, "learning_rate": 9.186219186219186e-06, "loss": 0.0702, "step": 11680 }, { "epoch": 3.471933471933472, "grad_norm": 0.7593043446540833, "learning_rate": 9.16839916839917e-06, "loss": 0.0585, "step": 11690 }, { "epoch": 3.474903474903475, "grad_norm": 0.8873201608657837, "learning_rate": 9.15057915057915e-06, "loss": 0.0556, "step": 11700 }, { "epoch": 3.477873477873478, "grad_norm": 0.23573075234889984, "learning_rate": 9.132759132759133e-06, "loss": 0.0691, "step": 11710 }, { "epoch": 3.4808434808434807, "grad_norm": 0.6220734119415283, "learning_rate": 9.114939114939116e-06, "loss": 0.0666, "step": 11720 }, { "epoch": 3.483813483813484, "grad_norm": 0.34220409393310547, "learning_rate": 9.097119097119097e-06, "loss": 0.064, "step": 11730 }, { "epoch": 3.486783486783487, "grad_norm": 0.46370092034339905, "learning_rate": 9.079299079299079e-06, "loss": 0.0723, "step": 11740 }, { "epoch": 3.4897534897534896, "grad_norm": 0.3304176926612854, "learning_rate": 9.061479061479061e-06, "loss": 0.0536, "step": 11750 }, { "epoch": 3.492723492723493, "grad_norm": 0.31319358944892883, "learning_rate": 9.043659043659045e-06, "loss": 0.0643, "step": 11760 }, { "epoch": 3.4956934956934957, "grad_norm": 0.7758733034133911, "learning_rate": 9.025839025839026e-06, "loss": 0.056, "step": 11770 }, { "epoch": 3.4986634986634986, "grad_norm": 0.48000404238700867, "learning_rate": 9.008019008019008e-06, "loss": 0.0533, "step": 11780 }, { "epoch": 3.501633501633502, "grad_norm": 0.6242011189460754, "learning_rate": 8.990198990198992e-06, "loss": 0.06, "step": 11790 }, { "epoch": 3.5046035046035047, "grad_norm": 0.6063371896743774, "learning_rate": 8.972378972378972e-06, "loss": 0.0478, "step": 11800 }, { "epoch": 3.5075735075735075, "grad_norm": 0.4791490137577057, "learning_rate": 8.954558954558954e-06, "loss": 0.0486, "step": 11810 }, { "epoch": 3.5105435105435108, "grad_norm": 0.3279794752597809, "learning_rate": 8.936738936738936e-06, "loss": 0.0655, "step": 11820 }, { "epoch": 3.5135135135135136, "grad_norm": 0.42596834897994995, "learning_rate": 8.91891891891892e-06, "loss": 0.0733, "step": 11830 }, { "epoch": 3.5164835164835164, "grad_norm": 0.4257424771785736, "learning_rate": 8.9010989010989e-06, "loss": 0.0624, "step": 11840 }, { "epoch": 3.5194535194535197, "grad_norm": 0.46473416686058044, "learning_rate": 8.883278883278883e-06, "loss": 0.0562, "step": 11850 }, { "epoch": 3.5224235224235225, "grad_norm": 0.6375032663345337, "learning_rate": 8.865458865458867e-06, "loss": 0.0589, "step": 11860 }, { "epoch": 3.5253935253935254, "grad_norm": 0.35437679290771484, "learning_rate": 8.847638847638847e-06, "loss": 0.0731, "step": 11870 }, { "epoch": 3.5283635283635286, "grad_norm": 0.5066477060317993, "learning_rate": 8.82981882981883e-06, "loss": 0.0614, "step": 11880 }, { "epoch": 3.5313335313335315, "grad_norm": 0.5429478883743286, "learning_rate": 8.811998811998812e-06, "loss": 0.0571, "step": 11890 }, { "epoch": 3.5343035343035343, "grad_norm": 0.6550652980804443, "learning_rate": 8.794178794178795e-06, "loss": 0.0569, "step": 11900 }, { "epoch": 3.5372735372735375, "grad_norm": 0.7283503413200378, "learning_rate": 8.776358776358776e-06, "loss": 0.0543, "step": 11910 }, { "epoch": 3.5402435402435404, "grad_norm": 0.3151548504829407, "learning_rate": 8.758538758538758e-06, "loss": 0.0566, "step": 11920 }, { "epoch": 3.543213543213543, "grad_norm": 0.507347583770752, "learning_rate": 8.740718740718742e-06, "loss": 0.053, "step": 11930 }, { "epoch": 3.546183546183546, "grad_norm": 0.7897441387176514, "learning_rate": 8.722898722898724e-06, "loss": 0.0688, "step": 11940 }, { "epoch": 3.5491535491535493, "grad_norm": 0.37791678309440613, "learning_rate": 8.705078705078705e-06, "loss": 0.0757, "step": 11950 }, { "epoch": 3.552123552123552, "grad_norm": 0.5913348197937012, "learning_rate": 8.687258687258687e-06, "loss": 0.0689, "step": 11960 }, { "epoch": 3.555093555093555, "grad_norm": 0.7024880647659302, "learning_rate": 8.66943866943867e-06, "loss": 0.0648, "step": 11970 }, { "epoch": 3.5580635580635582, "grad_norm": 0.37222567200660706, "learning_rate": 8.651618651618651e-06, "loss": 0.0595, "step": 11980 }, { "epoch": 3.561033561033561, "grad_norm": 0.45320606231689453, "learning_rate": 8.633798633798633e-06, "loss": 0.0638, "step": 11990 }, { "epoch": 3.564003564003564, "grad_norm": 0.46902260184288025, "learning_rate": 8.615978615978617e-06, "loss": 0.0776, "step": 12000 }, { "epoch": 3.5669735669735667, "grad_norm": 0.5722383260726929, "learning_rate": 8.5981585981586e-06, "loss": 0.0531, "step": 12010 }, { "epoch": 3.56994356994357, "grad_norm": 0.5090997815132141, "learning_rate": 8.58033858033858e-06, "loss": 0.0534, "step": 12020 }, { "epoch": 3.572913572913573, "grad_norm": 0.4689802825450897, "learning_rate": 8.562518562518562e-06, "loss": 0.0717, "step": 12030 }, { "epoch": 3.5758835758835756, "grad_norm": 0.4180223345756531, "learning_rate": 8.544698544698546e-06, "loss": 0.0456, "step": 12040 }, { "epoch": 3.578853578853579, "grad_norm": 0.30135074257850647, "learning_rate": 8.526878526878526e-06, "loss": 0.0548, "step": 12050 }, { "epoch": 3.5818235818235817, "grad_norm": 0.5609501600265503, "learning_rate": 8.509058509058509e-06, "loss": 0.0569, "step": 12060 }, { "epoch": 3.5847935847935846, "grad_norm": 0.30133068561553955, "learning_rate": 8.491238491238492e-06, "loss": 0.0499, "step": 12070 }, { "epoch": 3.587763587763588, "grad_norm": 0.4278302490711212, "learning_rate": 8.473418473418475e-06, "loss": 0.0556, "step": 12080 }, { "epoch": 3.5907335907335907, "grad_norm": 0.570552408695221, "learning_rate": 8.455598455598455e-06, "loss": 0.0667, "step": 12090 }, { "epoch": 3.5937035937035935, "grad_norm": 0.3945624828338623, "learning_rate": 8.437778437778437e-06, "loss": 0.0588, "step": 12100 }, { "epoch": 3.5966735966735968, "grad_norm": 0.5016827583312988, "learning_rate": 8.419958419958421e-06, "loss": 0.0454, "step": 12110 }, { "epoch": 3.5996435996435996, "grad_norm": 0.4540993869304657, "learning_rate": 8.402138402138402e-06, "loss": 0.0692, "step": 12120 }, { "epoch": 3.6026136026136024, "grad_norm": 0.4178672730922699, "learning_rate": 8.384318384318384e-06, "loss": 0.054, "step": 12130 }, { "epoch": 3.6055836055836057, "grad_norm": 0.6967900395393372, "learning_rate": 8.366498366498368e-06, "loss": 0.0629, "step": 12140 }, { "epoch": 3.6085536085536085, "grad_norm": 0.5746413469314575, "learning_rate": 8.34867834867835e-06, "loss": 0.0635, "step": 12150 }, { "epoch": 3.6115236115236113, "grad_norm": 0.45530760288238525, "learning_rate": 8.33085833085833e-06, "loss": 0.0632, "step": 12160 }, { "epoch": 3.6144936144936146, "grad_norm": 0.5083603858947754, "learning_rate": 8.313038313038312e-06, "loss": 0.0522, "step": 12170 }, { "epoch": 3.6174636174636174, "grad_norm": 0.4417908489704132, "learning_rate": 8.295218295218296e-06, "loss": 0.0537, "step": 12180 }, { "epoch": 3.6204336204336203, "grad_norm": 0.5929802656173706, "learning_rate": 8.277398277398278e-06, "loss": 0.0628, "step": 12190 }, { "epoch": 3.6234036234036235, "grad_norm": 0.35801371932029724, "learning_rate": 8.259578259578259e-06, "loss": 0.0568, "step": 12200 }, { "epoch": 3.6263736263736264, "grad_norm": 0.42152899503707886, "learning_rate": 8.241758241758243e-06, "loss": 0.0575, "step": 12210 }, { "epoch": 3.629343629343629, "grad_norm": 0.5134592652320862, "learning_rate": 8.223938223938225e-06, "loss": 0.0589, "step": 12220 }, { "epoch": 3.6323136323136325, "grad_norm": 0.5800890922546387, "learning_rate": 8.206118206118205e-06, "loss": 0.0711, "step": 12230 }, { "epoch": 3.6352836352836353, "grad_norm": 0.6621565222740173, "learning_rate": 8.188298188298188e-06, "loss": 0.0626, "step": 12240 }, { "epoch": 3.638253638253638, "grad_norm": 0.19206875562667847, "learning_rate": 8.170478170478171e-06, "loss": 0.054, "step": 12250 }, { "epoch": 3.6412236412236414, "grad_norm": 0.3461471199989319, "learning_rate": 8.152658152658154e-06, "loss": 0.063, "step": 12260 }, { "epoch": 3.644193644193644, "grad_norm": 0.5503948926925659, "learning_rate": 8.134838134838134e-06, "loss": 0.0674, "step": 12270 }, { "epoch": 3.647163647163647, "grad_norm": 0.3993360698223114, "learning_rate": 8.117018117018118e-06, "loss": 0.0523, "step": 12280 }, { "epoch": 3.6501336501336503, "grad_norm": 0.5561977624893188, "learning_rate": 8.0991980991981e-06, "loss": 0.0601, "step": 12290 }, { "epoch": 3.653103653103653, "grad_norm": 0.4218428134918213, "learning_rate": 8.08137808137808e-06, "loss": 0.0597, "step": 12300 }, { "epoch": 3.656073656073656, "grad_norm": 0.6830678582191467, "learning_rate": 8.063558063558063e-06, "loss": 0.0624, "step": 12310 }, { "epoch": 3.6590436590436592, "grad_norm": 0.5021694302558899, "learning_rate": 8.045738045738047e-06, "loss": 0.0725, "step": 12320 }, { "epoch": 3.662013662013662, "grad_norm": 0.6278291344642639, "learning_rate": 8.027918027918029e-06, "loss": 0.0715, "step": 12330 }, { "epoch": 3.664983664983665, "grad_norm": 0.7712084650993347, "learning_rate": 8.01009801009801e-06, "loss": 0.0609, "step": 12340 }, { "epoch": 3.667953667953668, "grad_norm": 0.47669193148612976, "learning_rate": 7.992277992277993e-06, "loss": 0.0645, "step": 12350 }, { "epoch": 3.670923670923671, "grad_norm": 0.5000527501106262, "learning_rate": 7.974457974457975e-06, "loss": 0.0463, "step": 12360 }, { "epoch": 3.673893673893674, "grad_norm": 0.409820020198822, "learning_rate": 7.956637956637956e-06, "loss": 0.0552, "step": 12370 }, { "epoch": 3.676863676863677, "grad_norm": 0.48183321952819824, "learning_rate": 7.938817938817938e-06, "loss": 0.0605, "step": 12380 }, { "epoch": 3.67983367983368, "grad_norm": 0.5534571409225464, "learning_rate": 7.920997920997922e-06, "loss": 0.0638, "step": 12390 }, { "epoch": 3.6828036828036828, "grad_norm": 0.4206744432449341, "learning_rate": 7.903177903177904e-06, "loss": 0.0634, "step": 12400 }, { "epoch": 3.685773685773686, "grad_norm": 0.5539330244064331, "learning_rate": 7.885357885357884e-06, "loss": 0.0583, "step": 12410 }, { "epoch": 3.688743688743689, "grad_norm": 0.32335200905799866, "learning_rate": 7.867537867537868e-06, "loss": 0.0604, "step": 12420 }, { "epoch": 3.6917136917136917, "grad_norm": 0.6858915686607361, "learning_rate": 7.84971784971785e-06, "loss": 0.0711, "step": 12430 }, { "epoch": 3.694683694683695, "grad_norm": 0.4419819116592407, "learning_rate": 7.831897831897831e-06, "loss": 0.0535, "step": 12440 }, { "epoch": 3.697653697653698, "grad_norm": 0.5330691933631897, "learning_rate": 7.814077814077813e-06, "loss": 0.0604, "step": 12450 }, { "epoch": 3.7006237006237006, "grad_norm": 0.5260715484619141, "learning_rate": 7.796257796257797e-06, "loss": 0.0607, "step": 12460 }, { "epoch": 3.7035937035937034, "grad_norm": 0.7059239149093628, "learning_rate": 7.77843777843778e-06, "loss": 0.0683, "step": 12470 }, { "epoch": 3.7065637065637067, "grad_norm": 0.31892430782318115, "learning_rate": 7.76061776061776e-06, "loss": 0.0701, "step": 12480 }, { "epoch": 3.7095337095337095, "grad_norm": 0.4127281606197357, "learning_rate": 7.742797742797744e-06, "loss": 0.0682, "step": 12490 }, { "epoch": 3.7125037125037124, "grad_norm": 0.23683589696884155, "learning_rate": 7.724977724977726e-06, "loss": 0.0612, "step": 12500 }, { "epoch": 3.7154737154737156, "grad_norm": 0.47517532110214233, "learning_rate": 7.707157707157708e-06, "loss": 0.0462, "step": 12510 }, { "epoch": 3.7184437184437185, "grad_norm": 0.6467389464378357, "learning_rate": 7.689337689337688e-06, "loss": 0.0664, "step": 12520 }, { "epoch": 3.7214137214137213, "grad_norm": 0.6246938705444336, "learning_rate": 7.671517671517672e-06, "loss": 0.0638, "step": 12530 }, { "epoch": 3.724383724383724, "grad_norm": 0.4938197433948517, "learning_rate": 7.653697653697654e-06, "loss": 0.0629, "step": 12540 }, { "epoch": 3.7273537273537274, "grad_norm": 0.5381590127944946, "learning_rate": 7.635877635877635e-06, "loss": 0.0621, "step": 12550 }, { "epoch": 3.73032373032373, "grad_norm": 0.2848157286643982, "learning_rate": 7.618057618057619e-06, "loss": 0.0641, "step": 12560 }, { "epoch": 3.733293733293733, "grad_norm": 0.4204511046409607, "learning_rate": 7.600237600237601e-06, "loss": 0.0606, "step": 12570 }, { "epoch": 3.7362637362637363, "grad_norm": 0.5741158723831177, "learning_rate": 7.582417582417582e-06, "loss": 0.0478, "step": 12580 }, { "epoch": 3.739233739233739, "grad_norm": 0.3851994574069977, "learning_rate": 7.564597564597564e-06, "loss": 0.067, "step": 12590 }, { "epoch": 3.742203742203742, "grad_norm": 0.35587117075920105, "learning_rate": 7.546777546777547e-06, "loss": 0.0665, "step": 12600 }, { "epoch": 3.7451737451737452, "grad_norm": 0.30616384744644165, "learning_rate": 7.528957528957529e-06, "loss": 0.064, "step": 12610 }, { "epoch": 3.748143748143748, "grad_norm": 0.5584198832511902, "learning_rate": 7.511137511137511e-06, "loss": 0.0546, "step": 12620 }, { "epoch": 3.751113751113751, "grad_norm": 0.3456946015357971, "learning_rate": 7.493317493317493e-06, "loss": 0.0571, "step": 12630 }, { "epoch": 3.754083754083754, "grad_norm": 0.5522321462631226, "learning_rate": 7.475497475497476e-06, "loss": 0.0809, "step": 12640 }, { "epoch": 3.757053757053757, "grad_norm": 0.42469412088394165, "learning_rate": 7.457677457677457e-06, "loss": 0.0629, "step": 12650 }, { "epoch": 3.76002376002376, "grad_norm": 0.5727609395980835, "learning_rate": 7.4398574398574404e-06, "loss": 0.0635, "step": 12660 }, { "epoch": 3.762993762993763, "grad_norm": 0.3833814859390259, "learning_rate": 7.422037422037423e-06, "loss": 0.0667, "step": 12670 }, { "epoch": 3.765963765963766, "grad_norm": 0.7218992114067078, "learning_rate": 7.404217404217404e-06, "loss": 0.0589, "step": 12680 }, { "epoch": 3.7689337689337687, "grad_norm": 0.5727225542068481, "learning_rate": 7.386397386397387e-06, "loss": 0.0528, "step": 12690 }, { "epoch": 3.771903771903772, "grad_norm": 0.38015714287757874, "learning_rate": 7.368577368577368e-06, "loss": 0.0538, "step": 12700 }, { "epoch": 3.774873774873775, "grad_norm": 0.32746824622154236, "learning_rate": 7.350757350757351e-06, "loss": 0.0511, "step": 12710 }, { "epoch": 3.7778437778437777, "grad_norm": 0.3238430321216583, "learning_rate": 7.332937332937333e-06, "loss": 0.0572, "step": 12720 }, { "epoch": 3.780813780813781, "grad_norm": 0.3043205142021179, "learning_rate": 7.315117315117316e-06, "loss": 0.0543, "step": 12730 }, { "epoch": 3.7837837837837838, "grad_norm": 0.23511236906051636, "learning_rate": 7.297297297297298e-06, "loss": 0.0567, "step": 12740 }, { "epoch": 3.7867537867537866, "grad_norm": 0.44706740975379944, "learning_rate": 7.27947727947728e-06, "loss": 0.0598, "step": 12750 }, { "epoch": 3.78972378972379, "grad_norm": 0.700774610042572, "learning_rate": 7.261657261657262e-06, "loss": 0.0671, "step": 12760 }, { "epoch": 3.7926937926937927, "grad_norm": 0.35849860310554504, "learning_rate": 7.2438372438372435e-06, "loss": 0.0586, "step": 12770 }, { "epoch": 3.7956637956637955, "grad_norm": 0.4785964787006378, "learning_rate": 7.2260172260172265e-06, "loss": 0.0616, "step": 12780 }, { "epoch": 3.798633798633799, "grad_norm": 0.6433180570602417, "learning_rate": 7.208197208197208e-06, "loss": 0.0582, "step": 12790 }, { "epoch": 3.8016038016038016, "grad_norm": 0.37284335494041443, "learning_rate": 7.190377190377191e-06, "loss": 0.0491, "step": 12800 }, { "epoch": 3.8045738045738045, "grad_norm": 0.576884388923645, "learning_rate": 7.172557172557173e-06, "loss": 0.0644, "step": 12810 }, { "epoch": 3.8075438075438077, "grad_norm": 0.5406507253646851, "learning_rate": 7.154737154737155e-06, "loss": 0.0621, "step": 12820 }, { "epoch": 3.8105138105138106, "grad_norm": 0.5398954749107361, "learning_rate": 7.136917136917137e-06, "loss": 0.0606, "step": 12830 }, { "epoch": 3.8134838134838134, "grad_norm": 0.3366580605506897, "learning_rate": 7.119097119097119e-06, "loss": 0.0543, "step": 12840 }, { "epoch": 3.8164538164538166, "grad_norm": 0.4284731149673462, "learning_rate": 7.101277101277102e-06, "loss": 0.0608, "step": 12850 }, { "epoch": 3.8194238194238195, "grad_norm": 0.6001728773117065, "learning_rate": 7.083457083457083e-06, "loss": 0.052, "step": 12860 }, { "epoch": 3.8223938223938223, "grad_norm": 0.6029428243637085, "learning_rate": 7.065637065637066e-06, "loss": 0.0516, "step": 12870 }, { "epoch": 3.8253638253638256, "grad_norm": 0.39351001381874084, "learning_rate": 7.047817047817048e-06, "loss": 0.0705, "step": 12880 }, { "epoch": 3.8283338283338284, "grad_norm": 0.5574610829353333, "learning_rate": 7.02999702999703e-06, "loss": 0.0749, "step": 12890 }, { "epoch": 3.8313038313038312, "grad_norm": 0.35019442439079285, "learning_rate": 7.0121770121770125e-06, "loss": 0.0612, "step": 12900 }, { "epoch": 3.8342738342738345, "grad_norm": 0.4754871129989624, "learning_rate": 6.994356994356995e-06, "loss": 0.0553, "step": 12910 }, { "epoch": 3.8372438372438373, "grad_norm": 0.42024219036102295, "learning_rate": 6.976536976536977e-06, "loss": 0.0655, "step": 12920 }, { "epoch": 3.84021384021384, "grad_norm": 0.5387091636657715, "learning_rate": 6.958716958716958e-06, "loss": 0.0599, "step": 12930 }, { "epoch": 3.8431838431838434, "grad_norm": 0.570585310459137, "learning_rate": 6.940896940896941e-06, "loss": 0.0614, "step": 12940 }, { "epoch": 3.8461538461538463, "grad_norm": 0.456065833568573, "learning_rate": 6.923076923076923e-06, "loss": 0.0654, "step": 12950 }, { "epoch": 3.849123849123849, "grad_norm": 0.7067524790763855, "learning_rate": 6.9052569052569056e-06, "loss": 0.0756, "step": 12960 }, { "epoch": 3.8520938520938524, "grad_norm": 0.46172401309013367, "learning_rate": 6.887436887436888e-06, "loss": 0.053, "step": 12970 }, { "epoch": 3.855063855063855, "grad_norm": 0.561151921749115, "learning_rate": 6.86961686961687e-06, "loss": 0.0503, "step": 12980 }, { "epoch": 3.858033858033858, "grad_norm": 0.3379230201244354, "learning_rate": 6.851796851796852e-06, "loss": 0.0626, "step": 12990 }, { "epoch": 3.861003861003861, "grad_norm": 0.6056146621704102, "learning_rate": 6.833976833976834e-06, "loss": 0.0628, "step": 13000 }, { "epoch": 3.863973863973864, "grad_norm": 0.48145750164985657, "learning_rate": 6.816156816156816e-06, "loss": 0.061, "step": 13010 }, { "epoch": 3.866943866943867, "grad_norm": 0.4073619246482849, "learning_rate": 6.7983367983367986e-06, "loss": 0.068, "step": 13020 }, { "epoch": 3.8699138699138698, "grad_norm": 0.4736767113208771, "learning_rate": 6.780516780516781e-06, "loss": 0.0595, "step": 13030 }, { "epoch": 3.872883872883873, "grad_norm": 0.4397349953651428, "learning_rate": 6.762696762696763e-06, "loss": 0.0676, "step": 13040 }, { "epoch": 3.875853875853876, "grad_norm": 0.4046313166618347, "learning_rate": 6.744876744876745e-06, "loss": 0.0552, "step": 13050 }, { "epoch": 3.8788238788238787, "grad_norm": 0.3561536371707916, "learning_rate": 6.727056727056727e-06, "loss": 0.0595, "step": 13060 }, { "epoch": 3.8817938817938815, "grad_norm": 0.5443368554115295, "learning_rate": 6.7092367092367094e-06, "loss": 0.0557, "step": 13070 }, { "epoch": 3.884763884763885, "grad_norm": 0.515012264251709, "learning_rate": 6.691416691416692e-06, "loss": 0.0488, "step": 13080 }, { "epoch": 3.8877338877338876, "grad_norm": 0.37932658195495605, "learning_rate": 6.673596673596674e-06, "loss": 0.0629, "step": 13090 }, { "epoch": 3.8907038907038904, "grad_norm": 0.4500630795955658, "learning_rate": 6.655776655776656e-06, "loss": 0.0589, "step": 13100 }, { "epoch": 3.8936738936738937, "grad_norm": 0.759432852268219, "learning_rate": 6.637956637956638e-06, "loss": 0.0547, "step": 13110 }, { "epoch": 3.8966438966438965, "grad_norm": 0.6136061549186707, "learning_rate": 6.62013662013662e-06, "loss": 0.0572, "step": 13120 }, { "epoch": 3.8996138996138994, "grad_norm": 0.5580719709396362, "learning_rate": 6.6023166023166025e-06, "loss": 0.0573, "step": 13130 }, { "epoch": 3.9025839025839026, "grad_norm": 0.5806880593299866, "learning_rate": 6.584496584496585e-06, "loss": 0.0624, "step": 13140 }, { "epoch": 3.9055539055539055, "grad_norm": 0.7236099243164062, "learning_rate": 6.566676566676567e-06, "loss": 0.069, "step": 13150 }, { "epoch": 3.9085239085239083, "grad_norm": 0.4525713622570038, "learning_rate": 6.548856548856549e-06, "loss": 0.065, "step": 13160 }, { "epoch": 3.9114939114939116, "grad_norm": 0.5687326788902283, "learning_rate": 6.531036531036531e-06, "loss": 0.0592, "step": 13170 }, { "epoch": 3.9144639144639144, "grad_norm": 0.4574839770793915, "learning_rate": 6.513216513216513e-06, "loss": 0.0691, "step": 13180 }, { "epoch": 3.9174339174339172, "grad_norm": 0.4538971483707428, "learning_rate": 6.4953964953964955e-06, "loss": 0.0562, "step": 13190 }, { "epoch": 3.9204039204039205, "grad_norm": 0.32448068261146545, "learning_rate": 6.477576477576478e-06, "loss": 0.0555, "step": 13200 }, { "epoch": 3.9233739233739233, "grad_norm": 0.5266978144645691, "learning_rate": 6.45975645975646e-06, "loss": 0.0532, "step": 13210 }, { "epoch": 3.926343926343926, "grad_norm": 0.48830193281173706, "learning_rate": 6.441936441936442e-06, "loss": 0.0609, "step": 13220 }, { "epoch": 3.9293139293139294, "grad_norm": 0.48386427760124207, "learning_rate": 6.424116424116425e-06, "loss": 0.0595, "step": 13230 }, { "epoch": 3.9322839322839322, "grad_norm": 0.33438950777053833, "learning_rate": 6.406296406296406e-06, "loss": 0.0638, "step": 13240 }, { "epoch": 3.935253935253935, "grad_norm": 0.5018361806869507, "learning_rate": 6.3884763884763885e-06, "loss": 0.0589, "step": 13250 }, { "epoch": 3.9382239382239383, "grad_norm": 0.5178138613700867, "learning_rate": 6.370656370656371e-06, "loss": 0.0619, "step": 13260 }, { "epoch": 3.941193941193941, "grad_norm": 0.4681033194065094, "learning_rate": 6.352836352836353e-06, "loss": 0.0577, "step": 13270 }, { "epoch": 3.944163944163944, "grad_norm": 0.6118980050086975, "learning_rate": 6.335016335016335e-06, "loss": 0.0563, "step": 13280 }, { "epoch": 3.9471339471339473, "grad_norm": 0.4309462010860443, "learning_rate": 6.317196317196317e-06, "loss": 0.0465, "step": 13290 }, { "epoch": 3.95010395010395, "grad_norm": 0.5277587175369263, "learning_rate": 6.2993762993763e-06, "loss": 0.0706, "step": 13300 }, { "epoch": 3.953073953073953, "grad_norm": 0.5027768611907959, "learning_rate": 6.2815562815562815e-06, "loss": 0.0491, "step": 13310 }, { "epoch": 3.956043956043956, "grad_norm": 0.560326874256134, "learning_rate": 6.2637362637362645e-06, "loss": 0.0487, "step": 13320 }, { "epoch": 3.959013959013959, "grad_norm": 0.5669682621955872, "learning_rate": 6.245916245916246e-06, "loss": 0.0491, "step": 13330 }, { "epoch": 3.961983961983962, "grad_norm": 0.49655118584632874, "learning_rate": 6.228096228096228e-06, "loss": 0.0497, "step": 13340 }, { "epoch": 3.964953964953965, "grad_norm": 0.8173234462738037, "learning_rate": 6.21027621027621e-06, "loss": 0.071, "step": 13350 }, { "epoch": 3.967923967923968, "grad_norm": 0.5078877210617065, "learning_rate": 6.192456192456192e-06, "loss": 0.0482, "step": 13360 }, { "epoch": 3.970893970893971, "grad_norm": 0.4183073937892914, "learning_rate": 6.174636174636175e-06, "loss": 0.0537, "step": 13370 }, { "epoch": 3.973863973863974, "grad_norm": 0.5460306406021118, "learning_rate": 6.156816156816157e-06, "loss": 0.0707, "step": 13380 }, { "epoch": 3.976833976833977, "grad_norm": 0.8355798125267029, "learning_rate": 6.13899613899614e-06, "loss": 0.0663, "step": 13390 }, { "epoch": 3.9798039798039797, "grad_norm": 0.5097036361694336, "learning_rate": 6.121176121176121e-06, "loss": 0.0652, "step": 13400 }, { "epoch": 3.982773982773983, "grad_norm": 0.5116889476776123, "learning_rate": 6.103356103356103e-06, "loss": 0.0584, "step": 13410 }, { "epoch": 3.985743985743986, "grad_norm": 0.4749346971511841, "learning_rate": 6.085536085536085e-06, "loss": 0.0598, "step": 13420 }, { "epoch": 3.9887139887139886, "grad_norm": 0.3732450306415558, "learning_rate": 6.0677160677160676e-06, "loss": 0.0623, "step": 13430 }, { "epoch": 3.991683991683992, "grad_norm": 0.37360072135925293, "learning_rate": 6.049896049896051e-06, "loss": 0.0577, "step": 13440 }, { "epoch": 3.9946539946539947, "grad_norm": 0.5997447967529297, "learning_rate": 6.032076032076032e-06, "loss": 0.0634, "step": 13450 }, { "epoch": 3.9976239976239976, "grad_norm": 0.4489789605140686, "learning_rate": 6.014256014256015e-06, "loss": 0.059, "step": 13460 }, { "epoch": 4.0, "eval_f1": 0.49727767695099817, "eval_loss": 0.054960619658231735, "eval_runtime": 178.7066, "eval_samples_per_second": 212.745, "eval_steps_per_second": 3.329, "step": 13468 }, { "epoch": 4.000594000594001, "grad_norm": 0.8586022257804871, "learning_rate": 5.996435996435996e-06, "loss": 0.0538, "step": 13470 }, { "epoch": 4.003564003564003, "grad_norm": 0.5165749192237854, "learning_rate": 5.978615978615979e-06, "loss": 0.0637, "step": 13480 }, { "epoch": 4.0065340065340065, "grad_norm": 0.6126316785812378, "learning_rate": 5.960795960795961e-06, "loss": 0.0514, "step": 13490 }, { "epoch": 4.00950400950401, "grad_norm": 0.572414219379425, "learning_rate": 5.942975942975943e-06, "loss": 0.0592, "step": 13500 }, { "epoch": 4.012474012474012, "grad_norm": 0.615561842918396, "learning_rate": 5.925155925155926e-06, "loss": 0.0551, "step": 13510 }, { "epoch": 4.015444015444015, "grad_norm": 0.5001277923583984, "learning_rate": 5.907335907335907e-06, "loss": 0.0586, "step": 13520 }, { "epoch": 4.018414018414019, "grad_norm": 0.4393932521343231, "learning_rate": 5.88951588951589e-06, "loss": 0.0567, "step": 13530 }, { "epoch": 4.021384021384021, "grad_norm": 0.3287888467311859, "learning_rate": 5.8716958716958714e-06, "loss": 0.0525, "step": 13540 }, { "epoch": 4.024354024354024, "grad_norm": 0.6189230680465698, "learning_rate": 5.8538758538758545e-06, "loss": 0.061, "step": 13550 }, { "epoch": 4.027324027324028, "grad_norm": 0.8372416496276855, "learning_rate": 5.836055836055836e-06, "loss": 0.0665, "step": 13560 }, { "epoch": 4.03029403029403, "grad_norm": 0.4526776373386383, "learning_rate": 5.818235818235818e-06, "loss": 0.0617, "step": 13570 }, { "epoch": 4.033264033264033, "grad_norm": 0.6300592422485352, "learning_rate": 5.800415800415801e-06, "loss": 0.0684, "step": 13580 }, { "epoch": 4.0362340362340365, "grad_norm": 0.7106212973594666, "learning_rate": 5.782595782595782e-06, "loss": 0.0635, "step": 13590 }, { "epoch": 4.039204039204039, "grad_norm": 0.47979801893234253, "learning_rate": 5.764775764775765e-06, "loss": 0.0543, "step": 13600 }, { "epoch": 4.042174042174042, "grad_norm": 0.704913854598999, "learning_rate": 5.746955746955747e-06, "loss": 0.0495, "step": 13610 }, { "epoch": 4.0451440451440455, "grad_norm": 0.5323979258537292, "learning_rate": 5.72913572913573e-06, "loss": 0.0549, "step": 13620 }, { "epoch": 4.048114048114048, "grad_norm": 0.6989266276359558, "learning_rate": 5.711315711315711e-06, "loss": 0.07, "step": 13630 }, { "epoch": 4.051084051084051, "grad_norm": 0.6013164520263672, "learning_rate": 5.693495693495694e-06, "loss": 0.0565, "step": 13640 }, { "epoch": 4.054054054054054, "grad_norm": 0.43801942467689514, "learning_rate": 5.675675675675676e-06, "loss": 0.0574, "step": 13650 }, { "epoch": 4.057024057024057, "grad_norm": 0.6650937795639038, "learning_rate": 5.6578556578556575e-06, "loss": 0.0637, "step": 13660 }, { "epoch": 4.05999405999406, "grad_norm": 0.4909881055355072, "learning_rate": 5.6400356400356405e-06, "loss": 0.0612, "step": 13670 }, { "epoch": 4.062964062964063, "grad_norm": 0.3323568105697632, "learning_rate": 5.622215622215622e-06, "loss": 0.0584, "step": 13680 }, { "epoch": 4.065934065934066, "grad_norm": 0.6184719800949097, "learning_rate": 5.604395604395605e-06, "loss": 0.0504, "step": 13690 }, { "epoch": 4.068904068904069, "grad_norm": 0.5047394037246704, "learning_rate": 5.586575586575586e-06, "loss": 0.0531, "step": 13700 }, { "epoch": 4.071874071874072, "grad_norm": 0.6481796503067017, "learning_rate": 5.568755568755569e-06, "loss": 0.0613, "step": 13710 }, { "epoch": 4.074844074844075, "grad_norm": 0.7215176224708557, "learning_rate": 5.550935550935551e-06, "loss": 0.0633, "step": 13720 }, { "epoch": 4.077814077814078, "grad_norm": 0.16339148581027985, "learning_rate": 5.533115533115533e-06, "loss": 0.058, "step": 13730 }, { "epoch": 4.080784080784081, "grad_norm": 0.5942262411117554, "learning_rate": 5.515295515295516e-06, "loss": 0.0733, "step": 13740 }, { "epoch": 4.0837540837540836, "grad_norm": 0.4894910454750061, "learning_rate": 5.497475497475497e-06, "loss": 0.0645, "step": 13750 }, { "epoch": 4.086724086724087, "grad_norm": 0.44156116247177124, "learning_rate": 5.47965547965548e-06, "loss": 0.0555, "step": 13760 }, { "epoch": 4.08969408969409, "grad_norm": 0.45034366846084595, "learning_rate": 5.461835461835461e-06, "loss": 0.0648, "step": 13770 }, { "epoch": 4.0926640926640925, "grad_norm": 0.43881091475486755, "learning_rate": 5.444015444015444e-06, "loss": 0.059, "step": 13780 }, { "epoch": 4.095634095634096, "grad_norm": 0.6181434988975525, "learning_rate": 5.4261954261954265e-06, "loss": 0.0698, "step": 13790 }, { "epoch": 4.098604098604099, "grad_norm": 0.39531105756759644, "learning_rate": 5.408375408375409e-06, "loss": 0.0563, "step": 13800 }, { "epoch": 4.101574101574101, "grad_norm": 0.44663333892822266, "learning_rate": 5.390555390555391e-06, "loss": 0.0529, "step": 13810 }, { "epoch": 4.104544104544105, "grad_norm": 0.591187059879303, "learning_rate": 5.372735372735372e-06, "loss": 0.0524, "step": 13820 }, { "epoch": 4.107514107514108, "grad_norm": 0.5794005990028381, "learning_rate": 5.354915354915355e-06, "loss": 0.0529, "step": 13830 }, { "epoch": 4.11048411048411, "grad_norm": 0.312919944524765, "learning_rate": 5.337095337095337e-06, "loss": 0.0651, "step": 13840 }, { "epoch": 4.113454113454114, "grad_norm": 0.5957525968551636, "learning_rate": 5.3192753192753196e-06, "loss": 0.0523, "step": 13850 }, { "epoch": 4.116424116424117, "grad_norm": 0.6151428818702698, "learning_rate": 5.301455301455302e-06, "loss": 0.0581, "step": 13860 }, { "epoch": 4.119394119394119, "grad_norm": 0.4753796458244324, "learning_rate": 5.283635283635284e-06, "loss": 0.0514, "step": 13870 }, { "epoch": 4.1223641223641225, "grad_norm": 0.45062291622161865, "learning_rate": 5.265815265815266e-06, "loss": 0.0467, "step": 13880 }, { "epoch": 4.125334125334125, "grad_norm": 0.4602527320384979, "learning_rate": 5.247995247995247e-06, "loss": 0.0583, "step": 13890 }, { "epoch": 4.128304128304128, "grad_norm": 0.543065071105957, "learning_rate": 5.2301752301752304e-06, "loss": 0.0493, "step": 13900 }, { "epoch": 4.1312741312741315, "grad_norm": 0.40139061212539673, "learning_rate": 5.212355212355213e-06, "loss": 0.0543, "step": 13910 }, { "epoch": 4.134244134244134, "grad_norm": 0.40932586789131165, "learning_rate": 5.194535194535195e-06, "loss": 0.0491, "step": 13920 }, { "epoch": 4.137214137214137, "grad_norm": 0.7136752605438232, "learning_rate": 5.176715176715177e-06, "loss": 0.0665, "step": 13930 }, { "epoch": 4.14018414018414, "grad_norm": 0.81020188331604, "learning_rate": 5.158895158895159e-06, "loss": 0.0516, "step": 13940 }, { "epoch": 4.143154143154143, "grad_norm": 0.3689301311969757, "learning_rate": 5.141075141075141e-06, "loss": 0.0617, "step": 13950 }, { "epoch": 4.146124146124146, "grad_norm": 0.296916663646698, "learning_rate": 5.1232551232551234e-06, "loss": 0.0507, "step": 13960 }, { "epoch": 4.149094149094149, "grad_norm": 0.45669737458229065, "learning_rate": 5.105435105435106e-06, "loss": 0.0589, "step": 13970 }, { "epoch": 4.152064152064152, "grad_norm": 0.7257834076881409, "learning_rate": 5.087615087615088e-06, "loss": 0.0762, "step": 13980 }, { "epoch": 4.155034155034155, "grad_norm": 0.4654732942581177, "learning_rate": 5.06979506979507e-06, "loss": 0.0503, "step": 13990 }, { "epoch": 4.158004158004158, "grad_norm": 0.4994029700756073, "learning_rate": 5.051975051975052e-06, "loss": 0.0535, "step": 14000 }, { "epoch": 4.160974160974161, "grad_norm": 0.47293511033058167, "learning_rate": 5.034155034155034e-06, "loss": 0.0564, "step": 14010 }, { "epoch": 4.163944163944164, "grad_norm": 0.3141496777534485, "learning_rate": 5.0163350163350165e-06, "loss": 0.0597, "step": 14020 }, { "epoch": 4.166914166914167, "grad_norm": 0.2851223051548004, "learning_rate": 4.998514998514999e-06, "loss": 0.0549, "step": 14030 }, { "epoch": 4.1698841698841695, "grad_norm": 0.9652001261711121, "learning_rate": 4.980694980694981e-06, "loss": 0.0605, "step": 14040 }, { "epoch": 4.172854172854173, "grad_norm": 0.6175165772438049, "learning_rate": 4.962874962874963e-06, "loss": 0.0574, "step": 14050 }, { "epoch": 4.175824175824176, "grad_norm": 0.39955687522888184, "learning_rate": 4.945054945054945e-06, "loss": 0.0733, "step": 14060 }, { "epoch": 4.1787941787941785, "grad_norm": 0.5539454817771912, "learning_rate": 4.927234927234927e-06, "loss": 0.0587, "step": 14070 }, { "epoch": 4.181764181764182, "grad_norm": 0.574409008026123, "learning_rate": 4.9094149094149095e-06, "loss": 0.0642, "step": 14080 }, { "epoch": 4.184734184734185, "grad_norm": 0.4297143816947937, "learning_rate": 4.891594891594892e-06, "loss": 0.0567, "step": 14090 }, { "epoch": 4.187704187704187, "grad_norm": 0.49302181601524353, "learning_rate": 4.873774873774874e-06, "loss": 0.0562, "step": 14100 }, { "epoch": 4.190674190674191, "grad_norm": 0.8171068429946899, "learning_rate": 4.855954855954856e-06, "loss": 0.0656, "step": 14110 }, { "epoch": 4.193644193644194, "grad_norm": 0.6117607951164246, "learning_rate": 4.838134838134839e-06, "loss": 0.0596, "step": 14120 }, { "epoch": 4.196614196614196, "grad_norm": 0.33238255977630615, "learning_rate": 4.82031482031482e-06, "loss": 0.0493, "step": 14130 }, { "epoch": 4.1995841995842, "grad_norm": 0.3627205789089203, "learning_rate": 4.8024948024948025e-06, "loss": 0.0579, "step": 14140 }, { "epoch": 4.202554202554203, "grad_norm": 0.6033427119255066, "learning_rate": 4.784674784674785e-06, "loss": 0.0568, "step": 14150 }, { "epoch": 4.205524205524205, "grad_norm": 0.5274185538291931, "learning_rate": 4.766854766854767e-06, "loss": 0.062, "step": 14160 }, { "epoch": 4.2084942084942085, "grad_norm": 0.4550093114376068, "learning_rate": 4.749034749034749e-06, "loss": 0.0747, "step": 14170 }, { "epoch": 4.211464211464212, "grad_norm": 0.382213294506073, "learning_rate": 4.731214731214731e-06, "loss": 0.0457, "step": 14180 }, { "epoch": 4.214434214434214, "grad_norm": 0.5736550092697144, "learning_rate": 4.713394713394714e-06, "loss": 0.0614, "step": 14190 }, { "epoch": 4.2174042174042174, "grad_norm": 0.5673187971115112, "learning_rate": 4.6955746955746955e-06, "loss": 0.0582, "step": 14200 }, { "epoch": 4.220374220374221, "grad_norm": 0.6587729454040527, "learning_rate": 4.677754677754678e-06, "loss": 0.0681, "step": 14210 }, { "epoch": 4.223344223344223, "grad_norm": 0.6249194741249084, "learning_rate": 4.65993465993466e-06, "loss": 0.0662, "step": 14220 }, { "epoch": 4.226314226314226, "grad_norm": 0.6569053530693054, "learning_rate": 4.642114642114642e-06, "loss": 0.0634, "step": 14230 }, { "epoch": 4.22928422928423, "grad_norm": 0.6076725125312805, "learning_rate": 4.624294624294624e-06, "loss": 0.0641, "step": 14240 }, { "epoch": 4.232254232254232, "grad_norm": 0.4433649182319641, "learning_rate": 4.606474606474606e-06, "loss": 0.0471, "step": 14250 }, { "epoch": 4.235224235224235, "grad_norm": 0.34535735845565796, "learning_rate": 4.588654588654589e-06, "loss": 0.0619, "step": 14260 }, { "epoch": 4.238194238194239, "grad_norm": 0.3933964967727661, "learning_rate": 4.570834570834571e-06, "loss": 0.0588, "step": 14270 }, { "epoch": 4.241164241164241, "grad_norm": 0.577758252620697, "learning_rate": 4.553014553014554e-06, "loss": 0.0625, "step": 14280 }, { "epoch": 4.244134244134244, "grad_norm": 0.4267483353614807, "learning_rate": 4.535194535194535e-06, "loss": 0.0456, "step": 14290 }, { "epoch": 4.2471042471042475, "grad_norm": 0.42397600412368774, "learning_rate": 4.517374517374517e-06, "loss": 0.056, "step": 14300 }, { "epoch": 4.25007425007425, "grad_norm": 0.3087056279182434, "learning_rate": 4.499554499554499e-06, "loss": 0.0564, "step": 14310 }, { "epoch": 4.253044253044253, "grad_norm": 0.3736560046672821, "learning_rate": 4.481734481734482e-06, "loss": 0.0647, "step": 14320 }, { "epoch": 4.256014256014256, "grad_norm": 0.37401074171066284, "learning_rate": 4.463914463914465e-06, "loss": 0.0526, "step": 14330 }, { "epoch": 4.258984258984259, "grad_norm": 0.6431254744529724, "learning_rate": 4.446094446094446e-06, "loss": 0.072, "step": 14340 }, { "epoch": 4.261954261954262, "grad_norm": 0.3994961380958557, "learning_rate": 4.428274428274429e-06, "loss": 0.0539, "step": 14350 }, { "epoch": 4.2649242649242645, "grad_norm": 0.5059460997581482, "learning_rate": 4.41045441045441e-06, "loss": 0.0557, "step": 14360 }, { "epoch": 4.267894267894268, "grad_norm": 0.201277494430542, "learning_rate": 4.392634392634393e-06, "loss": 0.0557, "step": 14370 }, { "epoch": 4.270864270864271, "grad_norm": 0.22198070585727692, "learning_rate": 4.374814374814375e-06, "loss": 0.0526, "step": 14380 }, { "epoch": 4.273834273834273, "grad_norm": 0.2608140707015991, "learning_rate": 4.356994356994357e-06, "loss": 0.0518, "step": 14390 }, { "epoch": 4.276804276804277, "grad_norm": 0.3986319601535797, "learning_rate": 4.33917433917434e-06, "loss": 0.0596, "step": 14400 }, { "epoch": 4.27977427977428, "grad_norm": 0.5883366465568542, "learning_rate": 4.321354321354321e-06, "loss": 0.058, "step": 14410 }, { "epoch": 4.282744282744282, "grad_norm": 0.7146860361099243, "learning_rate": 4.303534303534304e-06, "loss": 0.0638, "step": 14420 }, { "epoch": 4.285714285714286, "grad_norm": 0.630452036857605, "learning_rate": 4.2857142857142855e-06, "loss": 0.0519, "step": 14430 }, { "epoch": 4.288684288684289, "grad_norm": 0.7049713730812073, "learning_rate": 4.2678942678942685e-06, "loss": 0.0559, "step": 14440 }, { "epoch": 4.291654291654291, "grad_norm": 0.31321823596954346, "learning_rate": 4.25007425007425e-06, "loss": 0.054, "step": 14450 }, { "epoch": 4.2946242946242945, "grad_norm": 0.8444371223449707, "learning_rate": 4.232254232254232e-06, "loss": 0.0589, "step": 14460 }, { "epoch": 4.297594297594298, "grad_norm": 0.5905739665031433, "learning_rate": 4.214434214434215e-06, "loss": 0.0633, "step": 14470 }, { "epoch": 4.3005643005643, "grad_norm": 0.4641624093055725, "learning_rate": 4.196614196614196e-06, "loss": 0.0611, "step": 14480 }, { "epoch": 4.303534303534303, "grad_norm": 0.5575865507125854, "learning_rate": 4.178794178794179e-06, "loss": 0.0576, "step": 14490 }, { "epoch": 4.306504306504307, "grad_norm": 0.7232492566108704, "learning_rate": 4.160974160974161e-06, "loss": 0.0575, "step": 14500 }, { "epoch": 4.309474309474309, "grad_norm": 0.5242018103599548, "learning_rate": 4.143154143154144e-06, "loss": 0.0692, "step": 14510 }, { "epoch": 4.312444312444312, "grad_norm": 0.622914731502533, "learning_rate": 4.125334125334125e-06, "loss": 0.0688, "step": 14520 }, { "epoch": 4.315414315414316, "grad_norm": 0.5062875151634216, "learning_rate": 4.107514107514108e-06, "loss": 0.0542, "step": 14530 }, { "epoch": 4.318384318384318, "grad_norm": 0.5135970711708069, "learning_rate": 4.08969408969409e-06, "loss": 0.0525, "step": 14540 }, { "epoch": 4.321354321354321, "grad_norm": 0.2701030969619751, "learning_rate": 4.0718740718740715e-06, "loss": 0.0525, "step": 14550 }, { "epoch": 4.324324324324325, "grad_norm": 0.7602173089981079, "learning_rate": 4.0540540540540545e-06, "loss": 0.053, "step": 14560 }, { "epoch": 4.327294327294327, "grad_norm": 0.7320886254310608, "learning_rate": 4.036234036234036e-06, "loss": 0.0576, "step": 14570 }, { "epoch": 4.33026433026433, "grad_norm": 0.422878623008728, "learning_rate": 4.018414018414019e-06, "loss": 0.0578, "step": 14580 }, { "epoch": 4.3332343332343335, "grad_norm": 0.449724018573761, "learning_rate": 4.000594000594e-06, "loss": 0.0664, "step": 14590 }, { "epoch": 4.336204336204336, "grad_norm": 0.22872653603553772, "learning_rate": 3.982773982773983e-06, "loss": 0.0529, "step": 14600 }, { "epoch": 4.339174339174339, "grad_norm": 0.4547821581363678, "learning_rate": 3.964953964953965e-06, "loss": 0.0541, "step": 14610 }, { "epoch": 4.342144342144342, "grad_norm": 0.5161837339401245, "learning_rate": 3.947133947133947e-06, "loss": 0.0439, "step": 14620 }, { "epoch": 4.345114345114345, "grad_norm": 0.6731418371200562, "learning_rate": 3.92931392931393e-06, "loss": 0.0554, "step": 14630 }, { "epoch": 4.348084348084348, "grad_norm": 0.46018585562705994, "learning_rate": 3.911493911493911e-06, "loss": 0.0632, "step": 14640 }, { "epoch": 4.351054351054351, "grad_norm": 0.3375426232814789, "learning_rate": 3.893673893673894e-06, "loss": 0.065, "step": 14650 }, { "epoch": 4.354024354024354, "grad_norm": 0.5720539093017578, "learning_rate": 3.875853875853875e-06, "loss": 0.0595, "step": 14660 }, { "epoch": 4.356994356994357, "grad_norm": 0.542365312576294, "learning_rate": 3.858033858033858e-06, "loss": 0.0493, "step": 14670 }, { "epoch": 4.35996435996436, "grad_norm": 0.6491771340370178, "learning_rate": 3.8402138402138406e-06, "loss": 0.0588, "step": 14680 }, { "epoch": 4.362934362934363, "grad_norm": 0.7092576622962952, "learning_rate": 3.822393822393823e-06, "loss": 0.0637, "step": 14690 }, { "epoch": 4.365904365904366, "grad_norm": 0.5155068635940552, "learning_rate": 3.804573804573805e-06, "loss": 0.0502, "step": 14700 }, { "epoch": 4.368874368874369, "grad_norm": 0.31838563084602356, "learning_rate": 3.7867537867537867e-06, "loss": 0.0575, "step": 14710 }, { "epoch": 4.371844371844372, "grad_norm": 0.7911087274551392, "learning_rate": 3.7689337689337693e-06, "loss": 0.0628, "step": 14720 }, { "epoch": 4.374814374814375, "grad_norm": 0.26239511370658875, "learning_rate": 3.751113751113751e-06, "loss": 0.0452, "step": 14730 }, { "epoch": 4.377784377784378, "grad_norm": 0.5743318796157837, "learning_rate": 3.733293733293733e-06, "loss": 0.0618, "step": 14740 }, { "epoch": 4.3807543807543805, "grad_norm": 0.520468533039093, "learning_rate": 3.7154737154737153e-06, "loss": 0.0578, "step": 14750 }, { "epoch": 4.383724383724384, "grad_norm": 0.30406662821769714, "learning_rate": 3.6976536976536975e-06, "loss": 0.0479, "step": 14760 }, { "epoch": 4.386694386694387, "grad_norm": 0.363372266292572, "learning_rate": 3.67983367983368e-06, "loss": 0.0523, "step": 14770 }, { "epoch": 4.389664389664389, "grad_norm": 0.6119177341461182, "learning_rate": 3.6620136620136623e-06, "loss": 0.0593, "step": 14780 }, { "epoch": 4.392634392634393, "grad_norm": 0.7049959897994995, "learning_rate": 3.6441936441936444e-06, "loss": 0.0554, "step": 14790 }, { "epoch": 4.395604395604396, "grad_norm": 0.6827198266983032, "learning_rate": 3.6263736263736266e-06, "loss": 0.0536, "step": 14800 }, { "epoch": 4.398574398574398, "grad_norm": 0.4505496025085449, "learning_rate": 3.6085536085536088e-06, "loss": 0.044, "step": 14810 }, { "epoch": 4.401544401544402, "grad_norm": 0.36443957686424255, "learning_rate": 3.5907335907335905e-06, "loss": 0.065, "step": 14820 }, { "epoch": 4.404514404514405, "grad_norm": 0.4884301424026489, "learning_rate": 3.5729135729135727e-06, "loss": 0.0476, "step": 14830 }, { "epoch": 4.407484407484407, "grad_norm": 0.504188597202301, "learning_rate": 3.5550935550935553e-06, "loss": 0.0563, "step": 14840 }, { "epoch": 4.410454410454411, "grad_norm": 0.19332559406757355, "learning_rate": 3.5372735372735375e-06, "loss": 0.0727, "step": 14850 }, { "epoch": 4.413424413424414, "grad_norm": 0.33928439021110535, "learning_rate": 3.5194535194535196e-06, "loss": 0.0538, "step": 14860 }, { "epoch": 4.416394416394416, "grad_norm": 0.6077583432197571, "learning_rate": 3.501633501633502e-06, "loss": 0.0583, "step": 14870 }, { "epoch": 4.4193644193644195, "grad_norm": 0.5217536091804504, "learning_rate": 3.483813483813484e-06, "loss": 0.0515, "step": 14880 }, { "epoch": 4.422334422334423, "grad_norm": 0.7069948315620422, "learning_rate": 3.465993465993466e-06, "loss": 0.0787, "step": 14890 }, { "epoch": 4.425304425304425, "grad_norm": 0.5601736307144165, "learning_rate": 3.448173448173448e-06, "loss": 0.0603, "step": 14900 }, { "epoch": 4.428274428274428, "grad_norm": 0.687710702419281, "learning_rate": 3.4303534303534305e-06, "loss": 0.0743, "step": 14910 }, { "epoch": 4.431244431244432, "grad_norm": 0.4097294807434082, "learning_rate": 3.4125334125334127e-06, "loss": 0.053, "step": 14920 }, { "epoch": 4.434214434214434, "grad_norm": 0.42233291268348694, "learning_rate": 3.394713394713395e-06, "loss": 0.0547, "step": 14930 }, { "epoch": 4.437184437184437, "grad_norm": 0.44154420495033264, "learning_rate": 3.376893376893377e-06, "loss": 0.0619, "step": 14940 }, { "epoch": 4.440154440154441, "grad_norm": 0.8544827699661255, "learning_rate": 3.359073359073359e-06, "loss": 0.0567, "step": 14950 }, { "epoch": 4.443124443124443, "grad_norm": 0.5352084636688232, "learning_rate": 3.3412533412533413e-06, "loss": 0.0544, "step": 14960 }, { "epoch": 4.446094446094446, "grad_norm": 0.3393558859825134, "learning_rate": 3.3234333234333235e-06, "loss": 0.048, "step": 14970 }, { "epoch": 4.4490644490644495, "grad_norm": 0.4206056296825409, "learning_rate": 3.3056133056133057e-06, "loss": 0.0599, "step": 14980 }, { "epoch": 4.452034452034452, "grad_norm": 0.4742394983768463, "learning_rate": 3.287793287793288e-06, "loss": 0.067, "step": 14990 }, { "epoch": 4.455004455004455, "grad_norm": 0.5058844685554504, "learning_rate": 3.26997326997327e-06, "loss": 0.0625, "step": 15000 }, { "epoch": 4.457974457974458, "grad_norm": 0.39022761583328247, "learning_rate": 3.252153252153252e-06, "loss": 0.0634, "step": 15010 }, { "epoch": 4.460944460944461, "grad_norm": 0.46778586506843567, "learning_rate": 3.2343332343332344e-06, "loss": 0.0609, "step": 15020 }, { "epoch": 4.463914463914464, "grad_norm": 0.7826917767524719, "learning_rate": 3.2165132165132165e-06, "loss": 0.0652, "step": 15030 }, { "epoch": 4.4668844668844665, "grad_norm": 0.3851190507411957, "learning_rate": 3.1986931986931987e-06, "loss": 0.0706, "step": 15040 }, { "epoch": 4.46985446985447, "grad_norm": 0.5744338631629944, "learning_rate": 3.1808731808731813e-06, "loss": 0.0616, "step": 15050 }, { "epoch": 4.472824472824473, "grad_norm": 0.2627275288105011, "learning_rate": 3.163053163053163e-06, "loss": 0.0545, "step": 15060 }, { "epoch": 4.475794475794475, "grad_norm": 0.6903046369552612, "learning_rate": 3.1452331452331452e-06, "loss": 0.0538, "step": 15070 }, { "epoch": 4.478764478764479, "grad_norm": 0.49576228857040405, "learning_rate": 3.1274131274131274e-06, "loss": 0.0555, "step": 15080 }, { "epoch": 4.481734481734482, "grad_norm": 0.5750555396080017, "learning_rate": 3.1095931095931096e-06, "loss": 0.0526, "step": 15090 }, { "epoch": 4.484704484704484, "grad_norm": 0.5842902660369873, "learning_rate": 3.0917730917730917e-06, "loss": 0.0654, "step": 15100 }, { "epoch": 4.487674487674488, "grad_norm": 0.6240746974945068, "learning_rate": 3.073953073953074e-06, "loss": 0.0542, "step": 15110 }, { "epoch": 4.490644490644491, "grad_norm": 0.5041930079460144, "learning_rate": 3.0561330561330565e-06, "loss": 0.0539, "step": 15120 }, { "epoch": 4.493614493614493, "grad_norm": 0.7403512597084045, "learning_rate": 3.0383130383130387e-06, "loss": 0.0514, "step": 15130 }, { "epoch": 4.4965844965844965, "grad_norm": 0.39922061562538147, "learning_rate": 3.0204930204930204e-06, "loss": 0.0552, "step": 15140 }, { "epoch": 4.4995544995545, "grad_norm": 0.2986512780189514, "learning_rate": 3.0026730026730026e-06, "loss": 0.0417, "step": 15150 }, { "epoch": 4.502524502524502, "grad_norm": 0.5681390166282654, "learning_rate": 2.9848529848529848e-06, "loss": 0.0576, "step": 15160 }, { "epoch": 4.5054945054945055, "grad_norm": 0.3006349802017212, "learning_rate": 2.967032967032967e-06, "loss": 0.0556, "step": 15170 }, { "epoch": 4.508464508464509, "grad_norm": 0.35743093490600586, "learning_rate": 2.949212949212949e-06, "loss": 0.0598, "step": 15180 }, { "epoch": 4.511434511434511, "grad_norm": 0.7890453934669495, "learning_rate": 2.9313929313929317e-06, "loss": 0.0611, "step": 15190 }, { "epoch": 4.514404514404514, "grad_norm": 0.5027909874916077, "learning_rate": 2.913572913572914e-06, "loss": 0.0599, "step": 15200 }, { "epoch": 4.517374517374518, "grad_norm": 0.41626325249671936, "learning_rate": 2.895752895752896e-06, "loss": 0.0689, "step": 15210 }, { "epoch": 4.52034452034452, "grad_norm": 0.48036375641822815, "learning_rate": 2.877932877932878e-06, "loss": 0.0591, "step": 15220 }, { "epoch": 4.523314523314523, "grad_norm": 0.3339380919933319, "learning_rate": 2.86011286011286e-06, "loss": 0.0522, "step": 15230 }, { "epoch": 4.526284526284527, "grad_norm": 0.5192808508872986, "learning_rate": 2.842292842292842e-06, "loss": 0.0495, "step": 15240 }, { "epoch": 4.529254529254529, "grad_norm": 0.39185869693756104, "learning_rate": 2.8244728244728243e-06, "loss": 0.0523, "step": 15250 }, { "epoch": 4.532224532224532, "grad_norm": 0.5810967683792114, "learning_rate": 2.806652806652807e-06, "loss": 0.0596, "step": 15260 }, { "epoch": 4.5351945351945355, "grad_norm": 0.48891574144363403, "learning_rate": 2.788832788832789e-06, "loss": 0.0684, "step": 15270 }, { "epoch": 4.538164538164538, "grad_norm": 0.6249604821205139, "learning_rate": 2.7710127710127712e-06, "loss": 0.0605, "step": 15280 }, { "epoch": 4.541134541134541, "grad_norm": 0.5719090700149536, "learning_rate": 2.7531927531927534e-06, "loss": 0.0513, "step": 15290 }, { "epoch": 4.5441045441045445, "grad_norm": 0.5488110780715942, "learning_rate": 2.7353727353727356e-06, "loss": 0.0685, "step": 15300 }, { "epoch": 4.547074547074547, "grad_norm": 0.38646382093429565, "learning_rate": 2.7175527175527173e-06, "loss": 0.0557, "step": 15310 }, { "epoch": 4.55004455004455, "grad_norm": 0.4562876224517822, "learning_rate": 2.6997326997326995e-06, "loss": 0.0538, "step": 15320 }, { "epoch": 4.553014553014553, "grad_norm": 0.3206016719341278, "learning_rate": 2.681912681912682e-06, "loss": 0.0469, "step": 15330 }, { "epoch": 4.555984555984556, "grad_norm": 0.4230201542377472, "learning_rate": 2.6640926640926642e-06, "loss": 0.064, "step": 15340 }, { "epoch": 4.558954558954559, "grad_norm": 0.6635040640830994, "learning_rate": 2.6462726462726464e-06, "loss": 0.0572, "step": 15350 }, { "epoch": 4.561924561924562, "grad_norm": 0.6302227973937988, "learning_rate": 2.6284526284526286e-06, "loss": 0.0706, "step": 15360 }, { "epoch": 4.564894564894565, "grad_norm": 0.6194272637367249, "learning_rate": 2.6106326106326108e-06, "loss": 0.0787, "step": 15370 }, { "epoch": 4.567864567864568, "grad_norm": 0.7719616293907166, "learning_rate": 2.592812592812593e-06, "loss": 0.0657, "step": 15380 }, { "epoch": 4.57083457083457, "grad_norm": 0.541800856590271, "learning_rate": 2.574992574992575e-06, "loss": 0.0691, "step": 15390 }, { "epoch": 4.573804573804574, "grad_norm": 0.4170493483543396, "learning_rate": 2.5571725571725573e-06, "loss": 0.0623, "step": 15400 }, { "epoch": 4.576774576774577, "grad_norm": 0.6817463636398315, "learning_rate": 2.5393525393525394e-06, "loss": 0.0475, "step": 15410 }, { "epoch": 4.579744579744579, "grad_norm": 0.5324716567993164, "learning_rate": 2.5215325215325216e-06, "loss": 0.0609, "step": 15420 }, { "epoch": 4.5827145827145825, "grad_norm": 0.3345739245414734, "learning_rate": 2.5037125037125038e-06, "loss": 0.0547, "step": 15430 }, { "epoch": 4.585684585684586, "grad_norm": 0.5235359072685242, "learning_rate": 2.485892485892486e-06, "loss": 0.0439, "step": 15440 }, { "epoch": 4.588654588654588, "grad_norm": 0.5767530202865601, "learning_rate": 2.468072468072468e-06, "loss": 0.0509, "step": 15450 }, { "epoch": 4.5916245916245915, "grad_norm": 0.37491995096206665, "learning_rate": 2.4502524502524507e-06, "loss": 0.0537, "step": 15460 }, { "epoch": 4.594594594594595, "grad_norm": 0.49927496910095215, "learning_rate": 2.4324324324324325e-06, "loss": 0.0612, "step": 15470 }, { "epoch": 4.597564597564597, "grad_norm": 0.8037787079811096, "learning_rate": 2.4146124146124146e-06, "loss": 0.059, "step": 15480 }, { "epoch": 4.6005346005346, "grad_norm": 0.6241805553436279, "learning_rate": 2.396792396792397e-06, "loss": 0.0667, "step": 15490 }, { "epoch": 4.603504603504604, "grad_norm": 0.4899803698062897, "learning_rate": 2.378972378972379e-06, "loss": 0.0547, "step": 15500 }, { "epoch": 4.606474606474606, "grad_norm": 0.7477651834487915, "learning_rate": 2.361152361152361e-06, "loss": 0.055, "step": 15510 }, { "epoch": 4.609444609444609, "grad_norm": 0.35865089297294617, "learning_rate": 2.3433323433323433e-06, "loss": 0.0518, "step": 15520 }, { "epoch": 4.612414612414613, "grad_norm": 0.6939175128936768, "learning_rate": 2.325512325512326e-06, "loss": 0.0556, "step": 15530 }, { "epoch": 4.615384615384615, "grad_norm": 0.6515450477600098, "learning_rate": 2.307692307692308e-06, "loss": 0.066, "step": 15540 }, { "epoch": 4.618354618354618, "grad_norm": 0.5336460471153259, "learning_rate": 2.28987228987229e-06, "loss": 0.0601, "step": 15550 }, { "epoch": 4.6213246213246215, "grad_norm": 0.43573182821273804, "learning_rate": 2.272052272052272e-06, "loss": 0.062, "step": 15560 }, { "epoch": 4.624294624294624, "grad_norm": 0.5898021459579468, "learning_rate": 2.254232254232254e-06, "loss": 0.0508, "step": 15570 }, { "epoch": 4.627264627264627, "grad_norm": 0.4121955931186676, "learning_rate": 2.2364122364122363e-06, "loss": 0.0572, "step": 15580 }, { "epoch": 4.63023463023463, "grad_norm": 0.4609485864639282, "learning_rate": 2.2185922185922185e-06, "loss": 0.0569, "step": 15590 }, { "epoch": 4.633204633204633, "grad_norm": 0.5508905053138733, "learning_rate": 2.200772200772201e-06, "loss": 0.0514, "step": 15600 }, { "epoch": 4.636174636174636, "grad_norm": 0.2802213430404663, "learning_rate": 2.1829521829521833e-06, "loss": 0.0655, "step": 15610 }, { "epoch": 4.639144639144639, "grad_norm": 0.4371926784515381, "learning_rate": 2.1651321651321654e-06, "loss": 0.0651, "step": 15620 }, { "epoch": 4.642114642114642, "grad_norm": 0.42453938722610474, "learning_rate": 2.147312147312147e-06, "loss": 0.0511, "step": 15630 }, { "epoch": 4.645084645084645, "grad_norm": 0.5437641143798828, "learning_rate": 2.1294921294921294e-06, "loss": 0.0668, "step": 15640 }, { "epoch": 4.648054648054648, "grad_norm": 0.3894469141960144, "learning_rate": 2.1116721116721115e-06, "loss": 0.062, "step": 15650 }, { "epoch": 4.651024651024651, "grad_norm": 0.47583234310150146, "learning_rate": 2.0938520938520937e-06, "loss": 0.0566, "step": 15660 }, { "epoch": 4.653994653994654, "grad_norm": 0.4548485279083252, "learning_rate": 2.0760320760320763e-06, "loss": 0.056, "step": 15670 }, { "epoch": 4.656964656964657, "grad_norm": 0.802291989326477, "learning_rate": 2.0582120582120585e-06, "loss": 0.0655, "step": 15680 }, { "epoch": 4.65993465993466, "grad_norm": 0.7516531944274902, "learning_rate": 2.0403920403920406e-06, "loss": 0.0639, "step": 15690 }, { "epoch": 4.662904662904663, "grad_norm": 0.32423585653305054, "learning_rate": 2.022572022572023e-06, "loss": 0.0569, "step": 15700 }, { "epoch": 4.665874665874666, "grad_norm": 0.6043174266815186, "learning_rate": 2.0047520047520046e-06, "loss": 0.0579, "step": 15710 }, { "epoch": 4.6688446688446685, "grad_norm": 0.6407160758972168, "learning_rate": 1.9869319869319867e-06, "loss": 0.0567, "step": 15720 }, { "epoch": 4.671814671814672, "grad_norm": 0.4470699727535248, "learning_rate": 1.969111969111969e-06, "loss": 0.0588, "step": 15730 }, { "epoch": 4.674784674784675, "grad_norm": 0.582695484161377, "learning_rate": 1.9512919512919515e-06, "loss": 0.0617, "step": 15740 }, { "epoch": 4.6777546777546775, "grad_norm": 0.5506203770637512, "learning_rate": 1.9334719334719337e-06, "loss": 0.0561, "step": 15750 }, { "epoch": 4.680724680724681, "grad_norm": 0.47693562507629395, "learning_rate": 1.915651915651916e-06, "loss": 0.0681, "step": 15760 }, { "epoch": 4.683694683694684, "grad_norm": 0.5993645191192627, "learning_rate": 1.8978318978318978e-06, "loss": 0.0568, "step": 15770 }, { "epoch": 4.686664686664686, "grad_norm": 0.4940324127674103, "learning_rate": 1.88001188001188e-06, "loss": 0.0598, "step": 15780 }, { "epoch": 4.68963468963469, "grad_norm": 0.2504422664642334, "learning_rate": 1.8621918621918623e-06, "loss": 0.0529, "step": 15790 }, { "epoch": 4.692604692604693, "grad_norm": 0.46793416142463684, "learning_rate": 1.8443718443718445e-06, "loss": 0.0512, "step": 15800 }, { "epoch": 4.695574695574695, "grad_norm": 0.5812580585479736, "learning_rate": 1.8265518265518265e-06, "loss": 0.0651, "step": 15810 }, { "epoch": 4.698544698544699, "grad_norm": 0.6013128757476807, "learning_rate": 1.8087318087318088e-06, "loss": 0.0468, "step": 15820 }, { "epoch": 4.701514701514702, "grad_norm": 0.7705390453338623, "learning_rate": 1.790911790911791e-06, "loss": 0.068, "step": 15830 }, { "epoch": 4.704484704484704, "grad_norm": 0.3266391158103943, "learning_rate": 1.7730917730917732e-06, "loss": 0.0599, "step": 15840 }, { "epoch": 4.7074547074547075, "grad_norm": 0.5801645517349243, "learning_rate": 1.7552717552717551e-06, "loss": 0.05, "step": 15850 }, { "epoch": 4.710424710424711, "grad_norm": 0.4369991719722748, "learning_rate": 1.7374517374517375e-06, "loss": 0.0555, "step": 15860 }, { "epoch": 4.713394713394713, "grad_norm": 0.411531925201416, "learning_rate": 1.7196317196317197e-06, "loss": 0.0548, "step": 15870 }, { "epoch": 4.716364716364716, "grad_norm": 0.3529096841812134, "learning_rate": 1.7018117018117019e-06, "loss": 0.0603, "step": 15880 }, { "epoch": 4.71933471933472, "grad_norm": 0.4712686240673065, "learning_rate": 1.683991683991684e-06, "loss": 0.044, "step": 15890 }, { "epoch": 4.722304722304722, "grad_norm": 0.5159938335418701, "learning_rate": 1.6661716661716662e-06, "loss": 0.0581, "step": 15900 }, { "epoch": 4.725274725274725, "grad_norm": 0.7334055304527283, "learning_rate": 1.6483516483516484e-06, "loss": 0.0515, "step": 15910 }, { "epoch": 4.728244728244729, "grad_norm": 0.6379159688949585, "learning_rate": 1.6305316305316306e-06, "loss": 0.0504, "step": 15920 }, { "epoch": 4.731214731214731, "grad_norm": 0.45981767773628235, "learning_rate": 1.6127116127116127e-06, "loss": 0.0656, "step": 15930 }, { "epoch": 4.734184734184734, "grad_norm": 0.39534708857536316, "learning_rate": 1.594891594891595e-06, "loss": 0.0563, "step": 15940 }, { "epoch": 4.737154737154738, "grad_norm": 0.40459519624710083, "learning_rate": 1.577071577071577e-06, "loss": 0.0539, "step": 15950 }, { "epoch": 4.74012474012474, "grad_norm": 0.353635311126709, "learning_rate": 1.5592515592515594e-06, "loss": 0.0473, "step": 15960 }, { "epoch": 4.743094743094743, "grad_norm": 0.45498237013816833, "learning_rate": 1.5414315414315414e-06, "loss": 0.0547, "step": 15970 }, { "epoch": 4.7460647460647465, "grad_norm": 0.47604072093963623, "learning_rate": 1.5236115236115236e-06, "loss": 0.0664, "step": 15980 }, { "epoch": 4.749034749034749, "grad_norm": 0.5030866265296936, "learning_rate": 1.5057915057915057e-06, "loss": 0.0542, "step": 15990 }, { "epoch": 4.752004752004752, "grad_norm": 0.6311854124069214, "learning_rate": 1.4879714879714881e-06, "loss": 0.0574, "step": 16000 }, { "epoch": 4.754974754974755, "grad_norm": 0.44515228271484375, "learning_rate": 1.47015147015147e-06, "loss": 0.0396, "step": 16010 }, { "epoch": 4.757944757944758, "grad_norm": 0.31190499663352966, "learning_rate": 1.4523314523314523e-06, "loss": 0.048, "step": 16020 }, { "epoch": 4.760914760914761, "grad_norm": 0.3565562069416046, "learning_rate": 1.4345114345114346e-06, "loss": 0.0548, "step": 16030 }, { "epoch": 4.763884763884764, "grad_norm": 0.4140501320362091, "learning_rate": 1.4166914166914168e-06, "loss": 0.0577, "step": 16040 }, { "epoch": 4.766854766854767, "grad_norm": 0.5318161845207214, "learning_rate": 1.3988713988713988e-06, "loss": 0.052, "step": 16050 }, { "epoch": 4.76982476982477, "grad_norm": 0.48852646350860596, "learning_rate": 1.381051381051381e-06, "loss": 0.0738, "step": 16060 }, { "epoch": 4.772794772794773, "grad_norm": 0.501015305519104, "learning_rate": 1.3632313632313633e-06, "loss": 0.0584, "step": 16070 }, { "epoch": 4.775764775764776, "grad_norm": 0.46425512433052063, "learning_rate": 1.3454113454113455e-06, "loss": 0.0639, "step": 16080 }, { "epoch": 4.778734778734779, "grad_norm": 0.4860481321811676, "learning_rate": 1.3275913275913275e-06, "loss": 0.0624, "step": 16090 }, { "epoch": 4.781704781704782, "grad_norm": 0.7363678812980652, "learning_rate": 1.3097713097713098e-06, "loss": 0.0735, "step": 16100 }, { "epoch": 4.784674784674785, "grad_norm": 0.6220631003379822, "learning_rate": 1.291951291951292e-06, "loss": 0.0591, "step": 16110 }, { "epoch": 4.787644787644788, "grad_norm": 0.3801935613155365, "learning_rate": 1.2741312741312742e-06, "loss": 0.0486, "step": 16120 }, { "epoch": 4.79061479061479, "grad_norm": 0.39542245864868164, "learning_rate": 1.2563112563112563e-06, "loss": 0.0567, "step": 16130 }, { "epoch": 4.7935847935847935, "grad_norm": 0.6074013113975525, "learning_rate": 1.2384912384912385e-06, "loss": 0.0597, "step": 16140 }, { "epoch": 4.796554796554797, "grad_norm": 0.3309972584247589, "learning_rate": 1.2206712206712207e-06, "loss": 0.0726, "step": 16150 }, { "epoch": 4.799524799524799, "grad_norm": 0.5621445775032043, "learning_rate": 1.2028512028512029e-06, "loss": 0.0561, "step": 16160 }, { "epoch": 4.802494802494802, "grad_norm": 0.571205198764801, "learning_rate": 1.185031185031185e-06, "loss": 0.0476, "step": 16170 }, { "epoch": 4.805464805464806, "grad_norm": 0.4768125116825104, "learning_rate": 1.1672111672111672e-06, "loss": 0.0513, "step": 16180 }, { "epoch": 4.808434808434808, "grad_norm": 0.5495672821998596, "learning_rate": 1.1493911493911494e-06, "loss": 0.0585, "step": 16190 }, { "epoch": 4.811404811404811, "grad_norm": 0.4319486916065216, "learning_rate": 1.1315711315711318e-06, "loss": 0.0559, "step": 16200 }, { "epoch": 4.814374814374815, "grad_norm": 0.5664613246917725, "learning_rate": 1.1137511137511137e-06, "loss": 0.0524, "step": 16210 }, { "epoch": 4.817344817344817, "grad_norm": 0.4833865463733673, "learning_rate": 1.0959310959310959e-06, "loss": 0.0592, "step": 16220 }, { "epoch": 4.82031482031482, "grad_norm": 0.49978017807006836, "learning_rate": 1.078111078111078e-06, "loss": 0.0616, "step": 16230 }, { "epoch": 4.8232848232848236, "grad_norm": 0.434505432844162, "learning_rate": 1.0602910602910604e-06, "loss": 0.0699, "step": 16240 }, { "epoch": 4.826254826254826, "grad_norm": 0.4497815668582916, "learning_rate": 1.0424710424710424e-06, "loss": 0.0573, "step": 16250 }, { "epoch": 4.829224829224829, "grad_norm": 0.5861119031906128, "learning_rate": 1.0246510246510246e-06, "loss": 0.0623, "step": 16260 }, { "epoch": 4.8321948321948325, "grad_norm": 0.3796347677707672, "learning_rate": 1.006831006831007e-06, "loss": 0.0628, "step": 16270 }, { "epoch": 4.835164835164835, "grad_norm": 0.5198697447776794, "learning_rate": 9.890109890109891e-07, "loss": 0.0502, "step": 16280 }, { "epoch": 4.838134838134838, "grad_norm": 0.8420373797416687, "learning_rate": 9.711909711909713e-07, "loss": 0.0627, "step": 16290 }, { "epoch": 4.841104841104841, "grad_norm": 0.5385600328445435, "learning_rate": 9.533709533709534e-07, "loss": 0.054, "step": 16300 }, { "epoch": 4.844074844074844, "grad_norm": 0.675041913986206, "learning_rate": 9.355509355509356e-07, "loss": 0.056, "step": 16310 }, { "epoch": 4.847044847044847, "grad_norm": 0.6432201862335205, "learning_rate": 9.177309177309178e-07, "loss": 0.046, "step": 16320 }, { "epoch": 4.85001485001485, "grad_norm": 0.26743176579475403, "learning_rate": 8.999108999109e-07, "loss": 0.0611, "step": 16330 }, { "epoch": 4.852984852984853, "grad_norm": 0.4432642459869385, "learning_rate": 8.820908820908821e-07, "loss": 0.0514, "step": 16340 }, { "epoch": 4.855954855954856, "grad_norm": 0.6377805471420288, "learning_rate": 8.642708642708643e-07, "loss": 0.0523, "step": 16350 }, { "epoch": 4.858924858924859, "grad_norm": 0.34250327944755554, "learning_rate": 8.464508464508465e-07, "loss": 0.053, "step": 16360 }, { "epoch": 4.861894861894862, "grad_norm": 0.32881438732147217, "learning_rate": 8.286308286308286e-07, "loss": 0.0568, "step": 16370 }, { "epoch": 4.864864864864865, "grad_norm": 0.38570886850357056, "learning_rate": 8.108108108108109e-07, "loss": 0.0611, "step": 16380 }, { "epoch": 4.867834867834868, "grad_norm": 0.34529945254325867, "learning_rate": 7.92990792990793e-07, "loss": 0.0528, "step": 16390 }, { "epoch": 4.870804870804871, "grad_norm": 0.401865690946579, "learning_rate": 7.751707751707753e-07, "loss": 0.0592, "step": 16400 }, { "epoch": 4.873774873774874, "grad_norm": 0.4996122419834137, "learning_rate": 7.573507573507573e-07, "loss": 0.0717, "step": 16410 }, { "epoch": 4.876744876744877, "grad_norm": 0.9507100582122803, "learning_rate": 7.395307395307396e-07, "loss": 0.0518, "step": 16420 }, { "epoch": 4.8797148797148795, "grad_norm": 0.5046108961105347, "learning_rate": 7.217107217107217e-07, "loss": 0.0649, "step": 16430 }, { "epoch": 4.882684882684883, "grad_norm": 0.6091616153717041, "learning_rate": 7.03890703890704e-07, "loss": 0.0544, "step": 16440 }, { "epoch": 4.885654885654886, "grad_norm": 0.526391863822937, "learning_rate": 6.860706860706861e-07, "loss": 0.0581, "step": 16450 }, { "epoch": 4.888624888624888, "grad_norm": 0.6778053641319275, "learning_rate": 6.682506682506683e-07, "loss": 0.0534, "step": 16460 }, { "epoch": 4.891594891594892, "grad_norm": 0.5965909957885742, "learning_rate": 6.504306504306505e-07, "loss": 0.0584, "step": 16470 }, { "epoch": 4.894564894564894, "grad_norm": 0.4215756058692932, "learning_rate": 6.326106326106326e-07, "loss": 0.0603, "step": 16480 }, { "epoch": 4.897534897534897, "grad_norm": 0.3755158483982086, "learning_rate": 6.147906147906148e-07, "loss": 0.0571, "step": 16490 }, { "epoch": 4.900504900504901, "grad_norm": 0.6438432335853577, "learning_rate": 5.969705969705971e-07, "loss": 0.0544, "step": 16500 }, { "epoch": 4.903474903474903, "grad_norm": 0.4326302111148834, "learning_rate": 5.791505791505791e-07, "loss": 0.064, "step": 16510 }, { "epoch": 4.906444906444906, "grad_norm": 0.39046719670295715, "learning_rate": 5.613305613305614e-07, "loss": 0.0541, "step": 16520 }, { "epoch": 4.9094149094149095, "grad_norm": 0.35211583971977234, "learning_rate": 5.435105435105435e-07, "loss": 0.0629, "step": 16530 }, { "epoch": 4.912384912384912, "grad_norm": 0.6884411573410034, "learning_rate": 5.256905256905258e-07, "loss": 0.0538, "step": 16540 }, { "epoch": 4.915354915354915, "grad_norm": 0.322244793176651, "learning_rate": 5.078705078705078e-07, "loss": 0.0696, "step": 16550 }, { "epoch": 4.9183249183249185, "grad_norm": 0.5389405488967896, "learning_rate": 4.900504900504901e-07, "loss": 0.0568, "step": 16560 }, { "epoch": 4.921294921294921, "grad_norm": 0.5457450747489929, "learning_rate": 4.7223047223047227e-07, "loss": 0.0529, "step": 16570 }, { "epoch": 4.924264924264924, "grad_norm": 0.4502221643924713, "learning_rate": 4.544104544104544e-07, "loss": 0.0438, "step": 16580 }, { "epoch": 4.927234927234927, "grad_norm": 0.486514151096344, "learning_rate": 4.365904365904366e-07, "loss": 0.0625, "step": 16590 }, { "epoch": 4.93020493020493, "grad_norm": 0.5433149337768555, "learning_rate": 4.187704187704188e-07, "loss": 0.0605, "step": 16600 }, { "epoch": 4.933174933174933, "grad_norm": 0.7155877351760864, "learning_rate": 4.0095040095040095e-07, "loss": 0.0538, "step": 16610 }, { "epoch": 4.936144936144936, "grad_norm": 0.24715302884578705, "learning_rate": 3.831303831303831e-07, "loss": 0.0484, "step": 16620 }, { "epoch": 4.939114939114939, "grad_norm": 0.47398287057876587, "learning_rate": 3.653103653103653e-07, "loss": 0.0638, "step": 16630 }, { "epoch": 4.942084942084942, "grad_norm": 0.31817615032196045, "learning_rate": 3.4749034749034746e-07, "loss": 0.0498, "step": 16640 }, { "epoch": 4.945054945054945, "grad_norm": 0.30618026852607727, "learning_rate": 3.296703296703297e-07, "loss": 0.0606, "step": 16650 }, { "epoch": 4.948024948024948, "grad_norm": 0.29916951060295105, "learning_rate": 3.1185031185031186e-07, "loss": 0.0471, "step": 16660 }, { "epoch": 4.950994950994951, "grad_norm": 0.711303174495697, "learning_rate": 2.9403029403029403e-07, "loss": 0.0548, "step": 16670 }, { "epoch": 4.953964953964954, "grad_norm": 0.34268659353256226, "learning_rate": 2.762102762102762e-07, "loss": 0.052, "step": 16680 }, { "epoch": 4.956934956934957, "grad_norm": 0.5266901850700378, "learning_rate": 2.5839025839025837e-07, "loss": 0.061, "step": 16690 }, { "epoch": 4.95990495990496, "grad_norm": 0.5868252515792847, "learning_rate": 2.4057024057024054e-07, "loss": 0.0563, "step": 16700 }, { "epoch": 4.962874962874963, "grad_norm": 0.24245183169841766, "learning_rate": 2.2275022275022276e-07, "loss": 0.0639, "step": 16710 }, { "epoch": 4.9658449658449655, "grad_norm": 0.4494710862636566, "learning_rate": 2.0493020493020493e-07, "loss": 0.0498, "step": 16720 }, { "epoch": 4.968814968814969, "grad_norm": 0.525842010974884, "learning_rate": 1.8711018711018713e-07, "loss": 0.0578, "step": 16730 }, { "epoch": 4.971784971784972, "grad_norm": 0.6086418032646179, "learning_rate": 1.692901692901693e-07, "loss": 0.0424, "step": 16740 }, { "epoch": 4.974754974754974, "grad_norm": 0.4772314429283142, "learning_rate": 1.5147015147015147e-07, "loss": 0.0534, "step": 16750 }, { "epoch": 4.977724977724978, "grad_norm": 0.7088252305984497, "learning_rate": 1.3365013365013367e-07, "loss": 0.055, "step": 16760 }, { "epoch": 4.980694980694981, "grad_norm": 0.5903290510177612, "learning_rate": 1.1583011583011584e-07, "loss": 0.0549, "step": 16770 }, { "epoch": 4.983664983664983, "grad_norm": 0.48824286460876465, "learning_rate": 9.801009801009801e-08, "loss": 0.0468, "step": 16780 }, { "epoch": 4.986634986634987, "grad_norm": 0.3941417634487152, "learning_rate": 8.019008019008019e-08, "loss": 0.0708, "step": 16790 }, { "epoch": 4.98960498960499, "grad_norm": 0.5667988657951355, "learning_rate": 6.237006237006238e-08, "loss": 0.0532, "step": 16800 }, { "epoch": 4.992574992574992, "grad_norm": 0.4835197627544403, "learning_rate": 4.4550044550044554e-08, "loss": 0.0511, "step": 16810 }, { "epoch": 4.9955449955449955, "grad_norm": 0.5256985425949097, "learning_rate": 2.673002673002673e-08, "loss": 0.0624, "step": 16820 }, { "epoch": 4.998514998514999, "grad_norm": 0.398252934217453, "learning_rate": 8.91000891000891e-09, "loss": 0.0563, "step": 16830 }, { "epoch": 5.0, "eval_f1": 0.49727767695099817, "eval_loss": 0.053983673453330994, "eval_runtime": 176.2895, "eval_samples_per_second": 215.662, "eval_steps_per_second": 3.375, "step": 16835 }, { "epoch": 4.251451653622823, "grad_norm": 0.4622216820716858, "learning_rate": 4.491290078263065e-06, "loss": 0.0754, "step": 16840 }, { "epoch": 4.253976268619035, "grad_norm": 0.38623130321502686, "learning_rate": 4.476142388285787e-06, "loss": 0.0505, "step": 16850 }, { "epoch": 4.2565008836152485, "grad_norm": 0.32597488164901733, "learning_rate": 4.460994698308508e-06, "loss": 0.0473, "step": 16860 }, { "epoch": 4.259025498611462, "grad_norm": 0.599904477596283, "learning_rate": 4.44584700833123e-06, "loss": 0.0524, "step": 16870 }, { "epoch": 4.261550113607675, "grad_norm": 0.4074048101902008, "learning_rate": 4.4306993183539506e-06, "loss": 0.0605, "step": 16880 }, { "epoch": 4.264074728603888, "grad_norm": 0.626695454120636, "learning_rate": 4.415551628376672e-06, "loss": 0.0584, "step": 16890 }, { "epoch": 4.266599343600101, "grad_norm": 0.46520286798477173, "learning_rate": 4.400403938399395e-06, "loss": 0.0452, "step": 16900 }, { "epoch": 4.269123958596314, "grad_norm": 0.7951592206954956, "learning_rate": 4.385256248422115e-06, "loss": 0.071, "step": 16910 }, { "epoch": 4.271648573592527, "grad_norm": 0.5409834384918213, "learning_rate": 4.370108558444837e-06, "loss": 0.0467, "step": 16920 }, { "epoch": 4.27417318858874, "grad_norm": 0.6036372780799866, "learning_rate": 4.354960868467559e-06, "loss": 0.064, "step": 16930 }, { "epoch": 4.276697803584954, "grad_norm": 0.4542910158634186, "learning_rate": 4.33981317849028e-06, "loss": 0.0547, "step": 16940 }, { "epoch": 4.279222418581166, "grad_norm": 0.6374622583389282, "learning_rate": 4.324665488513002e-06, "loss": 0.0537, "step": 16950 }, { "epoch": 4.281747033577379, "grad_norm": 0.6870420575141907, "learning_rate": 4.309517798535723e-06, "loss": 0.0639, "step": 16960 }, { "epoch": 4.2842716485735926, "grad_norm": 0.24296802282333374, "learning_rate": 4.294370108558445e-06, "loss": 0.0614, "step": 16970 }, { "epoch": 4.286796263569806, "grad_norm": 0.5068966150283813, "learning_rate": 4.2792224185811665e-06, "loss": 0.0667, "step": 16980 }, { "epoch": 4.289320878566019, "grad_norm": 0.49634042382240295, "learning_rate": 4.264074728603888e-06, "loss": 0.0482, "step": 16990 }, { "epoch": 4.291845493562231, "grad_norm": 0.8153424263000488, "learning_rate": 4.24892703862661e-06, "loss": 0.0542, "step": 17000 }, { "epoch": 4.294370108558445, "grad_norm": 0.19083431363105774, "learning_rate": 4.233779348649331e-06, "loss": 0.0612, "step": 17010 }, { "epoch": 4.296894723554658, "grad_norm": 0.4229993522167206, "learning_rate": 4.218631658672053e-06, "loss": 0.0518, "step": 17020 }, { "epoch": 4.299419338550871, "grad_norm": 0.8197377920150757, "learning_rate": 4.2034839686947745e-06, "loss": 0.0547, "step": 17030 }, { "epoch": 4.3019439535470845, "grad_norm": 0.44996774196624756, "learning_rate": 4.188336278717495e-06, "loss": 0.0495, "step": 17040 }, { "epoch": 4.304468568543297, "grad_norm": 0.4352714419364929, "learning_rate": 4.173188588740217e-06, "loss": 0.047, "step": 17050 }, { "epoch": 4.30699318353951, "grad_norm": 0.3896523714065552, "learning_rate": 4.158040898762939e-06, "loss": 0.0599, "step": 17060 }, { "epoch": 4.309517798535723, "grad_norm": 0.6314728260040283, "learning_rate": 4.14289320878566e-06, "loss": 0.0604, "step": 17070 }, { "epoch": 4.312042413531937, "grad_norm": 0.6164297461509705, "learning_rate": 4.127745518808382e-06, "loss": 0.0649, "step": 17080 }, { "epoch": 4.314567028528149, "grad_norm": 0.47392478585243225, "learning_rate": 4.112597828831104e-06, "loss": 0.0482, "step": 17090 }, { "epoch": 4.317091643524362, "grad_norm": 0.4184396266937256, "learning_rate": 4.097450138853825e-06, "loss": 0.0576, "step": 17100 }, { "epoch": 4.3196162585205755, "grad_norm": 0.3965582251548767, "learning_rate": 4.082302448876546e-06, "loss": 0.0658, "step": 17110 }, { "epoch": 4.322140873516789, "grad_norm": 0.4759332835674286, "learning_rate": 4.067154758899268e-06, "loss": 0.0694, "step": 17120 }, { "epoch": 4.324665488513002, "grad_norm": 0.6103851795196533, "learning_rate": 4.0520070689219896e-06, "loss": 0.0632, "step": 17130 }, { "epoch": 4.327190103509215, "grad_norm": 0.3435596525669098, "learning_rate": 4.036859378944711e-06, "loss": 0.0632, "step": 17140 }, { "epoch": 4.329714718505428, "grad_norm": 0.6255317330360413, "learning_rate": 4.021711688967433e-06, "loss": 0.0607, "step": 17150 }, { "epoch": 4.332239333501641, "grad_norm": 0.8034877181053162, "learning_rate": 4.006563998990154e-06, "loss": 0.0624, "step": 17160 }, { "epoch": 4.334763948497854, "grad_norm": 0.5104978084564209, "learning_rate": 3.991416309012875e-06, "loss": 0.0723, "step": 17170 }, { "epoch": 4.337288563494067, "grad_norm": 0.6457841992378235, "learning_rate": 3.976268619035597e-06, "loss": 0.0622, "step": 17180 }, { "epoch": 4.33981317849028, "grad_norm": 0.5124953985214233, "learning_rate": 3.961120929058319e-06, "loss": 0.0608, "step": 17190 }, { "epoch": 4.342337793486493, "grad_norm": 0.4378756582736969, "learning_rate": 3.94597323908104e-06, "loss": 0.0483, "step": 17200 }, { "epoch": 4.344862408482706, "grad_norm": 0.47140154242515564, "learning_rate": 3.9308255491037615e-06, "loss": 0.0649, "step": 17210 }, { "epoch": 4.34738702347892, "grad_norm": 0.39003312587738037, "learning_rate": 3.915677859126484e-06, "loss": 0.0589, "step": 17220 }, { "epoch": 4.349911638475133, "grad_norm": 0.5201835036277771, "learning_rate": 3.900530169149205e-06, "loss": 0.0528, "step": 17230 }, { "epoch": 4.352436253471345, "grad_norm": 0.4116949439048767, "learning_rate": 3.885382479171926e-06, "loss": 0.0531, "step": 17240 }, { "epoch": 4.3549608684675585, "grad_norm": 0.39697498083114624, "learning_rate": 3.870234789194648e-06, "loss": 0.0733, "step": 17250 }, { "epoch": 4.357485483463772, "grad_norm": 0.4850797653198242, "learning_rate": 3.8550870992173695e-06, "loss": 0.0654, "step": 17260 }, { "epoch": 4.360010098459985, "grad_norm": 0.42553943395614624, "learning_rate": 3.839939409240091e-06, "loss": 0.0451, "step": 17270 }, { "epoch": 4.362534713456198, "grad_norm": 0.27774763107299805, "learning_rate": 3.824791719262813e-06, "loss": 0.0532, "step": 17280 }, { "epoch": 4.365059328452411, "grad_norm": 0.36856329441070557, "learning_rate": 3.8096440292855342e-06, "loss": 0.0499, "step": 17290 }, { "epoch": 4.367583943448624, "grad_norm": 0.6865664720535278, "learning_rate": 3.7944963393082554e-06, "loss": 0.0646, "step": 17300 }, { "epoch": 4.370108558444837, "grad_norm": 0.809834897518158, "learning_rate": 3.7793486493309766e-06, "loss": 0.0623, "step": 17310 }, { "epoch": 4.37263317344105, "grad_norm": 0.5114462971687317, "learning_rate": 3.764200959353699e-06, "loss": 0.055, "step": 17320 }, { "epoch": 4.375157788437264, "grad_norm": 0.6078599095344543, "learning_rate": 3.74905326937642e-06, "loss": 0.0654, "step": 17330 }, { "epoch": 4.377682403433476, "grad_norm": 0.48811349272727966, "learning_rate": 3.733905579399142e-06, "loss": 0.0555, "step": 17340 }, { "epoch": 4.380207018429689, "grad_norm": 0.7374588847160339, "learning_rate": 3.718757889421863e-06, "loss": 0.0593, "step": 17350 }, { "epoch": 4.3827316334259026, "grad_norm": 0.6511560678482056, "learning_rate": 3.703610199444585e-06, "loss": 0.0544, "step": 17360 }, { "epoch": 4.385256248422116, "grad_norm": 0.4263114333152771, "learning_rate": 3.6884625094673066e-06, "loss": 0.064, "step": 17370 }, { "epoch": 4.387780863418329, "grad_norm": 0.2922056317329407, "learning_rate": 3.6733148194900277e-06, "loss": 0.0573, "step": 17380 }, { "epoch": 4.390305478414541, "grad_norm": 0.7642768025398254, "learning_rate": 3.6581671295127493e-06, "loss": 0.0558, "step": 17390 }, { "epoch": 4.392830093410755, "grad_norm": 0.5975914597511292, "learning_rate": 3.643019439535471e-06, "loss": 0.075, "step": 17400 }, { "epoch": 4.395354708406968, "grad_norm": 0.4351644515991211, "learning_rate": 3.6278717495581925e-06, "loss": 0.0635, "step": 17410 }, { "epoch": 4.397879323403181, "grad_norm": 0.6523928046226501, "learning_rate": 3.612724059580914e-06, "loss": 0.0545, "step": 17420 }, { "epoch": 4.4004039383993945, "grad_norm": 0.4286153018474579, "learning_rate": 3.5975763696036353e-06, "loss": 0.0507, "step": 17430 }, { "epoch": 4.402928553395607, "grad_norm": 0.402811735868454, "learning_rate": 3.582428679626357e-06, "loss": 0.0595, "step": 17440 }, { "epoch": 4.40545316839182, "grad_norm": 0.5500208139419556, "learning_rate": 3.567280989649079e-06, "loss": 0.0553, "step": 17450 }, { "epoch": 4.407977783388033, "grad_norm": 0.7133852243423462, "learning_rate": 3.5521332996718e-06, "loss": 0.059, "step": 17460 }, { "epoch": 4.410502398384247, "grad_norm": 0.8194918036460876, "learning_rate": 3.5369856096945217e-06, "loss": 0.0521, "step": 17470 }, { "epoch": 4.41302701338046, "grad_norm": 0.5027428865432739, "learning_rate": 3.521837919717243e-06, "loss": 0.0491, "step": 17480 }, { "epoch": 4.415551628376672, "grad_norm": 0.46674638986587524, "learning_rate": 3.506690229739965e-06, "loss": 0.0543, "step": 17490 }, { "epoch": 4.4180762433728855, "grad_norm": 0.6677160263061523, "learning_rate": 3.4915425397626865e-06, "loss": 0.0464, "step": 17500 }, { "epoch": 4.420600858369099, "grad_norm": 0.3993780314922333, "learning_rate": 3.4763948497854076e-06, "loss": 0.0635, "step": 17510 }, { "epoch": 4.423125473365312, "grad_norm": 0.44299226999282837, "learning_rate": 3.4612471598081292e-06, "loss": 0.0675, "step": 17520 }, { "epoch": 4.425650088361525, "grad_norm": 0.47991326451301575, "learning_rate": 3.4460994698308512e-06, "loss": 0.0491, "step": 17530 }, { "epoch": 4.428174703357738, "grad_norm": 0.5460741519927979, "learning_rate": 3.4309517798535724e-06, "loss": 0.0688, "step": 17540 }, { "epoch": 4.430699318353951, "grad_norm": 0.5100826621055603, "learning_rate": 3.415804089876294e-06, "loss": 0.0686, "step": 17550 }, { "epoch": 4.433223933350164, "grad_norm": 0.7981113195419312, "learning_rate": 3.400656399899015e-06, "loss": 0.0605, "step": 17560 }, { "epoch": 4.435748548346377, "grad_norm": 0.42095330357551575, "learning_rate": 3.385508709921737e-06, "loss": 0.0621, "step": 17570 }, { "epoch": 4.43827316334259, "grad_norm": 0.4400339722633362, "learning_rate": 3.3703610199444588e-06, "loss": 0.0636, "step": 17580 }, { "epoch": 4.440797778338803, "grad_norm": 0.4648873805999756, "learning_rate": 3.35521332996718e-06, "loss": 0.0746, "step": 17590 }, { "epoch": 4.443322393335016, "grad_norm": 0.4564558267593384, "learning_rate": 3.3400656399899016e-06, "loss": 0.0546, "step": 17600 }, { "epoch": 4.44584700833123, "grad_norm": 0.4136642515659332, "learning_rate": 3.324917950012623e-06, "loss": 0.0557, "step": 17610 }, { "epoch": 4.448371623327443, "grad_norm": 0.4328581392765045, "learning_rate": 3.3097702600353447e-06, "loss": 0.0565, "step": 17620 }, { "epoch": 4.450896238323656, "grad_norm": 0.3888933062553406, "learning_rate": 3.2946225700580663e-06, "loss": 0.057, "step": 17630 }, { "epoch": 4.4534208533198685, "grad_norm": 0.5712131857872009, "learning_rate": 3.2794748800807875e-06, "loss": 0.0484, "step": 17640 }, { "epoch": 4.455945468316082, "grad_norm": 0.5881834626197815, "learning_rate": 3.2643271901035095e-06, "loss": 0.0531, "step": 17650 }, { "epoch": 4.458470083312295, "grad_norm": 0.5216571688652039, "learning_rate": 3.249179500126231e-06, "loss": 0.0522, "step": 17660 }, { "epoch": 4.460994698308508, "grad_norm": 0.5654059648513794, "learning_rate": 3.2340318101489523e-06, "loss": 0.0498, "step": 17670 }, { "epoch": 4.463519313304721, "grad_norm": 0.6211521625518799, "learning_rate": 3.218884120171674e-06, "loss": 0.0546, "step": 17680 }, { "epoch": 4.466043928300934, "grad_norm": 0.48394614458084106, "learning_rate": 3.2037364301943955e-06, "loss": 0.0588, "step": 17690 }, { "epoch": 4.468568543297147, "grad_norm": 0.7053552269935608, "learning_rate": 3.188588740217117e-06, "loss": 0.0598, "step": 17700 }, { "epoch": 4.47109315829336, "grad_norm": 0.4579329192638397, "learning_rate": 3.1734410502398387e-06, "loss": 0.0512, "step": 17710 }, { "epoch": 4.473617773289574, "grad_norm": 0.3756571114063263, "learning_rate": 3.15829336026256e-06, "loss": 0.0512, "step": 17720 }, { "epoch": 4.476142388285786, "grad_norm": 0.3513215482234955, "learning_rate": 3.1431456702852814e-06, "loss": 0.0557, "step": 17730 }, { "epoch": 4.478667003281999, "grad_norm": 0.5204163193702698, "learning_rate": 3.127997980308003e-06, "loss": 0.0799, "step": 17740 }, { "epoch": 4.4811916182782126, "grad_norm": 0.5801984071731567, "learning_rate": 3.1128502903307246e-06, "loss": 0.0419, "step": 17750 }, { "epoch": 4.483716233274426, "grad_norm": 0.6490535140037537, "learning_rate": 3.0977026003534462e-06, "loss": 0.0551, "step": 17760 }, { "epoch": 4.486240848270639, "grad_norm": 0.5970304012298584, "learning_rate": 3.0825549103761674e-06, "loss": 0.061, "step": 17770 }, { "epoch": 4.488765463266851, "grad_norm": 0.8191946744918823, "learning_rate": 3.0674072203988894e-06, "loss": 0.058, "step": 17780 }, { "epoch": 4.491290078263065, "grad_norm": 0.7532091736793518, "learning_rate": 3.052259530421611e-06, "loss": 0.0578, "step": 17790 }, { "epoch": 4.493814693259278, "grad_norm": 0.6891248226165771, "learning_rate": 3.037111840444332e-06, "loss": 0.0776, "step": 17800 }, { "epoch": 4.496339308255491, "grad_norm": 0.3589613139629364, "learning_rate": 3.0219641504670538e-06, "loss": 0.0578, "step": 17810 }, { "epoch": 4.4988639232517045, "grad_norm": 0.4397825598716736, "learning_rate": 3.0068164604897754e-06, "loss": 0.0477, "step": 17820 }, { "epoch": 4.501388538247917, "grad_norm": 0.6630678772926331, "learning_rate": 2.991668770512497e-06, "loss": 0.0621, "step": 17830 }, { "epoch": 4.50391315324413, "grad_norm": 0.4310142695903778, "learning_rate": 2.9765210805352185e-06, "loss": 0.0486, "step": 17840 }, { "epoch": 4.506437768240343, "grad_norm": 0.5123319625854492, "learning_rate": 2.9613733905579397e-06, "loss": 0.0419, "step": 17850 }, { "epoch": 4.508962383236557, "grad_norm": 0.8451969027519226, "learning_rate": 2.9462257005806617e-06, "loss": 0.0636, "step": 17860 }, { "epoch": 4.51148699823277, "grad_norm": 0.5869598388671875, "learning_rate": 2.9310780106033833e-06, "loss": 0.0592, "step": 17870 }, { "epoch": 4.514011613228982, "grad_norm": 0.8282822370529175, "learning_rate": 2.9159303206261045e-06, "loss": 0.0614, "step": 17880 }, { "epoch": 4.5165362282251955, "grad_norm": 0.5392325520515442, "learning_rate": 2.900782630648826e-06, "loss": 0.0537, "step": 17890 }, { "epoch": 4.519060843221409, "grad_norm": 0.6844165325164795, "learning_rate": 2.8856349406715477e-06, "loss": 0.0628, "step": 17900 }, { "epoch": 4.521585458217622, "grad_norm": 0.5177090764045715, "learning_rate": 2.8704872506942693e-06, "loss": 0.0536, "step": 17910 }, { "epoch": 4.524110073213835, "grad_norm": 0.395877480506897, "learning_rate": 2.855339560716991e-06, "loss": 0.0602, "step": 17920 }, { "epoch": 4.526634688210048, "grad_norm": 0.30185338854789734, "learning_rate": 2.840191870739712e-06, "loss": 0.0624, "step": 17930 }, { "epoch": 4.529159303206261, "grad_norm": 0.5236132740974426, "learning_rate": 2.825044180762434e-06, "loss": 0.0648, "step": 17940 }, { "epoch": 4.531683918202474, "grad_norm": 0.5160847306251526, "learning_rate": 2.8098964907851552e-06, "loss": 0.0554, "step": 17950 }, { "epoch": 4.534208533198687, "grad_norm": 0.5891533493995667, "learning_rate": 2.794748800807877e-06, "loss": 0.065, "step": 17960 }, { "epoch": 4.5367331481949, "grad_norm": 0.3929848074913025, "learning_rate": 2.7796011108305984e-06, "loss": 0.064, "step": 17970 }, { "epoch": 4.539257763191113, "grad_norm": 0.4245711863040924, "learning_rate": 2.76445342085332e-06, "loss": 0.0589, "step": 17980 }, { "epoch": 4.541782378187326, "grad_norm": 0.3840174376964569, "learning_rate": 2.7493057308760416e-06, "loss": 0.0533, "step": 17990 }, { "epoch": 4.54430699318354, "grad_norm": 0.5826268196105957, "learning_rate": 2.734158040898763e-06, "loss": 0.0574, "step": 18000 }, { "epoch": 4.546831608179753, "grad_norm": 0.34730613231658936, "learning_rate": 2.7190103509214844e-06, "loss": 0.0567, "step": 18010 }, { "epoch": 4.549356223175966, "grad_norm": 0.45773857831954956, "learning_rate": 2.703862660944206e-06, "loss": 0.0601, "step": 18020 }, { "epoch": 4.5518808381721785, "grad_norm": 0.3971637189388275, "learning_rate": 2.6887149709669276e-06, "loss": 0.0551, "step": 18030 }, { "epoch": 4.554405453168392, "grad_norm": 0.46903976798057556, "learning_rate": 2.673567280989649e-06, "loss": 0.0553, "step": 18040 }, { "epoch": 4.556930068164605, "grad_norm": 0.5735597014427185, "learning_rate": 2.6584195910123708e-06, "loss": 0.0544, "step": 18050 }, { "epoch": 4.559454683160818, "grad_norm": 0.7702049612998962, "learning_rate": 2.643271901035092e-06, "loss": 0.0424, "step": 18060 }, { "epoch": 4.561979298157031, "grad_norm": 0.7742976546287537, "learning_rate": 2.628124211057814e-06, "loss": 0.0625, "step": 18070 }, { "epoch": 4.564503913153244, "grad_norm": 0.7160853147506714, "learning_rate": 2.612976521080535e-06, "loss": 0.058, "step": 18080 }, { "epoch": 4.567028528149457, "grad_norm": 0.23403172194957733, "learning_rate": 2.5978288311032567e-06, "loss": 0.0477, "step": 18090 }, { "epoch": 4.56955314314567, "grad_norm": 0.25679340958595276, "learning_rate": 2.5826811411259783e-06, "loss": 0.0578, "step": 18100 }, { "epoch": 4.572077758141884, "grad_norm": 0.6108934879302979, "learning_rate": 2.5675334511487e-06, "loss": 0.0567, "step": 18110 }, { "epoch": 4.574602373138097, "grad_norm": 0.570832371711731, "learning_rate": 2.5523857611714215e-06, "loss": 0.0503, "step": 18120 }, { "epoch": 4.577126988134309, "grad_norm": 0.49613040685653687, "learning_rate": 2.537238071194143e-06, "loss": 0.0552, "step": 18130 }, { "epoch": 4.5796516031305226, "grad_norm": 0.43599942326545715, "learning_rate": 2.5220903812168643e-06, "loss": 0.0736, "step": 18140 }, { "epoch": 4.582176218126736, "grad_norm": 0.5823941230773926, "learning_rate": 2.5069426912395863e-06, "loss": 0.0452, "step": 18150 }, { "epoch": 4.584700833122949, "grad_norm": 0.6966807842254639, "learning_rate": 2.4917950012623075e-06, "loss": 0.0567, "step": 18160 }, { "epoch": 4.587225448119161, "grad_norm": 0.9933467507362366, "learning_rate": 2.476647311285029e-06, "loss": 0.0621, "step": 18170 }, { "epoch": 4.589750063115375, "grad_norm": 0.44380226731300354, "learning_rate": 2.4614996213077506e-06, "loss": 0.0607, "step": 18180 }, { "epoch": 4.592274678111588, "grad_norm": 0.3192310631275177, "learning_rate": 2.4463519313304722e-06, "loss": 0.0601, "step": 18190 }, { "epoch": 4.594799293107801, "grad_norm": 0.5151782035827637, "learning_rate": 2.431204241353194e-06, "loss": 0.0441, "step": 18200 }, { "epoch": 4.5973239081040145, "grad_norm": 0.8137912154197693, "learning_rate": 2.4160565513759154e-06, "loss": 0.0708, "step": 18210 }, { "epoch": 4.599848523100227, "grad_norm": 0.4802444875240326, "learning_rate": 2.4009088613986366e-06, "loss": 0.0553, "step": 18220 }, { "epoch": 4.60237313809644, "grad_norm": 0.41935741901397705, "learning_rate": 2.3857611714213586e-06, "loss": 0.057, "step": 18230 }, { "epoch": 4.604897753092653, "grad_norm": 0.42669227719306946, "learning_rate": 2.3706134814440798e-06, "loss": 0.0671, "step": 18240 }, { "epoch": 4.607422368088867, "grad_norm": 0.5261390209197998, "learning_rate": 2.3554657914668014e-06, "loss": 0.0561, "step": 18250 }, { "epoch": 4.60994698308508, "grad_norm": 0.495779424905777, "learning_rate": 2.340318101489523e-06, "loss": 0.059, "step": 18260 }, { "epoch": 4.612471598081292, "grad_norm": 0.5515862107276917, "learning_rate": 2.325170411512244e-06, "loss": 0.061, "step": 18270 }, { "epoch": 4.6149962130775055, "grad_norm": 0.8136048913002014, "learning_rate": 2.310022721534966e-06, "loss": 0.0572, "step": 18280 }, { "epoch": 4.617520828073719, "grad_norm": 0.393250972032547, "learning_rate": 2.2948750315576873e-06, "loss": 0.0523, "step": 18290 }, { "epoch": 4.620045443069932, "grad_norm": 0.5420840978622437, "learning_rate": 2.279727341580409e-06, "loss": 0.0605, "step": 18300 }, { "epoch": 4.622570058066145, "grad_norm": 0.5676819086074829, "learning_rate": 2.2645796516031305e-06, "loss": 0.0523, "step": 18310 }, { "epoch": 4.625094673062358, "grad_norm": 0.36500880122184753, "learning_rate": 2.249431961625852e-06, "loss": 0.0562, "step": 18320 }, { "epoch": 4.627619288058571, "grad_norm": 0.5303543210029602, "learning_rate": 2.2342842716485737e-06, "loss": 0.0474, "step": 18330 }, { "epoch": 4.630143903054784, "grad_norm": 0.4387858510017395, "learning_rate": 2.2191365816712953e-06, "loss": 0.0438, "step": 18340 }, { "epoch": 4.632668518050997, "grad_norm": 0.2990294098854065, "learning_rate": 2.2039888916940165e-06, "loss": 0.0447, "step": 18350 }, { "epoch": 4.63519313304721, "grad_norm": 0.37644967436790466, "learning_rate": 2.1888412017167385e-06, "loss": 0.049, "step": 18360 }, { "epoch": 4.637717748043423, "grad_norm": 0.5996664762496948, "learning_rate": 2.1736935117394597e-06, "loss": 0.0618, "step": 18370 }, { "epoch": 4.640242363039636, "grad_norm": 0.5396886467933655, "learning_rate": 2.1585458217621813e-06, "loss": 0.054, "step": 18380 }, { "epoch": 4.64276697803585, "grad_norm": 0.5311841368675232, "learning_rate": 2.143398131784903e-06, "loss": 0.059, "step": 18390 }, { "epoch": 4.645291593032063, "grad_norm": 0.6080347299575806, "learning_rate": 2.1282504418076244e-06, "loss": 0.063, "step": 18400 }, { "epoch": 4.647816208028276, "grad_norm": 0.720029354095459, "learning_rate": 2.113102751830346e-06, "loss": 0.0674, "step": 18410 }, { "epoch": 4.6503408230244885, "grad_norm": 0.26142123341560364, "learning_rate": 2.0979550618530672e-06, "loss": 0.043, "step": 18420 }, { "epoch": 4.652865438020702, "grad_norm": 0.8284344673156738, "learning_rate": 2.082807371875789e-06, "loss": 0.0632, "step": 18430 }, { "epoch": 4.655390053016915, "grad_norm": 0.5045512914657593, "learning_rate": 2.067659681898511e-06, "loss": 0.0558, "step": 18440 }, { "epoch": 4.657914668013128, "grad_norm": 0.3474113345146179, "learning_rate": 2.052511991921232e-06, "loss": 0.0484, "step": 18450 }, { "epoch": 4.660439283009341, "grad_norm": 0.4987110197544098, "learning_rate": 2.0373643019439536e-06, "loss": 0.0542, "step": 18460 }, { "epoch": 4.662963898005554, "grad_norm": 0.48412591218948364, "learning_rate": 2.022216611966675e-06, "loss": 0.0585, "step": 18470 }, { "epoch": 4.665488513001767, "grad_norm": 0.48798561096191406, "learning_rate": 2.0070689219893968e-06, "loss": 0.0512, "step": 18480 }, { "epoch": 4.66801312799798, "grad_norm": 0.3808564245700836, "learning_rate": 1.9919212320121184e-06, "loss": 0.0607, "step": 18490 }, { "epoch": 4.670537742994194, "grad_norm": 0.3918999135494232, "learning_rate": 1.9767735420348395e-06, "loss": 0.061, "step": 18500 }, { "epoch": 4.673062357990407, "grad_norm": 0.7011407017707825, "learning_rate": 1.961625852057561e-06, "loss": 0.0621, "step": 18510 }, { "epoch": 4.675586972986619, "grad_norm": 0.31626319885253906, "learning_rate": 1.946478162080283e-06, "loss": 0.0564, "step": 18520 }, { "epoch": 4.6781115879828326, "grad_norm": 0.5955636501312256, "learning_rate": 1.9313304721030043e-06, "loss": 0.0473, "step": 18530 }, { "epoch": 4.680636202979046, "grad_norm": 0.50102698802948, "learning_rate": 1.916182782125726e-06, "loss": 0.0585, "step": 18540 }, { "epoch": 4.683160817975259, "grad_norm": 0.46877047419548035, "learning_rate": 1.9010350921484475e-06, "loss": 0.0519, "step": 18550 }, { "epoch": 4.685685432971471, "grad_norm": 0.45812228322029114, "learning_rate": 1.885887402171169e-06, "loss": 0.0466, "step": 18560 }, { "epoch": 4.688210047967685, "grad_norm": 0.8704932332038879, "learning_rate": 1.8707397121938903e-06, "loss": 0.0538, "step": 18570 }, { "epoch": 4.690734662963898, "grad_norm": 0.61441969871521, "learning_rate": 1.855592022216612e-06, "loss": 0.0577, "step": 18580 }, { "epoch": 4.693259277960111, "grad_norm": 0.32448869943618774, "learning_rate": 1.8404443322393335e-06, "loss": 0.0501, "step": 18590 }, { "epoch": 4.6957838929563245, "grad_norm": 0.5979995727539062, "learning_rate": 1.825296642262055e-06, "loss": 0.0502, "step": 18600 }, { "epoch": 4.698308507952537, "grad_norm": 0.38927385210990906, "learning_rate": 1.8101489522847765e-06, "loss": 0.0562, "step": 18610 }, { "epoch": 4.70083312294875, "grad_norm": 0.6443197131156921, "learning_rate": 1.7950012623074983e-06, "loss": 0.0485, "step": 18620 }, { "epoch": 4.703357737944963, "grad_norm": 0.5753923654556274, "learning_rate": 1.7798535723302196e-06, "loss": 0.0455, "step": 18630 }, { "epoch": 4.705882352941177, "grad_norm": 0.5932863354682922, "learning_rate": 1.7647058823529412e-06, "loss": 0.0651, "step": 18640 }, { "epoch": 4.70840696793739, "grad_norm": 0.3984706401824951, "learning_rate": 1.7495581923756626e-06, "loss": 0.0575, "step": 18650 }, { "epoch": 4.710931582933602, "grad_norm": 0.5167907476425171, "learning_rate": 1.7344105023983844e-06, "loss": 0.0639, "step": 18660 }, { "epoch": 4.7134561979298155, "grad_norm": 0.5363221764564514, "learning_rate": 1.7192628124211058e-06, "loss": 0.0418, "step": 18670 }, { "epoch": 4.715980812926029, "grad_norm": 0.584365963935852, "learning_rate": 1.7041151224438274e-06, "loss": 0.0477, "step": 18680 }, { "epoch": 4.718505427922242, "grad_norm": 0.2417188286781311, "learning_rate": 1.6889674324665488e-06, "loss": 0.0543, "step": 18690 }, { "epoch": 4.721030042918455, "grad_norm": 0.5733222365379333, "learning_rate": 1.6738197424892706e-06, "loss": 0.0684, "step": 18700 }, { "epoch": 4.723554657914668, "grad_norm": 0.7107726335525513, "learning_rate": 1.658672052511992e-06, "loss": 0.0543, "step": 18710 }, { "epoch": 4.726079272910881, "grad_norm": 0.7579614520072937, "learning_rate": 1.6435243625347136e-06, "loss": 0.0507, "step": 18720 }, { "epoch": 4.728603887907094, "grad_norm": 0.4801480174064636, "learning_rate": 1.628376672557435e-06, "loss": 0.0611, "step": 18730 }, { "epoch": 4.731128502903307, "grad_norm": 0.5679193139076233, "learning_rate": 1.6132289825801565e-06, "loss": 0.0555, "step": 18740 }, { "epoch": 4.733653117899521, "grad_norm": 0.4143296480178833, "learning_rate": 1.5980812926028781e-06, "loss": 0.0564, "step": 18750 }, { "epoch": 4.736177732895733, "grad_norm": 0.5309060215950012, "learning_rate": 1.5829336026255997e-06, "loss": 0.047, "step": 18760 }, { "epoch": 4.738702347891946, "grad_norm": 0.49305611848831177, "learning_rate": 1.5677859126483211e-06, "loss": 0.053, "step": 18770 }, { "epoch": 4.74122696288816, "grad_norm": 0.5996381044387817, "learning_rate": 1.5526382226710427e-06, "loss": 0.0659, "step": 18780 }, { "epoch": 4.743751577884373, "grad_norm": 0.6321601271629333, "learning_rate": 1.5374905326937643e-06, "loss": 0.0447, "step": 18790 }, { "epoch": 4.746276192880586, "grad_norm": 0.7180649638175964, "learning_rate": 1.522342842716486e-06, "loss": 0.0489, "step": 18800 }, { "epoch": 4.7488008078767985, "grad_norm": 0.40703126788139343, "learning_rate": 1.5071951527392073e-06, "loss": 0.0584, "step": 18810 }, { "epoch": 4.751325422873012, "grad_norm": 0.5110107064247131, "learning_rate": 1.4920474627619289e-06, "loss": 0.0486, "step": 18820 }, { "epoch": 4.753850037869225, "grad_norm": 0.5644401907920837, "learning_rate": 1.4768997727846505e-06, "loss": 0.0562, "step": 18830 }, { "epoch": 4.756374652865438, "grad_norm": 0.5056483745574951, "learning_rate": 1.4617520828073719e-06, "loss": 0.0705, "step": 18840 }, { "epoch": 4.758899267861651, "grad_norm": 0.48723912239074707, "learning_rate": 1.4466043928300934e-06, "loss": 0.0561, "step": 18850 }, { "epoch": 4.761423882857864, "grad_norm": 0.47025883197784424, "learning_rate": 1.4314567028528148e-06, "loss": 0.0602, "step": 18860 }, { "epoch": 4.763948497854077, "grad_norm": 0.3964708745479584, "learning_rate": 1.4163090128755366e-06, "loss": 0.0634, "step": 18870 }, { "epoch": 4.76647311285029, "grad_norm": 0.7490302920341492, "learning_rate": 1.401161322898258e-06, "loss": 0.0648, "step": 18880 }, { "epoch": 4.768997727846504, "grad_norm": 0.3066612184047699, "learning_rate": 1.3860136329209796e-06, "loss": 0.063, "step": 18890 }, { "epoch": 4.771522342842717, "grad_norm": 0.4892236590385437, "learning_rate": 1.370865942943701e-06, "loss": 0.0498, "step": 18900 }, { "epoch": 4.774046957838929, "grad_norm": 0.5352323651313782, "learning_rate": 1.3557182529664226e-06, "loss": 0.0514, "step": 18910 }, { "epoch": 4.7765715728351426, "grad_norm": 0.6631128191947937, "learning_rate": 1.3405705629891442e-06, "loss": 0.0567, "step": 18920 }, { "epoch": 4.779096187831356, "grad_norm": 0.49421215057373047, "learning_rate": 1.3254228730118658e-06, "loss": 0.0556, "step": 18930 }, { "epoch": 4.781620802827569, "grad_norm": 0.36637288331985474, "learning_rate": 1.3102751830345872e-06, "loss": 0.0569, "step": 18940 }, { "epoch": 4.784145417823781, "grad_norm": 0.32764676213264465, "learning_rate": 1.2951274930573088e-06, "loss": 0.0584, "step": 18950 }, { "epoch": 4.786670032819995, "grad_norm": 0.7302456498146057, "learning_rate": 1.2799798030800304e-06, "loss": 0.0535, "step": 18960 }, { "epoch": 4.789194647816208, "grad_norm": 0.5171737670898438, "learning_rate": 1.264832113102752e-06, "loss": 0.0524, "step": 18970 }, { "epoch": 4.791719262812421, "grad_norm": 0.6158316135406494, "learning_rate": 1.2496844231254733e-06, "loss": 0.0626, "step": 18980 }, { "epoch": 4.7942438778086345, "grad_norm": 0.5882306694984436, "learning_rate": 1.234536733148195e-06, "loss": 0.0542, "step": 18990 }, { "epoch": 4.796768492804848, "grad_norm": 0.6305384039878845, "learning_rate": 1.2193890431709165e-06, "loss": 0.0576, "step": 19000 }, { "epoch": 4.79929310780106, "grad_norm": 0.46403953433036804, "learning_rate": 1.2042413531936381e-06, "loss": 0.0453, "step": 19010 }, { "epoch": 4.801817722797273, "grad_norm": 0.6074075698852539, "learning_rate": 1.1890936632163595e-06, "loss": 0.0682, "step": 19020 }, { "epoch": 4.804342337793487, "grad_norm": 0.43722423911094666, "learning_rate": 1.173945973239081e-06, "loss": 0.0542, "step": 19030 }, { "epoch": 4.8068669527897, "grad_norm": 0.35191863775253296, "learning_rate": 1.1587982832618027e-06, "loss": 0.0639, "step": 19040 }, { "epoch": 4.809391567785912, "grad_norm": 0.6911765336990356, "learning_rate": 1.1436505932845243e-06, "loss": 0.0553, "step": 19050 }, { "epoch": 4.8119161827821255, "grad_norm": 0.22319677472114563, "learning_rate": 1.1285029033072457e-06, "loss": 0.066, "step": 19060 }, { "epoch": 4.814440797778339, "grad_norm": 0.520487904548645, "learning_rate": 1.1133552133299673e-06, "loss": 0.0596, "step": 19070 }, { "epoch": 4.816965412774552, "grad_norm": 0.40240347385406494, "learning_rate": 1.0982075233526886e-06, "loss": 0.0618, "step": 19080 }, { "epoch": 4.819490027770765, "grad_norm": 0.6012730598449707, "learning_rate": 1.0830598333754104e-06, "loss": 0.0401, "step": 19090 }, { "epoch": 4.822014642766978, "grad_norm": 0.6411862373352051, "learning_rate": 1.0679121433981318e-06, "loss": 0.0569, "step": 19100 }, { "epoch": 4.824539257763191, "grad_norm": 0.7546837329864502, "learning_rate": 1.0527644534208532e-06, "loss": 0.0619, "step": 19110 }, { "epoch": 4.827063872759404, "grad_norm": 0.5956974625587463, "learning_rate": 1.0376167634435748e-06, "loss": 0.0651, "step": 19120 }, { "epoch": 4.829588487755617, "grad_norm": 0.41826269030570984, "learning_rate": 1.0224690734662964e-06, "loss": 0.0459, "step": 19130 }, { "epoch": 4.832113102751831, "grad_norm": 0.39252969622612, "learning_rate": 1.007321383489018e-06, "loss": 0.0549, "step": 19140 }, { "epoch": 4.834637717748043, "grad_norm": 0.45689401030540466, "learning_rate": 9.921736935117394e-07, "loss": 0.048, "step": 19150 }, { "epoch": 4.837162332744256, "grad_norm": 0.47611868381500244, "learning_rate": 9.77026003534461e-07, "loss": 0.0472, "step": 19160 }, { "epoch": 4.83968694774047, "grad_norm": 0.5146605968475342, "learning_rate": 9.618783135571826e-07, "loss": 0.0537, "step": 19170 }, { "epoch": 4.842211562736683, "grad_norm": 0.5189658999443054, "learning_rate": 9.467306235799042e-07, "loss": 0.0682, "step": 19180 }, { "epoch": 4.844736177732896, "grad_norm": 0.37280428409576416, "learning_rate": 9.315829336026256e-07, "loss": 0.0581, "step": 19190 }, { "epoch": 4.8472607927291085, "grad_norm": 0.5254796743392944, "learning_rate": 9.164352436253472e-07, "loss": 0.0653, "step": 19200 }, { "epoch": 4.849785407725322, "grad_norm": 0.5634022951126099, "learning_rate": 9.012875536480687e-07, "loss": 0.0641, "step": 19210 }, { "epoch": 4.852310022721535, "grad_norm": 0.6558578014373779, "learning_rate": 8.861398636707903e-07, "loss": 0.0665, "step": 19220 }, { "epoch": 4.854834637717748, "grad_norm": 0.7143204808235168, "learning_rate": 8.709921736935118e-07, "loss": 0.0615, "step": 19230 }, { "epoch": 4.857359252713961, "grad_norm": 0.40588563680648804, "learning_rate": 8.558444837162333e-07, "loss": 0.059, "step": 19240 }, { "epoch": 4.859883867710174, "grad_norm": 0.5825479030609131, "learning_rate": 8.406967937389548e-07, "loss": 0.0575, "step": 19250 }, { "epoch": 4.862408482706387, "grad_norm": 0.6735103726387024, "learning_rate": 8.255491037616763e-07, "loss": 0.0667, "step": 19260 }, { "epoch": 4.8649330977026, "grad_norm": 0.5344639420509338, "learning_rate": 8.104014137843979e-07, "loss": 0.0563, "step": 19270 }, { "epoch": 4.867457712698814, "grad_norm": 0.611815869808197, "learning_rate": 7.952537238071194e-07, "loss": 0.0589, "step": 19280 }, { "epoch": 4.869982327695027, "grad_norm": 0.5727031826972961, "learning_rate": 7.80106033829841e-07, "loss": 0.0601, "step": 19290 }, { "epoch": 4.872506942691239, "grad_norm": 0.39414718747138977, "learning_rate": 7.649583438525624e-07, "loss": 0.0542, "step": 19300 }, { "epoch": 4.8750315576874526, "grad_norm": 0.49244511127471924, "learning_rate": 7.49810653875284e-07, "loss": 0.0598, "step": 19310 }, { "epoch": 4.877556172683666, "grad_norm": 0.5638169050216675, "learning_rate": 7.346629638980055e-07, "loss": 0.0537, "step": 19320 }, { "epoch": 4.880080787679879, "grad_norm": 0.4944647550582886, "learning_rate": 7.195152739207271e-07, "loss": 0.0515, "step": 19330 }, { "epoch": 4.882605402676091, "grad_norm": 0.847815215587616, "learning_rate": 7.043675839434486e-07, "loss": 0.0653, "step": 19340 }, { "epoch": 4.885130017672305, "grad_norm": 0.7950305938720703, "learning_rate": 6.892198939661702e-07, "loss": 0.057, "step": 19350 }, { "epoch": 4.887654632668518, "grad_norm": 0.680915892124176, "learning_rate": 6.740722039888917e-07, "loss": 0.0554, "step": 19360 }, { "epoch": 4.890179247664731, "grad_norm": 0.42906680703163147, "learning_rate": 6.589245140116133e-07, "loss": 0.0535, "step": 19370 }, { "epoch": 4.8927038626609445, "grad_norm": 0.872386634349823, "learning_rate": 6.437768240343348e-07, "loss": 0.0622, "step": 19380 }, { "epoch": 4.895228477657158, "grad_norm": 0.619981586933136, "learning_rate": 6.286291340570563e-07, "loss": 0.0672, "step": 19390 }, { "epoch": 4.89775309265337, "grad_norm": 0.538330614566803, "learning_rate": 6.134814440797779e-07, "loss": 0.0525, "step": 19400 }, { "epoch": 4.900277707649583, "grad_norm": 0.4021759033203125, "learning_rate": 5.983337541024993e-07, "loss": 0.0546, "step": 19410 }, { "epoch": 4.902802322645797, "grad_norm": 0.6232868432998657, "learning_rate": 5.831860641252209e-07, "loss": 0.0581, "step": 19420 }, { "epoch": 4.90532693764201, "grad_norm": 0.6456800699234009, "learning_rate": 5.680383741479424e-07, "loss": 0.0545, "step": 19430 }, { "epoch": 4.907851552638222, "grad_norm": 0.5507019758224487, "learning_rate": 5.52890684170664e-07, "loss": 0.0576, "step": 19440 }, { "epoch": 4.9103761676344355, "grad_norm": 0.2918814718723297, "learning_rate": 5.377429941933855e-07, "loss": 0.0527, "step": 19450 }, { "epoch": 4.912900782630649, "grad_norm": 0.35016146302223206, "learning_rate": 5.225953042161071e-07, "loss": 0.0636, "step": 19460 }, { "epoch": 4.915425397626862, "grad_norm": 0.5368366837501526, "learning_rate": 5.074476142388286e-07, "loss": 0.0569, "step": 19470 }, { "epoch": 4.917950012623075, "grad_norm": 0.5466439723968506, "learning_rate": 4.922999242615502e-07, "loss": 0.0508, "step": 19480 }, { "epoch": 4.9204746276192886, "grad_norm": 0.6173040270805359, "learning_rate": 4.771522342842717e-07, "loss": 0.0504, "step": 19490 }, { "epoch": 4.922999242615501, "grad_norm": 0.28498920798301697, "learning_rate": 4.6200454430699317e-07, "loss": 0.0572, "step": 19500 }, { "epoch": 4.925523857611714, "grad_norm": 0.7897679209709167, "learning_rate": 4.468568543297147e-07, "loss": 0.053, "step": 19510 }, { "epoch": 4.928048472607927, "grad_norm": 0.4405366778373718, "learning_rate": 4.3170916435243625e-07, "loss": 0.051, "step": 19520 }, { "epoch": 4.930573087604141, "grad_norm": 0.7264717221260071, "learning_rate": 4.165614743751578e-07, "loss": 0.0535, "step": 19530 }, { "epoch": 4.933097702600353, "grad_norm": 0.47195565700531006, "learning_rate": 4.0141378439787934e-07, "loss": 0.0416, "step": 19540 }, { "epoch": 4.935622317596566, "grad_norm": 0.4767369031906128, "learning_rate": 3.862660944206009e-07, "loss": 0.049, "step": 19550 }, { "epoch": 4.93814693259278, "grad_norm": 0.5228800177574158, "learning_rate": 3.711184044433224e-07, "loss": 0.0568, "step": 19560 }, { "epoch": 4.940671547588993, "grad_norm": 0.5455029010772705, "learning_rate": 3.5597071446604396e-07, "loss": 0.061, "step": 19570 }, { "epoch": 4.943196162585206, "grad_norm": 0.4548329710960388, "learning_rate": 3.408230244887655e-07, "loss": 0.0543, "step": 19580 }, { "epoch": 4.9457207775814185, "grad_norm": 0.41128185391426086, "learning_rate": 3.2567533451148704e-07, "loss": 0.0568, "step": 19590 }, { "epoch": 4.948245392577632, "grad_norm": 0.3675704598426819, "learning_rate": 3.1052764453420853e-07, "loss": 0.0646, "step": 19600 }, { "epoch": 4.950770007573845, "grad_norm": 0.49481600522994995, "learning_rate": 2.9537995455693007e-07, "loss": 0.0659, "step": 19610 }, { "epoch": 4.953294622570058, "grad_norm": 0.3610905706882477, "learning_rate": 2.802322645796516e-07, "loss": 0.0606, "step": 19620 }, { "epoch": 4.9558192375662715, "grad_norm": 0.4303690493106842, "learning_rate": 2.6508457460237316e-07, "loss": 0.0485, "step": 19630 }, { "epoch": 4.958343852562484, "grad_norm": 0.4692881405353546, "learning_rate": 2.4993688462509464e-07, "loss": 0.0552, "step": 19640 }, { "epoch": 4.960868467558697, "grad_norm": 0.7063325047492981, "learning_rate": 2.3478919464781619e-07, "loss": 0.0614, "step": 19650 }, { "epoch": 4.96339308255491, "grad_norm": 0.6039048433303833, "learning_rate": 2.1964150467053775e-07, "loss": 0.0729, "step": 19660 }, { "epoch": 4.965917697551124, "grad_norm": 0.38355937600135803, "learning_rate": 2.044938146932593e-07, "loss": 0.054, "step": 19670 }, { "epoch": 4.968442312547337, "grad_norm": 0.7297325134277344, "learning_rate": 1.893461247159808e-07, "loss": 0.057, "step": 19680 }, { "epoch": 4.970966927543549, "grad_norm": 0.618418276309967, "learning_rate": 1.7419843473870235e-07, "loss": 0.0533, "step": 19690 }, { "epoch": 4.9734915425397626, "grad_norm": 0.44941627979278564, "learning_rate": 1.5905074476142387e-07, "loss": 0.0626, "step": 19700 }, { "epoch": 4.976016157535976, "grad_norm": 0.5745902061462402, "learning_rate": 1.439030547841454e-07, "loss": 0.0528, "step": 19710 }, { "epoch": 4.978540772532189, "grad_norm": 0.6372010707855225, "learning_rate": 1.2875536480686695e-07, "loss": 0.0548, "step": 19720 }, { "epoch": 4.9810653875284014, "grad_norm": 0.5590953826904297, "learning_rate": 1.1360767482958849e-07, "loss": 0.0515, "step": 19730 }, { "epoch": 4.983590002524615, "grad_norm": 0.3603893518447876, "learning_rate": 9.845998485231003e-08, "loss": 0.057, "step": 19740 }, { "epoch": 4.986114617520828, "grad_norm": 0.42396554350852966, "learning_rate": 8.331229487503156e-08, "loss": 0.0522, "step": 19750 }, { "epoch": 4.988639232517041, "grad_norm": 0.6315743327140808, "learning_rate": 6.816460489775309e-08, "loss": 0.0475, "step": 19760 }, { "epoch": 4.9911638475132545, "grad_norm": 0.535829484462738, "learning_rate": 5.301691492047463e-08, "loss": 0.0602, "step": 19770 }, { "epoch": 4.993688462509468, "grad_norm": 0.706295371055603, "learning_rate": 3.786922494319616e-08, "loss": 0.0476, "step": 19780 }, { "epoch": 4.99621307750568, "grad_norm": 0.5887550711631775, "learning_rate": 2.2721534965917698e-08, "loss": 0.0555, "step": 19790 }, { "epoch": 4.998737692501893, "grad_norm": 0.2900368273258209, "learning_rate": 7.573844988639233e-09, "loss": 0.0532, "step": 19800 }, { "epoch": 5.0, "eval_f1": 0.9705180789481339, "eval_loss": 0.04290741682052612, "eval_runtime": 1160.2076, "eval_samples_per_second": 177.78, "eval_steps_per_second": 2.778, "step": 19805 }, { "epoch": 5.0, "step": 19805, "total_flos": 9.82001462664467e+19, "train_loss": 0.0, "train_runtime": 0.0662, "train_samples_per_second": 19152190.372, "train_steps_per_second": 299304.924 } ], "logging_steps": 10, "max_steps": 19805, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.82001462664467e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }