{ "best_metric": 6.66681432723999, "best_model_checkpoint": "ModernBERT-base-dnb/checkpoint-27850", "epoch": 5.0, "eval_steps": 500, "global_step": 27850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004488330341113106, "grad_norm": 21.10218048095703, "learning_rate": 5.3859964093357274e-08, "loss": 20.235, "step": 25 }, { "epoch": 0.008976660682226212, "grad_norm": 21.044858932495117, "learning_rate": 1.0556552962298026e-07, "loss": 20.2822, "step": 50 }, { "epoch": 0.013464991023339317, "grad_norm": 15.749917984008789, "learning_rate": 1.5942549371633754e-07, "loss": 20.2111, "step": 75 }, { "epoch": 0.017953321364452424, "grad_norm": 53.41726303100586, "learning_rate": 2.1113105924596052e-07, "loss": 20.219, "step": 100 }, { "epoch": 0.02244165170556553, "grad_norm": 19.08318328857422, "learning_rate": 2.649910233393178e-07, "loss": 20.2098, "step": 125 }, { "epoch": 0.026929982046678635, "grad_norm": 17.500648498535156, "learning_rate": 3.188509874326751e-07, "loss": 20.1833, "step": 150 }, { "epoch": 0.03141831238779174, "grad_norm": 15.890292167663574, "learning_rate": 3.7271095152603236e-07, "loss": 20.1506, "step": 175 }, { "epoch": 0.03590664272890485, "grad_norm": 15.574706077575684, "learning_rate": 4.265709156193896e-07, "loss": 20.2048, "step": 200 }, { "epoch": 0.04039497307001795, "grad_norm": 16.508468627929688, "learning_rate": 4.804308797127469e-07, "loss": 20.1701, "step": 225 }, { "epoch": 0.04488330341113106, "grad_norm": 14.636826515197754, "learning_rate": 5.342908438061041e-07, "loss": 20.0568, "step": 250 }, { "epoch": 0.04937163375224417, "grad_norm": 12.061074256896973, "learning_rate": 5.881508078994614e-07, "loss": 20.0871, "step": 275 }, { "epoch": 0.05385996409335727, "grad_norm": 13.407825469970703, "learning_rate": 6.420107719928187e-07, "loss": 20.116, "step": 300 }, { "epoch": 0.05834829443447038, "grad_norm": 14.116663932800293, "learning_rate": 6.95870736086176e-07, "loss": 20.1185, "step": 325 }, { "epoch": 0.06283662477558348, "grad_norm": 13.583087921142578, "learning_rate": 7.497307001795332e-07, "loss": 20.1201, "step": 350 }, { "epoch": 0.06732495511669659, "grad_norm": 16.15342903137207, "learning_rate": 8.035906642728905e-07, "loss": 20.0418, "step": 375 }, { "epoch": 0.0718132854578097, "grad_norm": 11.18100643157959, "learning_rate": 8.574506283662477e-07, "loss": 20.0282, "step": 400 }, { "epoch": 0.0763016157989228, "grad_norm": 11.952834129333496, "learning_rate": 9.113105924596051e-07, "loss": 20.032, "step": 425 }, { "epoch": 0.0807899461400359, "grad_norm": 11.063940048217773, "learning_rate": 9.651705565529624e-07, "loss": 20.0012, "step": 450 }, { "epoch": 0.08527827648114901, "grad_norm": 11.023772239685059, "learning_rate": 1.0190305206463197e-06, "loss": 20.0264, "step": 475 }, { "epoch": 0.08976660682226212, "grad_norm": 11.103007316589355, "learning_rate": 1.072890484739677e-06, "loss": 19.964, "step": 500 }, { "epoch": 0.09425493716337523, "grad_norm": 12.964080810546875, "learning_rate": 1.126750448833034e-06, "loss": 20.0389, "step": 525 }, { "epoch": 0.09874326750448834, "grad_norm": 11.313638687133789, "learning_rate": 1.1806104129263915e-06, "loss": 20.0112, "step": 550 }, { "epoch": 0.10323159784560143, "grad_norm": 9.363356590270996, "learning_rate": 1.2344703770197488e-06, "loss": 19.9996, "step": 575 }, { "epoch": 0.10771992818671454, "grad_norm": 11.197516441345215, "learning_rate": 1.2883303411131059e-06, "loss": 19.9399, "step": 600 }, { "epoch": 0.11220825852782765, "grad_norm": 9.123501777648926, "learning_rate": 1.3421903052064631e-06, "loss": 20.0128, "step": 625 }, { "epoch": 0.11669658886894076, "grad_norm": 14.137397766113281, "learning_rate": 1.3960502692998206e-06, "loss": 20.0196, "step": 650 }, { "epoch": 0.12118491921005387, "grad_norm": 9.076166152954102, "learning_rate": 1.449910233393178e-06, "loss": 19.9431, "step": 675 }, { "epoch": 0.12567324955116696, "grad_norm": 11.227516174316406, "learning_rate": 1.503770197486535e-06, "loss": 19.892, "step": 700 }, { "epoch": 0.13016157989228008, "grad_norm": 9.613359451293945, "learning_rate": 1.5576301615798923e-06, "loss": 19.9289, "step": 725 }, { "epoch": 0.13464991023339318, "grad_norm": 9.49264907836914, "learning_rate": 1.6114901256732495e-06, "loss": 19.886, "step": 750 }, { "epoch": 0.13913824057450627, "grad_norm": 11.677379608154297, "learning_rate": 1.6653500897666068e-06, "loss": 19.8904, "step": 775 }, { "epoch": 0.1436265709156194, "grad_norm": 11.741113662719727, "learning_rate": 1.719210053859964e-06, "loss": 19.901, "step": 800 }, { "epoch": 0.1481149012567325, "grad_norm": 12.074057579040527, "learning_rate": 1.7730700179533214e-06, "loss": 19.903, "step": 825 }, { "epoch": 0.1526032315978456, "grad_norm": 17.792566299438477, "learning_rate": 1.8269299820466787e-06, "loss": 19.8621, "step": 850 }, { "epoch": 0.1570915619389587, "grad_norm": 10.892045021057129, "learning_rate": 1.880789946140036e-06, "loss": 19.8345, "step": 875 }, { "epoch": 0.1615798922800718, "grad_norm": 11.912057876586914, "learning_rate": 1.9346499102333932e-06, "loss": 19.8252, "step": 900 }, { "epoch": 0.16606822262118492, "grad_norm": 11.374829292297363, "learning_rate": 1.9885098743267503e-06, "loss": 19.8219, "step": 925 }, { "epoch": 0.17055655296229802, "grad_norm": 11.979461669921875, "learning_rate": 2.0423698384201078e-06, "loss": 19.7829, "step": 950 }, { "epoch": 0.17504488330341114, "grad_norm": 12.24255657196045, "learning_rate": 2.096229802513465e-06, "loss": 19.7962, "step": 975 }, { "epoch": 0.17953321364452424, "grad_norm": 11.375764846801758, "learning_rate": 2.150089766606822e-06, "loss": 19.7298, "step": 1000 }, { "epoch": 0.18402154398563733, "grad_norm": 12.321856498718262, "learning_rate": 2.20394973070018e-06, "loss": 19.712, "step": 1025 }, { "epoch": 0.18850987432675045, "grad_norm": 12.325765609741211, "learning_rate": 2.257809694793537e-06, "loss": 19.6841, "step": 1050 }, { "epoch": 0.19299820466786355, "grad_norm": 12.724334716796875, "learning_rate": 2.3116696588868944e-06, "loss": 19.6671, "step": 1075 }, { "epoch": 0.19748653500897667, "grad_norm": 12.078253746032715, "learning_rate": 2.3655296229802515e-06, "loss": 19.6296, "step": 1100 }, { "epoch": 0.20197486535008977, "grad_norm": 13.096288681030273, "learning_rate": 2.4193895870736085e-06, "loss": 19.5341, "step": 1125 }, { "epoch": 0.20646319569120286, "grad_norm": 13.682557106018066, "learning_rate": 2.473249551166966e-06, "loss": 19.6408, "step": 1150 }, { "epoch": 0.21095152603231598, "grad_norm": 13.981725692749023, "learning_rate": 2.527109515260323e-06, "loss": 19.6481, "step": 1175 }, { "epoch": 0.21543985637342908, "grad_norm": 13.742826461791992, "learning_rate": 2.58096947935368e-06, "loss": 19.5424, "step": 1200 }, { "epoch": 0.2199281867145422, "grad_norm": 16.546175003051758, "learning_rate": 2.634829443447038e-06, "loss": 19.5218, "step": 1225 }, { "epoch": 0.2244165170556553, "grad_norm": 15.267694473266602, "learning_rate": 2.688689407540395e-06, "loss": 19.5041, "step": 1250 }, { "epoch": 0.2289048473967684, "grad_norm": 15.6527681350708, "learning_rate": 2.7425493716337522e-06, "loss": 19.6472, "step": 1275 }, { "epoch": 0.2333931777378815, "grad_norm": 17.371788024902344, "learning_rate": 2.7964093357271097e-06, "loss": 19.4608, "step": 1300 }, { "epoch": 0.2378815080789946, "grad_norm": 15.061119079589844, "learning_rate": 2.8502692998204668e-06, "loss": 19.4584, "step": 1325 }, { "epoch": 0.24236983842010773, "grad_norm": 13.703615188598633, "learning_rate": 2.90197486535009e-06, "loss": 19.4636, "step": 1350 }, { "epoch": 0.24685816876122083, "grad_norm": 13.755026817321777, "learning_rate": 2.9558348294434473e-06, "loss": 19.391, "step": 1375 }, { "epoch": 0.2513464991023339, "grad_norm": 15.358574867248535, "learning_rate": 3.0096947935368044e-06, "loss": 19.3758, "step": 1400 }, { "epoch": 0.25583482944344704, "grad_norm": 14.703276634216309, "learning_rate": 3.063554757630162e-06, "loss": 19.357, "step": 1425 }, { "epoch": 0.26032315978456017, "grad_norm": 14.63382625579834, "learning_rate": 3.117414721723519e-06, "loss": 19.2794, "step": 1450 }, { "epoch": 0.26481149012567323, "grad_norm": 15.199682235717773, "learning_rate": 3.171274685816876e-06, "loss": 19.2949, "step": 1475 }, { "epoch": 0.26929982046678635, "grad_norm": 14.768528938293457, "learning_rate": 3.2251346499102335e-06, "loss": 19.3093, "step": 1500 }, { "epoch": 0.2737881508078995, "grad_norm": 14.896871566772461, "learning_rate": 3.2789946140035906e-06, "loss": 19.2305, "step": 1525 }, { "epoch": 0.27827648114901254, "grad_norm": 15.565362930297852, "learning_rate": 3.3328545780969477e-06, "loss": 19.2403, "step": 1550 }, { "epoch": 0.28276481149012567, "grad_norm": 16.003311157226562, "learning_rate": 3.3867145421903056e-06, "loss": 19.324, "step": 1575 }, { "epoch": 0.2872531418312388, "grad_norm": 15.933990478515625, "learning_rate": 3.4405745062836626e-06, "loss": 19.194, "step": 1600 }, { "epoch": 0.2917414721723519, "grad_norm": 16.488842010498047, "learning_rate": 3.49443447037702e-06, "loss": 19.3091, "step": 1625 }, { "epoch": 0.296229802513465, "grad_norm": 15.880293846130371, "learning_rate": 3.548294434470377e-06, "loss": 19.1561, "step": 1650 }, { "epoch": 0.3007181328545781, "grad_norm": 15.783681869506836, "learning_rate": 3.6021543985637343e-06, "loss": 19.0354, "step": 1675 }, { "epoch": 0.3052064631956912, "grad_norm": 17.726228713989258, "learning_rate": 3.6560143626570918e-06, "loss": 19.0885, "step": 1700 }, { "epoch": 0.3096947935368043, "grad_norm": 15.17348575592041, "learning_rate": 3.709874326750449e-06, "loss": 19.2063, "step": 1725 }, { "epoch": 0.3141831238779174, "grad_norm": 16.843894958496094, "learning_rate": 3.763734290843806e-06, "loss": 18.9388, "step": 1750 }, { "epoch": 0.31867145421903054, "grad_norm": 23.23088264465332, "learning_rate": 3.817594254937163e-06, "loss": 19.1281, "step": 1775 }, { "epoch": 0.3231597845601436, "grad_norm": 15.54453182220459, "learning_rate": 3.8714542190305205e-06, "loss": 19.1134, "step": 1800 }, { "epoch": 0.3276481149012567, "grad_norm": 16.050777435302734, "learning_rate": 3.925314183123878e-06, "loss": 18.9336, "step": 1825 }, { "epoch": 0.33213644524236985, "grad_norm": 15.230576515197754, "learning_rate": 3.979174147217235e-06, "loss": 19.063, "step": 1850 }, { "epoch": 0.33662477558348297, "grad_norm": 16.179035186767578, "learning_rate": 4.0330341113105925e-06, "loss": 18.9994, "step": 1875 }, { "epoch": 0.34111310592459604, "grad_norm": 16.51089096069336, "learning_rate": 4.0868940754039504e-06, "loss": 18.9635, "step": 1900 }, { "epoch": 0.34560143626570916, "grad_norm": 15.68458080291748, "learning_rate": 4.140754039497307e-06, "loss": 18.9526, "step": 1925 }, { "epoch": 0.3500897666068223, "grad_norm": 16.986204147338867, "learning_rate": 4.1946140035906646e-06, "loss": 19.042, "step": 1950 }, { "epoch": 0.35457809694793535, "grad_norm": 15.371708869934082, "learning_rate": 4.248473967684022e-06, "loss": 18.8415, "step": 1975 }, { "epoch": 0.3590664272890485, "grad_norm": 15.591832160949707, "learning_rate": 4.302333931777379e-06, "loss": 18.8101, "step": 2000 }, { "epoch": 0.3635547576301616, "grad_norm": 15.236952781677246, "learning_rate": 4.356193895870736e-06, "loss": 18.8519, "step": 2025 }, { "epoch": 0.36804308797127466, "grad_norm": 16.580678939819336, "learning_rate": 4.410053859964094e-06, "loss": 18.8845, "step": 2050 }, { "epoch": 0.3725314183123878, "grad_norm": 16.18427085876465, "learning_rate": 4.463913824057451e-06, "loss": 18.7093, "step": 2075 }, { "epoch": 0.3770197486535009, "grad_norm": 18.2146053314209, "learning_rate": 4.517773788150808e-06, "loss": 18.777, "step": 2100 }, { "epoch": 0.38150807899461403, "grad_norm": 16.10597038269043, "learning_rate": 4.571633752244166e-06, "loss": 18.7469, "step": 2125 }, { "epoch": 0.3859964093357271, "grad_norm": 15.864044189453125, "learning_rate": 4.625493716337523e-06, "loss": 18.8742, "step": 2150 }, { "epoch": 0.3904847396768402, "grad_norm": 18.99787139892578, "learning_rate": 4.67935368043088e-06, "loss": 18.7297, "step": 2175 }, { "epoch": 0.39497307001795334, "grad_norm": 16.34602928161621, "learning_rate": 4.733213644524237e-06, "loss": 18.6701, "step": 2200 }, { "epoch": 0.3994614003590664, "grad_norm": 16.323881149291992, "learning_rate": 4.787073608617595e-06, "loss": 18.7605, "step": 2225 }, { "epoch": 0.40394973070017953, "grad_norm": 16.251663208007812, "learning_rate": 4.840933572710951e-06, "loss": 18.736, "step": 2250 }, { "epoch": 0.40843806104129265, "grad_norm": 16.65889549255371, "learning_rate": 4.894793536804309e-06, "loss": 18.7178, "step": 2275 }, { "epoch": 0.4129263913824057, "grad_norm": 16.258752822875977, "learning_rate": 4.948653500897667e-06, "loss": 18.5482, "step": 2300 }, { "epoch": 0.41741472172351884, "grad_norm": 15.998190879821777, "learning_rate": 5.002513464991023e-06, "loss": 18.7417, "step": 2325 }, { "epoch": 0.42190305206463197, "grad_norm": 14.841017723083496, "learning_rate": 5.056373429084381e-06, "loss": 18.6716, "step": 2350 }, { "epoch": 0.4263913824057451, "grad_norm": 15.91247272491455, "learning_rate": 5.110233393177738e-06, "loss": 18.688, "step": 2375 }, { "epoch": 0.43087971274685816, "grad_norm": 17.436525344848633, "learning_rate": 5.164093357271095e-06, "loss": 18.5867, "step": 2400 }, { "epoch": 0.4353680430879713, "grad_norm": 16.675212860107422, "learning_rate": 5.217953321364452e-06, "loss": 18.7864, "step": 2425 }, { "epoch": 0.4398563734290844, "grad_norm": 16.56376075744629, "learning_rate": 5.27181328545781e-06, "loss": 18.469, "step": 2450 }, { "epoch": 0.44434470377019747, "grad_norm": 16.11998176574707, "learning_rate": 5.325673249551166e-06, "loss": 18.528, "step": 2475 }, { "epoch": 0.4488330341113106, "grad_norm": 16.21501922607422, "learning_rate": 5.379533213644524e-06, "loss": 18.5634, "step": 2500 }, { "epoch": 0.4533213644524237, "grad_norm": 17.891937255859375, "learning_rate": 5.433393177737882e-06, "loss": 18.5037, "step": 2525 }, { "epoch": 0.4578096947935368, "grad_norm": 18.845378875732422, "learning_rate": 5.4872531418312385e-06, "loss": 18.4701, "step": 2550 }, { "epoch": 0.4622980251346499, "grad_norm": 14.12865924835205, "learning_rate": 5.541113105924596e-06, "loss": 18.5924, "step": 2575 }, { "epoch": 0.466786355475763, "grad_norm": 19.014360427856445, "learning_rate": 5.5949730700179534e-06, "loss": 18.4597, "step": 2600 }, { "epoch": 0.47127468581687615, "grad_norm": 16.36025047302246, "learning_rate": 5.6488330341113105e-06, "loss": 18.6834, "step": 2625 }, { "epoch": 0.4757630161579892, "grad_norm": 16.406009674072266, "learning_rate": 5.7026929982046676e-06, "loss": 18.4796, "step": 2650 }, { "epoch": 0.48025134649910234, "grad_norm": 16.516311645507812, "learning_rate": 5.7565529622980255e-06, "loss": 18.408, "step": 2675 }, { "epoch": 0.48473967684021546, "grad_norm": 15.522378921508789, "learning_rate": 5.8104129263913826e-06, "loss": 18.664, "step": 2700 }, { "epoch": 0.48922800718132853, "grad_norm": 25.351648330688477, "learning_rate": 5.86427289048474e-06, "loss": 18.4765, "step": 2725 }, { "epoch": 0.49371633752244165, "grad_norm": 15.928828239440918, "learning_rate": 5.9181328545780975e-06, "loss": 18.5569, "step": 2750 }, { "epoch": 0.4982046678635548, "grad_norm": 14.958181381225586, "learning_rate": 5.971992818671455e-06, "loss": 18.4793, "step": 2775 }, { "epoch": 0.5026929982046678, "grad_norm": 16.40975570678711, "learning_rate": 6.025852782764812e-06, "loss": 18.3546, "step": 2800 }, { "epoch": 0.507181328545781, "grad_norm": 15.906976699829102, "learning_rate": 6.079712746858169e-06, "loss": 18.4363, "step": 2825 }, { "epoch": 0.5116696588868941, "grad_norm": 15.144857406616211, "learning_rate": 6.133572710951527e-06, "loss": 18.3984, "step": 2850 }, { "epoch": 0.5161579892280072, "grad_norm": 16.463176727294922, "learning_rate": 6.187432675044883e-06, "loss": 18.2991, "step": 2875 }, { "epoch": 0.5206463195691203, "grad_norm": 15.463627815246582, "learning_rate": 6.241292639138241e-06, "loss": 18.694, "step": 2900 }, { "epoch": 0.5251346499102334, "grad_norm": 17.69695281982422, "learning_rate": 6.295152603231599e-06, "loss": 18.2619, "step": 2925 }, { "epoch": 0.5296229802513465, "grad_norm": 17.653112411499023, "learning_rate": 6.349012567324955e-06, "loss": 18.19, "step": 2950 }, { "epoch": 0.5341113105924596, "grad_norm": 16.84458351135254, "learning_rate": 6.402872531418313e-06, "loss": 18.3045, "step": 2975 }, { "epoch": 0.5385996409335727, "grad_norm": 16.69748878479004, "learning_rate": 6.45673249551167e-06, "loss": 18.2229, "step": 3000 }, { "epoch": 0.5430879712746858, "grad_norm": 20.153520584106445, "learning_rate": 6.510592459605027e-06, "loss": 18.3158, "step": 3025 }, { "epoch": 0.547576301615799, "grad_norm": 18.79121208190918, "learning_rate": 6.564452423698384e-06, "loss": 18.3017, "step": 3050 }, { "epoch": 0.552064631956912, "grad_norm": 16.678478240966797, "learning_rate": 6.618312387791742e-06, "loss": 18.39, "step": 3075 }, { "epoch": 0.5565529622980251, "grad_norm": 22.23566246032715, "learning_rate": 6.672172351885098e-06, "loss": 18.3635, "step": 3100 }, { "epoch": 0.5610412926391383, "grad_norm": 17.27984619140625, "learning_rate": 6.726032315978456e-06, "loss": 18.1439, "step": 3125 }, { "epoch": 0.5655296229802513, "grad_norm": 16.337604522705078, "learning_rate": 6.779892280071814e-06, "loss": 18.378, "step": 3150 }, { "epoch": 0.5700179533213644, "grad_norm": 16.56863784790039, "learning_rate": 6.83375224416517e-06, "loss": 18.192, "step": 3175 }, { "epoch": 0.5745062836624776, "grad_norm": 17.132043838500977, "learning_rate": 6.887612208258528e-06, "loss": 18.3463, "step": 3200 }, { "epoch": 0.5789946140035906, "grad_norm": 17.860429763793945, "learning_rate": 6.941472172351885e-06, "loss": 18.2454, "step": 3225 }, { "epoch": 0.5834829443447038, "grad_norm": 14.994222640991211, "learning_rate": 6.995332136445242e-06, "loss": 18.3204, "step": 3250 }, { "epoch": 0.5879712746858169, "grad_norm": 16.782873153686523, "learning_rate": 7.049192100538599e-06, "loss": 18.2429, "step": 3275 }, { "epoch": 0.59245960502693, "grad_norm": 15.746573448181152, "learning_rate": 7.103052064631957e-06, "loss": 18.0945, "step": 3300 }, { "epoch": 0.5969479353680431, "grad_norm": 17.83595848083496, "learning_rate": 7.156912028725314e-06, "loss": 18.3612, "step": 3325 }, { "epoch": 0.6014362657091562, "grad_norm": 17.314441680908203, "learning_rate": 7.2107719928186714e-06, "loss": 18.1783, "step": 3350 }, { "epoch": 0.6059245960502693, "grad_norm": 16.076663970947266, "learning_rate": 7.264631956912029e-06, "loss": 18.3625, "step": 3375 }, { "epoch": 0.6104129263913824, "grad_norm": 16.62413215637207, "learning_rate": 7.318491921005386e-06, "loss": 18.3717, "step": 3400 }, { "epoch": 0.6149012567324955, "grad_norm": 19.139238357543945, "learning_rate": 7.3723518850987435e-06, "loss": 18.0023, "step": 3425 }, { "epoch": 0.6193895870736086, "grad_norm": 15.575067520141602, "learning_rate": 7.4262118491921005e-06, "loss": 18.3083, "step": 3450 }, { "epoch": 0.6238779174147218, "grad_norm": 18.650287628173828, "learning_rate": 7.4800718132854585e-06, "loss": 18.2143, "step": 3475 }, { "epoch": 0.6283662477558348, "grad_norm": 18.52598762512207, "learning_rate": 7.5339317773788155e-06, "loss": 18.0261, "step": 3500 }, { "epoch": 0.6328545780969479, "grad_norm": 17.653348922729492, "learning_rate": 7.587791741472173e-06, "loss": 17.919, "step": 3525 }, { "epoch": 0.6373429084380611, "grad_norm": 17.140901565551758, "learning_rate": 7.641651705565529e-06, "loss": 18.0068, "step": 3550 }, { "epoch": 0.6418312387791741, "grad_norm": 16.913959503173828, "learning_rate": 7.695511669658888e-06, "loss": 18.1295, "step": 3575 }, { "epoch": 0.6463195691202872, "grad_norm": 18.763505935668945, "learning_rate": 7.749371633752245e-06, "loss": 17.8775, "step": 3600 }, { "epoch": 0.6508078994614004, "grad_norm": 16.92535400390625, "learning_rate": 7.803231597845602e-06, "loss": 18.1007, "step": 3625 }, { "epoch": 0.6552962298025135, "grad_norm": 19.66353988647461, "learning_rate": 7.857091561938959e-06, "loss": 17.9173, "step": 3650 }, { "epoch": 0.6597845601436265, "grad_norm": 17.924484252929688, "learning_rate": 7.910951526032318e-06, "loss": 17.9126, "step": 3675 }, { "epoch": 0.6642728904847397, "grad_norm": 18.674997329711914, "learning_rate": 7.964811490125673e-06, "loss": 18.0945, "step": 3700 }, { "epoch": 0.6687612208258528, "grad_norm": 17.291763305664062, "learning_rate": 8.01867145421903e-06, "loss": 18.1315, "step": 3725 }, { "epoch": 0.6732495511669659, "grad_norm": 18.881961822509766, "learning_rate": 8.072531418312387e-06, "loss": 17.8323, "step": 3750 }, { "epoch": 0.677737881508079, "grad_norm": 18.964895248413086, "learning_rate": 8.126391382405746e-06, "loss": 17.8395, "step": 3775 }, { "epoch": 0.6822262118491921, "grad_norm": 16.793581008911133, "learning_rate": 8.180251346499103e-06, "loss": 18.0268, "step": 3800 }, { "epoch": 0.6867145421903053, "grad_norm": 16.940654754638672, "learning_rate": 8.23411131059246e-06, "loss": 17.9372, "step": 3825 }, { "epoch": 0.6912028725314183, "grad_norm": 16.80086898803711, "learning_rate": 8.287971274685817e-06, "loss": 17.7917, "step": 3850 }, { "epoch": 0.6956912028725314, "grad_norm": 21.22157859802246, "learning_rate": 8.341831238779174e-06, "loss": 18.1762, "step": 3875 }, { "epoch": 0.7001795332136446, "grad_norm": 18.476032257080078, "learning_rate": 8.395691202872531e-06, "loss": 17.7716, "step": 3900 }, { "epoch": 0.7046678635547576, "grad_norm": 15.498908996582031, "learning_rate": 8.447396768402154e-06, "loss": 18.0372, "step": 3925 }, { "epoch": 0.7091561938958707, "grad_norm": 18.895370483398438, "learning_rate": 8.501256732495513e-06, "loss": 18.0482, "step": 3950 }, { "epoch": 0.7136445242369839, "grad_norm": 21.163408279418945, "learning_rate": 8.55511669658887e-06, "loss": 17.745, "step": 3975 }, { "epoch": 0.718132854578097, "grad_norm": 15.291622161865234, "learning_rate": 8.608976660682225e-06, "loss": 17.9657, "step": 4000 }, { "epoch": 0.72262118491921, "grad_norm": 19.777557373046875, "learning_rate": 8.662836624775583e-06, "loss": 17.7042, "step": 4025 }, { "epoch": 0.7271095152603232, "grad_norm": 18.978593826293945, "learning_rate": 8.716696588868941e-06, "loss": 17.7297, "step": 4050 }, { "epoch": 0.7315978456014363, "grad_norm": 21.442642211914062, "learning_rate": 8.770556552962298e-06, "loss": 17.8813, "step": 4075 }, { "epoch": 0.7360861759425493, "grad_norm": 17.907501220703125, "learning_rate": 8.824416517055655e-06, "loss": 17.7841, "step": 4100 }, { "epoch": 0.7405745062836625, "grad_norm": 18.066967010498047, "learning_rate": 8.878276481149014e-06, "loss": 17.8453, "step": 4125 }, { "epoch": 0.7450628366247756, "grad_norm": 20.10198974609375, "learning_rate": 8.93213644524237e-06, "loss": 17.8353, "step": 4150 }, { "epoch": 0.7495511669658886, "grad_norm": 16.845745086669922, "learning_rate": 8.985996409335727e-06, "loss": 17.8432, "step": 4175 }, { "epoch": 0.7540394973070018, "grad_norm": 18.617088317871094, "learning_rate": 9.039856373429084e-06, "loss": 17.8202, "step": 4200 }, { "epoch": 0.7585278276481149, "grad_norm": 19.578916549682617, "learning_rate": 9.093716337522442e-06, "loss": 17.7985, "step": 4225 }, { "epoch": 0.7630161579892281, "grad_norm": 16.22163200378418, "learning_rate": 9.1475763016158e-06, "loss": 17.9493, "step": 4250 }, { "epoch": 0.7675044883303411, "grad_norm": 18.09850311279297, "learning_rate": 9.201436265709157e-06, "loss": 17.7751, "step": 4275 }, { "epoch": 0.7719928186714542, "grad_norm": 18.09538459777832, "learning_rate": 9.255296229802514e-06, "loss": 17.7249, "step": 4300 }, { "epoch": 0.7764811490125674, "grad_norm": 15.956615447998047, "learning_rate": 9.30915619389587e-06, "loss": 17.9555, "step": 4325 }, { "epoch": 0.7809694793536804, "grad_norm": 17.795026779174805, "learning_rate": 9.363016157989228e-06, "loss": 17.7529, "step": 4350 }, { "epoch": 0.7854578096947935, "grad_norm": 18.07413101196289, "learning_rate": 9.416876122082585e-06, "loss": 17.5449, "step": 4375 }, { "epoch": 0.7899461400359067, "grad_norm": 19.664108276367188, "learning_rate": 9.470736086175944e-06, "loss": 17.6976, "step": 4400 }, { "epoch": 0.7944344703770198, "grad_norm": 21.336183547973633, "learning_rate": 9.5245960502693e-06, "loss": 17.7112, "step": 4425 }, { "epoch": 0.7989228007181328, "grad_norm": 16.216899871826172, "learning_rate": 9.578456014362658e-06, "loss": 17.6466, "step": 4450 }, { "epoch": 0.803411131059246, "grad_norm": 17.492589950561523, "learning_rate": 9.632315978456013e-06, "loss": 17.823, "step": 4475 }, { "epoch": 0.8078994614003591, "grad_norm": 19.598114013671875, "learning_rate": 9.686175942549372e-06, "loss": 17.6314, "step": 4500 }, { "epoch": 0.8123877917414721, "grad_norm": 18.45696258544922, "learning_rate": 9.740035906642729e-06, "loss": 17.6494, "step": 4525 }, { "epoch": 0.8168761220825853, "grad_norm": 17.067623138427734, "learning_rate": 9.793895870736086e-06, "loss": 17.5437, "step": 4550 }, { "epoch": 0.8213644524236984, "grad_norm": 22.662578582763672, "learning_rate": 9.847755834829445e-06, "loss": 17.6709, "step": 4575 }, { "epoch": 0.8258527827648114, "grad_norm": 17.072893142700195, "learning_rate": 9.901615798922802e-06, "loss": 17.7612, "step": 4600 }, { "epoch": 0.8303411131059246, "grad_norm": 18.79728889465332, "learning_rate": 9.955475763016157e-06, "loss": 17.7251, "step": 4625 }, { "epoch": 0.8348294434470377, "grad_norm": 18.312700271606445, "learning_rate": 1.0009335727109514e-05, "loss": 17.5846, "step": 4650 }, { "epoch": 0.8393177737881508, "grad_norm": 18.2193660736084, "learning_rate": 1.0063195691202873e-05, "loss": 17.463, "step": 4675 }, { "epoch": 0.8438061041292639, "grad_norm": 20.095277786254883, "learning_rate": 1.011705565529623e-05, "loss": 17.1827, "step": 4700 }, { "epoch": 0.848294434470377, "grad_norm": 20.12647247314453, "learning_rate": 1.0170915619389587e-05, "loss": 17.4883, "step": 4725 }, { "epoch": 0.8527827648114902, "grad_norm": 18.37622833251953, "learning_rate": 1.0224775583482946e-05, "loss": 17.5654, "step": 4750 }, { "epoch": 0.8572710951526032, "grad_norm": 18.199060440063477, "learning_rate": 1.0278635547576303e-05, "loss": 17.526, "step": 4775 }, { "epoch": 0.8617594254937163, "grad_norm": 20.442859649658203, "learning_rate": 1.0332495511669658e-05, "loss": 17.6952, "step": 4800 }, { "epoch": 0.8662477558348295, "grad_norm": 18.540332794189453, "learning_rate": 1.0386355475763015e-05, "loss": 17.5526, "step": 4825 }, { "epoch": 0.8707360861759426, "grad_norm": 19.37755012512207, "learning_rate": 1.0440215439856374e-05, "loss": 17.463, "step": 4850 }, { "epoch": 0.8752244165170556, "grad_norm": 19.95784568786621, "learning_rate": 1.0494075403949731e-05, "loss": 17.671, "step": 4875 }, { "epoch": 0.8797127468581688, "grad_norm": 17.712932586669922, "learning_rate": 1.0547935368043088e-05, "loss": 17.4896, "step": 4900 }, { "epoch": 0.8842010771992819, "grad_norm": 19.43058967590332, "learning_rate": 1.0601795332136445e-05, "loss": 17.5935, "step": 4925 }, { "epoch": 0.8886894075403949, "grad_norm": 20.811893463134766, "learning_rate": 1.0655655296229803e-05, "loss": 17.5003, "step": 4950 }, { "epoch": 0.8931777378815081, "grad_norm": 21.070432662963867, "learning_rate": 1.070951526032316e-05, "loss": 17.479, "step": 4975 }, { "epoch": 0.8976660682226212, "grad_norm": 17.394657135009766, "learning_rate": 1.0763375224416517e-05, "loss": 17.5536, "step": 5000 }, { "epoch": 0.9021543985637342, "grad_norm": 19.602218627929688, "learning_rate": 1.0817235188509875e-05, "loss": 17.4437, "step": 5025 }, { "epoch": 0.9066427289048474, "grad_norm": 21.4727783203125, "learning_rate": 1.0871095152603232e-05, "loss": 17.3718, "step": 5050 }, { "epoch": 0.9111310592459605, "grad_norm": 18.7939510345459, "learning_rate": 1.092495511669659e-05, "loss": 17.1782, "step": 5075 }, { "epoch": 0.9156193895870736, "grad_norm": 21.373146057128906, "learning_rate": 1.0978815080789945e-05, "loss": 17.0867, "step": 5100 }, { "epoch": 0.9201077199281867, "grad_norm": 22.122276306152344, "learning_rate": 1.1032675044883304e-05, "loss": 17.5058, "step": 5125 }, { "epoch": 0.9245960502692998, "grad_norm": 20.753555297851562, "learning_rate": 1.108653500897666e-05, "loss": 16.9196, "step": 5150 }, { "epoch": 0.9290843806104129, "grad_norm": 20.26563262939453, "learning_rate": 1.1140394973070018e-05, "loss": 17.3307, "step": 5175 }, { "epoch": 0.933572710951526, "grad_norm": 19.536109924316406, "learning_rate": 1.1194254937163377e-05, "loss": 17.2833, "step": 5200 }, { "epoch": 0.9380610412926391, "grad_norm": 21.98984146118164, "learning_rate": 1.1248114901256734e-05, "loss": 17.3079, "step": 5225 }, { "epoch": 0.9425493716337523, "grad_norm": 20.069507598876953, "learning_rate": 1.1301974865350089e-05, "loss": 17.3164, "step": 5250 }, { "epoch": 0.9470377019748654, "grad_norm": 19.031282424926758, "learning_rate": 1.1355834829443446e-05, "loss": 17.4307, "step": 5275 }, { "epoch": 0.9515260323159784, "grad_norm": 21.127609252929688, "learning_rate": 1.1409694793536805e-05, "loss": 17.0626, "step": 5300 }, { "epoch": 0.9560143626570916, "grad_norm": 22.808317184448242, "learning_rate": 1.1463554757630162e-05, "loss": 17.2784, "step": 5325 }, { "epoch": 0.9605026929982047, "grad_norm": 20.546794891357422, "learning_rate": 1.1517414721723519e-05, "loss": 17.1925, "step": 5350 }, { "epoch": 0.9649910233393177, "grad_norm": 18.644824981689453, "learning_rate": 1.1571274685816878e-05, "loss": 17.1864, "step": 5375 }, { "epoch": 0.9694793536804309, "grad_norm": 19.968189239501953, "learning_rate": 1.1625134649910235e-05, "loss": 17.1675, "step": 5400 }, { "epoch": 0.973967684021544, "grad_norm": 19.67746925354004, "learning_rate": 1.167899461400359e-05, "loss": 17.1256, "step": 5425 }, { "epoch": 0.9784560143626571, "grad_norm": 21.387107849121094, "learning_rate": 1.1732854578096947e-05, "loss": 17.411, "step": 5450 }, { "epoch": 0.9829443447037702, "grad_norm": 17.536405563354492, "learning_rate": 1.1786714542190306e-05, "loss": 17.1895, "step": 5475 }, { "epoch": 0.9874326750448833, "grad_norm": 21.705129623413086, "learning_rate": 1.1840574506283663e-05, "loss": 17.3247, "step": 5500 }, { "epoch": 0.9919210053859964, "grad_norm": 20.925796508789062, "learning_rate": 1.189443447037702e-05, "loss": 17.2336, "step": 5525 }, { "epoch": 0.9964093357271095, "grad_norm": 19.501977920532227, "learning_rate": 1.1948294434470377e-05, "loss": 16.939, "step": 5550 }, { "epoch": 1.0, "eval_accuracy": 0.024535543123365092, "eval_f1_macro": 0.00018617887832660077, "eval_f1_micro": 0.024535543123365092, "eval_f1_weighted": 0.005391520089441958, "eval_loss": 8.812005996704102, "eval_precision_macro": 0.0002191599995913754, "eval_precision_micro": 0.024535543123365092, "eval_precision_weighted": 0.004721591271214293, "eval_recall_macro": 0.000766160477322559, "eval_recall_micro": 0.024535543123365092, "eval_recall_weighted": 0.024535543123365092, "eval_runtime": 128.9376, "eval_samples_per_second": 406.189, "eval_steps_per_second": 12.696, "step": 5570 }, { "epoch": 1.0008976660682227, "grad_norm": 19.746585845947266, "learning_rate": 1.2002154398563734e-05, "loss": 16.8127, "step": 5575 }, { "epoch": 1.0053859964093357, "grad_norm": 20.92815589904785, "learning_rate": 1.2056014362657091e-05, "loss": 16.6227, "step": 5600 }, { "epoch": 1.0098743267504489, "grad_norm": 23.12251091003418, "learning_rate": 1.2109874326750448e-05, "loss": 16.8028, "step": 5625 }, { "epoch": 1.014362657091562, "grad_norm": 21.773548126220703, "learning_rate": 1.2163734290843807e-05, "loss": 16.9285, "step": 5650 }, { "epoch": 1.018850987432675, "grad_norm": 18.216033935546875, "learning_rate": 1.2217594254937164e-05, "loss": 16.8515, "step": 5675 }, { "epoch": 1.0233393177737882, "grad_norm": 20.353927612304688, "learning_rate": 1.2271454219030521e-05, "loss": 16.9787, "step": 5700 }, { "epoch": 1.0278276481149013, "grad_norm": 22.886110305786133, "learning_rate": 1.2325314183123877e-05, "loss": 16.7614, "step": 5725 }, { "epoch": 1.0323159784560143, "grad_norm": 21.366548538208008, "learning_rate": 1.2379174147217235e-05, "loss": 16.3866, "step": 5750 }, { "epoch": 1.0368043087971275, "grad_norm": 23.675683975219727, "learning_rate": 1.2433034111310593e-05, "loss": 16.3334, "step": 5775 }, { "epoch": 1.0412926391382407, "grad_norm": 22.998641967773438, "learning_rate": 1.248689407540395e-05, "loss": 16.7228, "step": 5800 }, { "epoch": 1.0457809694793536, "grad_norm": 20.504121780395508, "learning_rate": 1.2540754039497308e-05, "loss": 16.5914, "step": 5825 }, { "epoch": 1.0502692998204668, "grad_norm": 22.66668128967285, "learning_rate": 1.2594614003590665e-05, "loss": 16.5995, "step": 5850 }, { "epoch": 1.05475763016158, "grad_norm": 20.194726943969727, "learning_rate": 1.264847396768402e-05, "loss": 16.7589, "step": 5875 }, { "epoch": 1.059245960502693, "grad_norm": 21.407981872558594, "learning_rate": 1.2702333931777378e-05, "loss": 16.6117, "step": 5900 }, { "epoch": 1.063734290843806, "grad_norm": 20.662927627563477, "learning_rate": 1.2756193895870737e-05, "loss": 16.5679, "step": 5925 }, { "epoch": 1.0682226211849193, "grad_norm": 24.050336837768555, "learning_rate": 1.2810053859964094e-05, "loss": 16.8247, "step": 5950 }, { "epoch": 1.0727109515260322, "grad_norm": 20.72054100036621, "learning_rate": 1.286391382405745e-05, "loss": 16.623, "step": 5975 }, { "epoch": 1.0771992818671454, "grad_norm": 23.1834659576416, "learning_rate": 1.291777378815081e-05, "loss": 16.4884, "step": 6000 }, { "epoch": 1.0816876122082586, "grad_norm": 21.00957679748535, "learning_rate": 1.2971633752244167e-05, "loss": 16.3869, "step": 6025 }, { "epoch": 1.0861759425493716, "grad_norm": 22.43168067932129, "learning_rate": 1.3025493716337522e-05, "loss": 16.3575, "step": 6050 }, { "epoch": 1.0906642728904847, "grad_norm": 21.6562557220459, "learning_rate": 1.3079353680430879e-05, "loss": 16.4085, "step": 6075 }, { "epoch": 1.095152603231598, "grad_norm": 23.325424194335938, "learning_rate": 1.3133213644524238e-05, "loss": 16.5846, "step": 6100 }, { "epoch": 1.0996409335727109, "grad_norm": 24.215314865112305, "learning_rate": 1.3187073608617595e-05, "loss": 16.4518, "step": 6125 }, { "epoch": 1.104129263913824, "grad_norm": 24.384559631347656, "learning_rate": 1.3240933572710952e-05, "loss": 16.5581, "step": 6150 }, { "epoch": 1.1086175942549372, "grad_norm": 24.343595504760742, "learning_rate": 1.3294793536804309e-05, "loss": 16.2876, "step": 6175 }, { "epoch": 1.1131059245960502, "grad_norm": 21.597131729125977, "learning_rate": 1.3348653500897666e-05, "loss": 16.4586, "step": 6200 }, { "epoch": 1.1175942549371634, "grad_norm": 22.356834411621094, "learning_rate": 1.3402513464991023e-05, "loss": 16.2934, "step": 6225 }, { "epoch": 1.1220825852782765, "grad_norm": 22.678932189941406, "learning_rate": 1.345637342908438e-05, "loss": 16.5327, "step": 6250 }, { "epoch": 1.1265709156193895, "grad_norm": 19.975004196166992, "learning_rate": 1.3510233393177739e-05, "loss": 16.4558, "step": 6275 }, { "epoch": 1.1310592459605027, "grad_norm": 23.107633590698242, "learning_rate": 1.3564093357271096e-05, "loss": 16.4774, "step": 6300 }, { "epoch": 1.1355475763016158, "grad_norm": 23.048038482666016, "learning_rate": 1.3617953321364453e-05, "loss": 16.8229, "step": 6325 }, { "epoch": 1.140035906642729, "grad_norm": 22.7868595123291, "learning_rate": 1.3671813285457809e-05, "loss": 16.5661, "step": 6350 }, { "epoch": 1.144524236983842, "grad_norm": 24.512683868408203, "learning_rate": 1.3725673249551167e-05, "loss": 16.4614, "step": 6375 }, { "epoch": 1.1490125673249552, "grad_norm": 22.137468338012695, "learning_rate": 1.3779533213644524e-05, "loss": 16.6419, "step": 6400 }, { "epoch": 1.1535008976660683, "grad_norm": 25.348499298095703, "learning_rate": 1.3833393177737881e-05, "loss": 16.2941, "step": 6425 }, { "epoch": 1.1579892280071813, "grad_norm": 20.657936096191406, "learning_rate": 1.388725314183124e-05, "loss": 16.4759, "step": 6450 }, { "epoch": 1.1624775583482945, "grad_norm": 21.39447021484375, "learning_rate": 1.3941113105924597e-05, "loss": 16.6561, "step": 6475 }, { "epoch": 1.1669658886894076, "grad_norm": 23.087963104248047, "learning_rate": 1.3994973070017953e-05, "loss": 15.8167, "step": 6500 }, { "epoch": 1.1714542190305206, "grad_norm": 23.6542911529541, "learning_rate": 1.404883303411131e-05, "loss": 16.531, "step": 6525 }, { "epoch": 1.1759425493716338, "grad_norm": 23.05323028564453, "learning_rate": 1.4102692998204668e-05, "loss": 16.3313, "step": 6550 }, { "epoch": 1.180430879712747, "grad_norm": 22.49639320373535, "learning_rate": 1.4156552962298026e-05, "loss": 16.2067, "step": 6575 }, { "epoch": 1.18491921005386, "grad_norm": 27.224279403686523, "learning_rate": 1.4210412926391383e-05, "loss": 16.502, "step": 6600 }, { "epoch": 1.189407540394973, "grad_norm": 21.412261962890625, "learning_rate": 1.4264272890484741e-05, "loss": 16.6798, "step": 6625 }, { "epoch": 1.1938958707360863, "grad_norm": 23.425609588623047, "learning_rate": 1.4318132854578098e-05, "loss": 16.2533, "step": 6650 }, { "epoch": 1.1983842010771992, "grad_norm": 23.98543357849121, "learning_rate": 1.4371992818671454e-05, "loss": 16.2683, "step": 6675 }, { "epoch": 1.2028725314183124, "grad_norm": 24.748369216918945, "learning_rate": 1.4425852782764811e-05, "loss": 16.4797, "step": 6700 }, { "epoch": 1.2073608617594256, "grad_norm": 20.175334930419922, "learning_rate": 1.447971274685817e-05, "loss": 16.493, "step": 6725 }, { "epoch": 1.2118491921005385, "grad_norm": 23.000167846679688, "learning_rate": 1.4533572710951527e-05, "loss": 16.1375, "step": 6750 }, { "epoch": 1.2163375224416517, "grad_norm": 21.749601364135742, "learning_rate": 1.4587432675044884e-05, "loss": 16.4148, "step": 6775 }, { "epoch": 1.220825852782765, "grad_norm": 23.57693099975586, "learning_rate": 1.464129263913824e-05, "loss": 15.9781, "step": 6800 }, { "epoch": 1.2253141831238779, "grad_norm": 22.823196411132812, "learning_rate": 1.4695152603231598e-05, "loss": 16.6314, "step": 6825 }, { "epoch": 1.229802513464991, "grad_norm": 22.367694854736328, "learning_rate": 1.4749012567324955e-05, "loss": 16.5215, "step": 6850 }, { "epoch": 1.2342908438061042, "grad_norm": 33.2826042175293, "learning_rate": 1.4802872531418312e-05, "loss": 16.4813, "step": 6875 }, { "epoch": 1.2387791741472172, "grad_norm": 23.414485931396484, "learning_rate": 1.485673249551167e-05, "loss": 15.8688, "step": 6900 }, { "epoch": 1.2432675044883303, "grad_norm": 24.831056594848633, "learning_rate": 1.4910592459605028e-05, "loss": 16.1143, "step": 6925 }, { "epoch": 1.2477558348294435, "grad_norm": 23.415950775146484, "learning_rate": 1.4964452423698385e-05, "loss": 15.7436, "step": 6950 }, { "epoch": 1.2522441651705565, "grad_norm": 22.253082275390625, "learning_rate": 1.5018312387791742e-05, "loss": 16.0852, "step": 6975 }, { "epoch": 1.2567324955116697, "grad_norm": 22.159162521362305, "learning_rate": 1.5072172351885099e-05, "loss": 16.3732, "step": 7000 }, { "epoch": 1.2612208258527828, "grad_norm": 22.717971801757812, "learning_rate": 1.5126032315978456e-05, "loss": 16.1097, "step": 7025 }, { "epoch": 1.2657091561938958, "grad_norm": 22.539794921875, "learning_rate": 1.5179892280071813e-05, "loss": 16.131, "step": 7050 }, { "epoch": 1.270197486535009, "grad_norm": 25.072383880615234, "learning_rate": 1.523375224416517e-05, "loss": 15.9651, "step": 7075 }, { "epoch": 1.2746858168761221, "grad_norm": 22.601781845092773, "learning_rate": 1.5287612208258526e-05, "loss": 16.0353, "step": 7100 }, { "epoch": 1.279174147217235, "grad_norm": 21.910064697265625, "learning_rate": 1.5341472172351888e-05, "loss": 16.1811, "step": 7125 }, { "epoch": 1.2836624775583483, "grad_norm": 23.791175842285156, "learning_rate": 1.5395332136445243e-05, "loss": 16.0497, "step": 7150 }, { "epoch": 1.2881508078994615, "grad_norm": 24.051387786865234, "learning_rate": 1.5449192100538602e-05, "loss": 16.0254, "step": 7175 }, { "epoch": 1.2926391382405744, "grad_norm": 20.40333366394043, "learning_rate": 1.5503052064631957e-05, "loss": 16.1743, "step": 7200 }, { "epoch": 1.2971274685816876, "grad_norm": 21.65686798095703, "learning_rate": 1.5556912028725313e-05, "loss": 16.0881, "step": 7225 }, { "epoch": 1.3016157989228008, "grad_norm": 23.0731201171875, "learning_rate": 1.561077199281867e-05, "loss": 15.7084, "step": 7250 }, { "epoch": 1.3061041292639137, "grad_norm": 23.546977996826172, "learning_rate": 1.5664631956912027e-05, "loss": 15.8303, "step": 7275 }, { "epoch": 1.310592459605027, "grad_norm": 24.602670669555664, "learning_rate": 1.571849192100539e-05, "loss": 16.0991, "step": 7300 }, { "epoch": 1.31508078994614, "grad_norm": 23.653459548950195, "learning_rate": 1.5772351885098744e-05, "loss": 16.011, "step": 7325 }, { "epoch": 1.319569120287253, "grad_norm": 23.47325325012207, "learning_rate": 1.5826211849192103e-05, "loss": 16.1551, "step": 7350 }, { "epoch": 1.3240574506283662, "grad_norm": 26.003053665161133, "learning_rate": 1.588007181328546e-05, "loss": 15.7585, "step": 7375 }, { "epoch": 1.3285457809694794, "grad_norm": 24.2227840423584, "learning_rate": 1.5933931777378814e-05, "loss": 16.1166, "step": 7400 }, { "epoch": 1.3330341113105924, "grad_norm": 23.852928161621094, "learning_rate": 1.5987791741472173e-05, "loss": 16.1642, "step": 7425 }, { "epoch": 1.3375224416517055, "grad_norm": 20.22197914123535, "learning_rate": 1.6041651705565528e-05, "loss": 16.2129, "step": 7450 }, { "epoch": 1.3420107719928187, "grad_norm": 23.04417610168457, "learning_rate": 1.609551166965889e-05, "loss": 15.9037, "step": 7475 }, { "epoch": 1.3464991023339317, "grad_norm": 22.43314552307129, "learning_rate": 1.6149371633752246e-05, "loss": 15.9117, "step": 7500 }, { "epoch": 1.3509874326750448, "grad_norm": 23.691787719726562, "learning_rate": 1.62032315978456e-05, "loss": 16.1126, "step": 7525 }, { "epoch": 1.355475763016158, "grad_norm": 22.891239166259766, "learning_rate": 1.625709156193896e-05, "loss": 15.8261, "step": 7550 }, { "epoch": 1.359964093357271, "grad_norm": 22.000856399536133, "learning_rate": 1.6310951526032315e-05, "loss": 16.0308, "step": 7575 }, { "epoch": 1.3644524236983842, "grad_norm": 23.784557342529297, "learning_rate": 1.6364811490125674e-05, "loss": 15.7405, "step": 7600 }, { "epoch": 1.3689407540394973, "grad_norm": 23.533695220947266, "learning_rate": 1.641867145421903e-05, "loss": 15.9525, "step": 7625 }, { "epoch": 1.3734290843806103, "grad_norm": 22.5810604095459, "learning_rate": 1.647253141831239e-05, "loss": 15.9572, "step": 7650 }, { "epoch": 1.3779174147217235, "grad_norm": 22.96384048461914, "learning_rate": 1.6526391382405747e-05, "loss": 16.0959, "step": 7675 }, { "epoch": 1.3824057450628366, "grad_norm": 22.757946014404297, "learning_rate": 1.6580251346499102e-05, "loss": 16.0045, "step": 7700 }, { "epoch": 1.3868940754039496, "grad_norm": 24.446062088012695, "learning_rate": 1.663411131059246e-05, "loss": 16.0387, "step": 7725 }, { "epoch": 1.3913824057450628, "grad_norm": 24.733076095581055, "learning_rate": 1.6687971274685816e-05, "loss": 15.6993, "step": 7750 }, { "epoch": 1.395870736086176, "grad_norm": 22.095415115356445, "learning_rate": 1.6741831238779175e-05, "loss": 16.108, "step": 7775 }, { "epoch": 1.400359066427289, "grad_norm": 22.528247833251953, "learning_rate": 1.679569120287253e-05, "loss": 15.9896, "step": 7800 }, { "epoch": 1.404847396768402, "grad_norm": 21.990081787109375, "learning_rate": 1.684955116696589e-05, "loss": 15.8753, "step": 7825 }, { "epoch": 1.4093357271095153, "grad_norm": 24.167387008666992, "learning_rate": 1.6903411131059248e-05, "loss": 15.7562, "step": 7850 }, { "epoch": 1.4138240574506284, "grad_norm": 23.88982391357422, "learning_rate": 1.6957271095152603e-05, "loss": 15.8713, "step": 7875 }, { "epoch": 1.4183123877917414, "grad_norm": 26.82301902770996, "learning_rate": 1.7011131059245962e-05, "loss": 15.5179, "step": 7900 }, { "epoch": 1.4228007181328546, "grad_norm": 25.797740936279297, "learning_rate": 1.7064991023339317e-05, "loss": 15.8012, "step": 7925 }, { "epoch": 1.4272890484739678, "grad_norm": 24.005008697509766, "learning_rate": 1.7118850987432676e-05, "loss": 16.0349, "step": 7950 }, { "epoch": 1.4317773788150807, "grad_norm": 21.801897048950195, "learning_rate": 1.717271095152603e-05, "loss": 16.2094, "step": 7975 }, { "epoch": 1.436265709156194, "grad_norm": 22.728696823120117, "learning_rate": 1.7226570915619387e-05, "loss": 15.8239, "step": 8000 }, { "epoch": 1.440754039497307, "grad_norm": 23.855932235717773, "learning_rate": 1.728043087971275e-05, "loss": 15.7778, "step": 8025 }, { "epoch": 1.44524236983842, "grad_norm": 24.114036560058594, "learning_rate": 1.7334290843806104e-05, "loss": 15.9458, "step": 8050 }, { "epoch": 1.4497307001795332, "grad_norm": 23.884950637817383, "learning_rate": 1.7388150807899463e-05, "loss": 15.7334, "step": 8075 }, { "epoch": 1.4542190305206464, "grad_norm": 26.62238311767578, "learning_rate": 1.744201077199282e-05, "loss": 15.4899, "step": 8100 }, { "epoch": 1.4587073608617596, "grad_norm": 25.22521209716797, "learning_rate": 1.7495870736086177e-05, "loss": 15.6142, "step": 8125 }, { "epoch": 1.4631956912028725, "grad_norm": 24.73517417907715, "learning_rate": 1.7549730700179533e-05, "loss": 16.046, "step": 8150 }, { "epoch": 1.4676840215439857, "grad_norm": 25.76804542541504, "learning_rate": 1.7603590664272888e-05, "loss": 15.5229, "step": 8175 }, { "epoch": 1.4721723518850989, "grad_norm": 24.096242904663086, "learning_rate": 1.765745062836625e-05, "loss": 15.485, "step": 8200 }, { "epoch": 1.4766606822262118, "grad_norm": 23.282133102416992, "learning_rate": 1.7711310592459606e-05, "loss": 15.4915, "step": 8225 }, { "epoch": 1.481149012567325, "grad_norm": 23.70339012145996, "learning_rate": 1.7765170556552964e-05, "loss": 15.778, "step": 8250 }, { "epoch": 1.4856373429084382, "grad_norm": 24.140331268310547, "learning_rate": 1.781903052064632e-05, "loss": 15.9428, "step": 8275 }, { "epoch": 1.4901256732495511, "grad_norm": 22.932546615600586, "learning_rate": 1.7872890484739675e-05, "loss": 15.6304, "step": 8300 }, { "epoch": 1.4946140035906643, "grad_norm": 24.020971298217773, "learning_rate": 1.7926750448833034e-05, "loss": 15.4193, "step": 8325 }, { "epoch": 1.4991023339317775, "grad_norm": 24.903371810913086, "learning_rate": 1.798061041292639e-05, "loss": 15.3212, "step": 8350 }, { "epoch": 1.5035906642728905, "grad_norm": 24.483036041259766, "learning_rate": 1.803447037701975e-05, "loss": 15.5371, "step": 8375 }, { "epoch": 1.5080789946140036, "grad_norm": 24.4531192779541, "learning_rate": 1.8088330341113107e-05, "loss": 15.6667, "step": 8400 }, { "epoch": 1.5125673249551168, "grad_norm": 23.508136749267578, "learning_rate": 1.8142190305206466e-05, "loss": 15.545, "step": 8425 }, { "epoch": 1.5170556552962298, "grad_norm": 25.5224666595459, "learning_rate": 1.819605026929982e-05, "loss": 15.8223, "step": 8450 }, { "epoch": 1.521543985637343, "grad_norm": 23.785808563232422, "learning_rate": 1.8249910233393176e-05, "loss": 15.736, "step": 8475 }, { "epoch": 1.5260323159784561, "grad_norm": 22.968332290649414, "learning_rate": 1.8303770197486535e-05, "loss": 15.5985, "step": 8500 }, { "epoch": 1.530520646319569, "grad_norm": 24.91457176208496, "learning_rate": 1.835763016157989e-05, "loss": 15.5723, "step": 8525 }, { "epoch": 1.5350089766606823, "grad_norm": 27.051095962524414, "learning_rate": 1.8411490125673253e-05, "loss": 15.5436, "step": 8550 }, { "epoch": 1.5394973070017954, "grad_norm": 26.1645565032959, "learning_rate": 1.8465350089766608e-05, "loss": 15.7189, "step": 8575 }, { "epoch": 1.5439856373429084, "grad_norm": 25.47484016418457, "learning_rate": 1.8519210053859967e-05, "loss": 15.4637, "step": 8600 }, { "epoch": 1.5484739676840216, "grad_norm": 21.521570205688477, "learning_rate": 1.8573070017953322e-05, "loss": 15.7744, "step": 8625 }, { "epoch": 1.5529622980251347, "grad_norm": 22.680315017700195, "learning_rate": 1.8626929982046677e-05, "loss": 15.516, "step": 8650 }, { "epoch": 1.5574506283662477, "grad_norm": 22.149436950683594, "learning_rate": 1.8680789946140036e-05, "loss": 16.09, "step": 8675 }, { "epoch": 1.5619389587073609, "grad_norm": 24.99411392211914, "learning_rate": 1.873464991023339e-05, "loss": 15.5241, "step": 8700 }, { "epoch": 1.566427289048474, "grad_norm": 24.49349021911621, "learning_rate": 1.8788509874326754e-05, "loss": 15.5149, "step": 8725 }, { "epoch": 1.570915619389587, "grad_norm": 24.748638153076172, "learning_rate": 1.884236983842011e-05, "loss": 15.4012, "step": 8750 }, { "epoch": 1.5754039497307002, "grad_norm": 23.619789123535156, "learning_rate": 1.8896229802513465e-05, "loss": 15.4603, "step": 8775 }, { "epoch": 1.5798922800718134, "grad_norm": 24.02398681640625, "learning_rate": 1.8950089766606823e-05, "loss": 15.7808, "step": 8800 }, { "epoch": 1.5843806104129263, "grad_norm": 24.2972354888916, "learning_rate": 1.900394973070018e-05, "loss": 15.1804, "step": 8825 }, { "epoch": 1.5888689407540395, "grad_norm": 24.516998291015625, "learning_rate": 1.9057809694793537e-05, "loss": 15.318, "step": 8850 }, { "epoch": 1.5933572710951527, "grad_norm": 24.47681999206543, "learning_rate": 1.9111669658886893e-05, "loss": 15.5455, "step": 8875 }, { "epoch": 1.5978456014362656, "grad_norm": 26.061948776245117, "learning_rate": 1.9165529622980255e-05, "loss": 15.204, "step": 8900 }, { "epoch": 1.6023339317773788, "grad_norm": 25.155284881591797, "learning_rate": 1.9217235188509875e-05, "loss": 15.7975, "step": 8925 }, { "epoch": 1.606822262118492, "grad_norm": 26.721513748168945, "learning_rate": 1.9271095152603233e-05, "loss": 14.8189, "step": 8950 }, { "epoch": 1.611310592459605, "grad_norm": 24.048892974853516, "learning_rate": 1.932495511669659e-05, "loss": 15.4657, "step": 8975 }, { "epoch": 1.6157989228007181, "grad_norm": 21.87297248840332, "learning_rate": 1.9378815080789948e-05, "loss": 15.2708, "step": 9000 }, { "epoch": 1.6202872531418313, "grad_norm": 23.78717613220215, "learning_rate": 1.9432675044883303e-05, "loss": 15.2762, "step": 9025 }, { "epoch": 1.6247755834829443, "grad_norm": 25.389694213867188, "learning_rate": 1.948653500897666e-05, "loss": 15.4709, "step": 9050 }, { "epoch": 1.6292639138240574, "grad_norm": 25.06108283996582, "learning_rate": 1.9540394973070017e-05, "loss": 15.4012, "step": 9075 }, { "epoch": 1.6337522441651706, "grad_norm": 22.665700912475586, "learning_rate": 1.9594254937163376e-05, "loss": 15.7225, "step": 9100 }, { "epoch": 1.6382405745062836, "grad_norm": 24.0644474029541, "learning_rate": 1.9648114901256735e-05, "loss": 15.5653, "step": 9125 }, { "epoch": 1.6427289048473968, "grad_norm": 25.258146286010742, "learning_rate": 1.970197486535009e-05, "loss": 15.5544, "step": 9150 }, { "epoch": 1.64721723518851, "grad_norm": 26.202850341796875, "learning_rate": 1.975583482944345e-05, "loss": 15.1562, "step": 9175 }, { "epoch": 1.6517055655296229, "grad_norm": 25.502126693725586, "learning_rate": 1.9809694793536804e-05, "loss": 15.5075, "step": 9200 }, { "epoch": 1.656193895870736, "grad_norm": 22.884952545166016, "learning_rate": 1.9863554757630163e-05, "loss": 15.386, "step": 9225 }, { "epoch": 1.6606822262118492, "grad_norm": 22.87488555908203, "learning_rate": 1.9917414721723518e-05, "loss": 15.1854, "step": 9250 }, { "epoch": 1.6651705565529622, "grad_norm": 25.1315975189209, "learning_rate": 1.9971274685816877e-05, "loss": 15.0536, "step": 9275 }, { "epoch": 1.6696588868940754, "grad_norm": 23.088226318359375, "learning_rate": 2.0025134649910236e-05, "loss": 15.6098, "step": 9300 }, { "epoch": 1.6741472172351886, "grad_norm": 24.46171760559082, "learning_rate": 2.007899461400359e-05, "loss": 15.2449, "step": 9325 }, { "epoch": 1.6786355475763015, "grad_norm": 25.243085861206055, "learning_rate": 2.013285457809695e-05, "loss": 15.3234, "step": 9350 }, { "epoch": 1.6831238779174147, "grad_norm": 24.72486686706543, "learning_rate": 2.0186714542190305e-05, "loss": 15.641, "step": 9375 }, { "epoch": 1.6876122082585279, "grad_norm": 23.12143898010254, "learning_rate": 2.024057450628366e-05, "loss": 15.2531, "step": 9400 }, { "epoch": 1.6921005385996408, "grad_norm": 24.512834548950195, "learning_rate": 2.029443447037702e-05, "loss": 15.5208, "step": 9425 }, { "epoch": 1.696588868940754, "grad_norm": 25.56024742126465, "learning_rate": 2.0348294434470378e-05, "loss": 15.0794, "step": 9450 }, { "epoch": 1.7010771992818672, "grad_norm": 25.564701080322266, "learning_rate": 2.0402154398563737e-05, "loss": 15.0259, "step": 9475 }, { "epoch": 1.7055655296229801, "grad_norm": 25.182714462280273, "learning_rate": 2.0456014362657092e-05, "loss": 15.3297, "step": 9500 }, { "epoch": 1.7100538599640933, "grad_norm": 25.756427764892578, "learning_rate": 2.050987432675045e-05, "loss": 15.0274, "step": 9525 }, { "epoch": 1.7145421903052065, "grad_norm": 24.414350509643555, "learning_rate": 2.0563734290843806e-05, "loss": 15.004, "step": 9550 }, { "epoch": 1.7190305206463194, "grad_norm": 26.023277282714844, "learning_rate": 2.0617594254937162e-05, "loss": 14.8821, "step": 9575 }, { "epoch": 1.7235188509874326, "grad_norm": 24.01046371459961, "learning_rate": 2.067145421903052e-05, "loss": 15.2589, "step": 9600 }, { "epoch": 1.7280071813285458, "grad_norm": 24.23836898803711, "learning_rate": 2.0725314183123876e-05, "loss": 15.1177, "step": 9625 }, { "epoch": 1.7324955116696588, "grad_norm": 23.774337768554688, "learning_rate": 2.0779174147217238e-05, "loss": 15.1478, "step": 9650 }, { "epoch": 1.736983842010772, "grad_norm": 28.614397048950195, "learning_rate": 2.0833034111310593e-05, "loss": 15.1796, "step": 9675 }, { "epoch": 1.7414721723518851, "grad_norm": 26.42593765258789, "learning_rate": 2.0886894075403952e-05, "loss": 15.0701, "step": 9700 }, { "epoch": 1.745960502692998, "grad_norm": 23.472248077392578, "learning_rate": 2.0940754039497308e-05, "loss": 15.3539, "step": 9725 }, { "epoch": 1.7504488330341115, "grad_norm": 23.4112491607666, "learning_rate": 2.0994614003590663e-05, "loss": 15.2332, "step": 9750 }, { "epoch": 1.7549371633752244, "grad_norm": 21.964303970336914, "learning_rate": 2.1048473967684022e-05, "loss": 15.1311, "step": 9775 }, { "epoch": 1.7594254937163374, "grad_norm": 25.997272491455078, "learning_rate": 2.1102333931777377e-05, "loss": 15.4253, "step": 9800 }, { "epoch": 1.7639138240574508, "grad_norm": 24.534364700317383, "learning_rate": 2.115619389587074e-05, "loss": 14.8151, "step": 9825 }, { "epoch": 1.7684021543985637, "grad_norm": 25.785430908203125, "learning_rate": 2.1210053859964095e-05, "loss": 15.1881, "step": 9850 }, { "epoch": 1.7728904847396767, "grad_norm": 24.27193832397461, "learning_rate": 2.126391382405745e-05, "loss": 15.0705, "step": 9875 }, { "epoch": 1.77737881508079, "grad_norm": 24.99488067626953, "learning_rate": 2.131777378815081e-05, "loss": 15.1269, "step": 9900 }, { "epoch": 1.781867145421903, "grad_norm": 25.080209732055664, "learning_rate": 2.1371633752244164e-05, "loss": 15.12, "step": 9925 }, { "epoch": 1.786355475763016, "grad_norm": 25.579904556274414, "learning_rate": 2.1425493716337523e-05, "loss": 14.9893, "step": 9950 }, { "epoch": 1.7908438061041294, "grad_norm": 25.11918067932129, "learning_rate": 2.1479353680430878e-05, "loss": 15.1663, "step": 9975 }, { "epoch": 1.7953321364452424, "grad_norm": 27.383655548095703, "learning_rate": 2.153321364452424e-05, "loss": 15.1548, "step": 10000 }, { "epoch": 1.7998204667863553, "grad_norm": 24.2135009765625, "learning_rate": 2.1587073608617596e-05, "loss": 15.0468, "step": 10025 }, { "epoch": 1.8043087971274687, "grad_norm": 26.53235626220703, "learning_rate": 2.164093357271095e-05, "loss": 14.9113, "step": 10050 }, { "epoch": 1.8087971274685817, "grad_norm": 25.139854431152344, "learning_rate": 2.169479353680431e-05, "loss": 15.2685, "step": 10075 }, { "epoch": 1.8132854578096946, "grad_norm": 26.078100204467773, "learning_rate": 2.1748653500897665e-05, "loss": 15.0769, "step": 10100 }, { "epoch": 1.817773788150808, "grad_norm": 32.14773941040039, "learning_rate": 2.1802513464991024e-05, "loss": 15.0157, "step": 10125 }, { "epoch": 1.822262118491921, "grad_norm": 25.352624893188477, "learning_rate": 2.185637342908438e-05, "loss": 15.2303, "step": 10150 }, { "epoch": 1.826750448833034, "grad_norm": 24.74574851989746, "learning_rate": 2.1910233393177738e-05, "loss": 14.5637, "step": 10175 }, { "epoch": 1.8312387791741473, "grad_norm": 26.362592697143555, "learning_rate": 2.1964093357271097e-05, "loss": 14.9059, "step": 10200 }, { "epoch": 1.8357271095152603, "grad_norm": 24.987171173095703, "learning_rate": 2.2017953321364452e-05, "loss": 15.069, "step": 10225 }, { "epoch": 1.8402154398563735, "grad_norm": 24.836288452148438, "learning_rate": 2.207181328545781e-05, "loss": 15.0462, "step": 10250 }, { "epoch": 1.8447037701974867, "grad_norm": 24.79768180847168, "learning_rate": 2.2125673249551166e-05, "loss": 14.8612, "step": 10275 }, { "epoch": 1.8491921005385996, "grad_norm": 25.61474609375, "learning_rate": 2.2179533213644525e-05, "loss": 14.7445, "step": 10300 }, { "epoch": 1.8536804308797128, "grad_norm": 25.009479522705078, "learning_rate": 2.223339317773788e-05, "loss": 14.9331, "step": 10325 }, { "epoch": 1.858168761220826, "grad_norm": 25.85749053955078, "learning_rate": 2.228725314183124e-05, "loss": 14.8491, "step": 10350 }, { "epoch": 1.862657091561939, "grad_norm": 24.728235244750977, "learning_rate": 2.2341113105924598e-05, "loss": 14.8084, "step": 10375 }, { "epoch": 1.867145421903052, "grad_norm": 23.449575424194336, "learning_rate": 2.2394973070017954e-05, "loss": 14.8635, "step": 10400 }, { "epoch": 1.8716337522441653, "grad_norm": 23.53273582458496, "learning_rate": 2.2448833034111312e-05, "loss": 15.0607, "step": 10425 }, { "epoch": 1.8761220825852782, "grad_norm": 26.236675262451172, "learning_rate": 2.2502692998204668e-05, "loss": 14.7338, "step": 10450 }, { "epoch": 1.8806104129263914, "grad_norm": 24.960784912109375, "learning_rate": 2.2556552962298026e-05, "loss": 14.831, "step": 10475 }, { "epoch": 1.8850987432675046, "grad_norm": 23.77855682373047, "learning_rate": 2.2610412926391382e-05, "loss": 14.9616, "step": 10500 }, { "epoch": 1.8895870736086176, "grad_norm": 25.975210189819336, "learning_rate": 2.266427289048474e-05, "loss": 14.5186, "step": 10525 }, { "epoch": 1.8940754039497307, "grad_norm": 26.122711181640625, "learning_rate": 2.27181328545781e-05, "loss": 14.3817, "step": 10550 }, { "epoch": 1.898563734290844, "grad_norm": 25.613475799560547, "learning_rate": 2.2771992818671455e-05, "loss": 14.5428, "step": 10575 }, { "epoch": 1.9030520646319569, "grad_norm": 24.9304141998291, "learning_rate": 2.2825852782764813e-05, "loss": 14.8628, "step": 10600 }, { "epoch": 1.90754039497307, "grad_norm": 25.525495529174805, "learning_rate": 2.287971274685817e-05, "loss": 14.7881, "step": 10625 }, { "epoch": 1.9120287253141832, "grad_norm": 24.550325393676758, "learning_rate": 2.2933572710951524e-05, "loss": 14.5237, "step": 10650 }, { "epoch": 1.9165170556552962, "grad_norm": 26.814821243286133, "learning_rate": 2.2987432675044883e-05, "loss": 14.9076, "step": 10675 }, { "epoch": 1.9210053859964094, "grad_norm": 25.589099884033203, "learning_rate": 2.3041292639138242e-05, "loss": 14.9983, "step": 10700 }, { "epoch": 1.9254937163375225, "grad_norm": 26.260356903076172, "learning_rate": 2.30951526032316e-05, "loss": 14.4078, "step": 10725 }, { "epoch": 1.9299820466786355, "grad_norm": 40.02426528930664, "learning_rate": 2.3149012567324956e-05, "loss": 14.6382, "step": 10750 }, { "epoch": 1.9344703770197487, "grad_norm": 24.463035583496094, "learning_rate": 2.3202872531418315e-05, "loss": 14.251, "step": 10775 }, { "epoch": 1.9389587073608618, "grad_norm": 26.021873474121094, "learning_rate": 2.325673249551167e-05, "loss": 14.783, "step": 10800 }, { "epoch": 1.9434470377019748, "grad_norm": 25.914993286132812, "learning_rate": 2.3310592459605025e-05, "loss": 14.6037, "step": 10825 }, { "epoch": 1.947935368043088, "grad_norm": 24.850980758666992, "learning_rate": 2.3364452423698384e-05, "loss": 14.692, "step": 10850 }, { "epoch": 1.9524236983842012, "grad_norm": 23.075193405151367, "learning_rate": 2.341831238779174e-05, "loss": 14.4149, "step": 10875 }, { "epoch": 1.9569120287253141, "grad_norm": 26.311481475830078, "learning_rate": 2.34721723518851e-05, "loss": 14.5615, "step": 10900 }, { "epoch": 1.9614003590664273, "grad_norm": 24.902671813964844, "learning_rate": 2.3526032315978457e-05, "loss": 14.267, "step": 10925 }, { "epoch": 1.9658886894075405, "grad_norm": 24.723201751708984, "learning_rate": 2.3579892280071816e-05, "loss": 14.5657, "step": 10950 }, { "epoch": 1.9703770197486534, "grad_norm": 26.1663818359375, "learning_rate": 2.363375224416517e-05, "loss": 14.6454, "step": 10975 }, { "epoch": 1.9748653500897666, "grad_norm": 24.78443145751953, "learning_rate": 2.3687612208258527e-05, "loss": 14.4444, "step": 11000 }, { "epoch": 1.9793536804308798, "grad_norm": 24.568164825439453, "learning_rate": 2.3741472172351885e-05, "loss": 14.6314, "step": 11025 }, { "epoch": 1.9838420107719927, "grad_norm": 26.20634651184082, "learning_rate": 2.379533213644524e-05, "loss": 14.6824, "step": 11050 }, { "epoch": 1.988330341113106, "grad_norm": 24.453754425048828, "learning_rate": 2.3849192100538603e-05, "loss": 14.7177, "step": 11075 }, { "epoch": 1.992818671454219, "grad_norm": 25.506359100341797, "learning_rate": 2.3903052064631958e-05, "loss": 14.5953, "step": 11100 }, { "epoch": 1.997307001795332, "grad_norm": 24.724069595336914, "learning_rate": 2.3956912028725314e-05, "loss": 14.5515, "step": 11125 }, { "epoch": 2.0, "eval_accuracy": 0.059114429190613486, "eval_f1_macro": 0.0023152878504535197, "eval_f1_micro": 0.059114429190613486, "eval_f1_weighted": 0.026248109088727177, "eval_loss": 7.744897365570068, "eval_precision_macro": 0.002216938480351738, "eval_precision_micro": 0.059114429190613486, "eval_precision_weighted": 0.02310532518025774, "eval_recall_macro": 0.005032399335473846, "eval_recall_micro": 0.059114429190613486, "eval_recall_weighted": 0.059114429190613486, "eval_runtime": 86.2961, "eval_samples_per_second": 606.899, "eval_steps_per_second": 18.97, "step": 11140 }, { "epoch": 2.0017953321364454, "grad_norm": 25.532371520996094, "learning_rate": 2.4010771992818672e-05, "loss": 14.3659, "step": 11150 }, { "epoch": 2.0062836624775584, "grad_norm": 25.71830177307129, "learning_rate": 2.4064631956912028e-05, "loss": 13.8419, "step": 11175 }, { "epoch": 2.0107719928186714, "grad_norm": 27.925411224365234, "learning_rate": 2.4118491921005386e-05, "loss": 13.9401, "step": 11200 }, { "epoch": 2.0152603231597848, "grad_norm": 26.441532135009766, "learning_rate": 2.4172351885098742e-05, "loss": 14.0487, "step": 11225 }, { "epoch": 2.0197486535008977, "grad_norm": 25.631881713867188, "learning_rate": 2.4226211849192104e-05, "loss": 13.4916, "step": 11250 }, { "epoch": 2.0242369838420107, "grad_norm": 25.339025497436523, "learning_rate": 2.428007181328546e-05, "loss": 13.5516, "step": 11275 }, { "epoch": 2.028725314183124, "grad_norm": 26.991966247558594, "learning_rate": 2.4333931777378815e-05, "loss": 13.4598, "step": 11300 }, { "epoch": 2.033213644524237, "grad_norm": 25.9316463470459, "learning_rate": 2.4387791741472174e-05, "loss": 13.7771, "step": 11325 }, { "epoch": 2.03770197486535, "grad_norm": 27.35523796081543, "learning_rate": 2.444165170556553e-05, "loss": 13.721, "step": 11350 }, { "epoch": 2.0421903052064634, "grad_norm": 27.451637268066406, "learning_rate": 2.4495511669658888e-05, "loss": 13.7572, "step": 11375 }, { "epoch": 2.0466786355475763, "grad_norm": 27.497739791870117, "learning_rate": 2.4549371633752243e-05, "loss": 13.7687, "step": 11400 }, { "epoch": 2.0511669658886893, "grad_norm": 26.42055892944336, "learning_rate": 2.4603231597845602e-05, "loss": 13.8483, "step": 11425 }, { "epoch": 2.0556552962298027, "grad_norm": 26.251361846923828, "learning_rate": 2.465709156193896e-05, "loss": 13.4564, "step": 11450 }, { "epoch": 2.0601436265709157, "grad_norm": 27.7249813079834, "learning_rate": 2.4710951526032316e-05, "loss": 13.6193, "step": 11475 }, { "epoch": 2.0646319569120286, "grad_norm": 29.7418155670166, "learning_rate": 2.4764811490125675e-05, "loss": 13.6154, "step": 11500 }, { "epoch": 2.069120287253142, "grad_norm": 28.159162521362305, "learning_rate": 2.481867145421903e-05, "loss": 13.505, "step": 11525 }, { "epoch": 2.073608617594255, "grad_norm": 27.0701904296875, "learning_rate": 2.487253141831239e-05, "loss": 13.6783, "step": 11550 }, { "epoch": 2.078096947935368, "grad_norm": 28.18494987487793, "learning_rate": 2.4924236983842012e-05, "loss": 13.6024, "step": 11575 }, { "epoch": 2.0825852782764813, "grad_norm": 25.40494155883789, "learning_rate": 2.4978096947935367e-05, "loss": 13.7101, "step": 11600 }, { "epoch": 2.0870736086175943, "grad_norm": 28.17936897277832, "learning_rate": 2.5031956912028726e-05, "loss": 13.8797, "step": 11625 }, { "epoch": 2.0915619389587072, "grad_norm": 28.881277084350586, "learning_rate": 2.5085816876122085e-05, "loss": 13.6777, "step": 11650 }, { "epoch": 2.0960502692998206, "grad_norm": 25.790342330932617, "learning_rate": 2.513967684021544e-05, "loss": 13.5791, "step": 11675 }, { "epoch": 2.1005385996409336, "grad_norm": 28.37506866455078, "learning_rate": 2.51935368043088e-05, "loss": 13.4982, "step": 11700 }, { "epoch": 2.1050269299820465, "grad_norm": 33.875404357910156, "learning_rate": 2.5247396768402154e-05, "loss": 13.5064, "step": 11725 }, { "epoch": 2.10951526032316, "grad_norm": 28.881078720092773, "learning_rate": 2.530125673249551e-05, "loss": 13.6496, "step": 11750 }, { "epoch": 2.114003590664273, "grad_norm": 26.983850479125977, "learning_rate": 2.535511669658887e-05, "loss": 13.3992, "step": 11775 }, { "epoch": 2.118491921005386, "grad_norm": 26.257688522338867, "learning_rate": 2.5408976660682227e-05, "loss": 13.7699, "step": 11800 }, { "epoch": 2.1229802513464993, "grad_norm": 28.320302963256836, "learning_rate": 2.5462836624775586e-05, "loss": 13.3924, "step": 11825 }, { "epoch": 2.127468581687612, "grad_norm": 28.05795669555664, "learning_rate": 2.551669658886894e-05, "loss": 13.5392, "step": 11850 }, { "epoch": 2.131956912028725, "grad_norm": 29.30341911315918, "learning_rate": 2.55705565529623e-05, "loss": 13.4195, "step": 11875 }, { "epoch": 2.1364452423698386, "grad_norm": 27.965492248535156, "learning_rate": 2.5624416517055655e-05, "loss": 13.6041, "step": 11900 }, { "epoch": 2.1409335727109515, "grad_norm": 29.342981338500977, "learning_rate": 2.567827648114901e-05, "loss": 13.4952, "step": 11925 }, { "epoch": 2.1454219030520645, "grad_norm": 29.504013061523438, "learning_rate": 2.573213644524237e-05, "loss": 13.3822, "step": 11950 }, { "epoch": 2.149910233393178, "grad_norm": 25.68410301208496, "learning_rate": 2.578599640933573e-05, "loss": 13.6285, "step": 11975 }, { "epoch": 2.154398563734291, "grad_norm": 27.036991119384766, "learning_rate": 2.5839856373429087e-05, "loss": 13.9489, "step": 12000 }, { "epoch": 2.158886894075404, "grad_norm": 28.70158576965332, "learning_rate": 2.5893716337522443e-05, "loss": 13.6128, "step": 12025 }, { "epoch": 2.163375224416517, "grad_norm": 27.817323684692383, "learning_rate": 2.59475763016158e-05, "loss": 13.8509, "step": 12050 }, { "epoch": 2.16786355475763, "grad_norm": 26.909086227416992, "learning_rate": 2.6001436265709157e-05, "loss": 13.4432, "step": 12075 }, { "epoch": 2.172351885098743, "grad_norm": 27.109466552734375, "learning_rate": 2.6055296229802512e-05, "loss": 13.3693, "step": 12100 }, { "epoch": 2.1768402154398565, "grad_norm": 29.08690643310547, "learning_rate": 2.610915619389587e-05, "loss": 13.7364, "step": 12125 }, { "epoch": 2.1813285457809695, "grad_norm": 28.68939971923828, "learning_rate": 2.616301615798923e-05, "loss": 13.7631, "step": 12150 }, { "epoch": 2.1858168761220824, "grad_norm": 28.95443344116211, "learning_rate": 2.621687612208259e-05, "loss": 14.0335, "step": 12175 }, { "epoch": 2.190305206463196, "grad_norm": 29.304248809814453, "learning_rate": 2.6270736086175944e-05, "loss": 13.6967, "step": 12200 }, { "epoch": 2.1947935368043088, "grad_norm": 27.95583152770996, "learning_rate": 2.63245960502693e-05, "loss": 13.5719, "step": 12225 }, { "epoch": 2.1992818671454217, "grad_norm": 27.92197608947754, "learning_rate": 2.6378456014362658e-05, "loss": 13.6523, "step": 12250 }, { "epoch": 2.203770197486535, "grad_norm": 29.322330474853516, "learning_rate": 2.6432315978456013e-05, "loss": 13.5422, "step": 12275 }, { "epoch": 2.208258527827648, "grad_norm": 29.324125289916992, "learning_rate": 2.6486175942549372e-05, "loss": 13.639, "step": 12300 }, { "epoch": 2.212746858168761, "grad_norm": 27.53671646118164, "learning_rate": 2.654003590664273e-05, "loss": 13.7083, "step": 12325 }, { "epoch": 2.2172351885098744, "grad_norm": 28.272226333618164, "learning_rate": 2.659389587073609e-05, "loss": 13.7521, "step": 12350 }, { "epoch": 2.2217235188509874, "grad_norm": 28.756206512451172, "learning_rate": 2.6647755834829445e-05, "loss": 13.3446, "step": 12375 }, { "epoch": 2.2262118491921004, "grad_norm": 27.521116256713867, "learning_rate": 2.67016157989228e-05, "loss": 13.3676, "step": 12400 }, { "epoch": 2.2307001795332138, "grad_norm": 28.232725143432617, "learning_rate": 2.675547576301616e-05, "loss": 13.7248, "step": 12425 }, { "epoch": 2.2351885098743267, "grad_norm": 27.95871353149414, "learning_rate": 2.6809335727109514e-05, "loss": 13.3986, "step": 12450 }, { "epoch": 2.2396768402154397, "grad_norm": 26.93558692932129, "learning_rate": 2.6863195691202873e-05, "loss": 13.5971, "step": 12475 }, { "epoch": 2.244165170556553, "grad_norm": 27.357070922851562, "learning_rate": 2.6914901256732496e-05, "loss": 13.8007, "step": 12500 }, { "epoch": 2.248653500897666, "grad_norm": 34.84161376953125, "learning_rate": 2.6968761220825855e-05, "loss": 13.3402, "step": 12525 }, { "epoch": 2.253141831238779, "grad_norm": 29.713102340698242, "learning_rate": 2.702262118491921e-05, "loss": 13.4515, "step": 12550 }, { "epoch": 2.2576301615798924, "grad_norm": 31.844457626342773, "learning_rate": 2.707648114901257e-05, "loss": 13.4538, "step": 12575 }, { "epoch": 2.2621184919210053, "grad_norm": 31.339860916137695, "learning_rate": 2.7130341113105924e-05, "loss": 13.7536, "step": 12600 }, { "epoch": 2.2666068222621183, "grad_norm": 27.18288803100586, "learning_rate": 2.7184201077199283e-05, "loss": 13.361, "step": 12625 }, { "epoch": 2.2710951526032317, "grad_norm": 25.645360946655273, "learning_rate": 2.723806104129264e-05, "loss": 13.7802, "step": 12650 }, { "epoch": 2.2755834829443446, "grad_norm": 28.508298873901367, "learning_rate": 2.7291921005385997e-05, "loss": 13.4987, "step": 12675 }, { "epoch": 2.280071813285458, "grad_norm": 26.898292541503906, "learning_rate": 2.7345780969479356e-05, "loss": 13.6794, "step": 12700 }, { "epoch": 2.284560143626571, "grad_norm": 40.84425354003906, "learning_rate": 2.739964093357271e-05, "loss": 13.3866, "step": 12725 }, { "epoch": 2.289048473967684, "grad_norm": 27.576169967651367, "learning_rate": 2.745350089766607e-05, "loss": 13.7029, "step": 12750 }, { "epoch": 2.293536804308797, "grad_norm": 27.815526962280273, "learning_rate": 2.7507360861759426e-05, "loss": 13.6855, "step": 12775 }, { "epoch": 2.2980251346499103, "grad_norm": 26.595399856567383, "learning_rate": 2.7561220825852784e-05, "loss": 13.3868, "step": 12800 }, { "epoch": 2.3025134649910233, "grad_norm": 27.15950584411621, "learning_rate": 2.761508078994614e-05, "loss": 13.3696, "step": 12825 }, { "epoch": 2.3070017953321367, "grad_norm": 28.6210994720459, "learning_rate": 2.7668940754039495e-05, "loss": 13.5515, "step": 12850 }, { "epoch": 2.3114901256732496, "grad_norm": 27.74658203125, "learning_rate": 2.7722800718132857e-05, "loss": 13.0361, "step": 12875 }, { "epoch": 2.3159784560143626, "grad_norm": 26.844989776611328, "learning_rate": 2.7776660682226213e-05, "loss": 13.5466, "step": 12900 }, { "epoch": 2.3204667863554755, "grad_norm": 27.64177703857422, "learning_rate": 2.783052064631957e-05, "loss": 13.8139, "step": 12925 }, { "epoch": 2.324955116696589, "grad_norm": 28.158784866333008, "learning_rate": 2.7884380610412927e-05, "loss": 13.7636, "step": 12950 }, { "epoch": 2.329443447037702, "grad_norm": 28.323238372802734, "learning_rate": 2.7938240574506286e-05, "loss": 13.3848, "step": 12975 }, { "epoch": 2.3339317773788153, "grad_norm": 28.48469352722168, "learning_rate": 2.799210053859964e-05, "loss": 13.6163, "step": 13000 }, { "epoch": 2.3384201077199283, "grad_norm": 26.27099609375, "learning_rate": 2.8045960502692996e-05, "loss": 13.3823, "step": 13025 }, { "epoch": 2.342908438061041, "grad_norm": 27.050186157226562, "learning_rate": 2.8099820466786355e-05, "loss": 13.6565, "step": 13050 }, { "epoch": 2.347396768402154, "grad_norm": 26.83416748046875, "learning_rate": 2.8153680430879714e-05, "loss": 13.5032, "step": 13075 }, { "epoch": 2.3518850987432676, "grad_norm": 25.751502990722656, "learning_rate": 2.8207540394973073e-05, "loss": 13.5242, "step": 13100 }, { "epoch": 2.3563734290843805, "grad_norm": 27.54896354675293, "learning_rate": 2.8261400359066428e-05, "loss": 13.6656, "step": 13125 }, { "epoch": 2.360861759425494, "grad_norm": 29.93552017211914, "learning_rate": 2.8315260323159787e-05, "loss": 13.7181, "step": 13150 }, { "epoch": 2.365350089766607, "grad_norm": 34.247100830078125, "learning_rate": 2.8369120287253142e-05, "loss": 13.3388, "step": 13175 }, { "epoch": 2.36983842010772, "grad_norm": 27.253677368164062, "learning_rate": 2.8422980251346498e-05, "loss": 12.8927, "step": 13200 }, { "epoch": 2.374326750448833, "grad_norm": 26.714345932006836, "learning_rate": 2.8476840215439856e-05, "loss": 13.1986, "step": 13225 }, { "epoch": 2.378815080789946, "grad_norm": 28.791046142578125, "learning_rate": 2.8530700179533215e-05, "loss": 13.3162, "step": 13250 }, { "epoch": 2.383303411131059, "grad_norm": 27.82441520690918, "learning_rate": 2.8584560143626574e-05, "loss": 13.6409, "step": 13275 }, { "epoch": 2.3877917414721725, "grad_norm": 27.760778427124023, "learning_rate": 2.863842010771993e-05, "loss": 13.4591, "step": 13300 }, { "epoch": 2.3922800718132855, "grad_norm": 35.298912048339844, "learning_rate": 2.8692280071813285e-05, "loss": 13.5868, "step": 13325 }, { "epoch": 2.3967684021543985, "grad_norm": 29.174081802368164, "learning_rate": 2.8746140035906643e-05, "loss": 12.9569, "step": 13350 }, { "epoch": 2.401256732495512, "grad_norm": 28.78097152709961, "learning_rate": 2.88e-05, "loss": 13.405, "step": 13375 }, { "epoch": 2.405745062836625, "grad_norm": 28.48590660095215, "learning_rate": 2.8853859964093357e-05, "loss": 13.8227, "step": 13400 }, { "epoch": 2.4102333931777378, "grad_norm": 27.466550827026367, "learning_rate": 2.8907719928186716e-05, "loss": 13.3373, "step": 13425 }, { "epoch": 2.414721723518851, "grad_norm": 26.298185348510742, "learning_rate": 2.8961579892280075e-05, "loss": 13.3942, "step": 13450 }, { "epoch": 2.419210053859964, "grad_norm": 27.673166275024414, "learning_rate": 2.901543985637343e-05, "loss": 13.3092, "step": 13475 }, { "epoch": 2.423698384201077, "grad_norm": 27.58799171447754, "learning_rate": 2.9069299820466786e-05, "loss": 13.2923, "step": 13500 }, { "epoch": 2.4281867145421905, "grad_norm": 28.616209030151367, "learning_rate": 2.9123159784560144e-05, "loss": 13.7892, "step": 13525 }, { "epoch": 2.4326750448833034, "grad_norm": 27.34395980834961, "learning_rate": 2.91770197486535e-05, "loss": 13.4703, "step": 13550 }, { "epoch": 2.4371633752244164, "grad_norm": 27.241291046142578, "learning_rate": 2.923087971274686e-05, "loss": 13.6906, "step": 13575 }, { "epoch": 2.44165170556553, "grad_norm": 31.22068214416504, "learning_rate": 2.9284739676840217e-05, "loss": 13.0538, "step": 13600 }, { "epoch": 2.4461400359066428, "grad_norm": 27.56983184814453, "learning_rate": 2.9338599640933573e-05, "loss": 13.4391, "step": 13625 }, { "epoch": 2.4506283662477557, "grad_norm": 27.46451187133789, "learning_rate": 2.939245960502693e-05, "loss": 13.4247, "step": 13650 }, { "epoch": 2.455116696588869, "grad_norm": 27.22041893005371, "learning_rate": 2.9446319569120287e-05, "loss": 13.2423, "step": 13675 }, { "epoch": 2.459605026929982, "grad_norm": 44.078704833984375, "learning_rate": 2.9500179533213646e-05, "loss": 12.9909, "step": 13700 }, { "epoch": 2.464093357271095, "grad_norm": 28.11593246459961, "learning_rate": 2.9554039497307e-05, "loss": 13.2222, "step": 13725 }, { "epoch": 2.4685816876122084, "grad_norm": 28.899824142456055, "learning_rate": 2.960789946140036e-05, "loss": 13.4138, "step": 13750 }, { "epoch": 2.4730700179533214, "grad_norm": 27.567039489746094, "learning_rate": 2.966175942549372e-05, "loss": 13.5078, "step": 13775 }, { "epoch": 2.4775583482944343, "grad_norm": 26.155046463012695, "learning_rate": 2.9715619389587074e-05, "loss": 13.2964, "step": 13800 }, { "epoch": 2.4820466786355477, "grad_norm": 26.821226119995117, "learning_rate": 2.9769479353680433e-05, "loss": 13.7765, "step": 13825 }, { "epoch": 2.4865350089766607, "grad_norm": 28.220781326293945, "learning_rate": 2.9823339317773788e-05, "loss": 13.6587, "step": 13850 }, { "epoch": 2.4910233393177736, "grad_norm": 29.53750228881836, "learning_rate": 2.9877199281867147e-05, "loss": 13.3947, "step": 13875 }, { "epoch": 2.495511669658887, "grad_norm": 26.887174606323242, "learning_rate": 2.9931059245960502e-05, "loss": 13.1447, "step": 13900 }, { "epoch": 2.5, "grad_norm": 27.31348419189453, "learning_rate": 2.998491921005386e-05, "loss": 13.2417, "step": 13925 }, { "epoch": 2.504488330341113, "grad_norm": 30.33110809326172, "learning_rate": 2.9995691202872533e-05, "loss": 13.3828, "step": 13950 }, { "epoch": 2.5089766606822264, "grad_norm": 27.644296646118164, "learning_rate": 2.9989706762417715e-05, "loss": 13.1784, "step": 13975 }, { "epoch": 2.5134649910233393, "grad_norm": 29.11074447631836, "learning_rate": 2.9983722321962897e-05, "loss": 13.2607, "step": 14000 }, { "epoch": 2.5179533213644523, "grad_norm": 27.747520446777344, "learning_rate": 2.997773788150808e-05, "loss": 13.5471, "step": 14025 }, { "epoch": 2.5224416517055657, "grad_norm": 26.138446807861328, "learning_rate": 2.9971753441053262e-05, "loss": 13.6022, "step": 14050 }, { "epoch": 2.5269299820466786, "grad_norm": 27.328989028930664, "learning_rate": 2.9965769000598445e-05, "loss": 13.1897, "step": 14075 }, { "epoch": 2.5314183123877916, "grad_norm": 26.998350143432617, "learning_rate": 2.9959784560143627e-05, "loss": 13.4755, "step": 14100 }, { "epoch": 2.535906642728905, "grad_norm": 27.311878204345703, "learning_rate": 2.995380011968881e-05, "loss": 12.9735, "step": 14125 }, { "epoch": 2.540394973070018, "grad_norm": 25.830198287963867, "learning_rate": 2.994781567923399e-05, "loss": 13.352, "step": 14150 }, { "epoch": 2.5448833034111313, "grad_norm": 26.87948989868164, "learning_rate": 2.9941831238779177e-05, "loss": 13.4053, "step": 14175 }, { "epoch": 2.5493716337522443, "grad_norm": 28.717430114746094, "learning_rate": 2.993584679832436e-05, "loss": 13.0502, "step": 14200 }, { "epoch": 2.5538599640933572, "grad_norm": 26.697654724121094, "learning_rate": 2.992986235786954e-05, "loss": 13.109, "step": 14225 }, { "epoch": 2.55834829443447, "grad_norm": 27.923320770263672, "learning_rate": 2.992387791741472e-05, "loss": 13.0734, "step": 14250 }, { "epoch": 2.5628366247755836, "grad_norm": 28.750234603881836, "learning_rate": 2.9917893476959903e-05, "loss": 13.2513, "step": 14275 }, { "epoch": 2.5673249551166966, "grad_norm": 28.91152000427246, "learning_rate": 2.9911909036505086e-05, "loss": 13.0385, "step": 14300 }, { "epoch": 2.57181328545781, "grad_norm": 26.92072105407715, "learning_rate": 2.990592459605027e-05, "loss": 13.289, "step": 14325 }, { "epoch": 2.576301615798923, "grad_norm": 26.42039680480957, "learning_rate": 2.9899940155595454e-05, "loss": 13.3365, "step": 14350 }, { "epoch": 2.580789946140036, "grad_norm": 26.49629783630371, "learning_rate": 2.9893955715140636e-05, "loss": 13.0466, "step": 14375 }, { "epoch": 2.585278276481149, "grad_norm": 26.710182189941406, "learning_rate": 2.988797127468582e-05, "loss": 13.196, "step": 14400 }, { "epoch": 2.5897666068222622, "grad_norm": 28.95528793334961, "learning_rate": 2.9881986834230998e-05, "loss": 13.2839, "step": 14425 }, { "epoch": 2.594254937163375, "grad_norm": 27.436601638793945, "learning_rate": 2.9876002393776183e-05, "loss": 13.195, "step": 14450 }, { "epoch": 2.5987432675044886, "grad_norm": 27.884984970092773, "learning_rate": 2.9870017953321366e-05, "loss": 13.3211, "step": 14475 }, { "epoch": 2.6032315978456015, "grad_norm": 28.28389549255371, "learning_rate": 2.9864033512866548e-05, "loss": 13.0379, "step": 14500 }, { "epoch": 2.6077199281867145, "grad_norm": 33.531089782714844, "learning_rate": 2.985804907241173e-05, "loss": 13.297, "step": 14525 }, { "epoch": 2.6122082585278275, "grad_norm": 27.326101303100586, "learning_rate": 2.9852064631956913e-05, "loss": 13.3307, "step": 14550 }, { "epoch": 2.616696588868941, "grad_norm": 26.402788162231445, "learning_rate": 2.9846080191502095e-05, "loss": 13.2087, "step": 14575 }, { "epoch": 2.621184919210054, "grad_norm": 28.52970314025879, "learning_rate": 2.984009575104728e-05, "loss": 13.0077, "step": 14600 }, { "epoch": 2.625673249551167, "grad_norm": 26.127384185791016, "learning_rate": 2.983411131059246e-05, "loss": 12.8619, "step": 14625 }, { "epoch": 2.63016157989228, "grad_norm": 26.900188446044922, "learning_rate": 2.9828126870137642e-05, "loss": 13.0219, "step": 14650 }, { "epoch": 2.634649910233393, "grad_norm": 28.075593948364258, "learning_rate": 2.9822142429682825e-05, "loss": 13.0576, "step": 14675 }, { "epoch": 2.639138240574506, "grad_norm": 27.4871883392334, "learning_rate": 2.9816157989228007e-05, "loss": 13.3393, "step": 14700 }, { "epoch": 2.6436265709156195, "grad_norm": 26.82506561279297, "learning_rate": 2.981017354877319e-05, "loss": 13.2037, "step": 14725 }, { "epoch": 2.6481149012567324, "grad_norm": 27.90208625793457, "learning_rate": 2.9804189108318375e-05, "loss": 13.2741, "step": 14750 }, { "epoch": 2.652603231597846, "grad_norm": 27.409181594848633, "learning_rate": 2.9798204667863557e-05, "loss": 13.0042, "step": 14775 }, { "epoch": 2.657091561938959, "grad_norm": 26.863079071044922, "learning_rate": 2.979222022740874e-05, "loss": 12.9851, "step": 14800 }, { "epoch": 2.6615798922800717, "grad_norm": 27.66518211364746, "learning_rate": 2.978623578695392e-05, "loss": 13.1473, "step": 14825 }, { "epoch": 2.6660682226211847, "grad_norm": 31.207706451416016, "learning_rate": 2.97802513464991e-05, "loss": 13.6109, "step": 14850 }, { "epoch": 2.670556552962298, "grad_norm": 26.27522087097168, "learning_rate": 2.9774266906044287e-05, "loss": 12.7777, "step": 14875 }, { "epoch": 2.675044883303411, "grad_norm": 28.05002784729004, "learning_rate": 2.976828246558947e-05, "loss": 13.4278, "step": 14900 }, { "epoch": 2.6795332136445245, "grad_norm": 27.554943084716797, "learning_rate": 2.976229802513465e-05, "loss": 13.4759, "step": 14925 }, { "epoch": 2.6840215439856374, "grad_norm": 28.544275283813477, "learning_rate": 2.9756313584679834e-05, "loss": 13.3619, "step": 14950 }, { "epoch": 2.6885098743267504, "grad_norm": 26.328645706176758, "learning_rate": 2.9750329144225016e-05, "loss": 13.3435, "step": 14975 }, { "epoch": 2.6929982046678633, "grad_norm": 28.869354248046875, "learning_rate": 2.97443447037702e-05, "loss": 12.8979, "step": 15000 }, { "epoch": 2.6974865350089767, "grad_norm": 26.356056213378906, "learning_rate": 2.973836026331538e-05, "loss": 13.2308, "step": 15025 }, { "epoch": 2.7019748653500897, "grad_norm": 26.76312828063965, "learning_rate": 2.9732375822860563e-05, "loss": 13.4595, "step": 15050 }, { "epoch": 2.706463195691203, "grad_norm": 26.2951717376709, "learning_rate": 2.9726391382405746e-05, "loss": 12.9706, "step": 15075 }, { "epoch": 2.710951526032316, "grad_norm": 26.80390739440918, "learning_rate": 2.9720406941950928e-05, "loss": 13.0039, "step": 15100 }, { "epoch": 2.715439856373429, "grad_norm": 27.611963272094727, "learning_rate": 2.971442250149611e-05, "loss": 12.9145, "step": 15125 }, { "epoch": 2.719928186714542, "grad_norm": 28.494123458862305, "learning_rate": 2.9708438061041293e-05, "loss": 13.2181, "step": 15150 }, { "epoch": 2.7244165170556554, "grad_norm": 26.697126388549805, "learning_rate": 2.970245362058648e-05, "loss": 13.3677, "step": 15175 }, { "epoch": 2.7289048473967683, "grad_norm": 27.672060012817383, "learning_rate": 2.9696469180131657e-05, "loss": 12.834, "step": 15200 }, { "epoch": 2.7333931777378817, "grad_norm": 28.86951446533203, "learning_rate": 2.969048473967684e-05, "loss": 13.2779, "step": 15225 }, { "epoch": 2.7378815080789947, "grad_norm": 26.693321228027344, "learning_rate": 2.9684500299222022e-05, "loss": 13.1227, "step": 15250 }, { "epoch": 2.7423698384201076, "grad_norm": 27.52298927307129, "learning_rate": 2.9678515858767205e-05, "loss": 12.6953, "step": 15275 }, { "epoch": 2.7468581687612206, "grad_norm": 26.291044235229492, "learning_rate": 2.9672531418312387e-05, "loss": 13.1016, "step": 15300 }, { "epoch": 2.751346499102334, "grad_norm": 27.562095642089844, "learning_rate": 2.9666546977857573e-05, "loss": 13.1168, "step": 15325 }, { "epoch": 2.755834829443447, "grad_norm": 26.268095016479492, "learning_rate": 2.9660562537402755e-05, "loss": 13.0068, "step": 15350 }, { "epoch": 2.7603231597845603, "grad_norm": 27.220062255859375, "learning_rate": 2.9654578096947937e-05, "loss": 12.8645, "step": 15375 }, { "epoch": 2.7648114901256733, "grad_norm": 27.46253776550293, "learning_rate": 2.9648593656493116e-05, "loss": 13.3971, "step": 15400 }, { "epoch": 2.7692998204667862, "grad_norm": 26.30550193786621, "learning_rate": 2.96426092160383e-05, "loss": 12.8522, "step": 15425 }, { "epoch": 2.773788150807899, "grad_norm": 27.612834930419922, "learning_rate": 2.9636624775583484e-05, "loss": 13.5025, "step": 15450 }, { "epoch": 2.7782764811490126, "grad_norm": 26.21208953857422, "learning_rate": 2.9630640335128667e-05, "loss": 12.552, "step": 15475 }, { "epoch": 2.7827648114901256, "grad_norm": 27.3443546295166, "learning_rate": 2.962465589467385e-05, "loss": 13.1858, "step": 15500 }, { "epoch": 2.787253141831239, "grad_norm": 27.457931518554688, "learning_rate": 2.961867145421903e-05, "loss": 13.2157, "step": 15525 }, { "epoch": 2.791741472172352, "grad_norm": 28.71920394897461, "learning_rate": 2.9612687013764214e-05, "loss": 13.213, "step": 15550 }, { "epoch": 2.796229802513465, "grad_norm": 25.985244750976562, "learning_rate": 2.9606702573309396e-05, "loss": 12.9935, "step": 15575 }, { "epoch": 2.800718132854578, "grad_norm": 25.949575424194336, "learning_rate": 2.960071813285458e-05, "loss": 12.9924, "step": 15600 }, { "epoch": 2.8052064631956912, "grad_norm": 26.65997314453125, "learning_rate": 2.959473369239976e-05, "loss": 13.1186, "step": 15625 }, { "epoch": 2.809694793536804, "grad_norm": 26.854761123657227, "learning_rate": 2.9588749251944943e-05, "loss": 13.0178, "step": 15650 }, { "epoch": 2.8141831238779176, "grad_norm": 26.749004364013672, "learning_rate": 2.9582764811490126e-05, "loss": 13.0751, "step": 15675 }, { "epoch": 2.8186714542190305, "grad_norm": 26.282302856445312, "learning_rate": 2.9576780371035308e-05, "loss": 12.8906, "step": 15700 }, { "epoch": 2.8231597845601435, "grad_norm": 26.395767211914062, "learning_rate": 2.957079593058049e-05, "loss": 13.3278, "step": 15725 }, { "epoch": 2.827648114901257, "grad_norm": 26.95943832397461, "learning_rate": 2.9564811490125676e-05, "loss": 12.9831, "step": 15750 }, { "epoch": 2.83213644524237, "grad_norm": 28.028095245361328, "learning_rate": 2.955882704967086e-05, "loss": 12.932, "step": 15775 }, { "epoch": 2.836624775583483, "grad_norm": 27.706012725830078, "learning_rate": 2.9552842609216038e-05, "loss": 12.7754, "step": 15800 }, { "epoch": 2.841113105924596, "grad_norm": 27.81289291381836, "learning_rate": 2.954685816876122e-05, "loss": 12.8489, "step": 15825 }, { "epoch": 2.845601436265709, "grad_norm": 26.71699333190918, "learning_rate": 2.9540873728306402e-05, "loss": 12.6091, "step": 15850 }, { "epoch": 2.850089766606822, "grad_norm": 65.94219207763672, "learning_rate": 2.9534889287851588e-05, "loss": 13.0053, "step": 15875 }, { "epoch": 2.8545780969479355, "grad_norm": 32.33103561401367, "learning_rate": 2.952890484739677e-05, "loss": 12.4834, "step": 15900 }, { "epoch": 2.8590664272890485, "grad_norm": 25.59375, "learning_rate": 2.9522920406941953e-05, "loss": 13.0441, "step": 15925 }, { "epoch": 2.8635547576301614, "grad_norm": 26.32273292541504, "learning_rate": 2.9516935966487135e-05, "loss": 12.701, "step": 15950 }, { "epoch": 2.868043087971275, "grad_norm": 27.84789276123047, "learning_rate": 2.9510951526032317e-05, "loss": 13.1712, "step": 15975 }, { "epoch": 2.872531418312388, "grad_norm": 27.111125946044922, "learning_rate": 2.9504967085577496e-05, "loss": 12.7789, "step": 16000 }, { "epoch": 2.8770197486535007, "grad_norm": 26.045406341552734, "learning_rate": 2.9498982645122682e-05, "loss": 12.7077, "step": 16025 }, { "epoch": 2.881508078994614, "grad_norm": 26.169029235839844, "learning_rate": 2.9492998204667865e-05, "loss": 13.0239, "step": 16050 }, { "epoch": 2.885996409335727, "grad_norm": 26.920217514038086, "learning_rate": 2.9487013764213047e-05, "loss": 12.978, "step": 16075 }, { "epoch": 2.89048473967684, "grad_norm": 26.622011184692383, "learning_rate": 2.948102932375823e-05, "loss": 12.825, "step": 16100 }, { "epoch": 2.8949730700179535, "grad_norm": 26.462886810302734, "learning_rate": 2.947504488330341e-05, "loss": 12.8208, "step": 16125 }, { "epoch": 2.8994614003590664, "grad_norm": 26.220985412597656, "learning_rate": 2.9469060442848594e-05, "loss": 13.3692, "step": 16150 }, { "epoch": 2.9039497307001794, "grad_norm": 27.57528305053711, "learning_rate": 2.946307600239378e-05, "loss": 12.7889, "step": 16175 }, { "epoch": 2.9084380610412928, "grad_norm": 27.193159103393555, "learning_rate": 2.945709156193896e-05, "loss": 12.9411, "step": 16200 }, { "epoch": 2.9129263913824057, "grad_norm": 28.573688507080078, "learning_rate": 2.945110712148414e-05, "loss": 13.0207, "step": 16225 }, { "epoch": 2.917414721723519, "grad_norm": 27.21942710876465, "learning_rate": 2.9445122681029323e-05, "loss": 12.8121, "step": 16250 }, { "epoch": 2.921903052064632, "grad_norm": 25.42641258239746, "learning_rate": 2.9439138240574506e-05, "loss": 13.0534, "step": 16275 }, { "epoch": 2.926391382405745, "grad_norm": 26.955564498901367, "learning_rate": 2.943315380011969e-05, "loss": 12.6557, "step": 16300 }, { "epoch": 2.930879712746858, "grad_norm": 26.791296005249023, "learning_rate": 2.9427169359664874e-05, "loss": 12.6982, "step": 16325 }, { "epoch": 2.9353680430879714, "grad_norm": 27.43919563293457, "learning_rate": 2.9421184919210056e-05, "loss": 13.1458, "step": 16350 }, { "epoch": 2.9398563734290843, "grad_norm": 26.005870819091797, "learning_rate": 2.941520047875524e-05, "loss": 13.3099, "step": 16375 }, { "epoch": 2.9443447037701977, "grad_norm": 26.166765213012695, "learning_rate": 2.9409216038300418e-05, "loss": 12.7118, "step": 16400 }, { "epoch": 2.9488330341113107, "grad_norm": 26.198945999145508, "learning_rate": 2.94032315978456e-05, "loss": 13.0126, "step": 16425 }, { "epoch": 2.9533213644524237, "grad_norm": 27.599916458129883, "learning_rate": 2.9397247157390786e-05, "loss": 12.4839, "step": 16450 }, { "epoch": 2.9578096947935366, "grad_norm": 26.379606246948242, "learning_rate": 2.9391262716935968e-05, "loss": 13.1094, "step": 16475 }, { "epoch": 2.96229802513465, "grad_norm": 26.30647850036621, "learning_rate": 2.938527827648115e-05, "loss": 13.0026, "step": 16500 }, { "epoch": 2.966786355475763, "grad_norm": 27.161256790161133, "learning_rate": 2.9379293836026333e-05, "loss": 12.795, "step": 16525 }, { "epoch": 2.9712746858168764, "grad_norm": 27.510034561157227, "learning_rate": 2.9373309395571515e-05, "loss": 12.4387, "step": 16550 }, { "epoch": 2.9757630161579893, "grad_norm": 28.14108657836914, "learning_rate": 2.9367324955116697e-05, "loss": 13.1286, "step": 16575 }, { "epoch": 2.9802513464991023, "grad_norm": 28.018766403198242, "learning_rate": 2.936134051466188e-05, "loss": 12.9119, "step": 16600 }, { "epoch": 2.9847396768402152, "grad_norm": 27.52519416809082, "learning_rate": 2.9355356074207062e-05, "loss": 13.2577, "step": 16625 }, { "epoch": 2.9892280071813286, "grad_norm": 26.498538970947266, "learning_rate": 2.9349371633752245e-05, "loss": 12.8444, "step": 16650 }, { "epoch": 2.9937163375224416, "grad_norm": 27.386394500732422, "learning_rate": 2.9343387193297427e-05, "loss": 12.9318, "step": 16675 }, { "epoch": 2.998204667863555, "grad_norm": 29.109481811523438, "learning_rate": 2.933740275284261e-05, "loss": 13.3042, "step": 16700 }, { "epoch": 3.0, "eval_accuracy": 0.07543963492639337, "eval_f1_macro": 0.005658449303781104, "eval_f1_micro": 0.07543963492639337, "eval_f1_weighted": 0.04143719976295692, "eval_loss": 7.06182861328125, "eval_precision_macro": 0.005690112768572151, "eval_precision_micro": 0.07543963492639337, "eval_precision_weighted": 0.0367941063687332, "eval_recall_macro": 0.009608965585832425, "eval_recall_micro": 0.07543963492639337, "eval_recall_weighted": 0.07543963492639337, "eval_runtime": 86.5339, "eval_samples_per_second": 605.231, "eval_steps_per_second": 18.917, "step": 16710 }, { "epoch": 3.002692998204668, "grad_norm": 26.483478546142578, "learning_rate": 2.933141831238779e-05, "loss": 11.5651, "step": 16725 }, { "epoch": 3.007181328545781, "grad_norm": 28.335594177246094, "learning_rate": 2.9325433871932977e-05, "loss": 11.4552, "step": 16750 }, { "epoch": 3.011669658886894, "grad_norm": 26.723102569580078, "learning_rate": 2.931944943147816e-05, "loss": 11.1541, "step": 16775 }, { "epoch": 3.0161579892280073, "grad_norm": 28.930675506591797, "learning_rate": 2.931346499102334e-05, "loss": 11.2597, "step": 16800 }, { "epoch": 3.02064631956912, "grad_norm": 30.39067268371582, "learning_rate": 2.930748055056852e-05, "loss": 11.1364, "step": 16825 }, { "epoch": 3.025134649910233, "grad_norm": 29.515583038330078, "learning_rate": 2.9301496110113703e-05, "loss": 11.1398, "step": 16850 }, { "epoch": 3.0296229802513466, "grad_norm": 29.533111572265625, "learning_rate": 2.929551166965889e-05, "loss": 11.3146, "step": 16875 }, { "epoch": 3.0341113105924595, "grad_norm": 28.315011978149414, "learning_rate": 2.928952722920407e-05, "loss": 10.9237, "step": 16900 }, { "epoch": 3.0385996409335725, "grad_norm": 27.643081665039062, "learning_rate": 2.9283542788749254e-05, "loss": 11.0213, "step": 16925 }, { "epoch": 3.043087971274686, "grad_norm": 30.351112365722656, "learning_rate": 2.9277558348294436e-05, "loss": 11.2419, "step": 16950 }, { "epoch": 3.047576301615799, "grad_norm": 31.334726333618164, "learning_rate": 2.927157390783962e-05, "loss": 11.1816, "step": 16975 }, { "epoch": 3.0520646319569122, "grad_norm": 28.574382781982422, "learning_rate": 2.9265589467384798e-05, "loss": 10.8629, "step": 17000 }, { "epoch": 3.056552962298025, "grad_norm": 30.646869659423828, "learning_rate": 2.9259605026929983e-05, "loss": 11.0935, "step": 17025 }, { "epoch": 3.061041292639138, "grad_norm": 38.04651641845703, "learning_rate": 2.9253620586475166e-05, "loss": 11.3436, "step": 17050 }, { "epoch": 3.0655296229802516, "grad_norm": 29.01982307434082, "learning_rate": 2.9247636146020348e-05, "loss": 11.0168, "step": 17075 }, { "epoch": 3.0700179533213645, "grad_norm": 31.702123641967773, "learning_rate": 2.924165170556553e-05, "loss": 11.1176, "step": 17100 }, { "epoch": 3.0745062836624775, "grad_norm": 31.976844787597656, "learning_rate": 2.9235667265110713e-05, "loss": 11.5311, "step": 17125 }, { "epoch": 3.078994614003591, "grad_norm": 29.563053131103516, "learning_rate": 2.9229682824655895e-05, "loss": 10.9299, "step": 17150 }, { "epoch": 3.083482944344704, "grad_norm": 31.436248779296875, "learning_rate": 2.9223698384201077e-05, "loss": 11.1627, "step": 17175 }, { "epoch": 3.087971274685817, "grad_norm": 30.475858688354492, "learning_rate": 2.921771394374626e-05, "loss": 11.3677, "step": 17200 }, { "epoch": 3.09245960502693, "grad_norm": 29.236719131469727, "learning_rate": 2.9211729503291442e-05, "loss": 11.2419, "step": 17225 }, { "epoch": 3.096947935368043, "grad_norm": 32.13743209838867, "learning_rate": 2.9205745062836625e-05, "loss": 11.1695, "step": 17250 }, { "epoch": 3.101436265709156, "grad_norm": 31.184057235717773, "learning_rate": 2.9199760622381807e-05, "loss": 10.8847, "step": 17275 }, { "epoch": 3.1059245960502695, "grad_norm": 35.40129852294922, "learning_rate": 2.9193776181926993e-05, "loss": 11.4488, "step": 17300 }, { "epoch": 3.1104129263913824, "grad_norm": 31.04747772216797, "learning_rate": 2.9187791741472175e-05, "loss": 11.3415, "step": 17325 }, { "epoch": 3.1149012567324954, "grad_norm": 30.742427825927734, "learning_rate": 2.9181807301017357e-05, "loss": 11.0896, "step": 17350 }, { "epoch": 3.119389587073609, "grad_norm": 29.326475143432617, "learning_rate": 2.9175822860562536e-05, "loss": 11.2907, "step": 17375 }, { "epoch": 3.1238779174147218, "grad_norm": 33.5991325378418, "learning_rate": 2.916983842010772e-05, "loss": 11.0482, "step": 17400 }, { "epoch": 3.1283662477558347, "grad_norm": 32.41011428833008, "learning_rate": 2.91638539796529e-05, "loss": 10.9863, "step": 17425 }, { "epoch": 3.132854578096948, "grad_norm": 29.50647735595703, "learning_rate": 2.9157869539198087e-05, "loss": 11.4484, "step": 17450 }, { "epoch": 3.137342908438061, "grad_norm": 30.07097625732422, "learning_rate": 2.915188509874327e-05, "loss": 10.8256, "step": 17475 }, { "epoch": 3.141831238779174, "grad_norm": 28.87929344177246, "learning_rate": 2.914590065828845e-05, "loss": 10.7541, "step": 17500 }, { "epoch": 3.1463195691202874, "grad_norm": 34.39598083496094, "learning_rate": 2.9139916217833634e-05, "loss": 11.3925, "step": 17525 }, { "epoch": 3.1508078994614004, "grad_norm": 30.967477798461914, "learning_rate": 2.9133931777378816e-05, "loss": 11.0977, "step": 17550 }, { "epoch": 3.1552962298025133, "grad_norm": 32.6968879699707, "learning_rate": 2.9127947336923995e-05, "loss": 10.9783, "step": 17575 }, { "epoch": 3.1597845601436267, "grad_norm": 30.194917678833008, "learning_rate": 2.9122202274087372e-05, "loss": 11.0674, "step": 17600 }, { "epoch": 3.1642728904847397, "grad_norm": 29.895421981811523, "learning_rate": 2.9116217833632558e-05, "loss": 11.3506, "step": 17625 }, { "epoch": 3.1687612208258527, "grad_norm": 30.785797119140625, "learning_rate": 2.911023339317774e-05, "loss": 11.044, "step": 17650 }, { "epoch": 3.173249551166966, "grad_norm": 31.407691955566406, "learning_rate": 2.9104248952722923e-05, "loss": 10.9295, "step": 17675 }, { "epoch": 3.177737881508079, "grad_norm": 29.658754348754883, "learning_rate": 2.9098264512268102e-05, "loss": 11.2406, "step": 17700 }, { "epoch": 3.182226211849192, "grad_norm": 30.043371200561523, "learning_rate": 2.9092280071813284e-05, "loss": 10.9478, "step": 17725 }, { "epoch": 3.1867145421903054, "grad_norm": 31.360021591186523, "learning_rate": 2.908629563135847e-05, "loss": 10.8657, "step": 17750 }, { "epoch": 3.1912028725314183, "grad_norm": 31.64422035217285, "learning_rate": 2.9080311190903652e-05, "loss": 10.8207, "step": 17775 }, { "epoch": 3.1956912028725313, "grad_norm": 30.953533172607422, "learning_rate": 2.9074326750448835e-05, "loss": 11.0474, "step": 17800 }, { "epoch": 3.2001795332136447, "grad_norm": 29.29545783996582, "learning_rate": 2.9068342309994017e-05, "loss": 11.4172, "step": 17825 }, { "epoch": 3.2046678635547576, "grad_norm": 28.73203468322754, "learning_rate": 2.90623578695392e-05, "loss": 11.0947, "step": 17850 }, { "epoch": 3.2091561938958706, "grad_norm": 29.092605590820312, "learning_rate": 2.905637342908438e-05, "loss": 10.7874, "step": 17875 }, { "epoch": 3.213644524236984, "grad_norm": 30.759441375732422, "learning_rate": 2.9050388988629564e-05, "loss": 11.1644, "step": 17900 }, { "epoch": 3.218132854578097, "grad_norm": 31.628297805786133, "learning_rate": 2.9044404548174746e-05, "loss": 11.6035, "step": 17925 }, { "epoch": 3.22262118491921, "grad_norm": 32.346553802490234, "learning_rate": 2.903842010771993e-05, "loss": 10.9969, "step": 17950 }, { "epoch": 3.2271095152603233, "grad_norm": 29.345993041992188, "learning_rate": 2.903243566726511e-05, "loss": 11.2449, "step": 17975 }, { "epoch": 3.2315978456014363, "grad_norm": 36.96156311035156, "learning_rate": 2.9026451226810293e-05, "loss": 11.157, "step": 18000 }, { "epoch": 3.236086175942549, "grad_norm": 31.43854522705078, "learning_rate": 2.9020466786355476e-05, "loss": 11.2116, "step": 18025 }, { "epoch": 3.2405745062836626, "grad_norm": 31.491018295288086, "learning_rate": 2.901448234590066e-05, "loss": 10.9847, "step": 18050 }, { "epoch": 3.2450628366247756, "grad_norm": 31.342721939086914, "learning_rate": 2.900849790544584e-05, "loss": 11.0106, "step": 18075 }, { "epoch": 3.2495511669658885, "grad_norm": 31.182981491088867, "learning_rate": 2.9002513464991023e-05, "loss": 11.2681, "step": 18100 }, { "epoch": 3.254039497307002, "grad_norm": 31.756725311279297, "learning_rate": 2.8996529024536205e-05, "loss": 11.1072, "step": 18125 }, { "epoch": 3.258527827648115, "grad_norm": 28.509653091430664, "learning_rate": 2.8990544584081388e-05, "loss": 11.184, "step": 18150 }, { "epoch": 3.263016157989228, "grad_norm": 31.49736785888672, "learning_rate": 2.8984560143626573e-05, "loss": 11.2929, "step": 18175 }, { "epoch": 3.2675044883303412, "grad_norm": 29.734132766723633, "learning_rate": 2.8978575703171756e-05, "loss": 11.1909, "step": 18200 }, { "epoch": 3.271992818671454, "grad_norm": 31.356077194213867, "learning_rate": 2.8972591262716938e-05, "loss": 11.468, "step": 18225 }, { "epoch": 3.276481149012567, "grad_norm": 28.490388870239258, "learning_rate": 2.896660682226212e-05, "loss": 11.3047, "step": 18250 }, { "epoch": 3.2809694793536806, "grad_norm": 30.780508041381836, "learning_rate": 2.89606223818073e-05, "loss": 11.3076, "step": 18275 }, { "epoch": 3.2854578096947935, "grad_norm": 29.654769897460938, "learning_rate": 2.8954637941352482e-05, "loss": 11.0864, "step": 18300 }, { "epoch": 3.2899461400359065, "grad_norm": 31.67804718017578, "learning_rate": 2.8948653500897668e-05, "loss": 11.2724, "step": 18325 }, { "epoch": 3.29443447037702, "grad_norm": 29.71087646484375, "learning_rate": 2.894266906044285e-05, "loss": 11.3839, "step": 18350 }, { "epoch": 3.298922800718133, "grad_norm": 30.625585556030273, "learning_rate": 2.8936684619988032e-05, "loss": 11.1948, "step": 18375 }, { "epoch": 3.3034111310592458, "grad_norm": 33.840885162353516, "learning_rate": 2.8930700179533215e-05, "loss": 11.5731, "step": 18400 }, { "epoch": 3.307899461400359, "grad_norm": 31.30687713623047, "learning_rate": 2.8924715739078397e-05, "loss": 11.3931, "step": 18425 }, { "epoch": 3.312387791741472, "grad_norm": 30.306846618652344, "learning_rate": 2.891873129862358e-05, "loss": 11.3634, "step": 18450 }, { "epoch": 3.316876122082585, "grad_norm": 31.10429573059082, "learning_rate": 2.891274685816876e-05, "loss": 11.383, "step": 18475 }, { "epoch": 3.3213644524236985, "grad_norm": 31.466232299804688, "learning_rate": 2.8906762417713944e-05, "loss": 11.3213, "step": 18500 }, { "epoch": 3.3258527827648114, "grad_norm": 31.928709030151367, "learning_rate": 2.8900777977259126e-05, "loss": 11.4474, "step": 18525 }, { "epoch": 3.3303411131059244, "grad_norm": 31.45096778869629, "learning_rate": 2.889479353680431e-05, "loss": 11.3677, "step": 18550 }, { "epoch": 3.334829443447038, "grad_norm": 31.321134567260742, "learning_rate": 2.888880909634949e-05, "loss": 11.0314, "step": 18575 }, { "epoch": 3.3393177737881508, "grad_norm": 30.80310821533203, "learning_rate": 2.8882824655894673e-05, "loss": 11.3432, "step": 18600 }, { "epoch": 3.343806104129264, "grad_norm": 33.093849182128906, "learning_rate": 2.887684021543986e-05, "loss": 11.4452, "step": 18625 }, { "epoch": 3.348294434470377, "grad_norm": 30.316701889038086, "learning_rate": 2.887085577498504e-05, "loss": 11.1295, "step": 18650 }, { "epoch": 3.35278276481149, "grad_norm": 30.940135955810547, "learning_rate": 2.886487133453022e-05, "loss": 11.2617, "step": 18675 }, { "epoch": 3.357271095152603, "grad_norm": 28.96495246887207, "learning_rate": 2.8858886894075403e-05, "loss": 11.4463, "step": 18700 }, { "epoch": 3.3617594254937164, "grad_norm": 30.461139678955078, "learning_rate": 2.8852902453620585e-05, "loss": 11.3384, "step": 18725 }, { "epoch": 3.3662477558348294, "grad_norm": 30.79012680053711, "learning_rate": 2.884691801316577e-05, "loss": 11.3678, "step": 18750 }, { "epoch": 3.370736086175943, "grad_norm": 31.620222091674805, "learning_rate": 2.8840933572710953e-05, "loss": 11.0789, "step": 18775 }, { "epoch": 3.3752244165170557, "grad_norm": 29.551908493041992, "learning_rate": 2.8834949132256136e-05, "loss": 11.2116, "step": 18800 }, { "epoch": 3.3797127468581687, "grad_norm": 31.130882263183594, "learning_rate": 2.8828964691801318e-05, "loss": 11.4108, "step": 18825 }, { "epoch": 3.3842010771992816, "grad_norm": 30.52980613708496, "learning_rate": 2.88229802513465e-05, "loss": 11.2848, "step": 18850 }, { "epoch": 3.388689407540395, "grad_norm": 31.423954010009766, "learning_rate": 2.881699581089168e-05, "loss": 11.1228, "step": 18875 }, { "epoch": 3.393177737881508, "grad_norm": 30.197856903076172, "learning_rate": 2.8811011370436865e-05, "loss": 10.955, "step": 18900 }, { "epoch": 3.3976660682226214, "grad_norm": 29.411909103393555, "learning_rate": 2.8805026929982048e-05, "loss": 11.4046, "step": 18925 }, { "epoch": 3.4021543985637344, "grad_norm": 30.500823974609375, "learning_rate": 2.879904248952723e-05, "loss": 11.3617, "step": 18950 }, { "epoch": 3.4066427289048473, "grad_norm": 31.399059295654297, "learning_rate": 2.8793058049072412e-05, "loss": 11.4427, "step": 18975 }, { "epoch": 3.4111310592459603, "grad_norm": 30.890851974487305, "learning_rate": 2.8787073608617595e-05, "loss": 11.4184, "step": 19000 }, { "epoch": 3.4156193895870737, "grad_norm": 31.299579620361328, "learning_rate": 2.8781089168162777e-05, "loss": 11.3553, "step": 19025 }, { "epoch": 3.4201077199281866, "grad_norm": 30.379802703857422, "learning_rate": 2.8775104727707963e-05, "loss": 11.2289, "step": 19050 }, { "epoch": 3.4245960502693, "grad_norm": 30.748916625976562, "learning_rate": 2.8769120287253142e-05, "loss": 10.9974, "step": 19075 }, { "epoch": 3.429084380610413, "grad_norm": 30.164533615112305, "learning_rate": 2.8763135846798324e-05, "loss": 11.2552, "step": 19100 }, { "epoch": 3.433572710951526, "grad_norm": 30.67738914489746, "learning_rate": 2.8757151406343506e-05, "loss": 11.514, "step": 19125 }, { "epoch": 3.438061041292639, "grad_norm": 31.51483154296875, "learning_rate": 2.875116696588869e-05, "loss": 11.4153, "step": 19150 }, { "epoch": 3.4425493716337523, "grad_norm": 32.316654205322266, "learning_rate": 2.8745182525433875e-05, "loss": 11.5824, "step": 19175 }, { "epoch": 3.4470377019748653, "grad_norm": 31.41953468322754, "learning_rate": 2.8739198084979057e-05, "loss": 11.259, "step": 19200 }, { "epoch": 3.4515260323159787, "grad_norm": 32.805870056152344, "learning_rate": 2.873321364452424e-05, "loss": 11.2999, "step": 19225 }, { "epoch": 3.4560143626570916, "grad_norm": 32.010826110839844, "learning_rate": 2.872722920406942e-05, "loss": 10.9906, "step": 19250 }, { "epoch": 3.4605026929982046, "grad_norm": 33.595767974853516, "learning_rate": 2.87212447636146e-05, "loss": 11.2258, "step": 19275 }, { "epoch": 3.464991023339318, "grad_norm": 31.329288482666016, "learning_rate": 2.8715260323159783e-05, "loss": 11.2326, "step": 19300 }, { "epoch": 3.469479353680431, "grad_norm": 31.369930267333984, "learning_rate": 2.870927588270497e-05, "loss": 11.2166, "step": 19325 }, { "epoch": 3.473967684021544, "grad_norm": 30.896013259887695, "learning_rate": 2.870329144225015e-05, "loss": 11.1363, "step": 19350 }, { "epoch": 3.4784560143626573, "grad_norm": 31.08727264404297, "learning_rate": 2.8697307001795333e-05, "loss": 11.4698, "step": 19375 }, { "epoch": 3.4829443447037702, "grad_norm": 28.412425994873047, "learning_rate": 2.8691322561340516e-05, "loss": 11.0108, "step": 19400 }, { "epoch": 3.487432675044883, "grad_norm": 31.50676155090332, "learning_rate": 2.8685338120885698e-05, "loss": 10.8741, "step": 19425 }, { "epoch": 3.4919210053859966, "grad_norm": 28.88292694091797, "learning_rate": 2.867935368043088e-05, "loss": 11.1386, "step": 19450 }, { "epoch": 3.4964093357271095, "grad_norm": 30.06254005432129, "learning_rate": 2.8673369239976063e-05, "loss": 11.1929, "step": 19475 }, { "epoch": 3.5008976660682225, "grad_norm": 34.148311614990234, "learning_rate": 2.8667384799521245e-05, "loss": 11.339, "step": 19500 }, { "epoch": 3.505385996409336, "grad_norm": 33.28491973876953, "learning_rate": 2.8661400359066428e-05, "loss": 11.1095, "step": 19525 }, { "epoch": 3.509874326750449, "grad_norm": 33.306026458740234, "learning_rate": 2.865541591861161e-05, "loss": 11.0814, "step": 19550 }, { "epoch": 3.514362657091562, "grad_norm": 31.115873336791992, "learning_rate": 2.8649431478156792e-05, "loss": 11.2325, "step": 19575 }, { "epoch": 3.5188509874326748, "grad_norm": 31.66822052001953, "learning_rate": 2.8643447037701978e-05, "loss": 11.1676, "step": 19600 }, { "epoch": 3.523339317773788, "grad_norm": 29.544313430786133, "learning_rate": 2.863746259724716e-05, "loss": 11.1635, "step": 19625 }, { "epoch": 3.527827648114901, "grad_norm": 34.17205810546875, "learning_rate": 2.8631478156792343e-05, "loss": 11.7224, "step": 19650 }, { "epoch": 3.5323159784560145, "grad_norm": 32.336727142333984, "learning_rate": 2.8625493716337522e-05, "loss": 11.3425, "step": 19675 }, { "epoch": 3.5368043087971275, "grad_norm": 32.560447692871094, "learning_rate": 2.8619509275882704e-05, "loss": 10.9515, "step": 19700 }, { "epoch": 3.5412926391382404, "grad_norm": 30.652273178100586, "learning_rate": 2.8613524835427886e-05, "loss": 11.2001, "step": 19725 }, { "epoch": 3.545780969479354, "grad_norm": 30.610469818115234, "learning_rate": 2.8607540394973072e-05, "loss": 11.5682, "step": 19750 }, { "epoch": 3.550269299820467, "grad_norm": 30.808074951171875, "learning_rate": 2.8601555954518255e-05, "loss": 11.453, "step": 19775 }, { "epoch": 3.5547576301615798, "grad_norm": 31.674997329711914, "learning_rate": 2.8595571514063437e-05, "loss": 11.2786, "step": 19800 }, { "epoch": 3.559245960502693, "grad_norm": 30.62029457092285, "learning_rate": 2.858958707360862e-05, "loss": 11.7177, "step": 19825 }, { "epoch": 3.563734290843806, "grad_norm": 31.6383113861084, "learning_rate": 2.8583602633153798e-05, "loss": 11.2354, "step": 19850 }, { "epoch": 3.568222621184919, "grad_norm": 32.37112045288086, "learning_rate": 2.857761819269898e-05, "loss": 11.2081, "step": 19875 }, { "epoch": 3.5727109515260325, "grad_norm": 29.828189849853516, "learning_rate": 2.8571873129862358e-05, "loss": 11.0898, "step": 19900 }, { "epoch": 3.5771992818671454, "grad_norm": 30.648529052734375, "learning_rate": 2.8565888689407543e-05, "loss": 10.8965, "step": 19925 }, { "epoch": 3.5816876122082584, "grad_norm": 41.849483489990234, "learning_rate": 2.8559904248952726e-05, "loss": 11.0246, "step": 19950 }, { "epoch": 3.5861759425493718, "grad_norm": 31.274961471557617, "learning_rate": 2.8553919808497905e-05, "loss": 11.4183, "step": 19975 }, { "epoch": 3.5906642728904847, "grad_norm": 30.798633575439453, "learning_rate": 2.8547935368043087e-05, "loss": 11.1237, "step": 20000 }, { "epoch": 3.5951526032315977, "grad_norm": 28.889543533325195, "learning_rate": 2.854195092758827e-05, "loss": 11.3662, "step": 20025 }, { "epoch": 3.599640933572711, "grad_norm": 29.90560531616211, "learning_rate": 2.8535966487133455e-05, "loss": 11.0125, "step": 20050 }, { "epoch": 3.604129263913824, "grad_norm": 30.499813079833984, "learning_rate": 2.8529982046678638e-05, "loss": 11.4072, "step": 20075 }, { "epoch": 3.608617594254937, "grad_norm": 30.493555068969727, "learning_rate": 2.852399760622382e-05, "loss": 11.2318, "step": 20100 }, { "epoch": 3.6131059245960504, "grad_norm": 31.599319458007812, "learning_rate": 2.8518013165769002e-05, "loss": 11.062, "step": 20125 }, { "epoch": 3.6175942549371634, "grad_norm": 30.47945213317871, "learning_rate": 2.8512028725314185e-05, "loss": 11.4558, "step": 20150 }, { "epoch": 3.6220825852782763, "grad_norm": 32.14970779418945, "learning_rate": 2.8506044284859364e-05, "loss": 10.5179, "step": 20175 }, { "epoch": 3.6265709156193897, "grad_norm": 31.17921257019043, "learning_rate": 2.850005984440455e-05, "loss": 10.968, "step": 20200 }, { "epoch": 3.6310592459605027, "grad_norm": 31.30777931213379, "learning_rate": 2.8494075403949732e-05, "loss": 11.336, "step": 20225 }, { "epoch": 3.635547576301616, "grad_norm": 30.182174682617188, "learning_rate": 2.8488090963494914e-05, "loss": 11.2265, "step": 20250 }, { "epoch": 3.640035906642729, "grad_norm": 31.07087516784668, "learning_rate": 2.8482106523040096e-05, "loss": 11.2847, "step": 20275 }, { "epoch": 3.644524236983842, "grad_norm": 28.739133834838867, "learning_rate": 2.847612208258528e-05, "loss": 11.427, "step": 20300 }, { "epoch": 3.649012567324955, "grad_norm": 31.23784637451172, "learning_rate": 2.847013764213046e-05, "loss": 11.0143, "step": 20325 }, { "epoch": 3.6535008976660683, "grad_norm": 30.830699920654297, "learning_rate": 2.8464153201675647e-05, "loss": 11.2841, "step": 20350 }, { "epoch": 3.6579892280071813, "grad_norm": 30.827341079711914, "learning_rate": 2.8458168761220826e-05, "loss": 10.9722, "step": 20375 }, { "epoch": 3.6624775583482947, "grad_norm": 29.842851638793945, "learning_rate": 2.8452184320766008e-05, "loss": 11.2758, "step": 20400 }, { "epoch": 3.6669658886894076, "grad_norm": 32.061363220214844, "learning_rate": 2.844619988031119e-05, "loss": 11.0502, "step": 20425 }, { "epoch": 3.6714542190305206, "grad_norm": 31.67589569091797, "learning_rate": 2.8440215439856373e-05, "loss": 11.2309, "step": 20450 }, { "epoch": 3.6759425493716336, "grad_norm": 29.2219295501709, "learning_rate": 2.8434230999401555e-05, "loss": 11.1079, "step": 20475 }, { "epoch": 3.680430879712747, "grad_norm": 29.494009017944336, "learning_rate": 2.842824655894674e-05, "loss": 11.095, "step": 20500 }, { "epoch": 3.68491921005386, "grad_norm": 30.72727394104004, "learning_rate": 2.8422262118491923e-05, "loss": 10.9651, "step": 20525 }, { "epoch": 3.6894075403949733, "grad_norm": 32.62581253051758, "learning_rate": 2.8416277678037106e-05, "loss": 11.1485, "step": 20550 }, { "epoch": 3.6938958707360863, "grad_norm": 32.17450714111328, "learning_rate": 2.8410293237582285e-05, "loss": 11.2383, "step": 20575 }, { "epoch": 3.6983842010771992, "grad_norm": 31.298063278198242, "learning_rate": 2.8404308797127467e-05, "loss": 11.4215, "step": 20600 }, { "epoch": 3.702872531418312, "grad_norm": 31.1262149810791, "learning_rate": 2.8398324356672653e-05, "loss": 11.0779, "step": 20625 }, { "epoch": 3.7073608617594256, "grad_norm": 31.340126037597656, "learning_rate": 2.8392339916217835e-05, "loss": 11.0714, "step": 20650 }, { "epoch": 3.7118491921005385, "grad_norm": 33.29624557495117, "learning_rate": 2.8386355475763018e-05, "loss": 11.0074, "step": 20675 }, { "epoch": 3.716337522441652, "grad_norm": 30.880542755126953, "learning_rate": 2.83803710353082e-05, "loss": 10.8339, "step": 20700 }, { "epoch": 3.720825852782765, "grad_norm": 29.898832321166992, "learning_rate": 2.8374386594853382e-05, "loss": 10.8654, "step": 20725 }, { "epoch": 3.725314183123878, "grad_norm": 29.32884979248047, "learning_rate": 2.8368402154398565e-05, "loss": 11.3662, "step": 20750 }, { "epoch": 3.729802513464991, "grad_norm": 32.064762115478516, "learning_rate": 2.8362417713943747e-05, "loss": 11.4581, "step": 20775 }, { "epoch": 3.734290843806104, "grad_norm": 32.138267517089844, "learning_rate": 2.835643327348893e-05, "loss": 10.9124, "step": 20800 }, { "epoch": 3.738779174147217, "grad_norm": 33.86062240600586, "learning_rate": 2.8350448833034112e-05, "loss": 11.0097, "step": 20825 }, { "epoch": 3.7432675044883306, "grad_norm": 30.490970611572266, "learning_rate": 2.8344464392579294e-05, "loss": 11.4534, "step": 20850 }, { "epoch": 3.7477558348294435, "grad_norm": 27.865781784057617, "learning_rate": 2.8338479952124477e-05, "loss": 11.4486, "step": 20875 }, { "epoch": 3.7522441651705565, "grad_norm": 31.9267520904541, "learning_rate": 2.833249551166966e-05, "loss": 11.5519, "step": 20900 }, { "epoch": 3.7567324955116694, "grad_norm": 29.056507110595703, "learning_rate": 2.8326511071214845e-05, "loss": 11.4834, "step": 20925 }, { "epoch": 3.761220825852783, "grad_norm": 30.026002883911133, "learning_rate": 2.8320526630760024e-05, "loss": 11.3338, "step": 20950 }, { "epoch": 3.765709156193896, "grad_norm": 30.737932205200195, "learning_rate": 2.8314542190305206e-05, "loss": 10.9653, "step": 20975 }, { "epoch": 3.770197486535009, "grad_norm": 30.978910446166992, "learning_rate": 2.8308557749850388e-05, "loss": 11.4722, "step": 21000 }, { "epoch": 3.774685816876122, "grad_norm": 30.23752212524414, "learning_rate": 2.830257330939557e-05, "loss": 11.3043, "step": 21025 }, { "epoch": 3.779174147217235, "grad_norm": 32.29151153564453, "learning_rate": 2.8296588868940756e-05, "loss": 11.0943, "step": 21050 }, { "epoch": 3.783662477558348, "grad_norm": 30.46995735168457, "learning_rate": 2.829060442848594e-05, "loss": 11.345, "step": 21075 }, { "epoch": 3.7881508078994615, "grad_norm": 32.500823974609375, "learning_rate": 2.828461998803112e-05, "loss": 11.2675, "step": 21100 }, { "epoch": 3.7926391382405744, "grad_norm": 31.643070220947266, "learning_rate": 2.8278635547576303e-05, "loss": 11.2081, "step": 21125 }, { "epoch": 3.797127468581688, "grad_norm": 31.303314208984375, "learning_rate": 2.8272651107121482e-05, "loss": 11.3969, "step": 21150 }, { "epoch": 3.8016157989228008, "grad_norm": 32.96514129638672, "learning_rate": 2.8266666666666665e-05, "loss": 11.1525, "step": 21175 }, { "epoch": 3.8061041292639137, "grad_norm": 30.002351760864258, "learning_rate": 2.826068222621185e-05, "loss": 10.8658, "step": 21200 }, { "epoch": 3.8105924596050267, "grad_norm": 31.169191360473633, "learning_rate": 2.8254697785757033e-05, "loss": 11.2565, "step": 21225 }, { "epoch": 3.81508078994614, "grad_norm": 31.06591033935547, "learning_rate": 2.8248713345302215e-05, "loss": 11.2624, "step": 21250 }, { "epoch": 3.819569120287253, "grad_norm": 28.5202579498291, "learning_rate": 2.8242728904847398e-05, "loss": 10.9322, "step": 21275 }, { "epoch": 3.8240574506283664, "grad_norm": 30.786962509155273, "learning_rate": 2.823674446439258e-05, "loss": 11.0254, "step": 21300 }, { "epoch": 3.8285457809694794, "grad_norm": 30.801992416381836, "learning_rate": 2.8230760023937762e-05, "loss": 11.156, "step": 21325 }, { "epoch": 3.8330341113105924, "grad_norm": 28.441688537597656, "learning_rate": 2.8224775583482945e-05, "loss": 11.1657, "step": 21350 }, { "epoch": 3.8375224416517053, "grad_norm": 29.77831268310547, "learning_rate": 2.8218791143028127e-05, "loss": 11.3975, "step": 21375 }, { "epoch": 3.8420107719928187, "grad_norm": 31.247785568237305, "learning_rate": 2.821280670257331e-05, "loss": 11.0109, "step": 21400 }, { "epoch": 3.8464991023339317, "grad_norm": 32.04808807373047, "learning_rate": 2.8206822262118492e-05, "loss": 11.0839, "step": 21425 }, { "epoch": 3.850987432675045, "grad_norm": 30.55583953857422, "learning_rate": 2.8200837821663674e-05, "loss": 10.8381, "step": 21450 }, { "epoch": 3.855475763016158, "grad_norm": 29.084171295166016, "learning_rate": 2.819485338120886e-05, "loss": 11.187, "step": 21475 }, { "epoch": 3.859964093357271, "grad_norm": 31.084972381591797, "learning_rate": 2.8188868940754042e-05, "loss": 11.52, "step": 21500 }, { "epoch": 3.864452423698384, "grad_norm": 31.979738235473633, "learning_rate": 2.8182884500299225e-05, "loss": 11.3922, "step": 21525 }, { "epoch": 3.8689407540394973, "grad_norm": 28.92717742919922, "learning_rate": 2.8176900059844404e-05, "loss": 11.064, "step": 21550 }, { "epoch": 3.8734290843806103, "grad_norm": 29.832292556762695, "learning_rate": 2.8170915619389586e-05, "loss": 11.2098, "step": 21575 }, { "epoch": 3.8779174147217237, "grad_norm": 31.74751091003418, "learning_rate": 2.816493117893477e-05, "loss": 10.9546, "step": 21600 }, { "epoch": 3.8824057450628366, "grad_norm": 32.31631088256836, "learning_rate": 2.8158946738479954e-05, "loss": 11.1283, "step": 21625 }, { "epoch": 3.8868940754039496, "grad_norm": 30.267370223999023, "learning_rate": 2.8152962298025136e-05, "loss": 10.9434, "step": 21650 }, { "epoch": 3.891382405745063, "grad_norm": 31.132080078125, "learning_rate": 2.814697785757032e-05, "loss": 11.404, "step": 21675 }, { "epoch": 3.895870736086176, "grad_norm": 31.34539794921875, "learning_rate": 2.81409934171155e-05, "loss": 11.3978, "step": 21700 }, { "epoch": 3.900359066427289, "grad_norm": 31.39044952392578, "learning_rate": 2.8135008976660684e-05, "loss": 11.187, "step": 21725 }, { "epoch": 3.9048473967684023, "grad_norm": 32.7244873046875, "learning_rate": 2.8129024536205862e-05, "loss": 11.2598, "step": 21750 }, { "epoch": 3.9093357271095153, "grad_norm": 28.18239974975586, "learning_rate": 2.8123040095751048e-05, "loss": 11.3187, "step": 21775 }, { "epoch": 3.9138240574506282, "grad_norm": 31.796775817871094, "learning_rate": 2.811705565529623e-05, "loss": 11.0664, "step": 21800 }, { "epoch": 3.9183123877917416, "grad_norm": 30.6005859375, "learning_rate": 2.8111071214841413e-05, "loss": 10.987, "step": 21825 }, { "epoch": 3.9228007181328546, "grad_norm": 30.108829498291016, "learning_rate": 2.8105086774386595e-05, "loss": 11.0541, "step": 21850 }, { "epoch": 3.9272890484739675, "grad_norm": 31.7265682220459, "learning_rate": 2.8099102333931778e-05, "loss": 11.1046, "step": 21875 }, { "epoch": 3.931777378815081, "grad_norm": 32.628074645996094, "learning_rate": 2.809311789347696e-05, "loss": 11.2953, "step": 21900 }, { "epoch": 3.936265709156194, "grad_norm": 28.80093765258789, "learning_rate": 2.8087133453022146e-05, "loss": 11.5525, "step": 21925 }, { "epoch": 3.940754039497307, "grad_norm": 29.523881912231445, "learning_rate": 2.8081149012567325e-05, "loss": 11.2298, "step": 21950 }, { "epoch": 3.9452423698384202, "grad_norm": 30.06547737121582, "learning_rate": 2.8075164572112507e-05, "loss": 10.7736, "step": 21975 }, { "epoch": 3.949730700179533, "grad_norm": 29.540449142456055, "learning_rate": 2.806918013165769e-05, "loss": 11.0709, "step": 22000 }, { "epoch": 3.954219030520646, "grad_norm": 33.31890869140625, "learning_rate": 2.8063195691202872e-05, "loss": 11.0698, "step": 22025 }, { "epoch": 3.9587073608617596, "grad_norm": 30.68980598449707, "learning_rate": 2.8057211250748058e-05, "loss": 10.8253, "step": 22050 }, { "epoch": 3.9631956912028725, "grad_norm": 31.10498809814453, "learning_rate": 2.805122681029324e-05, "loss": 11.4169, "step": 22075 }, { "epoch": 3.9676840215439855, "grad_norm": 30.547962188720703, "learning_rate": 2.8045242369838422e-05, "loss": 11.2333, "step": 22100 }, { "epoch": 3.972172351885099, "grad_norm": 30.325082778930664, "learning_rate": 2.8039257929383605e-05, "loss": 11.3395, "step": 22125 }, { "epoch": 3.976660682226212, "grad_norm": 30.00259780883789, "learning_rate": 2.8033273488928784e-05, "loss": 11.2044, "step": 22150 }, { "epoch": 3.9811490125673252, "grad_norm": 27.535524368286133, "learning_rate": 2.8027289048473966e-05, "loss": 10.92, "step": 22175 }, { "epoch": 3.985637342908438, "grad_norm": 31.112247467041016, "learning_rate": 2.8021304608019152e-05, "loss": 11.1473, "step": 22200 }, { "epoch": 3.990125673249551, "grad_norm": 30.036909103393555, "learning_rate": 2.8015320167564334e-05, "loss": 11.5283, "step": 22225 }, { "epoch": 3.994614003590664, "grad_norm": 30.063087463378906, "learning_rate": 2.8009335727109516e-05, "loss": 11.1425, "step": 22250 }, { "epoch": 3.9991023339317775, "grad_norm": 32.6578483581543, "learning_rate": 2.80033512866547e-05, "loss": 10.6965, "step": 22275 }, { "epoch": 4.0, "eval_accuracy": 0.07753995379298494, "eval_f1_macro": 0.009406764821620481, "eval_f1_micro": 0.07753995379298494, "eval_f1_weighted": 0.04766626516109554, "eval_loss": 6.724180221557617, "eval_precision_macro": 0.00894345687991437, "eval_precision_micro": 0.07753995379298494, "eval_precision_weighted": 0.04157378236856482, "eval_recall_macro": 0.014579264668079467, "eval_recall_micro": 0.07753995379298494, "eval_recall_weighted": 0.07753995379298494, "eval_runtime": 86.416, "eval_samples_per_second": 606.057, "eval_steps_per_second": 18.943, "step": 22280 }, { "epoch": 4.003590664272891, "grad_norm": 30.3936824798584, "learning_rate": 2.799736684619988e-05, "loss": 9.4773, "step": 22300 }, { "epoch": 4.008078994614004, "grad_norm": 30.763669967651367, "learning_rate": 2.7991382405745064e-05, "loss": 9.1388, "step": 22325 }, { "epoch": 4.012567324955117, "grad_norm": 30.83111572265625, "learning_rate": 2.7985397965290246e-05, "loss": 9.1059, "step": 22350 }, { "epoch": 4.01705565529623, "grad_norm": 32.58699035644531, "learning_rate": 2.7979413524835428e-05, "loss": 9.1599, "step": 22375 }, { "epoch": 4.021543985637343, "grad_norm": 32.16946792602539, "learning_rate": 2.797342908438061e-05, "loss": 8.8678, "step": 22400 }, { "epoch": 4.026032315978456, "grad_norm": 32.695838928222656, "learning_rate": 2.7967444643925793e-05, "loss": 8.5109, "step": 22425 }, { "epoch": 4.0305206463195695, "grad_norm": 32.195003509521484, "learning_rate": 2.7961460203470975e-05, "loss": 8.8435, "step": 22450 }, { "epoch": 4.0350089766606825, "grad_norm": 33.23640060424805, "learning_rate": 2.795547576301616e-05, "loss": 8.9362, "step": 22475 }, { "epoch": 4.039497307001795, "grad_norm": 36.865997314453125, "learning_rate": 2.7949491322561343e-05, "loss": 9.0649, "step": 22500 }, { "epoch": 4.043985637342908, "grad_norm": 35.41594696044922, "learning_rate": 2.7943506882106526e-05, "loss": 8.9695, "step": 22525 }, { "epoch": 4.048473967684021, "grad_norm": 35.198551177978516, "learning_rate": 2.7937522441651705e-05, "loss": 9.3542, "step": 22550 }, { "epoch": 4.052962298025134, "grad_norm": 49.4534912109375, "learning_rate": 2.7931538001196887e-05, "loss": 8.6442, "step": 22575 }, { "epoch": 4.057450628366248, "grad_norm": 35.323726654052734, "learning_rate": 2.792555356074207e-05, "loss": 8.5507, "step": 22600 }, { "epoch": 4.061938958707361, "grad_norm": Infinity, "learning_rate": 2.7919808497905447e-05, "loss": 9.0441, "step": 22625 }, { "epoch": 4.066427289048474, "grad_norm": 35.7750244140625, "learning_rate": 2.791382405745063e-05, "loss": 9.1965, "step": 22650 }, { "epoch": 4.070915619389587, "grad_norm": 31.913360595703125, "learning_rate": 2.790783961699581e-05, "loss": 9.1348, "step": 22675 }, { "epoch": 4.0754039497307, "grad_norm": 33.979190826416016, "learning_rate": 2.7901855176540994e-05, "loss": 9.1416, "step": 22700 }, { "epoch": 4.079892280071813, "grad_norm": 33.557029724121094, "learning_rate": 2.7895870736086176e-05, "loss": 8.8399, "step": 22725 }, { "epoch": 4.084380610412927, "grad_norm": 35.37779998779297, "learning_rate": 2.788988629563136e-05, "loss": 9.2904, "step": 22750 }, { "epoch": 4.08886894075404, "grad_norm": 33.334224700927734, "learning_rate": 2.788390185517654e-05, "loss": 9.0412, "step": 22775 }, { "epoch": 4.093357271095153, "grad_norm": 38.393653869628906, "learning_rate": 2.7877917414721726e-05, "loss": 8.8758, "step": 22800 }, { "epoch": 4.097845601436266, "grad_norm": 34.724517822265625, "learning_rate": 2.787193297426691e-05, "loss": 8.9632, "step": 22825 }, { "epoch": 4.102333931777379, "grad_norm": 35.026126861572266, "learning_rate": 2.7865948533812088e-05, "loss": 8.81, "step": 22850 }, { "epoch": 4.1068222621184916, "grad_norm": 33.23841094970703, "learning_rate": 2.785996409335727e-05, "loss": 8.965, "step": 22875 }, { "epoch": 4.111310592459605, "grad_norm": 33.344581604003906, "learning_rate": 2.7853979652902453e-05, "loss": 8.6121, "step": 22900 }, { "epoch": 4.115798922800718, "grad_norm": 33.311065673828125, "learning_rate": 2.7847995212447638e-05, "loss": 8.863, "step": 22925 }, { "epoch": 4.120287253141831, "grad_norm": 31.99666404724121, "learning_rate": 2.784201077199282e-05, "loss": 9.2711, "step": 22950 }, { "epoch": 4.124775583482944, "grad_norm": 35.421077728271484, "learning_rate": 2.7836026331538003e-05, "loss": 8.8117, "step": 22975 }, { "epoch": 4.129263913824057, "grad_norm": 35.499202728271484, "learning_rate": 2.7830041891083185e-05, "loss": 9.0647, "step": 23000 }, { "epoch": 4.13375224416517, "grad_norm": 39.84804916381836, "learning_rate": 2.7824057450628368e-05, "loss": 8.9921, "step": 23025 }, { "epoch": 4.138240574506284, "grad_norm": 35.68635559082031, "learning_rate": 2.7818073010173547e-05, "loss": 9.0054, "step": 23050 }, { "epoch": 4.142728904847397, "grad_norm": 34.515098571777344, "learning_rate": 2.7812088569718732e-05, "loss": 9.2403, "step": 23075 }, { "epoch": 4.14721723518851, "grad_norm": 35.22542190551758, "learning_rate": 2.7806104129263915e-05, "loss": 9.1803, "step": 23100 }, { "epoch": 4.151705565529623, "grad_norm": 31.101097106933594, "learning_rate": 2.7800119688809097e-05, "loss": 9.1061, "step": 23125 }, { "epoch": 4.156193895870736, "grad_norm": 35.81389236450195, "learning_rate": 2.779413524835428e-05, "loss": 9.2282, "step": 23150 }, { "epoch": 4.160682226211849, "grad_norm": 33.05430603027344, "learning_rate": 2.7788150807899462e-05, "loss": 8.9339, "step": 23175 }, { "epoch": 4.165170556552963, "grad_norm": 32.21403884887695, "learning_rate": 2.7782166367444644e-05, "loss": 9.0769, "step": 23200 }, { "epoch": 4.169658886894076, "grad_norm": 38.616085052490234, "learning_rate": 2.777618192698983e-05, "loss": 9.2644, "step": 23225 }, { "epoch": 4.174147217235189, "grad_norm": 34.82571029663086, "learning_rate": 2.777019748653501e-05, "loss": 9.2723, "step": 23250 }, { "epoch": 4.1786355475763015, "grad_norm": 37.125797271728516, "learning_rate": 2.776421304608019e-05, "loss": 9.2647, "step": 23275 }, { "epoch": 4.1831238779174145, "grad_norm": 36.201927185058594, "learning_rate": 2.7758228605625374e-05, "loss": 9.5391, "step": 23300 }, { "epoch": 4.187612208258527, "grad_norm": 34.90190505981445, "learning_rate": 2.7752244165170556e-05, "loss": 9.2669, "step": 23325 }, { "epoch": 4.192100538599641, "grad_norm": 36.72137451171875, "learning_rate": 2.7746259724715742e-05, "loss": 9.3072, "step": 23350 }, { "epoch": 4.196588868940754, "grad_norm": 34.933372497558594, "learning_rate": 2.7740275284260924e-05, "loss": 9.2713, "step": 23375 }, { "epoch": 4.201077199281867, "grad_norm": 37.9987907409668, "learning_rate": 2.7734290843806107e-05, "loss": 9.0352, "step": 23400 }, { "epoch": 4.20556552962298, "grad_norm": 33.95653533935547, "learning_rate": 2.772830640335129e-05, "loss": 9.1703, "step": 23425 }, { "epoch": 4.210053859964093, "grad_norm": 32.79034423828125, "learning_rate": 2.7722321962896468e-05, "loss": 9.0717, "step": 23450 }, { "epoch": 4.214542190305206, "grad_norm": 41.263702392578125, "learning_rate": 2.771633752244165e-05, "loss": 9.0279, "step": 23475 }, { "epoch": 4.21903052064632, "grad_norm": 34.632225036621094, "learning_rate": 2.7710353081986836e-05, "loss": 9.0236, "step": 23500 }, { "epoch": 4.223518850987433, "grad_norm": 34.72397232055664, "learning_rate": 2.770436864153202e-05, "loss": 9.2389, "step": 23525 }, { "epoch": 4.228007181328546, "grad_norm": 34.320003509521484, "learning_rate": 2.76983842010772e-05, "loss": 9.0942, "step": 23550 }, { "epoch": 4.232495511669659, "grad_norm": 35.2785758972168, "learning_rate": 2.7692399760622383e-05, "loss": 8.7822, "step": 23575 }, { "epoch": 4.236983842010772, "grad_norm": 40.83307647705078, "learning_rate": 2.7686415320167565e-05, "loss": 9.2271, "step": 23600 }, { "epoch": 4.241472172351885, "grad_norm": 34.236122131347656, "learning_rate": 2.7680430879712748e-05, "loss": 9.0967, "step": 23625 }, { "epoch": 4.2459605026929985, "grad_norm": 34.03813171386719, "learning_rate": 2.767444643925793e-05, "loss": 9.1916, "step": 23650 }, { "epoch": 4.2504488330341115, "grad_norm": 32.90471267700195, "learning_rate": 2.7668461998803112e-05, "loss": 8.9725, "step": 23675 }, { "epoch": 4.254937163375224, "grad_norm": 37.31569290161133, "learning_rate": 2.7662477558348295e-05, "loss": 9.5137, "step": 23700 }, { "epoch": 4.259425493716337, "grad_norm": 33.96034240722656, "learning_rate": 2.7656493117893477e-05, "loss": 9.1435, "step": 23725 }, { "epoch": 4.26391382405745, "grad_norm": 37.626258850097656, "learning_rate": 2.765050867743866e-05, "loss": 9.1109, "step": 23750 }, { "epoch": 4.268402154398563, "grad_norm": 37.14412307739258, "learning_rate": 2.7644524236983842e-05, "loss": 8.9028, "step": 23775 }, { "epoch": 4.272890484739677, "grad_norm": 33.2732048034668, "learning_rate": 2.7638539796529028e-05, "loss": 8.9961, "step": 23800 }, { "epoch": 4.27737881508079, "grad_norm": 35.71903991699219, "learning_rate": 2.7632555356074207e-05, "loss": 9.3034, "step": 23825 }, { "epoch": 4.281867145421903, "grad_norm": 34.583213806152344, "learning_rate": 2.762657091561939e-05, "loss": 9.2041, "step": 23850 }, { "epoch": 4.286355475763016, "grad_norm": 36.03817367553711, "learning_rate": 2.762058647516457e-05, "loss": 9.2503, "step": 23875 }, { "epoch": 4.290843806104129, "grad_norm": 34.202823638916016, "learning_rate": 2.7614602034709754e-05, "loss": 9.2398, "step": 23900 }, { "epoch": 4.295332136445243, "grad_norm": 35.64631652832031, "learning_rate": 2.760861759425494e-05, "loss": 9.139, "step": 23925 }, { "epoch": 4.299820466786356, "grad_norm": 34.361637115478516, "learning_rate": 2.7602633153800122e-05, "loss": 9.0898, "step": 23950 }, { "epoch": 4.304308797127469, "grad_norm": 32.614646911621094, "learning_rate": 2.7596648713345304e-05, "loss": 9.4455, "step": 23975 }, { "epoch": 4.308797127468582, "grad_norm": 36.456077575683594, "learning_rate": 2.7590664272890487e-05, "loss": 9.2413, "step": 24000 }, { "epoch": 4.313285457809695, "grad_norm": 33.43761444091797, "learning_rate": 2.7584679832435666e-05, "loss": 9.1153, "step": 24025 }, { "epoch": 4.317773788150808, "grad_norm": 34.84223556518555, "learning_rate": 2.7578695391980848e-05, "loss": 9.2057, "step": 24050 }, { "epoch": 4.3222621184919205, "grad_norm": 31.044452667236328, "learning_rate": 2.7572710951526034e-05, "loss": 9.3385, "step": 24075 }, { "epoch": 4.326750448833034, "grad_norm": 38.18600845336914, "learning_rate": 2.7566726511071216e-05, "loss": 9.0842, "step": 24100 }, { "epoch": 4.331238779174147, "grad_norm": 34.68734359741211, "learning_rate": 2.75607420706164e-05, "loss": 9.4446, "step": 24125 }, { "epoch": 4.33572710951526, "grad_norm": 38.530601501464844, "learning_rate": 2.755475763016158e-05, "loss": 9.0487, "step": 24150 }, { "epoch": 4.340215439856373, "grad_norm": 35.827022552490234, "learning_rate": 2.7548773189706763e-05, "loss": 9.1242, "step": 24175 }, { "epoch": 4.344703770197486, "grad_norm": 37.25276184082031, "learning_rate": 2.7542788749251945e-05, "loss": 9.1769, "step": 24200 }, { "epoch": 4.3491921005386, "grad_norm": 35.8741340637207, "learning_rate": 2.7536804308797128e-05, "loss": 9.256, "step": 24225 }, { "epoch": 4.353680430879713, "grad_norm": 34.161651611328125, "learning_rate": 2.753081986834231e-05, "loss": 9.3959, "step": 24250 }, { "epoch": 4.358168761220826, "grad_norm": 36.703941345214844, "learning_rate": 2.7524835427887492e-05, "loss": 9.6069, "step": 24275 }, { "epoch": 4.362657091561939, "grad_norm": 33.90925216674805, "learning_rate": 2.7518850987432675e-05, "loss": 9.2081, "step": 24300 }, { "epoch": 4.367145421903052, "grad_norm": 36.48859786987305, "learning_rate": 2.7512866546977857e-05, "loss": 9.2767, "step": 24325 }, { "epoch": 4.371633752244165, "grad_norm": 36.00957489013672, "learning_rate": 2.7506882106523043e-05, "loss": 9.2949, "step": 24350 }, { "epoch": 4.376122082585279, "grad_norm": 33.388736724853516, "learning_rate": 2.7500897666068225e-05, "loss": 9.4621, "step": 24375 }, { "epoch": 4.380610412926392, "grad_norm": 32.6502571105957, "learning_rate": 2.7494913225613408e-05, "loss": 9.2408, "step": 24400 }, { "epoch": 4.385098743267505, "grad_norm": 36.0883903503418, "learning_rate": 2.7488928785158587e-05, "loss": 9.3558, "step": 24425 }, { "epoch": 4.3895870736086176, "grad_norm": 33.08795928955078, "learning_rate": 2.748294434470377e-05, "loss": 9.1737, "step": 24450 }, { "epoch": 4.3940754039497305, "grad_norm": 37.87990188598633, "learning_rate": 2.747695990424895e-05, "loss": 9.2635, "step": 24475 }, { "epoch": 4.3985637342908435, "grad_norm": 32.306396484375, "learning_rate": 2.7470975463794137e-05, "loss": 9.4797, "step": 24500 }, { "epoch": 4.403052064631957, "grad_norm": 34.42149353027344, "learning_rate": 2.746499102333932e-05, "loss": 8.8528, "step": 24525 }, { "epoch": 4.40754039497307, "grad_norm": 33.147850036621094, "learning_rate": 2.7459006582884502e-05, "loss": 9.3153, "step": 24550 }, { "epoch": 4.412028725314183, "grad_norm": 36.34206771850586, "learning_rate": 2.7453022142429684e-05, "loss": 9.1607, "step": 24575 }, { "epoch": 4.416517055655296, "grad_norm": 36.275413513183594, "learning_rate": 2.7447037701974867e-05, "loss": 9.2555, "step": 24600 }, { "epoch": 4.421005385996409, "grad_norm": 34.83110427856445, "learning_rate": 2.7441053261520046e-05, "loss": 9.4131, "step": 24625 }, { "epoch": 4.425493716337522, "grad_norm": 35.73281478881836, "learning_rate": 2.743506882106523e-05, "loss": 9.7517, "step": 24650 }, { "epoch": 4.429982046678636, "grad_norm": 32.646751403808594, "learning_rate": 2.7429084380610414e-05, "loss": 9.452, "step": 24675 }, { "epoch": 4.434470377019749, "grad_norm": 42.54426956176758, "learning_rate": 2.7423099940155596e-05, "loss": 9.3777, "step": 24700 }, { "epoch": 4.438958707360862, "grad_norm": 35.09437942504883, "learning_rate": 2.741711549970078e-05, "loss": 9.3665, "step": 24725 }, { "epoch": 4.443447037701975, "grad_norm": 36.45936965942383, "learning_rate": 2.741113105924596e-05, "loss": 9.4285, "step": 24750 }, { "epoch": 4.447935368043088, "grad_norm": 34.06489181518555, "learning_rate": 2.7405146618791146e-05, "loss": 9.1473, "step": 24775 }, { "epoch": 4.452423698384201, "grad_norm": 38.4737663269043, "learning_rate": 2.739916217833633e-05, "loss": 9.5141, "step": 24800 }, { "epoch": 4.456912028725315, "grad_norm": 35.27596664428711, "learning_rate": 2.7393177737881508e-05, "loss": 9.3386, "step": 24825 }, { "epoch": 4.4614003590664275, "grad_norm": 39.01841735839844, "learning_rate": 2.738719329742669e-05, "loss": 9.2959, "step": 24850 }, { "epoch": 4.4658886894075405, "grad_norm": 40.175697326660156, "learning_rate": 2.7381208856971873e-05, "loss": 9.3482, "step": 24875 }, { "epoch": 4.470377019748653, "grad_norm": 37.285396575927734, "learning_rate": 2.7375224416517055e-05, "loss": 9.0562, "step": 24900 }, { "epoch": 4.474865350089766, "grad_norm": 37.979305267333984, "learning_rate": 2.736923997606224e-05, "loss": 9.4161, "step": 24925 }, { "epoch": 4.479353680430879, "grad_norm": 34.52471160888672, "learning_rate": 2.7363255535607423e-05, "loss": 9.1926, "step": 24950 }, { "epoch": 4.483842010771993, "grad_norm": 32.52268600463867, "learning_rate": 2.7357271095152605e-05, "loss": 9.5568, "step": 24975 }, { "epoch": 4.488330341113106, "grad_norm": 34.64008712768555, "learning_rate": 2.7351286654697788e-05, "loss": 9.34, "step": 25000 }, { "epoch": 4.492818671454219, "grad_norm": 35.43095397949219, "learning_rate": 2.7345302214242967e-05, "loss": 9.6012, "step": 25025 }, { "epoch": 4.497307001795332, "grad_norm": 34.24216079711914, "learning_rate": 2.733931777378815e-05, "loss": 9.545, "step": 25050 }, { "epoch": 4.501795332136445, "grad_norm": 36.410186767578125, "learning_rate": 2.7333333333333335e-05, "loss": 9.5178, "step": 25075 }, { "epoch": 4.506283662477558, "grad_norm": 33.58375549316406, "learning_rate": 2.7327348892878517e-05, "loss": 9.0259, "step": 25100 }, { "epoch": 4.510771992818672, "grad_norm": 33.377079010009766, "learning_rate": 2.73213644524237e-05, "loss": 9.1557, "step": 25125 }, { "epoch": 4.515260323159785, "grad_norm": 37.322166442871094, "learning_rate": 2.7315380011968882e-05, "loss": 9.2679, "step": 25150 }, { "epoch": 4.519748653500898, "grad_norm": 35.399192810058594, "learning_rate": 2.7309395571514064e-05, "loss": 9.3599, "step": 25175 }, { "epoch": 4.524236983842011, "grad_norm": 34.6229362487793, "learning_rate": 2.7303411131059247e-05, "loss": 9.2008, "step": 25200 }, { "epoch": 4.528725314183124, "grad_norm": 38.43641662597656, "learning_rate": 2.729742669060443e-05, "loss": 9.5365, "step": 25225 }, { "epoch": 4.533213644524237, "grad_norm": 36.315940856933594, "learning_rate": 2.729144225014961e-05, "loss": 9.2328, "step": 25250 }, { "epoch": 4.53770197486535, "grad_norm": 36.93431091308594, "learning_rate": 2.7285457809694794e-05, "loss": 9.1937, "step": 25275 }, { "epoch": 4.542190305206463, "grad_norm": 34.52630615234375, "learning_rate": 2.7279473369239976e-05, "loss": 9.3224, "step": 25300 }, { "epoch": 4.546678635547576, "grad_norm": 37.09843826293945, "learning_rate": 2.727348892878516e-05, "loss": 9.5531, "step": 25325 }, { "epoch": 4.551166965888689, "grad_norm": 35.45225143432617, "learning_rate": 2.7267504488330344e-05, "loss": 9.3112, "step": 25350 }, { "epoch": 4.555655296229802, "grad_norm": 36.52423858642578, "learning_rate": 2.7261520047875526e-05, "loss": 9.2307, "step": 25375 }, { "epoch": 4.560143626570916, "grad_norm": 31.571231842041016, "learning_rate": 2.725553560742071e-05, "loss": 9.0741, "step": 25400 }, { "epoch": 4.564631956912029, "grad_norm": 35.70735549926758, "learning_rate": 2.7249551166965888e-05, "loss": 9.4122, "step": 25425 }, { "epoch": 4.569120287253142, "grad_norm": 35.43241882324219, "learning_rate": 2.724356672651107e-05, "loss": 9.4357, "step": 25450 }, { "epoch": 4.573608617594255, "grad_norm": 35.756832122802734, "learning_rate": 2.7237582286056253e-05, "loss": 9.3782, "step": 25475 }, { "epoch": 4.578096947935368, "grad_norm": 33.91000747680664, "learning_rate": 2.7231597845601438e-05, "loss": 9.7165, "step": 25500 }, { "epoch": 4.582585278276481, "grad_norm": 34.89963912963867, "learning_rate": 2.722561340514662e-05, "loss": 9.5026, "step": 25525 }, { "epoch": 4.587073608617594, "grad_norm": 35.42002868652344, "learning_rate": 2.7219628964691803e-05, "loss": 9.6544, "step": 25550 }, { "epoch": 4.591561938958708, "grad_norm": 35.01460647583008, "learning_rate": 2.7213644524236985e-05, "loss": 9.3751, "step": 25575 }, { "epoch": 4.596050269299821, "grad_norm": 32.873260498046875, "learning_rate": 2.7207660083782164e-05, "loss": 9.1189, "step": 25600 }, { "epoch": 4.600538599640934, "grad_norm": 38.0374641418457, "learning_rate": 2.7201675643327347e-05, "loss": 9.4421, "step": 25625 }, { "epoch": 4.6050269299820465, "grad_norm": 32.76029586791992, "learning_rate": 2.7195691202872532e-05, "loss": 9.2211, "step": 25650 }, { "epoch": 4.6095152603231595, "grad_norm": 35.879295349121094, "learning_rate": 2.7189706762417715e-05, "loss": 9.4768, "step": 25675 }, { "epoch": 4.614003590664273, "grad_norm": 34.31226348876953, "learning_rate": 2.7183722321962897e-05, "loss": 9.1382, "step": 25700 }, { "epoch": 4.618491921005386, "grad_norm": 33.70473861694336, "learning_rate": 2.717773788150808e-05, "loss": 9.1704, "step": 25725 }, { "epoch": 4.622980251346499, "grad_norm": 36.1688232421875, "learning_rate": 2.7171753441053262e-05, "loss": 9.4746, "step": 25750 }, { "epoch": 4.627468581687612, "grad_norm": 35.33478927612305, "learning_rate": 2.7165769000598448e-05, "loss": 9.2567, "step": 25775 }, { "epoch": 4.631956912028725, "grad_norm": 35.50520324707031, "learning_rate": 2.7159784560143627e-05, "loss": 9.5426, "step": 25800 }, { "epoch": 4.636445242369838, "grad_norm": 34.68144989013672, "learning_rate": 2.715380011968881e-05, "loss": 9.4237, "step": 25825 }, { "epoch": 4.640933572710951, "grad_norm": 32.5733528137207, "learning_rate": 2.714781567923399e-05, "loss": 9.2993, "step": 25850 }, { "epoch": 4.645421903052065, "grad_norm": 34.17429733276367, "learning_rate": 2.714207061639737e-05, "loss": 9.4056, "step": 25875 }, { "epoch": 4.649910233393178, "grad_norm": 49.32793045043945, "learning_rate": 2.713608617594255e-05, "loss": 9.3828, "step": 25900 }, { "epoch": 4.654398563734291, "grad_norm": 35.83115768432617, "learning_rate": 2.713010173548773e-05, "loss": 9.4779, "step": 25925 }, { "epoch": 4.658886894075404, "grad_norm": 35.35591125488281, "learning_rate": 2.7124117295032915e-05, "loss": 9.4844, "step": 25950 }, { "epoch": 4.663375224416517, "grad_norm": 35.725494384765625, "learning_rate": 2.7118132854578098e-05, "loss": 9.1772, "step": 25975 }, { "epoch": 4.667863554757631, "grad_norm": 34.3475227355957, "learning_rate": 2.711214841412328e-05, "loss": 9.5834, "step": 26000 }, { "epoch": 4.6723518850987436, "grad_norm": 35.19342041015625, "learning_rate": 2.7106163973668463e-05, "loss": 9.1603, "step": 26025 }, { "epoch": 4.6768402154398565, "grad_norm": 37.154518127441406, "learning_rate": 2.7100179533213645e-05, "loss": 9.5956, "step": 26050 }, { "epoch": 4.6813285457809695, "grad_norm": 36.49668884277344, "learning_rate": 2.7094195092758827e-05, "loss": 9.5274, "step": 26075 }, { "epoch": 4.685816876122082, "grad_norm": 34.92998504638672, "learning_rate": 2.7088210652304013e-05, "loss": 9.5255, "step": 26100 }, { "epoch": 4.690305206463195, "grad_norm": 32.61775207519531, "learning_rate": 2.7082226211849192e-05, "loss": 9.4236, "step": 26125 }, { "epoch": 4.694793536804308, "grad_norm": 35.2857666015625, "learning_rate": 2.7076241771394374e-05, "loss": 9.4578, "step": 26150 }, { "epoch": 4.699281867145422, "grad_norm": 37.08427429199219, "learning_rate": 2.7070257330939557e-05, "loss": 9.5587, "step": 26175 }, { "epoch": 4.703770197486535, "grad_norm": 33.42496109008789, "learning_rate": 2.706427289048474e-05, "loss": 9.6455, "step": 26200 }, { "epoch": 4.708258527827648, "grad_norm": 38.109561920166016, "learning_rate": 2.7058288450029925e-05, "loss": 9.4372, "step": 26225 }, { "epoch": 4.712746858168761, "grad_norm": 34.73807907104492, "learning_rate": 2.7052304009575107e-05, "loss": 9.2668, "step": 26250 }, { "epoch": 4.717235188509874, "grad_norm": 35.39613723754883, "learning_rate": 2.704631956912029e-05, "loss": 9.3931, "step": 26275 }, { "epoch": 4.721723518850988, "grad_norm": 35.8447380065918, "learning_rate": 2.7040335128665472e-05, "loss": 9.5386, "step": 26300 }, { "epoch": 4.726211849192101, "grad_norm": 38.25541305541992, "learning_rate": 2.703435068821065e-05, "loss": 9.2727, "step": 26325 }, { "epoch": 4.730700179533214, "grad_norm": 35.903507232666016, "learning_rate": 2.7028366247755833e-05, "loss": 9.7039, "step": 26350 }, { "epoch": 4.735188509874327, "grad_norm": 35.6234130859375, "learning_rate": 2.702238180730102e-05, "loss": 9.6165, "step": 26375 }, { "epoch": 4.73967684021544, "grad_norm": 37.08405303955078, "learning_rate": 2.70163973668462e-05, "loss": 9.6926, "step": 26400 }, { "epoch": 4.744165170556553, "grad_norm": 32.01731491088867, "learning_rate": 2.7010412926391384e-05, "loss": 9.3151, "step": 26425 }, { "epoch": 4.748653500897666, "grad_norm": 37.30953598022461, "learning_rate": 2.7004428485936566e-05, "loss": 9.4753, "step": 26450 }, { "epoch": 4.753141831238779, "grad_norm": 37.31596755981445, "learning_rate": 2.699844404548175e-05, "loss": 9.4824, "step": 26475 }, { "epoch": 4.757630161579892, "grad_norm": 35.827213287353516, "learning_rate": 2.699245960502693e-05, "loss": 9.5776, "step": 26500 }, { "epoch": 4.762118491921005, "grad_norm": 39.54668045043945, "learning_rate": 2.6986475164572113e-05, "loss": 9.563, "step": 26525 }, { "epoch": 4.766606822262118, "grad_norm": 32.41488265991211, "learning_rate": 2.6980490724117296e-05, "loss": 9.3826, "step": 26550 }, { "epoch": 4.771095152603231, "grad_norm": 36.029666900634766, "learning_rate": 2.6974506283662478e-05, "loss": 9.512, "step": 26575 }, { "epoch": 4.775583482944345, "grad_norm": 32.85836410522461, "learning_rate": 2.696852184320766e-05, "loss": 9.4177, "step": 26600 }, { "epoch": 4.780071813285458, "grad_norm": 32.541988372802734, "learning_rate": 2.6962537402752843e-05, "loss": 9.3833, "step": 26625 }, { "epoch": 4.784560143626571, "grad_norm": 35.42625045776367, "learning_rate": 2.695655296229803e-05, "loss": 9.5967, "step": 26650 }, { "epoch": 4.789048473967684, "grad_norm": 39.130592346191406, "learning_rate": 2.695056852184321e-05, "loss": 9.4185, "step": 26675 }, { "epoch": 4.793536804308797, "grad_norm": 37.135032653808594, "learning_rate": 2.694458408138839e-05, "loss": 9.4166, "step": 26700 }, { "epoch": 4.79802513464991, "grad_norm": 32.95221710205078, "learning_rate": 2.6938599640933572e-05, "loss": 9.5325, "step": 26725 }, { "epoch": 4.802513464991024, "grad_norm": 34.23844528198242, "learning_rate": 2.6932615200478754e-05, "loss": 9.5948, "step": 26750 }, { "epoch": 4.807001795332137, "grad_norm": 36.37213134765625, "learning_rate": 2.6926630760023937e-05, "loss": 9.1881, "step": 26775 }, { "epoch": 4.81149012567325, "grad_norm": 37.12689971923828, "learning_rate": 2.6920646319569123e-05, "loss": 9.4443, "step": 26800 }, { "epoch": 4.815978456014363, "grad_norm": 30.90703582763672, "learning_rate": 2.6914661879114305e-05, "loss": 9.1284, "step": 26825 }, { "epoch": 4.8204667863554755, "grad_norm": 34.2583122253418, "learning_rate": 2.6908677438659487e-05, "loss": 9.1908, "step": 26850 }, { "epoch": 4.8249551166965885, "grad_norm": 36.532203674316406, "learning_rate": 2.690269299820467e-05, "loss": 9.3999, "step": 26875 }, { "epoch": 4.829443447037702, "grad_norm": 36.42616271972656, "learning_rate": 2.689670855774985e-05, "loss": 9.448, "step": 26900 }, { "epoch": 4.833931777378815, "grad_norm": 37.477928161621094, "learning_rate": 2.689072411729503e-05, "loss": 9.5414, "step": 26925 }, { "epoch": 4.838420107719928, "grad_norm": 36.44997024536133, "learning_rate": 2.6884739676840217e-05, "loss": 9.406, "step": 26950 }, { "epoch": 4.842908438061041, "grad_norm": 34.89653396606445, "learning_rate": 2.68787552363854e-05, "loss": 9.8373, "step": 26975 }, { "epoch": 4.847396768402154, "grad_norm": 34.84752655029297, "learning_rate": 2.687277079593058e-05, "loss": 9.3907, "step": 27000 }, { "epoch": 4.851885098743267, "grad_norm": 33.79581069946289, "learning_rate": 2.6866786355475764e-05, "loss": 9.4444, "step": 27025 }, { "epoch": 4.856373429084381, "grad_norm": 34.37635040283203, "learning_rate": 2.6860801915020946e-05, "loss": 9.5671, "step": 27050 }, { "epoch": 4.860861759425494, "grad_norm": 35.371822357177734, "learning_rate": 2.685481747456613e-05, "loss": 9.5015, "step": 27075 }, { "epoch": 4.865350089766607, "grad_norm": 38.23295211791992, "learning_rate": 2.684883303411131e-05, "loss": 9.3564, "step": 27100 }, { "epoch": 4.86983842010772, "grad_norm": 36.58891296386719, "learning_rate": 2.6842848593656493e-05, "loss": 9.6106, "step": 27125 }, { "epoch": 4.874326750448833, "grad_norm": 38.73398208618164, "learning_rate": 2.6836864153201676e-05, "loss": 9.5011, "step": 27150 }, { "epoch": 4.878815080789947, "grad_norm": 34.134403228759766, "learning_rate": 2.6830879712746858e-05, "loss": 9.5416, "step": 27175 }, { "epoch": 4.88330341113106, "grad_norm": 34.43739318847656, "learning_rate": 2.682489527229204e-05, "loss": 9.4023, "step": 27200 }, { "epoch": 4.8877917414721725, "grad_norm": 33.59444808959961, "learning_rate": 2.6818910831837226e-05, "loss": 9.7006, "step": 27225 }, { "epoch": 4.8922800718132855, "grad_norm": 37.26764678955078, "learning_rate": 2.681292639138241e-05, "loss": 9.5618, "step": 27250 }, { "epoch": 4.8967684021543985, "grad_norm": 34.99287033081055, "learning_rate": 2.680694195092759e-05, "loss": 9.181, "step": 27275 }, { "epoch": 4.901256732495511, "grad_norm": 37.341121673583984, "learning_rate": 2.680095751047277e-05, "loss": 9.0808, "step": 27300 }, { "epoch": 4.905745062836624, "grad_norm": 31.948301315307617, "learning_rate": 2.6794973070017952e-05, "loss": 9.2991, "step": 27325 }, { "epoch": 4.910233393177738, "grad_norm": 31.787092208862305, "learning_rate": 2.6788988629563134e-05, "loss": 9.7063, "step": 27350 }, { "epoch": 4.914721723518851, "grad_norm": 33.72126007080078, "learning_rate": 2.678300418910832e-05, "loss": 9.445, "step": 27375 }, { "epoch": 4.919210053859964, "grad_norm": 35.92157745361328, "learning_rate": 2.6777019748653503e-05, "loss": 9.8848, "step": 27400 }, { "epoch": 4.923698384201077, "grad_norm": 35.00507354736328, "learning_rate": 2.6771035308198685e-05, "loss": 9.4444, "step": 27425 }, { "epoch": 4.92818671454219, "grad_norm": 35.75861358642578, "learning_rate": 2.6765050867743867e-05, "loss": 9.8597, "step": 27450 }, { "epoch": 4.932675044883304, "grad_norm": 37.223167419433594, "learning_rate": 2.675906642728905e-05, "loss": 9.5199, "step": 27475 }, { "epoch": 4.937163375224417, "grad_norm": 34.89140319824219, "learning_rate": 2.675308198683423e-05, "loss": 9.5326, "step": 27500 }, { "epoch": 4.94165170556553, "grad_norm": 38.68606948852539, "learning_rate": 2.6747097546379414e-05, "loss": 9.6149, "step": 27525 }, { "epoch": 4.946140035906643, "grad_norm": 35.76506805419922, "learning_rate": 2.6741113105924597e-05, "loss": 9.2212, "step": 27550 }, { "epoch": 4.950628366247756, "grad_norm": 34.05699920654297, "learning_rate": 2.673512866546978e-05, "loss": 9.3154, "step": 27575 }, { "epoch": 4.955116696588869, "grad_norm": 35.53427505493164, "learning_rate": 2.672914422501496e-05, "loss": 9.6563, "step": 27600 }, { "epoch": 4.959605026929982, "grad_norm": 33.76486587524414, "learning_rate": 2.6723159784560144e-05, "loss": 9.5415, "step": 27625 }, { "epoch": 4.9640933572710955, "grad_norm": 33.02145767211914, "learning_rate": 2.671717534410533e-05, "loss": 9.3559, "step": 27650 }, { "epoch": 4.968581687612208, "grad_norm": 38.21705627441406, "learning_rate": 2.6711190903650512e-05, "loss": 9.6168, "step": 27675 }, { "epoch": 4.973070017953321, "grad_norm": 37.642417907714844, "learning_rate": 2.670520646319569e-05, "loss": 9.5134, "step": 27700 }, { "epoch": 4.977558348294434, "grad_norm": 33.83686828613281, "learning_rate": 2.6699222022740873e-05, "loss": 9.4817, "step": 27725 }, { "epoch": 4.982046678635547, "grad_norm": 34.98296356201172, "learning_rate": 2.6693237582286056e-05, "loss": 9.6689, "step": 27750 }, { "epoch": 4.986535008976661, "grad_norm": 35.06865692138672, "learning_rate": 2.6687253141831238e-05, "loss": 9.4976, "step": 27775 }, { "epoch": 4.991023339317774, "grad_norm": 34.293113708496094, "learning_rate": 2.6681268701376424e-05, "loss": 9.3905, "step": 27800 }, { "epoch": 4.995511669658887, "grad_norm": 34.20943832397461, "learning_rate": 2.6675284260921606e-05, "loss": 9.6572, "step": 27825 }, { "epoch": 5.0, "grad_norm": 61.015777587890625, "learning_rate": 2.666929982046679e-05, "loss": 9.5296, "step": 27850 }, { "epoch": 5.0, "eval_accuracy": 0.06898592786359384, "eval_f1_macro": 0.011309476538890082, "eval_f1_micro": 0.06898592786359384, "eval_f1_weighted": 0.047429091612135786, "eval_loss": 6.66681432723999, "eval_precision_macro": 0.010315927310867975, "eval_precision_micro": 0.06898592786359384, "eval_precision_weighted": 0.041722656687824905, "eval_recall_macro": 0.016669465188724426, "eval_recall_micro": 0.06898592786359384, "eval_recall_weighted": 0.06898592786359384, "eval_runtime": 83.3116, "eval_samples_per_second": 628.64, "eval_steps_per_second": 19.649, "step": 27850 } ], "logging_steps": 25, "max_steps": 139250, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.8218004536284e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }