jimfhahn's picture
Upload folder using huggingface_hub
938e90f verified
{
"best_metric": 6.66681432723999,
"best_model_checkpoint": "ModernBERT-base-dnb/checkpoint-27850",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 27850,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004488330341113106,
"grad_norm": 21.10218048095703,
"learning_rate": 5.3859964093357274e-08,
"loss": 20.235,
"step": 25
},
{
"epoch": 0.008976660682226212,
"grad_norm": 21.044858932495117,
"learning_rate": 1.0556552962298026e-07,
"loss": 20.2822,
"step": 50
},
{
"epoch": 0.013464991023339317,
"grad_norm": 15.749917984008789,
"learning_rate": 1.5942549371633754e-07,
"loss": 20.2111,
"step": 75
},
{
"epoch": 0.017953321364452424,
"grad_norm": 53.41726303100586,
"learning_rate": 2.1113105924596052e-07,
"loss": 20.219,
"step": 100
},
{
"epoch": 0.02244165170556553,
"grad_norm": 19.08318328857422,
"learning_rate": 2.649910233393178e-07,
"loss": 20.2098,
"step": 125
},
{
"epoch": 0.026929982046678635,
"grad_norm": 17.500648498535156,
"learning_rate": 3.188509874326751e-07,
"loss": 20.1833,
"step": 150
},
{
"epoch": 0.03141831238779174,
"grad_norm": 15.890292167663574,
"learning_rate": 3.7271095152603236e-07,
"loss": 20.1506,
"step": 175
},
{
"epoch": 0.03590664272890485,
"grad_norm": 15.574706077575684,
"learning_rate": 4.265709156193896e-07,
"loss": 20.2048,
"step": 200
},
{
"epoch": 0.04039497307001795,
"grad_norm": 16.508468627929688,
"learning_rate": 4.804308797127469e-07,
"loss": 20.1701,
"step": 225
},
{
"epoch": 0.04488330341113106,
"grad_norm": 14.636826515197754,
"learning_rate": 5.342908438061041e-07,
"loss": 20.0568,
"step": 250
},
{
"epoch": 0.04937163375224417,
"grad_norm": 12.061074256896973,
"learning_rate": 5.881508078994614e-07,
"loss": 20.0871,
"step": 275
},
{
"epoch": 0.05385996409335727,
"grad_norm": 13.407825469970703,
"learning_rate": 6.420107719928187e-07,
"loss": 20.116,
"step": 300
},
{
"epoch": 0.05834829443447038,
"grad_norm": 14.116663932800293,
"learning_rate": 6.95870736086176e-07,
"loss": 20.1185,
"step": 325
},
{
"epoch": 0.06283662477558348,
"grad_norm": 13.583087921142578,
"learning_rate": 7.497307001795332e-07,
"loss": 20.1201,
"step": 350
},
{
"epoch": 0.06732495511669659,
"grad_norm": 16.15342903137207,
"learning_rate": 8.035906642728905e-07,
"loss": 20.0418,
"step": 375
},
{
"epoch": 0.0718132854578097,
"grad_norm": 11.18100643157959,
"learning_rate": 8.574506283662477e-07,
"loss": 20.0282,
"step": 400
},
{
"epoch": 0.0763016157989228,
"grad_norm": 11.952834129333496,
"learning_rate": 9.113105924596051e-07,
"loss": 20.032,
"step": 425
},
{
"epoch": 0.0807899461400359,
"grad_norm": 11.063940048217773,
"learning_rate": 9.651705565529624e-07,
"loss": 20.0012,
"step": 450
},
{
"epoch": 0.08527827648114901,
"grad_norm": 11.023772239685059,
"learning_rate": 1.0190305206463197e-06,
"loss": 20.0264,
"step": 475
},
{
"epoch": 0.08976660682226212,
"grad_norm": 11.103007316589355,
"learning_rate": 1.072890484739677e-06,
"loss": 19.964,
"step": 500
},
{
"epoch": 0.09425493716337523,
"grad_norm": 12.964080810546875,
"learning_rate": 1.126750448833034e-06,
"loss": 20.0389,
"step": 525
},
{
"epoch": 0.09874326750448834,
"grad_norm": 11.313638687133789,
"learning_rate": 1.1806104129263915e-06,
"loss": 20.0112,
"step": 550
},
{
"epoch": 0.10323159784560143,
"grad_norm": 9.363356590270996,
"learning_rate": 1.2344703770197488e-06,
"loss": 19.9996,
"step": 575
},
{
"epoch": 0.10771992818671454,
"grad_norm": 11.197516441345215,
"learning_rate": 1.2883303411131059e-06,
"loss": 19.9399,
"step": 600
},
{
"epoch": 0.11220825852782765,
"grad_norm": 9.123501777648926,
"learning_rate": 1.3421903052064631e-06,
"loss": 20.0128,
"step": 625
},
{
"epoch": 0.11669658886894076,
"grad_norm": 14.137397766113281,
"learning_rate": 1.3960502692998206e-06,
"loss": 20.0196,
"step": 650
},
{
"epoch": 0.12118491921005387,
"grad_norm": 9.076166152954102,
"learning_rate": 1.449910233393178e-06,
"loss": 19.9431,
"step": 675
},
{
"epoch": 0.12567324955116696,
"grad_norm": 11.227516174316406,
"learning_rate": 1.503770197486535e-06,
"loss": 19.892,
"step": 700
},
{
"epoch": 0.13016157989228008,
"grad_norm": 9.613359451293945,
"learning_rate": 1.5576301615798923e-06,
"loss": 19.9289,
"step": 725
},
{
"epoch": 0.13464991023339318,
"grad_norm": 9.49264907836914,
"learning_rate": 1.6114901256732495e-06,
"loss": 19.886,
"step": 750
},
{
"epoch": 0.13913824057450627,
"grad_norm": 11.677379608154297,
"learning_rate": 1.6653500897666068e-06,
"loss": 19.8904,
"step": 775
},
{
"epoch": 0.1436265709156194,
"grad_norm": 11.741113662719727,
"learning_rate": 1.719210053859964e-06,
"loss": 19.901,
"step": 800
},
{
"epoch": 0.1481149012567325,
"grad_norm": 12.074057579040527,
"learning_rate": 1.7730700179533214e-06,
"loss": 19.903,
"step": 825
},
{
"epoch": 0.1526032315978456,
"grad_norm": 17.792566299438477,
"learning_rate": 1.8269299820466787e-06,
"loss": 19.8621,
"step": 850
},
{
"epoch": 0.1570915619389587,
"grad_norm": 10.892045021057129,
"learning_rate": 1.880789946140036e-06,
"loss": 19.8345,
"step": 875
},
{
"epoch": 0.1615798922800718,
"grad_norm": 11.912057876586914,
"learning_rate": 1.9346499102333932e-06,
"loss": 19.8252,
"step": 900
},
{
"epoch": 0.16606822262118492,
"grad_norm": 11.374829292297363,
"learning_rate": 1.9885098743267503e-06,
"loss": 19.8219,
"step": 925
},
{
"epoch": 0.17055655296229802,
"grad_norm": 11.979461669921875,
"learning_rate": 2.0423698384201078e-06,
"loss": 19.7829,
"step": 950
},
{
"epoch": 0.17504488330341114,
"grad_norm": 12.24255657196045,
"learning_rate": 2.096229802513465e-06,
"loss": 19.7962,
"step": 975
},
{
"epoch": 0.17953321364452424,
"grad_norm": 11.375764846801758,
"learning_rate": 2.150089766606822e-06,
"loss": 19.7298,
"step": 1000
},
{
"epoch": 0.18402154398563733,
"grad_norm": 12.321856498718262,
"learning_rate": 2.20394973070018e-06,
"loss": 19.712,
"step": 1025
},
{
"epoch": 0.18850987432675045,
"grad_norm": 12.325765609741211,
"learning_rate": 2.257809694793537e-06,
"loss": 19.6841,
"step": 1050
},
{
"epoch": 0.19299820466786355,
"grad_norm": 12.724334716796875,
"learning_rate": 2.3116696588868944e-06,
"loss": 19.6671,
"step": 1075
},
{
"epoch": 0.19748653500897667,
"grad_norm": 12.078253746032715,
"learning_rate": 2.3655296229802515e-06,
"loss": 19.6296,
"step": 1100
},
{
"epoch": 0.20197486535008977,
"grad_norm": 13.096288681030273,
"learning_rate": 2.4193895870736085e-06,
"loss": 19.5341,
"step": 1125
},
{
"epoch": 0.20646319569120286,
"grad_norm": 13.682557106018066,
"learning_rate": 2.473249551166966e-06,
"loss": 19.6408,
"step": 1150
},
{
"epoch": 0.21095152603231598,
"grad_norm": 13.981725692749023,
"learning_rate": 2.527109515260323e-06,
"loss": 19.6481,
"step": 1175
},
{
"epoch": 0.21543985637342908,
"grad_norm": 13.742826461791992,
"learning_rate": 2.58096947935368e-06,
"loss": 19.5424,
"step": 1200
},
{
"epoch": 0.2199281867145422,
"grad_norm": 16.546175003051758,
"learning_rate": 2.634829443447038e-06,
"loss": 19.5218,
"step": 1225
},
{
"epoch": 0.2244165170556553,
"grad_norm": 15.267694473266602,
"learning_rate": 2.688689407540395e-06,
"loss": 19.5041,
"step": 1250
},
{
"epoch": 0.2289048473967684,
"grad_norm": 15.6527681350708,
"learning_rate": 2.7425493716337522e-06,
"loss": 19.6472,
"step": 1275
},
{
"epoch": 0.2333931777378815,
"grad_norm": 17.371788024902344,
"learning_rate": 2.7964093357271097e-06,
"loss": 19.4608,
"step": 1300
},
{
"epoch": 0.2378815080789946,
"grad_norm": 15.061119079589844,
"learning_rate": 2.8502692998204668e-06,
"loss": 19.4584,
"step": 1325
},
{
"epoch": 0.24236983842010773,
"grad_norm": 13.703615188598633,
"learning_rate": 2.90197486535009e-06,
"loss": 19.4636,
"step": 1350
},
{
"epoch": 0.24685816876122083,
"grad_norm": 13.755026817321777,
"learning_rate": 2.9558348294434473e-06,
"loss": 19.391,
"step": 1375
},
{
"epoch": 0.2513464991023339,
"grad_norm": 15.358574867248535,
"learning_rate": 3.0096947935368044e-06,
"loss": 19.3758,
"step": 1400
},
{
"epoch": 0.25583482944344704,
"grad_norm": 14.703276634216309,
"learning_rate": 3.063554757630162e-06,
"loss": 19.357,
"step": 1425
},
{
"epoch": 0.26032315978456017,
"grad_norm": 14.63382625579834,
"learning_rate": 3.117414721723519e-06,
"loss": 19.2794,
"step": 1450
},
{
"epoch": 0.26481149012567323,
"grad_norm": 15.199682235717773,
"learning_rate": 3.171274685816876e-06,
"loss": 19.2949,
"step": 1475
},
{
"epoch": 0.26929982046678635,
"grad_norm": 14.768528938293457,
"learning_rate": 3.2251346499102335e-06,
"loss": 19.3093,
"step": 1500
},
{
"epoch": 0.2737881508078995,
"grad_norm": 14.896871566772461,
"learning_rate": 3.2789946140035906e-06,
"loss": 19.2305,
"step": 1525
},
{
"epoch": 0.27827648114901254,
"grad_norm": 15.565362930297852,
"learning_rate": 3.3328545780969477e-06,
"loss": 19.2403,
"step": 1550
},
{
"epoch": 0.28276481149012567,
"grad_norm": 16.003311157226562,
"learning_rate": 3.3867145421903056e-06,
"loss": 19.324,
"step": 1575
},
{
"epoch": 0.2872531418312388,
"grad_norm": 15.933990478515625,
"learning_rate": 3.4405745062836626e-06,
"loss": 19.194,
"step": 1600
},
{
"epoch": 0.2917414721723519,
"grad_norm": 16.488842010498047,
"learning_rate": 3.49443447037702e-06,
"loss": 19.3091,
"step": 1625
},
{
"epoch": 0.296229802513465,
"grad_norm": 15.880293846130371,
"learning_rate": 3.548294434470377e-06,
"loss": 19.1561,
"step": 1650
},
{
"epoch": 0.3007181328545781,
"grad_norm": 15.783681869506836,
"learning_rate": 3.6021543985637343e-06,
"loss": 19.0354,
"step": 1675
},
{
"epoch": 0.3052064631956912,
"grad_norm": 17.726228713989258,
"learning_rate": 3.6560143626570918e-06,
"loss": 19.0885,
"step": 1700
},
{
"epoch": 0.3096947935368043,
"grad_norm": 15.17348575592041,
"learning_rate": 3.709874326750449e-06,
"loss": 19.2063,
"step": 1725
},
{
"epoch": 0.3141831238779174,
"grad_norm": 16.843894958496094,
"learning_rate": 3.763734290843806e-06,
"loss": 18.9388,
"step": 1750
},
{
"epoch": 0.31867145421903054,
"grad_norm": 23.23088264465332,
"learning_rate": 3.817594254937163e-06,
"loss": 19.1281,
"step": 1775
},
{
"epoch": 0.3231597845601436,
"grad_norm": 15.54453182220459,
"learning_rate": 3.8714542190305205e-06,
"loss": 19.1134,
"step": 1800
},
{
"epoch": 0.3276481149012567,
"grad_norm": 16.050777435302734,
"learning_rate": 3.925314183123878e-06,
"loss": 18.9336,
"step": 1825
},
{
"epoch": 0.33213644524236985,
"grad_norm": 15.230576515197754,
"learning_rate": 3.979174147217235e-06,
"loss": 19.063,
"step": 1850
},
{
"epoch": 0.33662477558348297,
"grad_norm": 16.179035186767578,
"learning_rate": 4.0330341113105925e-06,
"loss": 18.9994,
"step": 1875
},
{
"epoch": 0.34111310592459604,
"grad_norm": 16.51089096069336,
"learning_rate": 4.0868940754039504e-06,
"loss": 18.9635,
"step": 1900
},
{
"epoch": 0.34560143626570916,
"grad_norm": 15.68458080291748,
"learning_rate": 4.140754039497307e-06,
"loss": 18.9526,
"step": 1925
},
{
"epoch": 0.3500897666068223,
"grad_norm": 16.986204147338867,
"learning_rate": 4.1946140035906646e-06,
"loss": 19.042,
"step": 1950
},
{
"epoch": 0.35457809694793535,
"grad_norm": 15.371708869934082,
"learning_rate": 4.248473967684022e-06,
"loss": 18.8415,
"step": 1975
},
{
"epoch": 0.3590664272890485,
"grad_norm": 15.591832160949707,
"learning_rate": 4.302333931777379e-06,
"loss": 18.8101,
"step": 2000
},
{
"epoch": 0.3635547576301616,
"grad_norm": 15.236952781677246,
"learning_rate": 4.356193895870736e-06,
"loss": 18.8519,
"step": 2025
},
{
"epoch": 0.36804308797127466,
"grad_norm": 16.580678939819336,
"learning_rate": 4.410053859964094e-06,
"loss": 18.8845,
"step": 2050
},
{
"epoch": 0.3725314183123878,
"grad_norm": 16.18427085876465,
"learning_rate": 4.463913824057451e-06,
"loss": 18.7093,
"step": 2075
},
{
"epoch": 0.3770197486535009,
"grad_norm": 18.2146053314209,
"learning_rate": 4.517773788150808e-06,
"loss": 18.777,
"step": 2100
},
{
"epoch": 0.38150807899461403,
"grad_norm": 16.10597038269043,
"learning_rate": 4.571633752244166e-06,
"loss": 18.7469,
"step": 2125
},
{
"epoch": 0.3859964093357271,
"grad_norm": 15.864044189453125,
"learning_rate": 4.625493716337523e-06,
"loss": 18.8742,
"step": 2150
},
{
"epoch": 0.3904847396768402,
"grad_norm": 18.99787139892578,
"learning_rate": 4.67935368043088e-06,
"loss": 18.7297,
"step": 2175
},
{
"epoch": 0.39497307001795334,
"grad_norm": 16.34602928161621,
"learning_rate": 4.733213644524237e-06,
"loss": 18.6701,
"step": 2200
},
{
"epoch": 0.3994614003590664,
"grad_norm": 16.323881149291992,
"learning_rate": 4.787073608617595e-06,
"loss": 18.7605,
"step": 2225
},
{
"epoch": 0.40394973070017953,
"grad_norm": 16.251663208007812,
"learning_rate": 4.840933572710951e-06,
"loss": 18.736,
"step": 2250
},
{
"epoch": 0.40843806104129265,
"grad_norm": 16.65889549255371,
"learning_rate": 4.894793536804309e-06,
"loss": 18.7178,
"step": 2275
},
{
"epoch": 0.4129263913824057,
"grad_norm": 16.258752822875977,
"learning_rate": 4.948653500897667e-06,
"loss": 18.5482,
"step": 2300
},
{
"epoch": 0.41741472172351884,
"grad_norm": 15.998190879821777,
"learning_rate": 5.002513464991023e-06,
"loss": 18.7417,
"step": 2325
},
{
"epoch": 0.42190305206463197,
"grad_norm": 14.841017723083496,
"learning_rate": 5.056373429084381e-06,
"loss": 18.6716,
"step": 2350
},
{
"epoch": 0.4263913824057451,
"grad_norm": 15.91247272491455,
"learning_rate": 5.110233393177738e-06,
"loss": 18.688,
"step": 2375
},
{
"epoch": 0.43087971274685816,
"grad_norm": 17.436525344848633,
"learning_rate": 5.164093357271095e-06,
"loss": 18.5867,
"step": 2400
},
{
"epoch": 0.4353680430879713,
"grad_norm": 16.675212860107422,
"learning_rate": 5.217953321364452e-06,
"loss": 18.7864,
"step": 2425
},
{
"epoch": 0.4398563734290844,
"grad_norm": 16.56376075744629,
"learning_rate": 5.27181328545781e-06,
"loss": 18.469,
"step": 2450
},
{
"epoch": 0.44434470377019747,
"grad_norm": 16.11998176574707,
"learning_rate": 5.325673249551166e-06,
"loss": 18.528,
"step": 2475
},
{
"epoch": 0.4488330341113106,
"grad_norm": 16.21501922607422,
"learning_rate": 5.379533213644524e-06,
"loss": 18.5634,
"step": 2500
},
{
"epoch": 0.4533213644524237,
"grad_norm": 17.891937255859375,
"learning_rate": 5.433393177737882e-06,
"loss": 18.5037,
"step": 2525
},
{
"epoch": 0.4578096947935368,
"grad_norm": 18.845378875732422,
"learning_rate": 5.4872531418312385e-06,
"loss": 18.4701,
"step": 2550
},
{
"epoch": 0.4622980251346499,
"grad_norm": 14.12865924835205,
"learning_rate": 5.541113105924596e-06,
"loss": 18.5924,
"step": 2575
},
{
"epoch": 0.466786355475763,
"grad_norm": 19.014360427856445,
"learning_rate": 5.5949730700179534e-06,
"loss": 18.4597,
"step": 2600
},
{
"epoch": 0.47127468581687615,
"grad_norm": 16.36025047302246,
"learning_rate": 5.6488330341113105e-06,
"loss": 18.6834,
"step": 2625
},
{
"epoch": 0.4757630161579892,
"grad_norm": 16.406009674072266,
"learning_rate": 5.7026929982046676e-06,
"loss": 18.4796,
"step": 2650
},
{
"epoch": 0.48025134649910234,
"grad_norm": 16.516311645507812,
"learning_rate": 5.7565529622980255e-06,
"loss": 18.408,
"step": 2675
},
{
"epoch": 0.48473967684021546,
"grad_norm": 15.522378921508789,
"learning_rate": 5.8104129263913826e-06,
"loss": 18.664,
"step": 2700
},
{
"epoch": 0.48922800718132853,
"grad_norm": 25.351648330688477,
"learning_rate": 5.86427289048474e-06,
"loss": 18.4765,
"step": 2725
},
{
"epoch": 0.49371633752244165,
"grad_norm": 15.928828239440918,
"learning_rate": 5.9181328545780975e-06,
"loss": 18.5569,
"step": 2750
},
{
"epoch": 0.4982046678635548,
"grad_norm": 14.958181381225586,
"learning_rate": 5.971992818671455e-06,
"loss": 18.4793,
"step": 2775
},
{
"epoch": 0.5026929982046678,
"grad_norm": 16.40975570678711,
"learning_rate": 6.025852782764812e-06,
"loss": 18.3546,
"step": 2800
},
{
"epoch": 0.507181328545781,
"grad_norm": 15.906976699829102,
"learning_rate": 6.079712746858169e-06,
"loss": 18.4363,
"step": 2825
},
{
"epoch": 0.5116696588868941,
"grad_norm": 15.144857406616211,
"learning_rate": 6.133572710951527e-06,
"loss": 18.3984,
"step": 2850
},
{
"epoch": 0.5161579892280072,
"grad_norm": 16.463176727294922,
"learning_rate": 6.187432675044883e-06,
"loss": 18.2991,
"step": 2875
},
{
"epoch": 0.5206463195691203,
"grad_norm": 15.463627815246582,
"learning_rate": 6.241292639138241e-06,
"loss": 18.694,
"step": 2900
},
{
"epoch": 0.5251346499102334,
"grad_norm": 17.69695281982422,
"learning_rate": 6.295152603231599e-06,
"loss": 18.2619,
"step": 2925
},
{
"epoch": 0.5296229802513465,
"grad_norm": 17.653112411499023,
"learning_rate": 6.349012567324955e-06,
"loss": 18.19,
"step": 2950
},
{
"epoch": 0.5341113105924596,
"grad_norm": 16.84458351135254,
"learning_rate": 6.402872531418313e-06,
"loss": 18.3045,
"step": 2975
},
{
"epoch": 0.5385996409335727,
"grad_norm": 16.69748878479004,
"learning_rate": 6.45673249551167e-06,
"loss": 18.2229,
"step": 3000
},
{
"epoch": 0.5430879712746858,
"grad_norm": 20.153520584106445,
"learning_rate": 6.510592459605027e-06,
"loss": 18.3158,
"step": 3025
},
{
"epoch": 0.547576301615799,
"grad_norm": 18.79121208190918,
"learning_rate": 6.564452423698384e-06,
"loss": 18.3017,
"step": 3050
},
{
"epoch": 0.552064631956912,
"grad_norm": 16.678478240966797,
"learning_rate": 6.618312387791742e-06,
"loss": 18.39,
"step": 3075
},
{
"epoch": 0.5565529622980251,
"grad_norm": 22.23566246032715,
"learning_rate": 6.672172351885098e-06,
"loss": 18.3635,
"step": 3100
},
{
"epoch": 0.5610412926391383,
"grad_norm": 17.27984619140625,
"learning_rate": 6.726032315978456e-06,
"loss": 18.1439,
"step": 3125
},
{
"epoch": 0.5655296229802513,
"grad_norm": 16.337604522705078,
"learning_rate": 6.779892280071814e-06,
"loss": 18.378,
"step": 3150
},
{
"epoch": 0.5700179533213644,
"grad_norm": 16.56863784790039,
"learning_rate": 6.83375224416517e-06,
"loss": 18.192,
"step": 3175
},
{
"epoch": 0.5745062836624776,
"grad_norm": 17.132043838500977,
"learning_rate": 6.887612208258528e-06,
"loss": 18.3463,
"step": 3200
},
{
"epoch": 0.5789946140035906,
"grad_norm": 17.860429763793945,
"learning_rate": 6.941472172351885e-06,
"loss": 18.2454,
"step": 3225
},
{
"epoch": 0.5834829443447038,
"grad_norm": 14.994222640991211,
"learning_rate": 6.995332136445242e-06,
"loss": 18.3204,
"step": 3250
},
{
"epoch": 0.5879712746858169,
"grad_norm": 16.782873153686523,
"learning_rate": 7.049192100538599e-06,
"loss": 18.2429,
"step": 3275
},
{
"epoch": 0.59245960502693,
"grad_norm": 15.746573448181152,
"learning_rate": 7.103052064631957e-06,
"loss": 18.0945,
"step": 3300
},
{
"epoch": 0.5969479353680431,
"grad_norm": 17.83595848083496,
"learning_rate": 7.156912028725314e-06,
"loss": 18.3612,
"step": 3325
},
{
"epoch": 0.6014362657091562,
"grad_norm": 17.314441680908203,
"learning_rate": 7.2107719928186714e-06,
"loss": 18.1783,
"step": 3350
},
{
"epoch": 0.6059245960502693,
"grad_norm": 16.076663970947266,
"learning_rate": 7.264631956912029e-06,
"loss": 18.3625,
"step": 3375
},
{
"epoch": 0.6104129263913824,
"grad_norm": 16.62413215637207,
"learning_rate": 7.318491921005386e-06,
"loss": 18.3717,
"step": 3400
},
{
"epoch": 0.6149012567324955,
"grad_norm": 19.139238357543945,
"learning_rate": 7.3723518850987435e-06,
"loss": 18.0023,
"step": 3425
},
{
"epoch": 0.6193895870736086,
"grad_norm": 15.575067520141602,
"learning_rate": 7.4262118491921005e-06,
"loss": 18.3083,
"step": 3450
},
{
"epoch": 0.6238779174147218,
"grad_norm": 18.650287628173828,
"learning_rate": 7.4800718132854585e-06,
"loss": 18.2143,
"step": 3475
},
{
"epoch": 0.6283662477558348,
"grad_norm": 18.52598762512207,
"learning_rate": 7.5339317773788155e-06,
"loss": 18.0261,
"step": 3500
},
{
"epoch": 0.6328545780969479,
"grad_norm": 17.653348922729492,
"learning_rate": 7.587791741472173e-06,
"loss": 17.919,
"step": 3525
},
{
"epoch": 0.6373429084380611,
"grad_norm": 17.140901565551758,
"learning_rate": 7.641651705565529e-06,
"loss": 18.0068,
"step": 3550
},
{
"epoch": 0.6418312387791741,
"grad_norm": 16.913959503173828,
"learning_rate": 7.695511669658888e-06,
"loss": 18.1295,
"step": 3575
},
{
"epoch": 0.6463195691202872,
"grad_norm": 18.763505935668945,
"learning_rate": 7.749371633752245e-06,
"loss": 17.8775,
"step": 3600
},
{
"epoch": 0.6508078994614004,
"grad_norm": 16.92535400390625,
"learning_rate": 7.803231597845602e-06,
"loss": 18.1007,
"step": 3625
},
{
"epoch": 0.6552962298025135,
"grad_norm": 19.66353988647461,
"learning_rate": 7.857091561938959e-06,
"loss": 17.9173,
"step": 3650
},
{
"epoch": 0.6597845601436265,
"grad_norm": 17.924484252929688,
"learning_rate": 7.910951526032318e-06,
"loss": 17.9126,
"step": 3675
},
{
"epoch": 0.6642728904847397,
"grad_norm": 18.674997329711914,
"learning_rate": 7.964811490125673e-06,
"loss": 18.0945,
"step": 3700
},
{
"epoch": 0.6687612208258528,
"grad_norm": 17.291763305664062,
"learning_rate": 8.01867145421903e-06,
"loss": 18.1315,
"step": 3725
},
{
"epoch": 0.6732495511669659,
"grad_norm": 18.881961822509766,
"learning_rate": 8.072531418312387e-06,
"loss": 17.8323,
"step": 3750
},
{
"epoch": 0.677737881508079,
"grad_norm": 18.964895248413086,
"learning_rate": 8.126391382405746e-06,
"loss": 17.8395,
"step": 3775
},
{
"epoch": 0.6822262118491921,
"grad_norm": 16.793581008911133,
"learning_rate": 8.180251346499103e-06,
"loss": 18.0268,
"step": 3800
},
{
"epoch": 0.6867145421903053,
"grad_norm": 16.940654754638672,
"learning_rate": 8.23411131059246e-06,
"loss": 17.9372,
"step": 3825
},
{
"epoch": 0.6912028725314183,
"grad_norm": 16.80086898803711,
"learning_rate": 8.287971274685817e-06,
"loss": 17.7917,
"step": 3850
},
{
"epoch": 0.6956912028725314,
"grad_norm": 21.22157859802246,
"learning_rate": 8.341831238779174e-06,
"loss": 18.1762,
"step": 3875
},
{
"epoch": 0.7001795332136446,
"grad_norm": 18.476032257080078,
"learning_rate": 8.395691202872531e-06,
"loss": 17.7716,
"step": 3900
},
{
"epoch": 0.7046678635547576,
"grad_norm": 15.498908996582031,
"learning_rate": 8.447396768402154e-06,
"loss": 18.0372,
"step": 3925
},
{
"epoch": 0.7091561938958707,
"grad_norm": 18.895370483398438,
"learning_rate": 8.501256732495513e-06,
"loss": 18.0482,
"step": 3950
},
{
"epoch": 0.7136445242369839,
"grad_norm": 21.163408279418945,
"learning_rate": 8.55511669658887e-06,
"loss": 17.745,
"step": 3975
},
{
"epoch": 0.718132854578097,
"grad_norm": 15.291622161865234,
"learning_rate": 8.608976660682225e-06,
"loss": 17.9657,
"step": 4000
},
{
"epoch": 0.72262118491921,
"grad_norm": 19.777557373046875,
"learning_rate": 8.662836624775583e-06,
"loss": 17.7042,
"step": 4025
},
{
"epoch": 0.7271095152603232,
"grad_norm": 18.978593826293945,
"learning_rate": 8.716696588868941e-06,
"loss": 17.7297,
"step": 4050
},
{
"epoch": 0.7315978456014363,
"grad_norm": 21.442642211914062,
"learning_rate": 8.770556552962298e-06,
"loss": 17.8813,
"step": 4075
},
{
"epoch": 0.7360861759425493,
"grad_norm": 17.907501220703125,
"learning_rate": 8.824416517055655e-06,
"loss": 17.7841,
"step": 4100
},
{
"epoch": 0.7405745062836625,
"grad_norm": 18.066967010498047,
"learning_rate": 8.878276481149014e-06,
"loss": 17.8453,
"step": 4125
},
{
"epoch": 0.7450628366247756,
"grad_norm": 20.10198974609375,
"learning_rate": 8.93213644524237e-06,
"loss": 17.8353,
"step": 4150
},
{
"epoch": 0.7495511669658886,
"grad_norm": 16.845745086669922,
"learning_rate": 8.985996409335727e-06,
"loss": 17.8432,
"step": 4175
},
{
"epoch": 0.7540394973070018,
"grad_norm": 18.617088317871094,
"learning_rate": 9.039856373429084e-06,
"loss": 17.8202,
"step": 4200
},
{
"epoch": 0.7585278276481149,
"grad_norm": 19.578916549682617,
"learning_rate": 9.093716337522442e-06,
"loss": 17.7985,
"step": 4225
},
{
"epoch": 0.7630161579892281,
"grad_norm": 16.22163200378418,
"learning_rate": 9.1475763016158e-06,
"loss": 17.9493,
"step": 4250
},
{
"epoch": 0.7675044883303411,
"grad_norm": 18.09850311279297,
"learning_rate": 9.201436265709157e-06,
"loss": 17.7751,
"step": 4275
},
{
"epoch": 0.7719928186714542,
"grad_norm": 18.09538459777832,
"learning_rate": 9.255296229802514e-06,
"loss": 17.7249,
"step": 4300
},
{
"epoch": 0.7764811490125674,
"grad_norm": 15.956615447998047,
"learning_rate": 9.30915619389587e-06,
"loss": 17.9555,
"step": 4325
},
{
"epoch": 0.7809694793536804,
"grad_norm": 17.795026779174805,
"learning_rate": 9.363016157989228e-06,
"loss": 17.7529,
"step": 4350
},
{
"epoch": 0.7854578096947935,
"grad_norm": 18.07413101196289,
"learning_rate": 9.416876122082585e-06,
"loss": 17.5449,
"step": 4375
},
{
"epoch": 0.7899461400359067,
"grad_norm": 19.664108276367188,
"learning_rate": 9.470736086175944e-06,
"loss": 17.6976,
"step": 4400
},
{
"epoch": 0.7944344703770198,
"grad_norm": 21.336183547973633,
"learning_rate": 9.5245960502693e-06,
"loss": 17.7112,
"step": 4425
},
{
"epoch": 0.7989228007181328,
"grad_norm": 16.216899871826172,
"learning_rate": 9.578456014362658e-06,
"loss": 17.6466,
"step": 4450
},
{
"epoch": 0.803411131059246,
"grad_norm": 17.492589950561523,
"learning_rate": 9.632315978456013e-06,
"loss": 17.823,
"step": 4475
},
{
"epoch": 0.8078994614003591,
"grad_norm": 19.598114013671875,
"learning_rate": 9.686175942549372e-06,
"loss": 17.6314,
"step": 4500
},
{
"epoch": 0.8123877917414721,
"grad_norm": 18.45696258544922,
"learning_rate": 9.740035906642729e-06,
"loss": 17.6494,
"step": 4525
},
{
"epoch": 0.8168761220825853,
"grad_norm": 17.067623138427734,
"learning_rate": 9.793895870736086e-06,
"loss": 17.5437,
"step": 4550
},
{
"epoch": 0.8213644524236984,
"grad_norm": 22.662578582763672,
"learning_rate": 9.847755834829445e-06,
"loss": 17.6709,
"step": 4575
},
{
"epoch": 0.8258527827648114,
"grad_norm": 17.072893142700195,
"learning_rate": 9.901615798922802e-06,
"loss": 17.7612,
"step": 4600
},
{
"epoch": 0.8303411131059246,
"grad_norm": 18.79728889465332,
"learning_rate": 9.955475763016157e-06,
"loss": 17.7251,
"step": 4625
},
{
"epoch": 0.8348294434470377,
"grad_norm": 18.312700271606445,
"learning_rate": 1.0009335727109514e-05,
"loss": 17.5846,
"step": 4650
},
{
"epoch": 0.8393177737881508,
"grad_norm": 18.2193660736084,
"learning_rate": 1.0063195691202873e-05,
"loss": 17.463,
"step": 4675
},
{
"epoch": 0.8438061041292639,
"grad_norm": 20.095277786254883,
"learning_rate": 1.011705565529623e-05,
"loss": 17.1827,
"step": 4700
},
{
"epoch": 0.848294434470377,
"grad_norm": 20.12647247314453,
"learning_rate": 1.0170915619389587e-05,
"loss": 17.4883,
"step": 4725
},
{
"epoch": 0.8527827648114902,
"grad_norm": 18.37622833251953,
"learning_rate": 1.0224775583482946e-05,
"loss": 17.5654,
"step": 4750
},
{
"epoch": 0.8572710951526032,
"grad_norm": 18.199060440063477,
"learning_rate": 1.0278635547576303e-05,
"loss": 17.526,
"step": 4775
},
{
"epoch": 0.8617594254937163,
"grad_norm": 20.442859649658203,
"learning_rate": 1.0332495511669658e-05,
"loss": 17.6952,
"step": 4800
},
{
"epoch": 0.8662477558348295,
"grad_norm": 18.540332794189453,
"learning_rate": 1.0386355475763015e-05,
"loss": 17.5526,
"step": 4825
},
{
"epoch": 0.8707360861759426,
"grad_norm": 19.37755012512207,
"learning_rate": 1.0440215439856374e-05,
"loss": 17.463,
"step": 4850
},
{
"epoch": 0.8752244165170556,
"grad_norm": 19.95784568786621,
"learning_rate": 1.0494075403949731e-05,
"loss": 17.671,
"step": 4875
},
{
"epoch": 0.8797127468581688,
"grad_norm": 17.712932586669922,
"learning_rate": 1.0547935368043088e-05,
"loss": 17.4896,
"step": 4900
},
{
"epoch": 0.8842010771992819,
"grad_norm": 19.43058967590332,
"learning_rate": 1.0601795332136445e-05,
"loss": 17.5935,
"step": 4925
},
{
"epoch": 0.8886894075403949,
"grad_norm": 20.811893463134766,
"learning_rate": 1.0655655296229803e-05,
"loss": 17.5003,
"step": 4950
},
{
"epoch": 0.8931777378815081,
"grad_norm": 21.070432662963867,
"learning_rate": 1.070951526032316e-05,
"loss": 17.479,
"step": 4975
},
{
"epoch": 0.8976660682226212,
"grad_norm": 17.394657135009766,
"learning_rate": 1.0763375224416517e-05,
"loss": 17.5536,
"step": 5000
},
{
"epoch": 0.9021543985637342,
"grad_norm": 19.602218627929688,
"learning_rate": 1.0817235188509875e-05,
"loss": 17.4437,
"step": 5025
},
{
"epoch": 0.9066427289048474,
"grad_norm": 21.4727783203125,
"learning_rate": 1.0871095152603232e-05,
"loss": 17.3718,
"step": 5050
},
{
"epoch": 0.9111310592459605,
"grad_norm": 18.7939510345459,
"learning_rate": 1.092495511669659e-05,
"loss": 17.1782,
"step": 5075
},
{
"epoch": 0.9156193895870736,
"grad_norm": 21.373146057128906,
"learning_rate": 1.0978815080789945e-05,
"loss": 17.0867,
"step": 5100
},
{
"epoch": 0.9201077199281867,
"grad_norm": 22.122276306152344,
"learning_rate": 1.1032675044883304e-05,
"loss": 17.5058,
"step": 5125
},
{
"epoch": 0.9245960502692998,
"grad_norm": 20.753555297851562,
"learning_rate": 1.108653500897666e-05,
"loss": 16.9196,
"step": 5150
},
{
"epoch": 0.9290843806104129,
"grad_norm": 20.26563262939453,
"learning_rate": 1.1140394973070018e-05,
"loss": 17.3307,
"step": 5175
},
{
"epoch": 0.933572710951526,
"grad_norm": 19.536109924316406,
"learning_rate": 1.1194254937163377e-05,
"loss": 17.2833,
"step": 5200
},
{
"epoch": 0.9380610412926391,
"grad_norm": 21.98984146118164,
"learning_rate": 1.1248114901256734e-05,
"loss": 17.3079,
"step": 5225
},
{
"epoch": 0.9425493716337523,
"grad_norm": 20.069507598876953,
"learning_rate": 1.1301974865350089e-05,
"loss": 17.3164,
"step": 5250
},
{
"epoch": 0.9470377019748654,
"grad_norm": 19.031282424926758,
"learning_rate": 1.1355834829443446e-05,
"loss": 17.4307,
"step": 5275
},
{
"epoch": 0.9515260323159784,
"grad_norm": 21.127609252929688,
"learning_rate": 1.1409694793536805e-05,
"loss": 17.0626,
"step": 5300
},
{
"epoch": 0.9560143626570916,
"grad_norm": 22.808317184448242,
"learning_rate": 1.1463554757630162e-05,
"loss": 17.2784,
"step": 5325
},
{
"epoch": 0.9605026929982047,
"grad_norm": 20.546794891357422,
"learning_rate": 1.1517414721723519e-05,
"loss": 17.1925,
"step": 5350
},
{
"epoch": 0.9649910233393177,
"grad_norm": 18.644824981689453,
"learning_rate": 1.1571274685816878e-05,
"loss": 17.1864,
"step": 5375
},
{
"epoch": 0.9694793536804309,
"grad_norm": 19.968189239501953,
"learning_rate": 1.1625134649910235e-05,
"loss": 17.1675,
"step": 5400
},
{
"epoch": 0.973967684021544,
"grad_norm": 19.67746925354004,
"learning_rate": 1.167899461400359e-05,
"loss": 17.1256,
"step": 5425
},
{
"epoch": 0.9784560143626571,
"grad_norm": 21.387107849121094,
"learning_rate": 1.1732854578096947e-05,
"loss": 17.411,
"step": 5450
},
{
"epoch": 0.9829443447037702,
"grad_norm": 17.536405563354492,
"learning_rate": 1.1786714542190306e-05,
"loss": 17.1895,
"step": 5475
},
{
"epoch": 0.9874326750448833,
"grad_norm": 21.705129623413086,
"learning_rate": 1.1840574506283663e-05,
"loss": 17.3247,
"step": 5500
},
{
"epoch": 0.9919210053859964,
"grad_norm": 20.925796508789062,
"learning_rate": 1.189443447037702e-05,
"loss": 17.2336,
"step": 5525
},
{
"epoch": 0.9964093357271095,
"grad_norm": 19.501977920532227,
"learning_rate": 1.1948294434470377e-05,
"loss": 16.939,
"step": 5550
},
{
"epoch": 1.0,
"eval_accuracy": 0.024535543123365092,
"eval_f1_macro": 0.00018617887832660077,
"eval_f1_micro": 0.024535543123365092,
"eval_f1_weighted": 0.005391520089441958,
"eval_loss": 8.812005996704102,
"eval_precision_macro": 0.0002191599995913754,
"eval_precision_micro": 0.024535543123365092,
"eval_precision_weighted": 0.004721591271214293,
"eval_recall_macro": 0.000766160477322559,
"eval_recall_micro": 0.024535543123365092,
"eval_recall_weighted": 0.024535543123365092,
"eval_runtime": 128.9376,
"eval_samples_per_second": 406.189,
"eval_steps_per_second": 12.696,
"step": 5570
},
{
"epoch": 1.0008976660682227,
"grad_norm": 19.746585845947266,
"learning_rate": 1.2002154398563734e-05,
"loss": 16.8127,
"step": 5575
},
{
"epoch": 1.0053859964093357,
"grad_norm": 20.92815589904785,
"learning_rate": 1.2056014362657091e-05,
"loss": 16.6227,
"step": 5600
},
{
"epoch": 1.0098743267504489,
"grad_norm": 23.12251091003418,
"learning_rate": 1.2109874326750448e-05,
"loss": 16.8028,
"step": 5625
},
{
"epoch": 1.014362657091562,
"grad_norm": 21.773548126220703,
"learning_rate": 1.2163734290843807e-05,
"loss": 16.9285,
"step": 5650
},
{
"epoch": 1.018850987432675,
"grad_norm": 18.216033935546875,
"learning_rate": 1.2217594254937164e-05,
"loss": 16.8515,
"step": 5675
},
{
"epoch": 1.0233393177737882,
"grad_norm": 20.353927612304688,
"learning_rate": 1.2271454219030521e-05,
"loss": 16.9787,
"step": 5700
},
{
"epoch": 1.0278276481149013,
"grad_norm": 22.886110305786133,
"learning_rate": 1.2325314183123877e-05,
"loss": 16.7614,
"step": 5725
},
{
"epoch": 1.0323159784560143,
"grad_norm": 21.366548538208008,
"learning_rate": 1.2379174147217235e-05,
"loss": 16.3866,
"step": 5750
},
{
"epoch": 1.0368043087971275,
"grad_norm": 23.675683975219727,
"learning_rate": 1.2433034111310593e-05,
"loss": 16.3334,
"step": 5775
},
{
"epoch": 1.0412926391382407,
"grad_norm": 22.998641967773438,
"learning_rate": 1.248689407540395e-05,
"loss": 16.7228,
"step": 5800
},
{
"epoch": 1.0457809694793536,
"grad_norm": 20.504121780395508,
"learning_rate": 1.2540754039497308e-05,
"loss": 16.5914,
"step": 5825
},
{
"epoch": 1.0502692998204668,
"grad_norm": 22.66668128967285,
"learning_rate": 1.2594614003590665e-05,
"loss": 16.5995,
"step": 5850
},
{
"epoch": 1.05475763016158,
"grad_norm": 20.194726943969727,
"learning_rate": 1.264847396768402e-05,
"loss": 16.7589,
"step": 5875
},
{
"epoch": 1.059245960502693,
"grad_norm": 21.407981872558594,
"learning_rate": 1.2702333931777378e-05,
"loss": 16.6117,
"step": 5900
},
{
"epoch": 1.063734290843806,
"grad_norm": 20.662927627563477,
"learning_rate": 1.2756193895870737e-05,
"loss": 16.5679,
"step": 5925
},
{
"epoch": 1.0682226211849193,
"grad_norm": 24.050336837768555,
"learning_rate": 1.2810053859964094e-05,
"loss": 16.8247,
"step": 5950
},
{
"epoch": 1.0727109515260322,
"grad_norm": 20.72054100036621,
"learning_rate": 1.286391382405745e-05,
"loss": 16.623,
"step": 5975
},
{
"epoch": 1.0771992818671454,
"grad_norm": 23.1834659576416,
"learning_rate": 1.291777378815081e-05,
"loss": 16.4884,
"step": 6000
},
{
"epoch": 1.0816876122082586,
"grad_norm": 21.00957679748535,
"learning_rate": 1.2971633752244167e-05,
"loss": 16.3869,
"step": 6025
},
{
"epoch": 1.0861759425493716,
"grad_norm": 22.43168067932129,
"learning_rate": 1.3025493716337522e-05,
"loss": 16.3575,
"step": 6050
},
{
"epoch": 1.0906642728904847,
"grad_norm": 21.6562557220459,
"learning_rate": 1.3079353680430879e-05,
"loss": 16.4085,
"step": 6075
},
{
"epoch": 1.095152603231598,
"grad_norm": 23.325424194335938,
"learning_rate": 1.3133213644524238e-05,
"loss": 16.5846,
"step": 6100
},
{
"epoch": 1.0996409335727109,
"grad_norm": 24.215314865112305,
"learning_rate": 1.3187073608617595e-05,
"loss": 16.4518,
"step": 6125
},
{
"epoch": 1.104129263913824,
"grad_norm": 24.384559631347656,
"learning_rate": 1.3240933572710952e-05,
"loss": 16.5581,
"step": 6150
},
{
"epoch": 1.1086175942549372,
"grad_norm": 24.343595504760742,
"learning_rate": 1.3294793536804309e-05,
"loss": 16.2876,
"step": 6175
},
{
"epoch": 1.1131059245960502,
"grad_norm": 21.597131729125977,
"learning_rate": 1.3348653500897666e-05,
"loss": 16.4586,
"step": 6200
},
{
"epoch": 1.1175942549371634,
"grad_norm": 22.356834411621094,
"learning_rate": 1.3402513464991023e-05,
"loss": 16.2934,
"step": 6225
},
{
"epoch": 1.1220825852782765,
"grad_norm": 22.678932189941406,
"learning_rate": 1.345637342908438e-05,
"loss": 16.5327,
"step": 6250
},
{
"epoch": 1.1265709156193895,
"grad_norm": 19.975004196166992,
"learning_rate": 1.3510233393177739e-05,
"loss": 16.4558,
"step": 6275
},
{
"epoch": 1.1310592459605027,
"grad_norm": 23.107633590698242,
"learning_rate": 1.3564093357271096e-05,
"loss": 16.4774,
"step": 6300
},
{
"epoch": 1.1355475763016158,
"grad_norm": 23.048038482666016,
"learning_rate": 1.3617953321364453e-05,
"loss": 16.8229,
"step": 6325
},
{
"epoch": 1.140035906642729,
"grad_norm": 22.7868595123291,
"learning_rate": 1.3671813285457809e-05,
"loss": 16.5661,
"step": 6350
},
{
"epoch": 1.144524236983842,
"grad_norm": 24.512683868408203,
"learning_rate": 1.3725673249551167e-05,
"loss": 16.4614,
"step": 6375
},
{
"epoch": 1.1490125673249552,
"grad_norm": 22.137468338012695,
"learning_rate": 1.3779533213644524e-05,
"loss": 16.6419,
"step": 6400
},
{
"epoch": 1.1535008976660683,
"grad_norm": 25.348499298095703,
"learning_rate": 1.3833393177737881e-05,
"loss": 16.2941,
"step": 6425
},
{
"epoch": 1.1579892280071813,
"grad_norm": 20.657936096191406,
"learning_rate": 1.388725314183124e-05,
"loss": 16.4759,
"step": 6450
},
{
"epoch": 1.1624775583482945,
"grad_norm": 21.39447021484375,
"learning_rate": 1.3941113105924597e-05,
"loss": 16.6561,
"step": 6475
},
{
"epoch": 1.1669658886894076,
"grad_norm": 23.087963104248047,
"learning_rate": 1.3994973070017953e-05,
"loss": 15.8167,
"step": 6500
},
{
"epoch": 1.1714542190305206,
"grad_norm": 23.6542911529541,
"learning_rate": 1.404883303411131e-05,
"loss": 16.531,
"step": 6525
},
{
"epoch": 1.1759425493716338,
"grad_norm": 23.05323028564453,
"learning_rate": 1.4102692998204668e-05,
"loss": 16.3313,
"step": 6550
},
{
"epoch": 1.180430879712747,
"grad_norm": 22.49639320373535,
"learning_rate": 1.4156552962298026e-05,
"loss": 16.2067,
"step": 6575
},
{
"epoch": 1.18491921005386,
"grad_norm": 27.224279403686523,
"learning_rate": 1.4210412926391383e-05,
"loss": 16.502,
"step": 6600
},
{
"epoch": 1.189407540394973,
"grad_norm": 21.412261962890625,
"learning_rate": 1.4264272890484741e-05,
"loss": 16.6798,
"step": 6625
},
{
"epoch": 1.1938958707360863,
"grad_norm": 23.425609588623047,
"learning_rate": 1.4318132854578098e-05,
"loss": 16.2533,
"step": 6650
},
{
"epoch": 1.1983842010771992,
"grad_norm": 23.98543357849121,
"learning_rate": 1.4371992818671454e-05,
"loss": 16.2683,
"step": 6675
},
{
"epoch": 1.2028725314183124,
"grad_norm": 24.748369216918945,
"learning_rate": 1.4425852782764811e-05,
"loss": 16.4797,
"step": 6700
},
{
"epoch": 1.2073608617594256,
"grad_norm": 20.175334930419922,
"learning_rate": 1.447971274685817e-05,
"loss": 16.493,
"step": 6725
},
{
"epoch": 1.2118491921005385,
"grad_norm": 23.000167846679688,
"learning_rate": 1.4533572710951527e-05,
"loss": 16.1375,
"step": 6750
},
{
"epoch": 1.2163375224416517,
"grad_norm": 21.749601364135742,
"learning_rate": 1.4587432675044884e-05,
"loss": 16.4148,
"step": 6775
},
{
"epoch": 1.220825852782765,
"grad_norm": 23.57693099975586,
"learning_rate": 1.464129263913824e-05,
"loss": 15.9781,
"step": 6800
},
{
"epoch": 1.2253141831238779,
"grad_norm": 22.823196411132812,
"learning_rate": 1.4695152603231598e-05,
"loss": 16.6314,
"step": 6825
},
{
"epoch": 1.229802513464991,
"grad_norm": 22.367694854736328,
"learning_rate": 1.4749012567324955e-05,
"loss": 16.5215,
"step": 6850
},
{
"epoch": 1.2342908438061042,
"grad_norm": 33.2826042175293,
"learning_rate": 1.4802872531418312e-05,
"loss": 16.4813,
"step": 6875
},
{
"epoch": 1.2387791741472172,
"grad_norm": 23.414485931396484,
"learning_rate": 1.485673249551167e-05,
"loss": 15.8688,
"step": 6900
},
{
"epoch": 1.2432675044883303,
"grad_norm": 24.831056594848633,
"learning_rate": 1.4910592459605028e-05,
"loss": 16.1143,
"step": 6925
},
{
"epoch": 1.2477558348294435,
"grad_norm": 23.415950775146484,
"learning_rate": 1.4964452423698385e-05,
"loss": 15.7436,
"step": 6950
},
{
"epoch": 1.2522441651705565,
"grad_norm": 22.253082275390625,
"learning_rate": 1.5018312387791742e-05,
"loss": 16.0852,
"step": 6975
},
{
"epoch": 1.2567324955116697,
"grad_norm": 22.159162521362305,
"learning_rate": 1.5072172351885099e-05,
"loss": 16.3732,
"step": 7000
},
{
"epoch": 1.2612208258527828,
"grad_norm": 22.717971801757812,
"learning_rate": 1.5126032315978456e-05,
"loss": 16.1097,
"step": 7025
},
{
"epoch": 1.2657091561938958,
"grad_norm": 22.539794921875,
"learning_rate": 1.5179892280071813e-05,
"loss": 16.131,
"step": 7050
},
{
"epoch": 1.270197486535009,
"grad_norm": 25.072383880615234,
"learning_rate": 1.523375224416517e-05,
"loss": 15.9651,
"step": 7075
},
{
"epoch": 1.2746858168761221,
"grad_norm": 22.601781845092773,
"learning_rate": 1.5287612208258526e-05,
"loss": 16.0353,
"step": 7100
},
{
"epoch": 1.279174147217235,
"grad_norm": 21.910064697265625,
"learning_rate": 1.5341472172351888e-05,
"loss": 16.1811,
"step": 7125
},
{
"epoch": 1.2836624775583483,
"grad_norm": 23.791175842285156,
"learning_rate": 1.5395332136445243e-05,
"loss": 16.0497,
"step": 7150
},
{
"epoch": 1.2881508078994615,
"grad_norm": 24.051387786865234,
"learning_rate": 1.5449192100538602e-05,
"loss": 16.0254,
"step": 7175
},
{
"epoch": 1.2926391382405744,
"grad_norm": 20.40333366394043,
"learning_rate": 1.5503052064631957e-05,
"loss": 16.1743,
"step": 7200
},
{
"epoch": 1.2971274685816876,
"grad_norm": 21.65686798095703,
"learning_rate": 1.5556912028725313e-05,
"loss": 16.0881,
"step": 7225
},
{
"epoch": 1.3016157989228008,
"grad_norm": 23.0731201171875,
"learning_rate": 1.561077199281867e-05,
"loss": 15.7084,
"step": 7250
},
{
"epoch": 1.3061041292639137,
"grad_norm": 23.546977996826172,
"learning_rate": 1.5664631956912027e-05,
"loss": 15.8303,
"step": 7275
},
{
"epoch": 1.310592459605027,
"grad_norm": 24.602670669555664,
"learning_rate": 1.571849192100539e-05,
"loss": 16.0991,
"step": 7300
},
{
"epoch": 1.31508078994614,
"grad_norm": 23.653459548950195,
"learning_rate": 1.5772351885098744e-05,
"loss": 16.011,
"step": 7325
},
{
"epoch": 1.319569120287253,
"grad_norm": 23.47325325012207,
"learning_rate": 1.5826211849192103e-05,
"loss": 16.1551,
"step": 7350
},
{
"epoch": 1.3240574506283662,
"grad_norm": 26.003053665161133,
"learning_rate": 1.588007181328546e-05,
"loss": 15.7585,
"step": 7375
},
{
"epoch": 1.3285457809694794,
"grad_norm": 24.2227840423584,
"learning_rate": 1.5933931777378814e-05,
"loss": 16.1166,
"step": 7400
},
{
"epoch": 1.3330341113105924,
"grad_norm": 23.852928161621094,
"learning_rate": 1.5987791741472173e-05,
"loss": 16.1642,
"step": 7425
},
{
"epoch": 1.3375224416517055,
"grad_norm": 20.22197914123535,
"learning_rate": 1.6041651705565528e-05,
"loss": 16.2129,
"step": 7450
},
{
"epoch": 1.3420107719928187,
"grad_norm": 23.04417610168457,
"learning_rate": 1.609551166965889e-05,
"loss": 15.9037,
"step": 7475
},
{
"epoch": 1.3464991023339317,
"grad_norm": 22.43314552307129,
"learning_rate": 1.6149371633752246e-05,
"loss": 15.9117,
"step": 7500
},
{
"epoch": 1.3509874326750448,
"grad_norm": 23.691787719726562,
"learning_rate": 1.62032315978456e-05,
"loss": 16.1126,
"step": 7525
},
{
"epoch": 1.355475763016158,
"grad_norm": 22.891239166259766,
"learning_rate": 1.625709156193896e-05,
"loss": 15.8261,
"step": 7550
},
{
"epoch": 1.359964093357271,
"grad_norm": 22.000856399536133,
"learning_rate": 1.6310951526032315e-05,
"loss": 16.0308,
"step": 7575
},
{
"epoch": 1.3644524236983842,
"grad_norm": 23.784557342529297,
"learning_rate": 1.6364811490125674e-05,
"loss": 15.7405,
"step": 7600
},
{
"epoch": 1.3689407540394973,
"grad_norm": 23.533695220947266,
"learning_rate": 1.641867145421903e-05,
"loss": 15.9525,
"step": 7625
},
{
"epoch": 1.3734290843806103,
"grad_norm": 22.5810604095459,
"learning_rate": 1.647253141831239e-05,
"loss": 15.9572,
"step": 7650
},
{
"epoch": 1.3779174147217235,
"grad_norm": 22.96384048461914,
"learning_rate": 1.6526391382405747e-05,
"loss": 16.0959,
"step": 7675
},
{
"epoch": 1.3824057450628366,
"grad_norm": 22.757946014404297,
"learning_rate": 1.6580251346499102e-05,
"loss": 16.0045,
"step": 7700
},
{
"epoch": 1.3868940754039496,
"grad_norm": 24.446062088012695,
"learning_rate": 1.663411131059246e-05,
"loss": 16.0387,
"step": 7725
},
{
"epoch": 1.3913824057450628,
"grad_norm": 24.733076095581055,
"learning_rate": 1.6687971274685816e-05,
"loss": 15.6993,
"step": 7750
},
{
"epoch": 1.395870736086176,
"grad_norm": 22.095415115356445,
"learning_rate": 1.6741831238779175e-05,
"loss": 16.108,
"step": 7775
},
{
"epoch": 1.400359066427289,
"grad_norm": 22.528247833251953,
"learning_rate": 1.679569120287253e-05,
"loss": 15.9896,
"step": 7800
},
{
"epoch": 1.404847396768402,
"grad_norm": 21.990081787109375,
"learning_rate": 1.684955116696589e-05,
"loss": 15.8753,
"step": 7825
},
{
"epoch": 1.4093357271095153,
"grad_norm": 24.167387008666992,
"learning_rate": 1.6903411131059248e-05,
"loss": 15.7562,
"step": 7850
},
{
"epoch": 1.4138240574506284,
"grad_norm": 23.88982391357422,
"learning_rate": 1.6957271095152603e-05,
"loss": 15.8713,
"step": 7875
},
{
"epoch": 1.4183123877917414,
"grad_norm": 26.82301902770996,
"learning_rate": 1.7011131059245962e-05,
"loss": 15.5179,
"step": 7900
},
{
"epoch": 1.4228007181328546,
"grad_norm": 25.797740936279297,
"learning_rate": 1.7064991023339317e-05,
"loss": 15.8012,
"step": 7925
},
{
"epoch": 1.4272890484739678,
"grad_norm": 24.005008697509766,
"learning_rate": 1.7118850987432676e-05,
"loss": 16.0349,
"step": 7950
},
{
"epoch": 1.4317773788150807,
"grad_norm": 21.801897048950195,
"learning_rate": 1.717271095152603e-05,
"loss": 16.2094,
"step": 7975
},
{
"epoch": 1.436265709156194,
"grad_norm": 22.728696823120117,
"learning_rate": 1.7226570915619387e-05,
"loss": 15.8239,
"step": 8000
},
{
"epoch": 1.440754039497307,
"grad_norm": 23.855932235717773,
"learning_rate": 1.728043087971275e-05,
"loss": 15.7778,
"step": 8025
},
{
"epoch": 1.44524236983842,
"grad_norm": 24.114036560058594,
"learning_rate": 1.7334290843806104e-05,
"loss": 15.9458,
"step": 8050
},
{
"epoch": 1.4497307001795332,
"grad_norm": 23.884950637817383,
"learning_rate": 1.7388150807899463e-05,
"loss": 15.7334,
"step": 8075
},
{
"epoch": 1.4542190305206464,
"grad_norm": 26.62238311767578,
"learning_rate": 1.744201077199282e-05,
"loss": 15.4899,
"step": 8100
},
{
"epoch": 1.4587073608617596,
"grad_norm": 25.22521209716797,
"learning_rate": 1.7495870736086177e-05,
"loss": 15.6142,
"step": 8125
},
{
"epoch": 1.4631956912028725,
"grad_norm": 24.73517417907715,
"learning_rate": 1.7549730700179533e-05,
"loss": 16.046,
"step": 8150
},
{
"epoch": 1.4676840215439857,
"grad_norm": 25.76804542541504,
"learning_rate": 1.7603590664272888e-05,
"loss": 15.5229,
"step": 8175
},
{
"epoch": 1.4721723518850989,
"grad_norm": 24.096242904663086,
"learning_rate": 1.765745062836625e-05,
"loss": 15.485,
"step": 8200
},
{
"epoch": 1.4766606822262118,
"grad_norm": 23.282133102416992,
"learning_rate": 1.7711310592459606e-05,
"loss": 15.4915,
"step": 8225
},
{
"epoch": 1.481149012567325,
"grad_norm": 23.70339012145996,
"learning_rate": 1.7765170556552964e-05,
"loss": 15.778,
"step": 8250
},
{
"epoch": 1.4856373429084382,
"grad_norm": 24.140331268310547,
"learning_rate": 1.781903052064632e-05,
"loss": 15.9428,
"step": 8275
},
{
"epoch": 1.4901256732495511,
"grad_norm": 22.932546615600586,
"learning_rate": 1.7872890484739675e-05,
"loss": 15.6304,
"step": 8300
},
{
"epoch": 1.4946140035906643,
"grad_norm": 24.020971298217773,
"learning_rate": 1.7926750448833034e-05,
"loss": 15.4193,
"step": 8325
},
{
"epoch": 1.4991023339317775,
"grad_norm": 24.903371810913086,
"learning_rate": 1.798061041292639e-05,
"loss": 15.3212,
"step": 8350
},
{
"epoch": 1.5035906642728905,
"grad_norm": 24.483036041259766,
"learning_rate": 1.803447037701975e-05,
"loss": 15.5371,
"step": 8375
},
{
"epoch": 1.5080789946140036,
"grad_norm": 24.4531192779541,
"learning_rate": 1.8088330341113107e-05,
"loss": 15.6667,
"step": 8400
},
{
"epoch": 1.5125673249551168,
"grad_norm": 23.508136749267578,
"learning_rate": 1.8142190305206466e-05,
"loss": 15.545,
"step": 8425
},
{
"epoch": 1.5170556552962298,
"grad_norm": 25.5224666595459,
"learning_rate": 1.819605026929982e-05,
"loss": 15.8223,
"step": 8450
},
{
"epoch": 1.521543985637343,
"grad_norm": 23.785808563232422,
"learning_rate": 1.8249910233393176e-05,
"loss": 15.736,
"step": 8475
},
{
"epoch": 1.5260323159784561,
"grad_norm": 22.968332290649414,
"learning_rate": 1.8303770197486535e-05,
"loss": 15.5985,
"step": 8500
},
{
"epoch": 1.530520646319569,
"grad_norm": 24.91457176208496,
"learning_rate": 1.835763016157989e-05,
"loss": 15.5723,
"step": 8525
},
{
"epoch": 1.5350089766606823,
"grad_norm": 27.051095962524414,
"learning_rate": 1.8411490125673253e-05,
"loss": 15.5436,
"step": 8550
},
{
"epoch": 1.5394973070017954,
"grad_norm": 26.1645565032959,
"learning_rate": 1.8465350089766608e-05,
"loss": 15.7189,
"step": 8575
},
{
"epoch": 1.5439856373429084,
"grad_norm": 25.47484016418457,
"learning_rate": 1.8519210053859967e-05,
"loss": 15.4637,
"step": 8600
},
{
"epoch": 1.5484739676840216,
"grad_norm": 21.521570205688477,
"learning_rate": 1.8573070017953322e-05,
"loss": 15.7744,
"step": 8625
},
{
"epoch": 1.5529622980251347,
"grad_norm": 22.680315017700195,
"learning_rate": 1.8626929982046677e-05,
"loss": 15.516,
"step": 8650
},
{
"epoch": 1.5574506283662477,
"grad_norm": 22.149436950683594,
"learning_rate": 1.8680789946140036e-05,
"loss": 16.09,
"step": 8675
},
{
"epoch": 1.5619389587073609,
"grad_norm": 24.99411392211914,
"learning_rate": 1.873464991023339e-05,
"loss": 15.5241,
"step": 8700
},
{
"epoch": 1.566427289048474,
"grad_norm": 24.49349021911621,
"learning_rate": 1.8788509874326754e-05,
"loss": 15.5149,
"step": 8725
},
{
"epoch": 1.570915619389587,
"grad_norm": 24.748638153076172,
"learning_rate": 1.884236983842011e-05,
"loss": 15.4012,
"step": 8750
},
{
"epoch": 1.5754039497307002,
"grad_norm": 23.619789123535156,
"learning_rate": 1.8896229802513465e-05,
"loss": 15.4603,
"step": 8775
},
{
"epoch": 1.5798922800718134,
"grad_norm": 24.02398681640625,
"learning_rate": 1.8950089766606823e-05,
"loss": 15.7808,
"step": 8800
},
{
"epoch": 1.5843806104129263,
"grad_norm": 24.2972354888916,
"learning_rate": 1.900394973070018e-05,
"loss": 15.1804,
"step": 8825
},
{
"epoch": 1.5888689407540395,
"grad_norm": 24.516998291015625,
"learning_rate": 1.9057809694793537e-05,
"loss": 15.318,
"step": 8850
},
{
"epoch": 1.5933572710951527,
"grad_norm": 24.47681999206543,
"learning_rate": 1.9111669658886893e-05,
"loss": 15.5455,
"step": 8875
},
{
"epoch": 1.5978456014362656,
"grad_norm": 26.061948776245117,
"learning_rate": 1.9165529622980255e-05,
"loss": 15.204,
"step": 8900
},
{
"epoch": 1.6023339317773788,
"grad_norm": 25.155284881591797,
"learning_rate": 1.9217235188509875e-05,
"loss": 15.7975,
"step": 8925
},
{
"epoch": 1.606822262118492,
"grad_norm": 26.721513748168945,
"learning_rate": 1.9271095152603233e-05,
"loss": 14.8189,
"step": 8950
},
{
"epoch": 1.611310592459605,
"grad_norm": 24.048892974853516,
"learning_rate": 1.932495511669659e-05,
"loss": 15.4657,
"step": 8975
},
{
"epoch": 1.6157989228007181,
"grad_norm": 21.87297248840332,
"learning_rate": 1.9378815080789948e-05,
"loss": 15.2708,
"step": 9000
},
{
"epoch": 1.6202872531418313,
"grad_norm": 23.78717613220215,
"learning_rate": 1.9432675044883303e-05,
"loss": 15.2762,
"step": 9025
},
{
"epoch": 1.6247755834829443,
"grad_norm": 25.389694213867188,
"learning_rate": 1.948653500897666e-05,
"loss": 15.4709,
"step": 9050
},
{
"epoch": 1.6292639138240574,
"grad_norm": 25.06108283996582,
"learning_rate": 1.9540394973070017e-05,
"loss": 15.4012,
"step": 9075
},
{
"epoch": 1.6337522441651706,
"grad_norm": 22.665700912475586,
"learning_rate": 1.9594254937163376e-05,
"loss": 15.7225,
"step": 9100
},
{
"epoch": 1.6382405745062836,
"grad_norm": 24.0644474029541,
"learning_rate": 1.9648114901256735e-05,
"loss": 15.5653,
"step": 9125
},
{
"epoch": 1.6427289048473968,
"grad_norm": 25.258146286010742,
"learning_rate": 1.970197486535009e-05,
"loss": 15.5544,
"step": 9150
},
{
"epoch": 1.64721723518851,
"grad_norm": 26.202850341796875,
"learning_rate": 1.975583482944345e-05,
"loss": 15.1562,
"step": 9175
},
{
"epoch": 1.6517055655296229,
"grad_norm": 25.502126693725586,
"learning_rate": 1.9809694793536804e-05,
"loss": 15.5075,
"step": 9200
},
{
"epoch": 1.656193895870736,
"grad_norm": 22.884952545166016,
"learning_rate": 1.9863554757630163e-05,
"loss": 15.386,
"step": 9225
},
{
"epoch": 1.6606822262118492,
"grad_norm": 22.87488555908203,
"learning_rate": 1.9917414721723518e-05,
"loss": 15.1854,
"step": 9250
},
{
"epoch": 1.6651705565529622,
"grad_norm": 25.1315975189209,
"learning_rate": 1.9971274685816877e-05,
"loss": 15.0536,
"step": 9275
},
{
"epoch": 1.6696588868940754,
"grad_norm": 23.088226318359375,
"learning_rate": 2.0025134649910236e-05,
"loss": 15.6098,
"step": 9300
},
{
"epoch": 1.6741472172351886,
"grad_norm": 24.46171760559082,
"learning_rate": 2.007899461400359e-05,
"loss": 15.2449,
"step": 9325
},
{
"epoch": 1.6786355475763015,
"grad_norm": 25.243085861206055,
"learning_rate": 2.013285457809695e-05,
"loss": 15.3234,
"step": 9350
},
{
"epoch": 1.6831238779174147,
"grad_norm": 24.72486686706543,
"learning_rate": 2.0186714542190305e-05,
"loss": 15.641,
"step": 9375
},
{
"epoch": 1.6876122082585279,
"grad_norm": 23.12143898010254,
"learning_rate": 2.024057450628366e-05,
"loss": 15.2531,
"step": 9400
},
{
"epoch": 1.6921005385996408,
"grad_norm": 24.512834548950195,
"learning_rate": 2.029443447037702e-05,
"loss": 15.5208,
"step": 9425
},
{
"epoch": 1.696588868940754,
"grad_norm": 25.56024742126465,
"learning_rate": 2.0348294434470378e-05,
"loss": 15.0794,
"step": 9450
},
{
"epoch": 1.7010771992818672,
"grad_norm": 25.564701080322266,
"learning_rate": 2.0402154398563737e-05,
"loss": 15.0259,
"step": 9475
},
{
"epoch": 1.7055655296229801,
"grad_norm": 25.182714462280273,
"learning_rate": 2.0456014362657092e-05,
"loss": 15.3297,
"step": 9500
},
{
"epoch": 1.7100538599640933,
"grad_norm": 25.756427764892578,
"learning_rate": 2.050987432675045e-05,
"loss": 15.0274,
"step": 9525
},
{
"epoch": 1.7145421903052065,
"grad_norm": 24.414350509643555,
"learning_rate": 2.0563734290843806e-05,
"loss": 15.004,
"step": 9550
},
{
"epoch": 1.7190305206463194,
"grad_norm": 26.023277282714844,
"learning_rate": 2.0617594254937162e-05,
"loss": 14.8821,
"step": 9575
},
{
"epoch": 1.7235188509874326,
"grad_norm": 24.01046371459961,
"learning_rate": 2.067145421903052e-05,
"loss": 15.2589,
"step": 9600
},
{
"epoch": 1.7280071813285458,
"grad_norm": 24.23836898803711,
"learning_rate": 2.0725314183123876e-05,
"loss": 15.1177,
"step": 9625
},
{
"epoch": 1.7324955116696588,
"grad_norm": 23.774337768554688,
"learning_rate": 2.0779174147217238e-05,
"loss": 15.1478,
"step": 9650
},
{
"epoch": 1.736983842010772,
"grad_norm": 28.614397048950195,
"learning_rate": 2.0833034111310593e-05,
"loss": 15.1796,
"step": 9675
},
{
"epoch": 1.7414721723518851,
"grad_norm": 26.42593765258789,
"learning_rate": 2.0886894075403952e-05,
"loss": 15.0701,
"step": 9700
},
{
"epoch": 1.745960502692998,
"grad_norm": 23.472248077392578,
"learning_rate": 2.0940754039497308e-05,
"loss": 15.3539,
"step": 9725
},
{
"epoch": 1.7504488330341115,
"grad_norm": 23.4112491607666,
"learning_rate": 2.0994614003590663e-05,
"loss": 15.2332,
"step": 9750
},
{
"epoch": 1.7549371633752244,
"grad_norm": 21.964303970336914,
"learning_rate": 2.1048473967684022e-05,
"loss": 15.1311,
"step": 9775
},
{
"epoch": 1.7594254937163374,
"grad_norm": 25.997272491455078,
"learning_rate": 2.1102333931777377e-05,
"loss": 15.4253,
"step": 9800
},
{
"epoch": 1.7639138240574508,
"grad_norm": 24.534364700317383,
"learning_rate": 2.115619389587074e-05,
"loss": 14.8151,
"step": 9825
},
{
"epoch": 1.7684021543985637,
"grad_norm": 25.785430908203125,
"learning_rate": 2.1210053859964095e-05,
"loss": 15.1881,
"step": 9850
},
{
"epoch": 1.7728904847396767,
"grad_norm": 24.27193832397461,
"learning_rate": 2.126391382405745e-05,
"loss": 15.0705,
"step": 9875
},
{
"epoch": 1.77737881508079,
"grad_norm": 24.99488067626953,
"learning_rate": 2.131777378815081e-05,
"loss": 15.1269,
"step": 9900
},
{
"epoch": 1.781867145421903,
"grad_norm": 25.080209732055664,
"learning_rate": 2.1371633752244164e-05,
"loss": 15.12,
"step": 9925
},
{
"epoch": 1.786355475763016,
"grad_norm": 25.579904556274414,
"learning_rate": 2.1425493716337523e-05,
"loss": 14.9893,
"step": 9950
},
{
"epoch": 1.7908438061041294,
"grad_norm": 25.11918067932129,
"learning_rate": 2.1479353680430878e-05,
"loss": 15.1663,
"step": 9975
},
{
"epoch": 1.7953321364452424,
"grad_norm": 27.383655548095703,
"learning_rate": 2.153321364452424e-05,
"loss": 15.1548,
"step": 10000
},
{
"epoch": 1.7998204667863553,
"grad_norm": 24.2135009765625,
"learning_rate": 2.1587073608617596e-05,
"loss": 15.0468,
"step": 10025
},
{
"epoch": 1.8043087971274687,
"grad_norm": 26.53235626220703,
"learning_rate": 2.164093357271095e-05,
"loss": 14.9113,
"step": 10050
},
{
"epoch": 1.8087971274685817,
"grad_norm": 25.139854431152344,
"learning_rate": 2.169479353680431e-05,
"loss": 15.2685,
"step": 10075
},
{
"epoch": 1.8132854578096946,
"grad_norm": 26.078100204467773,
"learning_rate": 2.1748653500897665e-05,
"loss": 15.0769,
"step": 10100
},
{
"epoch": 1.817773788150808,
"grad_norm": 32.14773941040039,
"learning_rate": 2.1802513464991024e-05,
"loss": 15.0157,
"step": 10125
},
{
"epoch": 1.822262118491921,
"grad_norm": 25.352624893188477,
"learning_rate": 2.185637342908438e-05,
"loss": 15.2303,
"step": 10150
},
{
"epoch": 1.826750448833034,
"grad_norm": 24.74574851989746,
"learning_rate": 2.1910233393177738e-05,
"loss": 14.5637,
"step": 10175
},
{
"epoch": 1.8312387791741473,
"grad_norm": 26.362592697143555,
"learning_rate": 2.1964093357271097e-05,
"loss": 14.9059,
"step": 10200
},
{
"epoch": 1.8357271095152603,
"grad_norm": 24.987171173095703,
"learning_rate": 2.2017953321364452e-05,
"loss": 15.069,
"step": 10225
},
{
"epoch": 1.8402154398563735,
"grad_norm": 24.836288452148438,
"learning_rate": 2.207181328545781e-05,
"loss": 15.0462,
"step": 10250
},
{
"epoch": 1.8447037701974867,
"grad_norm": 24.79768180847168,
"learning_rate": 2.2125673249551166e-05,
"loss": 14.8612,
"step": 10275
},
{
"epoch": 1.8491921005385996,
"grad_norm": 25.61474609375,
"learning_rate": 2.2179533213644525e-05,
"loss": 14.7445,
"step": 10300
},
{
"epoch": 1.8536804308797128,
"grad_norm": 25.009479522705078,
"learning_rate": 2.223339317773788e-05,
"loss": 14.9331,
"step": 10325
},
{
"epoch": 1.858168761220826,
"grad_norm": 25.85749053955078,
"learning_rate": 2.228725314183124e-05,
"loss": 14.8491,
"step": 10350
},
{
"epoch": 1.862657091561939,
"grad_norm": 24.728235244750977,
"learning_rate": 2.2341113105924598e-05,
"loss": 14.8084,
"step": 10375
},
{
"epoch": 1.867145421903052,
"grad_norm": 23.449575424194336,
"learning_rate": 2.2394973070017954e-05,
"loss": 14.8635,
"step": 10400
},
{
"epoch": 1.8716337522441653,
"grad_norm": 23.53273582458496,
"learning_rate": 2.2448833034111312e-05,
"loss": 15.0607,
"step": 10425
},
{
"epoch": 1.8761220825852782,
"grad_norm": 26.236675262451172,
"learning_rate": 2.2502692998204668e-05,
"loss": 14.7338,
"step": 10450
},
{
"epoch": 1.8806104129263914,
"grad_norm": 24.960784912109375,
"learning_rate": 2.2556552962298026e-05,
"loss": 14.831,
"step": 10475
},
{
"epoch": 1.8850987432675046,
"grad_norm": 23.77855682373047,
"learning_rate": 2.2610412926391382e-05,
"loss": 14.9616,
"step": 10500
},
{
"epoch": 1.8895870736086176,
"grad_norm": 25.975210189819336,
"learning_rate": 2.266427289048474e-05,
"loss": 14.5186,
"step": 10525
},
{
"epoch": 1.8940754039497307,
"grad_norm": 26.122711181640625,
"learning_rate": 2.27181328545781e-05,
"loss": 14.3817,
"step": 10550
},
{
"epoch": 1.898563734290844,
"grad_norm": 25.613475799560547,
"learning_rate": 2.2771992818671455e-05,
"loss": 14.5428,
"step": 10575
},
{
"epoch": 1.9030520646319569,
"grad_norm": 24.9304141998291,
"learning_rate": 2.2825852782764813e-05,
"loss": 14.8628,
"step": 10600
},
{
"epoch": 1.90754039497307,
"grad_norm": 25.525495529174805,
"learning_rate": 2.287971274685817e-05,
"loss": 14.7881,
"step": 10625
},
{
"epoch": 1.9120287253141832,
"grad_norm": 24.550325393676758,
"learning_rate": 2.2933572710951524e-05,
"loss": 14.5237,
"step": 10650
},
{
"epoch": 1.9165170556552962,
"grad_norm": 26.814821243286133,
"learning_rate": 2.2987432675044883e-05,
"loss": 14.9076,
"step": 10675
},
{
"epoch": 1.9210053859964094,
"grad_norm": 25.589099884033203,
"learning_rate": 2.3041292639138242e-05,
"loss": 14.9983,
"step": 10700
},
{
"epoch": 1.9254937163375225,
"grad_norm": 26.260356903076172,
"learning_rate": 2.30951526032316e-05,
"loss": 14.4078,
"step": 10725
},
{
"epoch": 1.9299820466786355,
"grad_norm": 40.02426528930664,
"learning_rate": 2.3149012567324956e-05,
"loss": 14.6382,
"step": 10750
},
{
"epoch": 1.9344703770197487,
"grad_norm": 24.463035583496094,
"learning_rate": 2.3202872531418315e-05,
"loss": 14.251,
"step": 10775
},
{
"epoch": 1.9389587073608618,
"grad_norm": 26.021873474121094,
"learning_rate": 2.325673249551167e-05,
"loss": 14.783,
"step": 10800
},
{
"epoch": 1.9434470377019748,
"grad_norm": 25.914993286132812,
"learning_rate": 2.3310592459605025e-05,
"loss": 14.6037,
"step": 10825
},
{
"epoch": 1.947935368043088,
"grad_norm": 24.850980758666992,
"learning_rate": 2.3364452423698384e-05,
"loss": 14.692,
"step": 10850
},
{
"epoch": 1.9524236983842012,
"grad_norm": 23.075193405151367,
"learning_rate": 2.341831238779174e-05,
"loss": 14.4149,
"step": 10875
},
{
"epoch": 1.9569120287253141,
"grad_norm": 26.311481475830078,
"learning_rate": 2.34721723518851e-05,
"loss": 14.5615,
"step": 10900
},
{
"epoch": 1.9614003590664273,
"grad_norm": 24.902671813964844,
"learning_rate": 2.3526032315978457e-05,
"loss": 14.267,
"step": 10925
},
{
"epoch": 1.9658886894075405,
"grad_norm": 24.723201751708984,
"learning_rate": 2.3579892280071816e-05,
"loss": 14.5657,
"step": 10950
},
{
"epoch": 1.9703770197486534,
"grad_norm": 26.1663818359375,
"learning_rate": 2.363375224416517e-05,
"loss": 14.6454,
"step": 10975
},
{
"epoch": 1.9748653500897666,
"grad_norm": 24.78443145751953,
"learning_rate": 2.3687612208258527e-05,
"loss": 14.4444,
"step": 11000
},
{
"epoch": 1.9793536804308798,
"grad_norm": 24.568164825439453,
"learning_rate": 2.3741472172351885e-05,
"loss": 14.6314,
"step": 11025
},
{
"epoch": 1.9838420107719927,
"grad_norm": 26.20634651184082,
"learning_rate": 2.379533213644524e-05,
"loss": 14.6824,
"step": 11050
},
{
"epoch": 1.988330341113106,
"grad_norm": 24.453754425048828,
"learning_rate": 2.3849192100538603e-05,
"loss": 14.7177,
"step": 11075
},
{
"epoch": 1.992818671454219,
"grad_norm": 25.506359100341797,
"learning_rate": 2.3903052064631958e-05,
"loss": 14.5953,
"step": 11100
},
{
"epoch": 1.997307001795332,
"grad_norm": 24.724069595336914,
"learning_rate": 2.3956912028725314e-05,
"loss": 14.5515,
"step": 11125
},
{
"epoch": 2.0,
"eval_accuracy": 0.059114429190613486,
"eval_f1_macro": 0.0023152878504535197,
"eval_f1_micro": 0.059114429190613486,
"eval_f1_weighted": 0.026248109088727177,
"eval_loss": 7.744897365570068,
"eval_precision_macro": 0.002216938480351738,
"eval_precision_micro": 0.059114429190613486,
"eval_precision_weighted": 0.02310532518025774,
"eval_recall_macro": 0.005032399335473846,
"eval_recall_micro": 0.059114429190613486,
"eval_recall_weighted": 0.059114429190613486,
"eval_runtime": 86.2961,
"eval_samples_per_second": 606.899,
"eval_steps_per_second": 18.97,
"step": 11140
},
{
"epoch": 2.0017953321364454,
"grad_norm": 25.532371520996094,
"learning_rate": 2.4010771992818672e-05,
"loss": 14.3659,
"step": 11150
},
{
"epoch": 2.0062836624775584,
"grad_norm": 25.71830177307129,
"learning_rate": 2.4064631956912028e-05,
"loss": 13.8419,
"step": 11175
},
{
"epoch": 2.0107719928186714,
"grad_norm": 27.925411224365234,
"learning_rate": 2.4118491921005386e-05,
"loss": 13.9401,
"step": 11200
},
{
"epoch": 2.0152603231597848,
"grad_norm": 26.441532135009766,
"learning_rate": 2.4172351885098742e-05,
"loss": 14.0487,
"step": 11225
},
{
"epoch": 2.0197486535008977,
"grad_norm": 25.631881713867188,
"learning_rate": 2.4226211849192104e-05,
"loss": 13.4916,
"step": 11250
},
{
"epoch": 2.0242369838420107,
"grad_norm": 25.339025497436523,
"learning_rate": 2.428007181328546e-05,
"loss": 13.5516,
"step": 11275
},
{
"epoch": 2.028725314183124,
"grad_norm": 26.991966247558594,
"learning_rate": 2.4333931777378815e-05,
"loss": 13.4598,
"step": 11300
},
{
"epoch": 2.033213644524237,
"grad_norm": 25.9316463470459,
"learning_rate": 2.4387791741472174e-05,
"loss": 13.7771,
"step": 11325
},
{
"epoch": 2.03770197486535,
"grad_norm": 27.35523796081543,
"learning_rate": 2.444165170556553e-05,
"loss": 13.721,
"step": 11350
},
{
"epoch": 2.0421903052064634,
"grad_norm": 27.451637268066406,
"learning_rate": 2.4495511669658888e-05,
"loss": 13.7572,
"step": 11375
},
{
"epoch": 2.0466786355475763,
"grad_norm": 27.497739791870117,
"learning_rate": 2.4549371633752243e-05,
"loss": 13.7687,
"step": 11400
},
{
"epoch": 2.0511669658886893,
"grad_norm": 26.42055892944336,
"learning_rate": 2.4603231597845602e-05,
"loss": 13.8483,
"step": 11425
},
{
"epoch": 2.0556552962298027,
"grad_norm": 26.251361846923828,
"learning_rate": 2.465709156193896e-05,
"loss": 13.4564,
"step": 11450
},
{
"epoch": 2.0601436265709157,
"grad_norm": 27.7249813079834,
"learning_rate": 2.4710951526032316e-05,
"loss": 13.6193,
"step": 11475
},
{
"epoch": 2.0646319569120286,
"grad_norm": 29.7418155670166,
"learning_rate": 2.4764811490125675e-05,
"loss": 13.6154,
"step": 11500
},
{
"epoch": 2.069120287253142,
"grad_norm": 28.159162521362305,
"learning_rate": 2.481867145421903e-05,
"loss": 13.505,
"step": 11525
},
{
"epoch": 2.073608617594255,
"grad_norm": 27.0701904296875,
"learning_rate": 2.487253141831239e-05,
"loss": 13.6783,
"step": 11550
},
{
"epoch": 2.078096947935368,
"grad_norm": 28.18494987487793,
"learning_rate": 2.4924236983842012e-05,
"loss": 13.6024,
"step": 11575
},
{
"epoch": 2.0825852782764813,
"grad_norm": 25.40494155883789,
"learning_rate": 2.4978096947935367e-05,
"loss": 13.7101,
"step": 11600
},
{
"epoch": 2.0870736086175943,
"grad_norm": 28.17936897277832,
"learning_rate": 2.5031956912028726e-05,
"loss": 13.8797,
"step": 11625
},
{
"epoch": 2.0915619389587072,
"grad_norm": 28.881277084350586,
"learning_rate": 2.5085816876122085e-05,
"loss": 13.6777,
"step": 11650
},
{
"epoch": 2.0960502692998206,
"grad_norm": 25.790342330932617,
"learning_rate": 2.513967684021544e-05,
"loss": 13.5791,
"step": 11675
},
{
"epoch": 2.1005385996409336,
"grad_norm": 28.37506866455078,
"learning_rate": 2.51935368043088e-05,
"loss": 13.4982,
"step": 11700
},
{
"epoch": 2.1050269299820465,
"grad_norm": 33.875404357910156,
"learning_rate": 2.5247396768402154e-05,
"loss": 13.5064,
"step": 11725
},
{
"epoch": 2.10951526032316,
"grad_norm": 28.881078720092773,
"learning_rate": 2.530125673249551e-05,
"loss": 13.6496,
"step": 11750
},
{
"epoch": 2.114003590664273,
"grad_norm": 26.983850479125977,
"learning_rate": 2.535511669658887e-05,
"loss": 13.3992,
"step": 11775
},
{
"epoch": 2.118491921005386,
"grad_norm": 26.257688522338867,
"learning_rate": 2.5408976660682227e-05,
"loss": 13.7699,
"step": 11800
},
{
"epoch": 2.1229802513464993,
"grad_norm": 28.320302963256836,
"learning_rate": 2.5462836624775586e-05,
"loss": 13.3924,
"step": 11825
},
{
"epoch": 2.127468581687612,
"grad_norm": 28.05795669555664,
"learning_rate": 2.551669658886894e-05,
"loss": 13.5392,
"step": 11850
},
{
"epoch": 2.131956912028725,
"grad_norm": 29.30341911315918,
"learning_rate": 2.55705565529623e-05,
"loss": 13.4195,
"step": 11875
},
{
"epoch": 2.1364452423698386,
"grad_norm": 27.965492248535156,
"learning_rate": 2.5624416517055655e-05,
"loss": 13.6041,
"step": 11900
},
{
"epoch": 2.1409335727109515,
"grad_norm": 29.342981338500977,
"learning_rate": 2.567827648114901e-05,
"loss": 13.4952,
"step": 11925
},
{
"epoch": 2.1454219030520645,
"grad_norm": 29.504013061523438,
"learning_rate": 2.573213644524237e-05,
"loss": 13.3822,
"step": 11950
},
{
"epoch": 2.149910233393178,
"grad_norm": 25.68410301208496,
"learning_rate": 2.578599640933573e-05,
"loss": 13.6285,
"step": 11975
},
{
"epoch": 2.154398563734291,
"grad_norm": 27.036991119384766,
"learning_rate": 2.5839856373429087e-05,
"loss": 13.9489,
"step": 12000
},
{
"epoch": 2.158886894075404,
"grad_norm": 28.70158576965332,
"learning_rate": 2.5893716337522443e-05,
"loss": 13.6128,
"step": 12025
},
{
"epoch": 2.163375224416517,
"grad_norm": 27.817323684692383,
"learning_rate": 2.59475763016158e-05,
"loss": 13.8509,
"step": 12050
},
{
"epoch": 2.16786355475763,
"grad_norm": 26.909086227416992,
"learning_rate": 2.6001436265709157e-05,
"loss": 13.4432,
"step": 12075
},
{
"epoch": 2.172351885098743,
"grad_norm": 27.109466552734375,
"learning_rate": 2.6055296229802512e-05,
"loss": 13.3693,
"step": 12100
},
{
"epoch": 2.1768402154398565,
"grad_norm": 29.08690643310547,
"learning_rate": 2.610915619389587e-05,
"loss": 13.7364,
"step": 12125
},
{
"epoch": 2.1813285457809695,
"grad_norm": 28.68939971923828,
"learning_rate": 2.616301615798923e-05,
"loss": 13.7631,
"step": 12150
},
{
"epoch": 2.1858168761220824,
"grad_norm": 28.95443344116211,
"learning_rate": 2.621687612208259e-05,
"loss": 14.0335,
"step": 12175
},
{
"epoch": 2.190305206463196,
"grad_norm": 29.304248809814453,
"learning_rate": 2.6270736086175944e-05,
"loss": 13.6967,
"step": 12200
},
{
"epoch": 2.1947935368043088,
"grad_norm": 27.95583152770996,
"learning_rate": 2.63245960502693e-05,
"loss": 13.5719,
"step": 12225
},
{
"epoch": 2.1992818671454217,
"grad_norm": 27.92197608947754,
"learning_rate": 2.6378456014362658e-05,
"loss": 13.6523,
"step": 12250
},
{
"epoch": 2.203770197486535,
"grad_norm": 29.322330474853516,
"learning_rate": 2.6432315978456013e-05,
"loss": 13.5422,
"step": 12275
},
{
"epoch": 2.208258527827648,
"grad_norm": 29.324125289916992,
"learning_rate": 2.6486175942549372e-05,
"loss": 13.639,
"step": 12300
},
{
"epoch": 2.212746858168761,
"grad_norm": 27.53671646118164,
"learning_rate": 2.654003590664273e-05,
"loss": 13.7083,
"step": 12325
},
{
"epoch": 2.2172351885098744,
"grad_norm": 28.272226333618164,
"learning_rate": 2.659389587073609e-05,
"loss": 13.7521,
"step": 12350
},
{
"epoch": 2.2217235188509874,
"grad_norm": 28.756206512451172,
"learning_rate": 2.6647755834829445e-05,
"loss": 13.3446,
"step": 12375
},
{
"epoch": 2.2262118491921004,
"grad_norm": 27.521116256713867,
"learning_rate": 2.67016157989228e-05,
"loss": 13.3676,
"step": 12400
},
{
"epoch": 2.2307001795332138,
"grad_norm": 28.232725143432617,
"learning_rate": 2.675547576301616e-05,
"loss": 13.7248,
"step": 12425
},
{
"epoch": 2.2351885098743267,
"grad_norm": 27.95871353149414,
"learning_rate": 2.6809335727109514e-05,
"loss": 13.3986,
"step": 12450
},
{
"epoch": 2.2396768402154397,
"grad_norm": 26.93558692932129,
"learning_rate": 2.6863195691202873e-05,
"loss": 13.5971,
"step": 12475
},
{
"epoch": 2.244165170556553,
"grad_norm": 27.357070922851562,
"learning_rate": 2.6914901256732496e-05,
"loss": 13.8007,
"step": 12500
},
{
"epoch": 2.248653500897666,
"grad_norm": 34.84161376953125,
"learning_rate": 2.6968761220825855e-05,
"loss": 13.3402,
"step": 12525
},
{
"epoch": 2.253141831238779,
"grad_norm": 29.713102340698242,
"learning_rate": 2.702262118491921e-05,
"loss": 13.4515,
"step": 12550
},
{
"epoch": 2.2576301615798924,
"grad_norm": 31.844457626342773,
"learning_rate": 2.707648114901257e-05,
"loss": 13.4538,
"step": 12575
},
{
"epoch": 2.2621184919210053,
"grad_norm": 31.339860916137695,
"learning_rate": 2.7130341113105924e-05,
"loss": 13.7536,
"step": 12600
},
{
"epoch": 2.2666068222621183,
"grad_norm": 27.18288803100586,
"learning_rate": 2.7184201077199283e-05,
"loss": 13.361,
"step": 12625
},
{
"epoch": 2.2710951526032317,
"grad_norm": 25.645360946655273,
"learning_rate": 2.723806104129264e-05,
"loss": 13.7802,
"step": 12650
},
{
"epoch": 2.2755834829443446,
"grad_norm": 28.508298873901367,
"learning_rate": 2.7291921005385997e-05,
"loss": 13.4987,
"step": 12675
},
{
"epoch": 2.280071813285458,
"grad_norm": 26.898292541503906,
"learning_rate": 2.7345780969479356e-05,
"loss": 13.6794,
"step": 12700
},
{
"epoch": 2.284560143626571,
"grad_norm": 40.84425354003906,
"learning_rate": 2.739964093357271e-05,
"loss": 13.3866,
"step": 12725
},
{
"epoch": 2.289048473967684,
"grad_norm": 27.576169967651367,
"learning_rate": 2.745350089766607e-05,
"loss": 13.7029,
"step": 12750
},
{
"epoch": 2.293536804308797,
"grad_norm": 27.815526962280273,
"learning_rate": 2.7507360861759426e-05,
"loss": 13.6855,
"step": 12775
},
{
"epoch": 2.2980251346499103,
"grad_norm": 26.595399856567383,
"learning_rate": 2.7561220825852784e-05,
"loss": 13.3868,
"step": 12800
},
{
"epoch": 2.3025134649910233,
"grad_norm": 27.15950584411621,
"learning_rate": 2.761508078994614e-05,
"loss": 13.3696,
"step": 12825
},
{
"epoch": 2.3070017953321367,
"grad_norm": 28.6210994720459,
"learning_rate": 2.7668940754039495e-05,
"loss": 13.5515,
"step": 12850
},
{
"epoch": 2.3114901256732496,
"grad_norm": 27.74658203125,
"learning_rate": 2.7722800718132857e-05,
"loss": 13.0361,
"step": 12875
},
{
"epoch": 2.3159784560143626,
"grad_norm": 26.844989776611328,
"learning_rate": 2.7776660682226213e-05,
"loss": 13.5466,
"step": 12900
},
{
"epoch": 2.3204667863554755,
"grad_norm": 27.64177703857422,
"learning_rate": 2.783052064631957e-05,
"loss": 13.8139,
"step": 12925
},
{
"epoch": 2.324955116696589,
"grad_norm": 28.158784866333008,
"learning_rate": 2.7884380610412927e-05,
"loss": 13.7636,
"step": 12950
},
{
"epoch": 2.329443447037702,
"grad_norm": 28.323238372802734,
"learning_rate": 2.7938240574506286e-05,
"loss": 13.3848,
"step": 12975
},
{
"epoch": 2.3339317773788153,
"grad_norm": 28.48469352722168,
"learning_rate": 2.799210053859964e-05,
"loss": 13.6163,
"step": 13000
},
{
"epoch": 2.3384201077199283,
"grad_norm": 26.27099609375,
"learning_rate": 2.8045960502692996e-05,
"loss": 13.3823,
"step": 13025
},
{
"epoch": 2.342908438061041,
"grad_norm": 27.050186157226562,
"learning_rate": 2.8099820466786355e-05,
"loss": 13.6565,
"step": 13050
},
{
"epoch": 2.347396768402154,
"grad_norm": 26.83416748046875,
"learning_rate": 2.8153680430879714e-05,
"loss": 13.5032,
"step": 13075
},
{
"epoch": 2.3518850987432676,
"grad_norm": 25.751502990722656,
"learning_rate": 2.8207540394973073e-05,
"loss": 13.5242,
"step": 13100
},
{
"epoch": 2.3563734290843805,
"grad_norm": 27.54896354675293,
"learning_rate": 2.8261400359066428e-05,
"loss": 13.6656,
"step": 13125
},
{
"epoch": 2.360861759425494,
"grad_norm": 29.93552017211914,
"learning_rate": 2.8315260323159787e-05,
"loss": 13.7181,
"step": 13150
},
{
"epoch": 2.365350089766607,
"grad_norm": 34.247100830078125,
"learning_rate": 2.8369120287253142e-05,
"loss": 13.3388,
"step": 13175
},
{
"epoch": 2.36983842010772,
"grad_norm": 27.253677368164062,
"learning_rate": 2.8422980251346498e-05,
"loss": 12.8927,
"step": 13200
},
{
"epoch": 2.374326750448833,
"grad_norm": 26.714345932006836,
"learning_rate": 2.8476840215439856e-05,
"loss": 13.1986,
"step": 13225
},
{
"epoch": 2.378815080789946,
"grad_norm": 28.791046142578125,
"learning_rate": 2.8530700179533215e-05,
"loss": 13.3162,
"step": 13250
},
{
"epoch": 2.383303411131059,
"grad_norm": 27.82441520690918,
"learning_rate": 2.8584560143626574e-05,
"loss": 13.6409,
"step": 13275
},
{
"epoch": 2.3877917414721725,
"grad_norm": 27.760778427124023,
"learning_rate": 2.863842010771993e-05,
"loss": 13.4591,
"step": 13300
},
{
"epoch": 2.3922800718132855,
"grad_norm": 35.298912048339844,
"learning_rate": 2.8692280071813285e-05,
"loss": 13.5868,
"step": 13325
},
{
"epoch": 2.3967684021543985,
"grad_norm": 29.174081802368164,
"learning_rate": 2.8746140035906643e-05,
"loss": 12.9569,
"step": 13350
},
{
"epoch": 2.401256732495512,
"grad_norm": 28.78097152709961,
"learning_rate": 2.88e-05,
"loss": 13.405,
"step": 13375
},
{
"epoch": 2.405745062836625,
"grad_norm": 28.48590660095215,
"learning_rate": 2.8853859964093357e-05,
"loss": 13.8227,
"step": 13400
},
{
"epoch": 2.4102333931777378,
"grad_norm": 27.466550827026367,
"learning_rate": 2.8907719928186716e-05,
"loss": 13.3373,
"step": 13425
},
{
"epoch": 2.414721723518851,
"grad_norm": 26.298185348510742,
"learning_rate": 2.8961579892280075e-05,
"loss": 13.3942,
"step": 13450
},
{
"epoch": 2.419210053859964,
"grad_norm": 27.673166275024414,
"learning_rate": 2.901543985637343e-05,
"loss": 13.3092,
"step": 13475
},
{
"epoch": 2.423698384201077,
"grad_norm": 27.58799171447754,
"learning_rate": 2.9069299820466786e-05,
"loss": 13.2923,
"step": 13500
},
{
"epoch": 2.4281867145421905,
"grad_norm": 28.616209030151367,
"learning_rate": 2.9123159784560144e-05,
"loss": 13.7892,
"step": 13525
},
{
"epoch": 2.4326750448833034,
"grad_norm": 27.34395980834961,
"learning_rate": 2.91770197486535e-05,
"loss": 13.4703,
"step": 13550
},
{
"epoch": 2.4371633752244164,
"grad_norm": 27.241291046142578,
"learning_rate": 2.923087971274686e-05,
"loss": 13.6906,
"step": 13575
},
{
"epoch": 2.44165170556553,
"grad_norm": 31.22068214416504,
"learning_rate": 2.9284739676840217e-05,
"loss": 13.0538,
"step": 13600
},
{
"epoch": 2.4461400359066428,
"grad_norm": 27.56983184814453,
"learning_rate": 2.9338599640933573e-05,
"loss": 13.4391,
"step": 13625
},
{
"epoch": 2.4506283662477557,
"grad_norm": 27.46451187133789,
"learning_rate": 2.939245960502693e-05,
"loss": 13.4247,
"step": 13650
},
{
"epoch": 2.455116696588869,
"grad_norm": 27.22041893005371,
"learning_rate": 2.9446319569120287e-05,
"loss": 13.2423,
"step": 13675
},
{
"epoch": 2.459605026929982,
"grad_norm": 44.078704833984375,
"learning_rate": 2.9500179533213646e-05,
"loss": 12.9909,
"step": 13700
},
{
"epoch": 2.464093357271095,
"grad_norm": 28.11593246459961,
"learning_rate": 2.9554039497307e-05,
"loss": 13.2222,
"step": 13725
},
{
"epoch": 2.4685816876122084,
"grad_norm": 28.899824142456055,
"learning_rate": 2.960789946140036e-05,
"loss": 13.4138,
"step": 13750
},
{
"epoch": 2.4730700179533214,
"grad_norm": 27.567039489746094,
"learning_rate": 2.966175942549372e-05,
"loss": 13.5078,
"step": 13775
},
{
"epoch": 2.4775583482944343,
"grad_norm": 26.155046463012695,
"learning_rate": 2.9715619389587074e-05,
"loss": 13.2964,
"step": 13800
},
{
"epoch": 2.4820466786355477,
"grad_norm": 26.821226119995117,
"learning_rate": 2.9769479353680433e-05,
"loss": 13.7765,
"step": 13825
},
{
"epoch": 2.4865350089766607,
"grad_norm": 28.220781326293945,
"learning_rate": 2.9823339317773788e-05,
"loss": 13.6587,
"step": 13850
},
{
"epoch": 2.4910233393177736,
"grad_norm": 29.53750228881836,
"learning_rate": 2.9877199281867147e-05,
"loss": 13.3947,
"step": 13875
},
{
"epoch": 2.495511669658887,
"grad_norm": 26.887174606323242,
"learning_rate": 2.9931059245960502e-05,
"loss": 13.1447,
"step": 13900
},
{
"epoch": 2.5,
"grad_norm": 27.31348419189453,
"learning_rate": 2.998491921005386e-05,
"loss": 13.2417,
"step": 13925
},
{
"epoch": 2.504488330341113,
"grad_norm": 30.33110809326172,
"learning_rate": 2.9995691202872533e-05,
"loss": 13.3828,
"step": 13950
},
{
"epoch": 2.5089766606822264,
"grad_norm": 27.644296646118164,
"learning_rate": 2.9989706762417715e-05,
"loss": 13.1784,
"step": 13975
},
{
"epoch": 2.5134649910233393,
"grad_norm": 29.11074447631836,
"learning_rate": 2.9983722321962897e-05,
"loss": 13.2607,
"step": 14000
},
{
"epoch": 2.5179533213644523,
"grad_norm": 27.747520446777344,
"learning_rate": 2.997773788150808e-05,
"loss": 13.5471,
"step": 14025
},
{
"epoch": 2.5224416517055657,
"grad_norm": 26.138446807861328,
"learning_rate": 2.9971753441053262e-05,
"loss": 13.6022,
"step": 14050
},
{
"epoch": 2.5269299820466786,
"grad_norm": 27.328989028930664,
"learning_rate": 2.9965769000598445e-05,
"loss": 13.1897,
"step": 14075
},
{
"epoch": 2.5314183123877916,
"grad_norm": 26.998350143432617,
"learning_rate": 2.9959784560143627e-05,
"loss": 13.4755,
"step": 14100
},
{
"epoch": 2.535906642728905,
"grad_norm": 27.311878204345703,
"learning_rate": 2.995380011968881e-05,
"loss": 12.9735,
"step": 14125
},
{
"epoch": 2.540394973070018,
"grad_norm": 25.830198287963867,
"learning_rate": 2.994781567923399e-05,
"loss": 13.352,
"step": 14150
},
{
"epoch": 2.5448833034111313,
"grad_norm": 26.87948989868164,
"learning_rate": 2.9941831238779177e-05,
"loss": 13.4053,
"step": 14175
},
{
"epoch": 2.5493716337522443,
"grad_norm": 28.717430114746094,
"learning_rate": 2.993584679832436e-05,
"loss": 13.0502,
"step": 14200
},
{
"epoch": 2.5538599640933572,
"grad_norm": 26.697654724121094,
"learning_rate": 2.992986235786954e-05,
"loss": 13.109,
"step": 14225
},
{
"epoch": 2.55834829443447,
"grad_norm": 27.923320770263672,
"learning_rate": 2.992387791741472e-05,
"loss": 13.0734,
"step": 14250
},
{
"epoch": 2.5628366247755836,
"grad_norm": 28.750234603881836,
"learning_rate": 2.9917893476959903e-05,
"loss": 13.2513,
"step": 14275
},
{
"epoch": 2.5673249551166966,
"grad_norm": 28.91152000427246,
"learning_rate": 2.9911909036505086e-05,
"loss": 13.0385,
"step": 14300
},
{
"epoch": 2.57181328545781,
"grad_norm": 26.92072105407715,
"learning_rate": 2.990592459605027e-05,
"loss": 13.289,
"step": 14325
},
{
"epoch": 2.576301615798923,
"grad_norm": 26.42039680480957,
"learning_rate": 2.9899940155595454e-05,
"loss": 13.3365,
"step": 14350
},
{
"epoch": 2.580789946140036,
"grad_norm": 26.49629783630371,
"learning_rate": 2.9893955715140636e-05,
"loss": 13.0466,
"step": 14375
},
{
"epoch": 2.585278276481149,
"grad_norm": 26.710182189941406,
"learning_rate": 2.988797127468582e-05,
"loss": 13.196,
"step": 14400
},
{
"epoch": 2.5897666068222622,
"grad_norm": 28.95528793334961,
"learning_rate": 2.9881986834230998e-05,
"loss": 13.2839,
"step": 14425
},
{
"epoch": 2.594254937163375,
"grad_norm": 27.436601638793945,
"learning_rate": 2.9876002393776183e-05,
"loss": 13.195,
"step": 14450
},
{
"epoch": 2.5987432675044886,
"grad_norm": 27.884984970092773,
"learning_rate": 2.9870017953321366e-05,
"loss": 13.3211,
"step": 14475
},
{
"epoch": 2.6032315978456015,
"grad_norm": 28.28389549255371,
"learning_rate": 2.9864033512866548e-05,
"loss": 13.0379,
"step": 14500
},
{
"epoch": 2.6077199281867145,
"grad_norm": 33.531089782714844,
"learning_rate": 2.985804907241173e-05,
"loss": 13.297,
"step": 14525
},
{
"epoch": 2.6122082585278275,
"grad_norm": 27.326101303100586,
"learning_rate": 2.9852064631956913e-05,
"loss": 13.3307,
"step": 14550
},
{
"epoch": 2.616696588868941,
"grad_norm": 26.402788162231445,
"learning_rate": 2.9846080191502095e-05,
"loss": 13.2087,
"step": 14575
},
{
"epoch": 2.621184919210054,
"grad_norm": 28.52970314025879,
"learning_rate": 2.984009575104728e-05,
"loss": 13.0077,
"step": 14600
},
{
"epoch": 2.625673249551167,
"grad_norm": 26.127384185791016,
"learning_rate": 2.983411131059246e-05,
"loss": 12.8619,
"step": 14625
},
{
"epoch": 2.63016157989228,
"grad_norm": 26.900188446044922,
"learning_rate": 2.9828126870137642e-05,
"loss": 13.0219,
"step": 14650
},
{
"epoch": 2.634649910233393,
"grad_norm": 28.075593948364258,
"learning_rate": 2.9822142429682825e-05,
"loss": 13.0576,
"step": 14675
},
{
"epoch": 2.639138240574506,
"grad_norm": 27.4871883392334,
"learning_rate": 2.9816157989228007e-05,
"loss": 13.3393,
"step": 14700
},
{
"epoch": 2.6436265709156195,
"grad_norm": 26.82506561279297,
"learning_rate": 2.981017354877319e-05,
"loss": 13.2037,
"step": 14725
},
{
"epoch": 2.6481149012567324,
"grad_norm": 27.90208625793457,
"learning_rate": 2.9804189108318375e-05,
"loss": 13.2741,
"step": 14750
},
{
"epoch": 2.652603231597846,
"grad_norm": 27.409181594848633,
"learning_rate": 2.9798204667863557e-05,
"loss": 13.0042,
"step": 14775
},
{
"epoch": 2.657091561938959,
"grad_norm": 26.863079071044922,
"learning_rate": 2.979222022740874e-05,
"loss": 12.9851,
"step": 14800
},
{
"epoch": 2.6615798922800717,
"grad_norm": 27.66518211364746,
"learning_rate": 2.978623578695392e-05,
"loss": 13.1473,
"step": 14825
},
{
"epoch": 2.6660682226211847,
"grad_norm": 31.207706451416016,
"learning_rate": 2.97802513464991e-05,
"loss": 13.6109,
"step": 14850
},
{
"epoch": 2.670556552962298,
"grad_norm": 26.27522087097168,
"learning_rate": 2.9774266906044287e-05,
"loss": 12.7777,
"step": 14875
},
{
"epoch": 2.675044883303411,
"grad_norm": 28.05002784729004,
"learning_rate": 2.976828246558947e-05,
"loss": 13.4278,
"step": 14900
},
{
"epoch": 2.6795332136445245,
"grad_norm": 27.554943084716797,
"learning_rate": 2.976229802513465e-05,
"loss": 13.4759,
"step": 14925
},
{
"epoch": 2.6840215439856374,
"grad_norm": 28.544275283813477,
"learning_rate": 2.9756313584679834e-05,
"loss": 13.3619,
"step": 14950
},
{
"epoch": 2.6885098743267504,
"grad_norm": 26.328645706176758,
"learning_rate": 2.9750329144225016e-05,
"loss": 13.3435,
"step": 14975
},
{
"epoch": 2.6929982046678633,
"grad_norm": 28.869354248046875,
"learning_rate": 2.97443447037702e-05,
"loss": 12.8979,
"step": 15000
},
{
"epoch": 2.6974865350089767,
"grad_norm": 26.356056213378906,
"learning_rate": 2.973836026331538e-05,
"loss": 13.2308,
"step": 15025
},
{
"epoch": 2.7019748653500897,
"grad_norm": 26.76312828063965,
"learning_rate": 2.9732375822860563e-05,
"loss": 13.4595,
"step": 15050
},
{
"epoch": 2.706463195691203,
"grad_norm": 26.2951717376709,
"learning_rate": 2.9726391382405746e-05,
"loss": 12.9706,
"step": 15075
},
{
"epoch": 2.710951526032316,
"grad_norm": 26.80390739440918,
"learning_rate": 2.9720406941950928e-05,
"loss": 13.0039,
"step": 15100
},
{
"epoch": 2.715439856373429,
"grad_norm": 27.611963272094727,
"learning_rate": 2.971442250149611e-05,
"loss": 12.9145,
"step": 15125
},
{
"epoch": 2.719928186714542,
"grad_norm": 28.494123458862305,
"learning_rate": 2.9708438061041293e-05,
"loss": 13.2181,
"step": 15150
},
{
"epoch": 2.7244165170556554,
"grad_norm": 26.697126388549805,
"learning_rate": 2.970245362058648e-05,
"loss": 13.3677,
"step": 15175
},
{
"epoch": 2.7289048473967683,
"grad_norm": 27.672060012817383,
"learning_rate": 2.9696469180131657e-05,
"loss": 12.834,
"step": 15200
},
{
"epoch": 2.7333931777378817,
"grad_norm": 28.86951446533203,
"learning_rate": 2.969048473967684e-05,
"loss": 13.2779,
"step": 15225
},
{
"epoch": 2.7378815080789947,
"grad_norm": 26.693321228027344,
"learning_rate": 2.9684500299222022e-05,
"loss": 13.1227,
"step": 15250
},
{
"epoch": 2.7423698384201076,
"grad_norm": 27.52298927307129,
"learning_rate": 2.9678515858767205e-05,
"loss": 12.6953,
"step": 15275
},
{
"epoch": 2.7468581687612206,
"grad_norm": 26.291044235229492,
"learning_rate": 2.9672531418312387e-05,
"loss": 13.1016,
"step": 15300
},
{
"epoch": 2.751346499102334,
"grad_norm": 27.562095642089844,
"learning_rate": 2.9666546977857573e-05,
"loss": 13.1168,
"step": 15325
},
{
"epoch": 2.755834829443447,
"grad_norm": 26.268095016479492,
"learning_rate": 2.9660562537402755e-05,
"loss": 13.0068,
"step": 15350
},
{
"epoch": 2.7603231597845603,
"grad_norm": 27.220062255859375,
"learning_rate": 2.9654578096947937e-05,
"loss": 12.8645,
"step": 15375
},
{
"epoch": 2.7648114901256733,
"grad_norm": 27.46253776550293,
"learning_rate": 2.9648593656493116e-05,
"loss": 13.3971,
"step": 15400
},
{
"epoch": 2.7692998204667862,
"grad_norm": 26.30550193786621,
"learning_rate": 2.96426092160383e-05,
"loss": 12.8522,
"step": 15425
},
{
"epoch": 2.773788150807899,
"grad_norm": 27.612834930419922,
"learning_rate": 2.9636624775583484e-05,
"loss": 13.5025,
"step": 15450
},
{
"epoch": 2.7782764811490126,
"grad_norm": 26.21208953857422,
"learning_rate": 2.9630640335128667e-05,
"loss": 12.552,
"step": 15475
},
{
"epoch": 2.7827648114901256,
"grad_norm": 27.3443546295166,
"learning_rate": 2.962465589467385e-05,
"loss": 13.1858,
"step": 15500
},
{
"epoch": 2.787253141831239,
"grad_norm": 27.457931518554688,
"learning_rate": 2.961867145421903e-05,
"loss": 13.2157,
"step": 15525
},
{
"epoch": 2.791741472172352,
"grad_norm": 28.71920394897461,
"learning_rate": 2.9612687013764214e-05,
"loss": 13.213,
"step": 15550
},
{
"epoch": 2.796229802513465,
"grad_norm": 25.985244750976562,
"learning_rate": 2.9606702573309396e-05,
"loss": 12.9935,
"step": 15575
},
{
"epoch": 2.800718132854578,
"grad_norm": 25.949575424194336,
"learning_rate": 2.960071813285458e-05,
"loss": 12.9924,
"step": 15600
},
{
"epoch": 2.8052064631956912,
"grad_norm": 26.65997314453125,
"learning_rate": 2.959473369239976e-05,
"loss": 13.1186,
"step": 15625
},
{
"epoch": 2.809694793536804,
"grad_norm": 26.854761123657227,
"learning_rate": 2.9588749251944943e-05,
"loss": 13.0178,
"step": 15650
},
{
"epoch": 2.8141831238779176,
"grad_norm": 26.749004364013672,
"learning_rate": 2.9582764811490126e-05,
"loss": 13.0751,
"step": 15675
},
{
"epoch": 2.8186714542190305,
"grad_norm": 26.282302856445312,
"learning_rate": 2.9576780371035308e-05,
"loss": 12.8906,
"step": 15700
},
{
"epoch": 2.8231597845601435,
"grad_norm": 26.395767211914062,
"learning_rate": 2.957079593058049e-05,
"loss": 13.3278,
"step": 15725
},
{
"epoch": 2.827648114901257,
"grad_norm": 26.95943832397461,
"learning_rate": 2.9564811490125676e-05,
"loss": 12.9831,
"step": 15750
},
{
"epoch": 2.83213644524237,
"grad_norm": 28.028095245361328,
"learning_rate": 2.955882704967086e-05,
"loss": 12.932,
"step": 15775
},
{
"epoch": 2.836624775583483,
"grad_norm": 27.706012725830078,
"learning_rate": 2.9552842609216038e-05,
"loss": 12.7754,
"step": 15800
},
{
"epoch": 2.841113105924596,
"grad_norm": 27.81289291381836,
"learning_rate": 2.954685816876122e-05,
"loss": 12.8489,
"step": 15825
},
{
"epoch": 2.845601436265709,
"grad_norm": 26.71699333190918,
"learning_rate": 2.9540873728306402e-05,
"loss": 12.6091,
"step": 15850
},
{
"epoch": 2.850089766606822,
"grad_norm": 65.94219207763672,
"learning_rate": 2.9534889287851588e-05,
"loss": 13.0053,
"step": 15875
},
{
"epoch": 2.8545780969479355,
"grad_norm": 32.33103561401367,
"learning_rate": 2.952890484739677e-05,
"loss": 12.4834,
"step": 15900
},
{
"epoch": 2.8590664272890485,
"grad_norm": 25.59375,
"learning_rate": 2.9522920406941953e-05,
"loss": 13.0441,
"step": 15925
},
{
"epoch": 2.8635547576301614,
"grad_norm": 26.32273292541504,
"learning_rate": 2.9516935966487135e-05,
"loss": 12.701,
"step": 15950
},
{
"epoch": 2.868043087971275,
"grad_norm": 27.84789276123047,
"learning_rate": 2.9510951526032317e-05,
"loss": 13.1712,
"step": 15975
},
{
"epoch": 2.872531418312388,
"grad_norm": 27.111125946044922,
"learning_rate": 2.9504967085577496e-05,
"loss": 12.7789,
"step": 16000
},
{
"epoch": 2.8770197486535007,
"grad_norm": 26.045406341552734,
"learning_rate": 2.9498982645122682e-05,
"loss": 12.7077,
"step": 16025
},
{
"epoch": 2.881508078994614,
"grad_norm": 26.169029235839844,
"learning_rate": 2.9492998204667865e-05,
"loss": 13.0239,
"step": 16050
},
{
"epoch": 2.885996409335727,
"grad_norm": 26.920217514038086,
"learning_rate": 2.9487013764213047e-05,
"loss": 12.978,
"step": 16075
},
{
"epoch": 2.89048473967684,
"grad_norm": 26.622011184692383,
"learning_rate": 2.948102932375823e-05,
"loss": 12.825,
"step": 16100
},
{
"epoch": 2.8949730700179535,
"grad_norm": 26.462886810302734,
"learning_rate": 2.947504488330341e-05,
"loss": 12.8208,
"step": 16125
},
{
"epoch": 2.8994614003590664,
"grad_norm": 26.220985412597656,
"learning_rate": 2.9469060442848594e-05,
"loss": 13.3692,
"step": 16150
},
{
"epoch": 2.9039497307001794,
"grad_norm": 27.57528305053711,
"learning_rate": 2.946307600239378e-05,
"loss": 12.7889,
"step": 16175
},
{
"epoch": 2.9084380610412928,
"grad_norm": 27.193159103393555,
"learning_rate": 2.945709156193896e-05,
"loss": 12.9411,
"step": 16200
},
{
"epoch": 2.9129263913824057,
"grad_norm": 28.573688507080078,
"learning_rate": 2.945110712148414e-05,
"loss": 13.0207,
"step": 16225
},
{
"epoch": 2.917414721723519,
"grad_norm": 27.21942710876465,
"learning_rate": 2.9445122681029323e-05,
"loss": 12.8121,
"step": 16250
},
{
"epoch": 2.921903052064632,
"grad_norm": 25.42641258239746,
"learning_rate": 2.9439138240574506e-05,
"loss": 13.0534,
"step": 16275
},
{
"epoch": 2.926391382405745,
"grad_norm": 26.955564498901367,
"learning_rate": 2.943315380011969e-05,
"loss": 12.6557,
"step": 16300
},
{
"epoch": 2.930879712746858,
"grad_norm": 26.791296005249023,
"learning_rate": 2.9427169359664874e-05,
"loss": 12.6982,
"step": 16325
},
{
"epoch": 2.9353680430879714,
"grad_norm": 27.43919563293457,
"learning_rate": 2.9421184919210056e-05,
"loss": 13.1458,
"step": 16350
},
{
"epoch": 2.9398563734290843,
"grad_norm": 26.005870819091797,
"learning_rate": 2.941520047875524e-05,
"loss": 13.3099,
"step": 16375
},
{
"epoch": 2.9443447037701977,
"grad_norm": 26.166765213012695,
"learning_rate": 2.9409216038300418e-05,
"loss": 12.7118,
"step": 16400
},
{
"epoch": 2.9488330341113107,
"grad_norm": 26.198945999145508,
"learning_rate": 2.94032315978456e-05,
"loss": 13.0126,
"step": 16425
},
{
"epoch": 2.9533213644524237,
"grad_norm": 27.599916458129883,
"learning_rate": 2.9397247157390786e-05,
"loss": 12.4839,
"step": 16450
},
{
"epoch": 2.9578096947935366,
"grad_norm": 26.379606246948242,
"learning_rate": 2.9391262716935968e-05,
"loss": 13.1094,
"step": 16475
},
{
"epoch": 2.96229802513465,
"grad_norm": 26.30647850036621,
"learning_rate": 2.938527827648115e-05,
"loss": 13.0026,
"step": 16500
},
{
"epoch": 2.966786355475763,
"grad_norm": 27.161256790161133,
"learning_rate": 2.9379293836026333e-05,
"loss": 12.795,
"step": 16525
},
{
"epoch": 2.9712746858168764,
"grad_norm": 27.510034561157227,
"learning_rate": 2.9373309395571515e-05,
"loss": 12.4387,
"step": 16550
},
{
"epoch": 2.9757630161579893,
"grad_norm": 28.14108657836914,
"learning_rate": 2.9367324955116697e-05,
"loss": 13.1286,
"step": 16575
},
{
"epoch": 2.9802513464991023,
"grad_norm": 28.018766403198242,
"learning_rate": 2.936134051466188e-05,
"loss": 12.9119,
"step": 16600
},
{
"epoch": 2.9847396768402152,
"grad_norm": 27.52519416809082,
"learning_rate": 2.9355356074207062e-05,
"loss": 13.2577,
"step": 16625
},
{
"epoch": 2.9892280071813286,
"grad_norm": 26.498538970947266,
"learning_rate": 2.9349371633752245e-05,
"loss": 12.8444,
"step": 16650
},
{
"epoch": 2.9937163375224416,
"grad_norm": 27.386394500732422,
"learning_rate": 2.9343387193297427e-05,
"loss": 12.9318,
"step": 16675
},
{
"epoch": 2.998204667863555,
"grad_norm": 29.109481811523438,
"learning_rate": 2.933740275284261e-05,
"loss": 13.3042,
"step": 16700
},
{
"epoch": 3.0,
"eval_accuracy": 0.07543963492639337,
"eval_f1_macro": 0.005658449303781104,
"eval_f1_micro": 0.07543963492639337,
"eval_f1_weighted": 0.04143719976295692,
"eval_loss": 7.06182861328125,
"eval_precision_macro": 0.005690112768572151,
"eval_precision_micro": 0.07543963492639337,
"eval_precision_weighted": 0.0367941063687332,
"eval_recall_macro": 0.009608965585832425,
"eval_recall_micro": 0.07543963492639337,
"eval_recall_weighted": 0.07543963492639337,
"eval_runtime": 86.5339,
"eval_samples_per_second": 605.231,
"eval_steps_per_second": 18.917,
"step": 16710
},
{
"epoch": 3.002692998204668,
"grad_norm": 26.483478546142578,
"learning_rate": 2.933141831238779e-05,
"loss": 11.5651,
"step": 16725
},
{
"epoch": 3.007181328545781,
"grad_norm": 28.335594177246094,
"learning_rate": 2.9325433871932977e-05,
"loss": 11.4552,
"step": 16750
},
{
"epoch": 3.011669658886894,
"grad_norm": 26.723102569580078,
"learning_rate": 2.931944943147816e-05,
"loss": 11.1541,
"step": 16775
},
{
"epoch": 3.0161579892280073,
"grad_norm": 28.930675506591797,
"learning_rate": 2.931346499102334e-05,
"loss": 11.2597,
"step": 16800
},
{
"epoch": 3.02064631956912,
"grad_norm": 30.39067268371582,
"learning_rate": 2.930748055056852e-05,
"loss": 11.1364,
"step": 16825
},
{
"epoch": 3.025134649910233,
"grad_norm": 29.515583038330078,
"learning_rate": 2.9301496110113703e-05,
"loss": 11.1398,
"step": 16850
},
{
"epoch": 3.0296229802513466,
"grad_norm": 29.533111572265625,
"learning_rate": 2.929551166965889e-05,
"loss": 11.3146,
"step": 16875
},
{
"epoch": 3.0341113105924595,
"grad_norm": 28.315011978149414,
"learning_rate": 2.928952722920407e-05,
"loss": 10.9237,
"step": 16900
},
{
"epoch": 3.0385996409335725,
"grad_norm": 27.643081665039062,
"learning_rate": 2.9283542788749254e-05,
"loss": 11.0213,
"step": 16925
},
{
"epoch": 3.043087971274686,
"grad_norm": 30.351112365722656,
"learning_rate": 2.9277558348294436e-05,
"loss": 11.2419,
"step": 16950
},
{
"epoch": 3.047576301615799,
"grad_norm": 31.334726333618164,
"learning_rate": 2.927157390783962e-05,
"loss": 11.1816,
"step": 16975
},
{
"epoch": 3.0520646319569122,
"grad_norm": 28.574382781982422,
"learning_rate": 2.9265589467384798e-05,
"loss": 10.8629,
"step": 17000
},
{
"epoch": 3.056552962298025,
"grad_norm": 30.646869659423828,
"learning_rate": 2.9259605026929983e-05,
"loss": 11.0935,
"step": 17025
},
{
"epoch": 3.061041292639138,
"grad_norm": 38.04651641845703,
"learning_rate": 2.9253620586475166e-05,
"loss": 11.3436,
"step": 17050
},
{
"epoch": 3.0655296229802516,
"grad_norm": 29.01982307434082,
"learning_rate": 2.9247636146020348e-05,
"loss": 11.0168,
"step": 17075
},
{
"epoch": 3.0700179533213645,
"grad_norm": 31.702123641967773,
"learning_rate": 2.924165170556553e-05,
"loss": 11.1176,
"step": 17100
},
{
"epoch": 3.0745062836624775,
"grad_norm": 31.976844787597656,
"learning_rate": 2.9235667265110713e-05,
"loss": 11.5311,
"step": 17125
},
{
"epoch": 3.078994614003591,
"grad_norm": 29.563053131103516,
"learning_rate": 2.9229682824655895e-05,
"loss": 10.9299,
"step": 17150
},
{
"epoch": 3.083482944344704,
"grad_norm": 31.436248779296875,
"learning_rate": 2.9223698384201077e-05,
"loss": 11.1627,
"step": 17175
},
{
"epoch": 3.087971274685817,
"grad_norm": 30.475858688354492,
"learning_rate": 2.921771394374626e-05,
"loss": 11.3677,
"step": 17200
},
{
"epoch": 3.09245960502693,
"grad_norm": 29.236719131469727,
"learning_rate": 2.9211729503291442e-05,
"loss": 11.2419,
"step": 17225
},
{
"epoch": 3.096947935368043,
"grad_norm": 32.13743209838867,
"learning_rate": 2.9205745062836625e-05,
"loss": 11.1695,
"step": 17250
},
{
"epoch": 3.101436265709156,
"grad_norm": 31.184057235717773,
"learning_rate": 2.9199760622381807e-05,
"loss": 10.8847,
"step": 17275
},
{
"epoch": 3.1059245960502695,
"grad_norm": 35.40129852294922,
"learning_rate": 2.9193776181926993e-05,
"loss": 11.4488,
"step": 17300
},
{
"epoch": 3.1104129263913824,
"grad_norm": 31.04747772216797,
"learning_rate": 2.9187791741472175e-05,
"loss": 11.3415,
"step": 17325
},
{
"epoch": 3.1149012567324954,
"grad_norm": 30.742427825927734,
"learning_rate": 2.9181807301017357e-05,
"loss": 11.0896,
"step": 17350
},
{
"epoch": 3.119389587073609,
"grad_norm": 29.326475143432617,
"learning_rate": 2.9175822860562536e-05,
"loss": 11.2907,
"step": 17375
},
{
"epoch": 3.1238779174147218,
"grad_norm": 33.5991325378418,
"learning_rate": 2.916983842010772e-05,
"loss": 11.0482,
"step": 17400
},
{
"epoch": 3.1283662477558347,
"grad_norm": 32.41011428833008,
"learning_rate": 2.91638539796529e-05,
"loss": 10.9863,
"step": 17425
},
{
"epoch": 3.132854578096948,
"grad_norm": 29.50647735595703,
"learning_rate": 2.9157869539198087e-05,
"loss": 11.4484,
"step": 17450
},
{
"epoch": 3.137342908438061,
"grad_norm": 30.07097625732422,
"learning_rate": 2.915188509874327e-05,
"loss": 10.8256,
"step": 17475
},
{
"epoch": 3.141831238779174,
"grad_norm": 28.87929344177246,
"learning_rate": 2.914590065828845e-05,
"loss": 10.7541,
"step": 17500
},
{
"epoch": 3.1463195691202874,
"grad_norm": 34.39598083496094,
"learning_rate": 2.9139916217833634e-05,
"loss": 11.3925,
"step": 17525
},
{
"epoch": 3.1508078994614004,
"grad_norm": 30.967477798461914,
"learning_rate": 2.9133931777378816e-05,
"loss": 11.0977,
"step": 17550
},
{
"epoch": 3.1552962298025133,
"grad_norm": 32.6968879699707,
"learning_rate": 2.9127947336923995e-05,
"loss": 10.9783,
"step": 17575
},
{
"epoch": 3.1597845601436267,
"grad_norm": 30.194917678833008,
"learning_rate": 2.9122202274087372e-05,
"loss": 11.0674,
"step": 17600
},
{
"epoch": 3.1642728904847397,
"grad_norm": 29.895421981811523,
"learning_rate": 2.9116217833632558e-05,
"loss": 11.3506,
"step": 17625
},
{
"epoch": 3.1687612208258527,
"grad_norm": 30.785797119140625,
"learning_rate": 2.911023339317774e-05,
"loss": 11.044,
"step": 17650
},
{
"epoch": 3.173249551166966,
"grad_norm": 31.407691955566406,
"learning_rate": 2.9104248952722923e-05,
"loss": 10.9295,
"step": 17675
},
{
"epoch": 3.177737881508079,
"grad_norm": 29.658754348754883,
"learning_rate": 2.9098264512268102e-05,
"loss": 11.2406,
"step": 17700
},
{
"epoch": 3.182226211849192,
"grad_norm": 30.043371200561523,
"learning_rate": 2.9092280071813284e-05,
"loss": 10.9478,
"step": 17725
},
{
"epoch": 3.1867145421903054,
"grad_norm": 31.360021591186523,
"learning_rate": 2.908629563135847e-05,
"loss": 10.8657,
"step": 17750
},
{
"epoch": 3.1912028725314183,
"grad_norm": 31.64422035217285,
"learning_rate": 2.9080311190903652e-05,
"loss": 10.8207,
"step": 17775
},
{
"epoch": 3.1956912028725313,
"grad_norm": 30.953533172607422,
"learning_rate": 2.9074326750448835e-05,
"loss": 11.0474,
"step": 17800
},
{
"epoch": 3.2001795332136447,
"grad_norm": 29.29545783996582,
"learning_rate": 2.9068342309994017e-05,
"loss": 11.4172,
"step": 17825
},
{
"epoch": 3.2046678635547576,
"grad_norm": 28.73203468322754,
"learning_rate": 2.90623578695392e-05,
"loss": 11.0947,
"step": 17850
},
{
"epoch": 3.2091561938958706,
"grad_norm": 29.092605590820312,
"learning_rate": 2.905637342908438e-05,
"loss": 10.7874,
"step": 17875
},
{
"epoch": 3.213644524236984,
"grad_norm": 30.759441375732422,
"learning_rate": 2.9050388988629564e-05,
"loss": 11.1644,
"step": 17900
},
{
"epoch": 3.218132854578097,
"grad_norm": 31.628297805786133,
"learning_rate": 2.9044404548174746e-05,
"loss": 11.6035,
"step": 17925
},
{
"epoch": 3.22262118491921,
"grad_norm": 32.346553802490234,
"learning_rate": 2.903842010771993e-05,
"loss": 10.9969,
"step": 17950
},
{
"epoch": 3.2271095152603233,
"grad_norm": 29.345993041992188,
"learning_rate": 2.903243566726511e-05,
"loss": 11.2449,
"step": 17975
},
{
"epoch": 3.2315978456014363,
"grad_norm": 36.96156311035156,
"learning_rate": 2.9026451226810293e-05,
"loss": 11.157,
"step": 18000
},
{
"epoch": 3.236086175942549,
"grad_norm": 31.43854522705078,
"learning_rate": 2.9020466786355476e-05,
"loss": 11.2116,
"step": 18025
},
{
"epoch": 3.2405745062836626,
"grad_norm": 31.491018295288086,
"learning_rate": 2.901448234590066e-05,
"loss": 10.9847,
"step": 18050
},
{
"epoch": 3.2450628366247756,
"grad_norm": 31.342721939086914,
"learning_rate": 2.900849790544584e-05,
"loss": 11.0106,
"step": 18075
},
{
"epoch": 3.2495511669658885,
"grad_norm": 31.182981491088867,
"learning_rate": 2.9002513464991023e-05,
"loss": 11.2681,
"step": 18100
},
{
"epoch": 3.254039497307002,
"grad_norm": 31.756725311279297,
"learning_rate": 2.8996529024536205e-05,
"loss": 11.1072,
"step": 18125
},
{
"epoch": 3.258527827648115,
"grad_norm": 28.509653091430664,
"learning_rate": 2.8990544584081388e-05,
"loss": 11.184,
"step": 18150
},
{
"epoch": 3.263016157989228,
"grad_norm": 31.49736785888672,
"learning_rate": 2.8984560143626573e-05,
"loss": 11.2929,
"step": 18175
},
{
"epoch": 3.2675044883303412,
"grad_norm": 29.734132766723633,
"learning_rate": 2.8978575703171756e-05,
"loss": 11.1909,
"step": 18200
},
{
"epoch": 3.271992818671454,
"grad_norm": 31.356077194213867,
"learning_rate": 2.8972591262716938e-05,
"loss": 11.468,
"step": 18225
},
{
"epoch": 3.276481149012567,
"grad_norm": 28.490388870239258,
"learning_rate": 2.896660682226212e-05,
"loss": 11.3047,
"step": 18250
},
{
"epoch": 3.2809694793536806,
"grad_norm": 30.780508041381836,
"learning_rate": 2.89606223818073e-05,
"loss": 11.3076,
"step": 18275
},
{
"epoch": 3.2854578096947935,
"grad_norm": 29.654769897460938,
"learning_rate": 2.8954637941352482e-05,
"loss": 11.0864,
"step": 18300
},
{
"epoch": 3.2899461400359065,
"grad_norm": 31.67804718017578,
"learning_rate": 2.8948653500897668e-05,
"loss": 11.2724,
"step": 18325
},
{
"epoch": 3.29443447037702,
"grad_norm": 29.71087646484375,
"learning_rate": 2.894266906044285e-05,
"loss": 11.3839,
"step": 18350
},
{
"epoch": 3.298922800718133,
"grad_norm": 30.625585556030273,
"learning_rate": 2.8936684619988032e-05,
"loss": 11.1948,
"step": 18375
},
{
"epoch": 3.3034111310592458,
"grad_norm": 33.840885162353516,
"learning_rate": 2.8930700179533215e-05,
"loss": 11.5731,
"step": 18400
},
{
"epoch": 3.307899461400359,
"grad_norm": 31.30687713623047,
"learning_rate": 2.8924715739078397e-05,
"loss": 11.3931,
"step": 18425
},
{
"epoch": 3.312387791741472,
"grad_norm": 30.306846618652344,
"learning_rate": 2.891873129862358e-05,
"loss": 11.3634,
"step": 18450
},
{
"epoch": 3.316876122082585,
"grad_norm": 31.10429573059082,
"learning_rate": 2.891274685816876e-05,
"loss": 11.383,
"step": 18475
},
{
"epoch": 3.3213644524236985,
"grad_norm": 31.466232299804688,
"learning_rate": 2.8906762417713944e-05,
"loss": 11.3213,
"step": 18500
},
{
"epoch": 3.3258527827648114,
"grad_norm": 31.928709030151367,
"learning_rate": 2.8900777977259126e-05,
"loss": 11.4474,
"step": 18525
},
{
"epoch": 3.3303411131059244,
"grad_norm": 31.45096778869629,
"learning_rate": 2.889479353680431e-05,
"loss": 11.3677,
"step": 18550
},
{
"epoch": 3.334829443447038,
"grad_norm": 31.321134567260742,
"learning_rate": 2.888880909634949e-05,
"loss": 11.0314,
"step": 18575
},
{
"epoch": 3.3393177737881508,
"grad_norm": 30.80310821533203,
"learning_rate": 2.8882824655894673e-05,
"loss": 11.3432,
"step": 18600
},
{
"epoch": 3.343806104129264,
"grad_norm": 33.093849182128906,
"learning_rate": 2.887684021543986e-05,
"loss": 11.4452,
"step": 18625
},
{
"epoch": 3.348294434470377,
"grad_norm": 30.316701889038086,
"learning_rate": 2.887085577498504e-05,
"loss": 11.1295,
"step": 18650
},
{
"epoch": 3.35278276481149,
"grad_norm": 30.940135955810547,
"learning_rate": 2.886487133453022e-05,
"loss": 11.2617,
"step": 18675
},
{
"epoch": 3.357271095152603,
"grad_norm": 28.96495246887207,
"learning_rate": 2.8858886894075403e-05,
"loss": 11.4463,
"step": 18700
},
{
"epoch": 3.3617594254937164,
"grad_norm": 30.461139678955078,
"learning_rate": 2.8852902453620585e-05,
"loss": 11.3384,
"step": 18725
},
{
"epoch": 3.3662477558348294,
"grad_norm": 30.79012680053711,
"learning_rate": 2.884691801316577e-05,
"loss": 11.3678,
"step": 18750
},
{
"epoch": 3.370736086175943,
"grad_norm": 31.620222091674805,
"learning_rate": 2.8840933572710953e-05,
"loss": 11.0789,
"step": 18775
},
{
"epoch": 3.3752244165170557,
"grad_norm": 29.551908493041992,
"learning_rate": 2.8834949132256136e-05,
"loss": 11.2116,
"step": 18800
},
{
"epoch": 3.3797127468581687,
"grad_norm": 31.130882263183594,
"learning_rate": 2.8828964691801318e-05,
"loss": 11.4108,
"step": 18825
},
{
"epoch": 3.3842010771992816,
"grad_norm": 30.52980613708496,
"learning_rate": 2.88229802513465e-05,
"loss": 11.2848,
"step": 18850
},
{
"epoch": 3.388689407540395,
"grad_norm": 31.423954010009766,
"learning_rate": 2.881699581089168e-05,
"loss": 11.1228,
"step": 18875
},
{
"epoch": 3.393177737881508,
"grad_norm": 30.197856903076172,
"learning_rate": 2.8811011370436865e-05,
"loss": 10.955,
"step": 18900
},
{
"epoch": 3.3976660682226214,
"grad_norm": 29.411909103393555,
"learning_rate": 2.8805026929982048e-05,
"loss": 11.4046,
"step": 18925
},
{
"epoch": 3.4021543985637344,
"grad_norm": 30.500823974609375,
"learning_rate": 2.879904248952723e-05,
"loss": 11.3617,
"step": 18950
},
{
"epoch": 3.4066427289048473,
"grad_norm": 31.399059295654297,
"learning_rate": 2.8793058049072412e-05,
"loss": 11.4427,
"step": 18975
},
{
"epoch": 3.4111310592459603,
"grad_norm": 30.890851974487305,
"learning_rate": 2.8787073608617595e-05,
"loss": 11.4184,
"step": 19000
},
{
"epoch": 3.4156193895870737,
"grad_norm": 31.299579620361328,
"learning_rate": 2.8781089168162777e-05,
"loss": 11.3553,
"step": 19025
},
{
"epoch": 3.4201077199281866,
"grad_norm": 30.379802703857422,
"learning_rate": 2.8775104727707963e-05,
"loss": 11.2289,
"step": 19050
},
{
"epoch": 3.4245960502693,
"grad_norm": 30.748916625976562,
"learning_rate": 2.8769120287253142e-05,
"loss": 10.9974,
"step": 19075
},
{
"epoch": 3.429084380610413,
"grad_norm": 30.164533615112305,
"learning_rate": 2.8763135846798324e-05,
"loss": 11.2552,
"step": 19100
},
{
"epoch": 3.433572710951526,
"grad_norm": 30.67738914489746,
"learning_rate": 2.8757151406343506e-05,
"loss": 11.514,
"step": 19125
},
{
"epoch": 3.438061041292639,
"grad_norm": 31.51483154296875,
"learning_rate": 2.875116696588869e-05,
"loss": 11.4153,
"step": 19150
},
{
"epoch": 3.4425493716337523,
"grad_norm": 32.316654205322266,
"learning_rate": 2.8745182525433875e-05,
"loss": 11.5824,
"step": 19175
},
{
"epoch": 3.4470377019748653,
"grad_norm": 31.41953468322754,
"learning_rate": 2.8739198084979057e-05,
"loss": 11.259,
"step": 19200
},
{
"epoch": 3.4515260323159787,
"grad_norm": 32.805870056152344,
"learning_rate": 2.873321364452424e-05,
"loss": 11.2999,
"step": 19225
},
{
"epoch": 3.4560143626570916,
"grad_norm": 32.010826110839844,
"learning_rate": 2.872722920406942e-05,
"loss": 10.9906,
"step": 19250
},
{
"epoch": 3.4605026929982046,
"grad_norm": 33.595767974853516,
"learning_rate": 2.87212447636146e-05,
"loss": 11.2258,
"step": 19275
},
{
"epoch": 3.464991023339318,
"grad_norm": 31.329288482666016,
"learning_rate": 2.8715260323159783e-05,
"loss": 11.2326,
"step": 19300
},
{
"epoch": 3.469479353680431,
"grad_norm": 31.369930267333984,
"learning_rate": 2.870927588270497e-05,
"loss": 11.2166,
"step": 19325
},
{
"epoch": 3.473967684021544,
"grad_norm": 30.896013259887695,
"learning_rate": 2.870329144225015e-05,
"loss": 11.1363,
"step": 19350
},
{
"epoch": 3.4784560143626573,
"grad_norm": 31.08727264404297,
"learning_rate": 2.8697307001795333e-05,
"loss": 11.4698,
"step": 19375
},
{
"epoch": 3.4829443447037702,
"grad_norm": 28.412425994873047,
"learning_rate": 2.8691322561340516e-05,
"loss": 11.0108,
"step": 19400
},
{
"epoch": 3.487432675044883,
"grad_norm": 31.50676155090332,
"learning_rate": 2.8685338120885698e-05,
"loss": 10.8741,
"step": 19425
},
{
"epoch": 3.4919210053859966,
"grad_norm": 28.88292694091797,
"learning_rate": 2.867935368043088e-05,
"loss": 11.1386,
"step": 19450
},
{
"epoch": 3.4964093357271095,
"grad_norm": 30.06254005432129,
"learning_rate": 2.8673369239976063e-05,
"loss": 11.1929,
"step": 19475
},
{
"epoch": 3.5008976660682225,
"grad_norm": 34.148311614990234,
"learning_rate": 2.8667384799521245e-05,
"loss": 11.339,
"step": 19500
},
{
"epoch": 3.505385996409336,
"grad_norm": 33.28491973876953,
"learning_rate": 2.8661400359066428e-05,
"loss": 11.1095,
"step": 19525
},
{
"epoch": 3.509874326750449,
"grad_norm": 33.306026458740234,
"learning_rate": 2.865541591861161e-05,
"loss": 11.0814,
"step": 19550
},
{
"epoch": 3.514362657091562,
"grad_norm": 31.115873336791992,
"learning_rate": 2.8649431478156792e-05,
"loss": 11.2325,
"step": 19575
},
{
"epoch": 3.5188509874326748,
"grad_norm": 31.66822052001953,
"learning_rate": 2.8643447037701978e-05,
"loss": 11.1676,
"step": 19600
},
{
"epoch": 3.523339317773788,
"grad_norm": 29.544313430786133,
"learning_rate": 2.863746259724716e-05,
"loss": 11.1635,
"step": 19625
},
{
"epoch": 3.527827648114901,
"grad_norm": 34.17205810546875,
"learning_rate": 2.8631478156792343e-05,
"loss": 11.7224,
"step": 19650
},
{
"epoch": 3.5323159784560145,
"grad_norm": 32.336727142333984,
"learning_rate": 2.8625493716337522e-05,
"loss": 11.3425,
"step": 19675
},
{
"epoch": 3.5368043087971275,
"grad_norm": 32.560447692871094,
"learning_rate": 2.8619509275882704e-05,
"loss": 10.9515,
"step": 19700
},
{
"epoch": 3.5412926391382404,
"grad_norm": 30.652273178100586,
"learning_rate": 2.8613524835427886e-05,
"loss": 11.2001,
"step": 19725
},
{
"epoch": 3.545780969479354,
"grad_norm": 30.610469818115234,
"learning_rate": 2.8607540394973072e-05,
"loss": 11.5682,
"step": 19750
},
{
"epoch": 3.550269299820467,
"grad_norm": 30.808074951171875,
"learning_rate": 2.8601555954518255e-05,
"loss": 11.453,
"step": 19775
},
{
"epoch": 3.5547576301615798,
"grad_norm": 31.674997329711914,
"learning_rate": 2.8595571514063437e-05,
"loss": 11.2786,
"step": 19800
},
{
"epoch": 3.559245960502693,
"grad_norm": 30.62029457092285,
"learning_rate": 2.858958707360862e-05,
"loss": 11.7177,
"step": 19825
},
{
"epoch": 3.563734290843806,
"grad_norm": 31.6383113861084,
"learning_rate": 2.8583602633153798e-05,
"loss": 11.2354,
"step": 19850
},
{
"epoch": 3.568222621184919,
"grad_norm": 32.37112045288086,
"learning_rate": 2.857761819269898e-05,
"loss": 11.2081,
"step": 19875
},
{
"epoch": 3.5727109515260325,
"grad_norm": 29.828189849853516,
"learning_rate": 2.8571873129862358e-05,
"loss": 11.0898,
"step": 19900
},
{
"epoch": 3.5771992818671454,
"grad_norm": 30.648529052734375,
"learning_rate": 2.8565888689407543e-05,
"loss": 10.8965,
"step": 19925
},
{
"epoch": 3.5816876122082584,
"grad_norm": 41.849483489990234,
"learning_rate": 2.8559904248952726e-05,
"loss": 11.0246,
"step": 19950
},
{
"epoch": 3.5861759425493718,
"grad_norm": 31.274961471557617,
"learning_rate": 2.8553919808497905e-05,
"loss": 11.4183,
"step": 19975
},
{
"epoch": 3.5906642728904847,
"grad_norm": 30.798633575439453,
"learning_rate": 2.8547935368043087e-05,
"loss": 11.1237,
"step": 20000
},
{
"epoch": 3.5951526032315977,
"grad_norm": 28.889543533325195,
"learning_rate": 2.854195092758827e-05,
"loss": 11.3662,
"step": 20025
},
{
"epoch": 3.599640933572711,
"grad_norm": 29.90560531616211,
"learning_rate": 2.8535966487133455e-05,
"loss": 11.0125,
"step": 20050
},
{
"epoch": 3.604129263913824,
"grad_norm": 30.499813079833984,
"learning_rate": 2.8529982046678638e-05,
"loss": 11.4072,
"step": 20075
},
{
"epoch": 3.608617594254937,
"grad_norm": 30.493555068969727,
"learning_rate": 2.852399760622382e-05,
"loss": 11.2318,
"step": 20100
},
{
"epoch": 3.6131059245960504,
"grad_norm": 31.599319458007812,
"learning_rate": 2.8518013165769002e-05,
"loss": 11.062,
"step": 20125
},
{
"epoch": 3.6175942549371634,
"grad_norm": 30.47945213317871,
"learning_rate": 2.8512028725314185e-05,
"loss": 11.4558,
"step": 20150
},
{
"epoch": 3.6220825852782763,
"grad_norm": 32.14970779418945,
"learning_rate": 2.8506044284859364e-05,
"loss": 10.5179,
"step": 20175
},
{
"epoch": 3.6265709156193897,
"grad_norm": 31.17921257019043,
"learning_rate": 2.850005984440455e-05,
"loss": 10.968,
"step": 20200
},
{
"epoch": 3.6310592459605027,
"grad_norm": 31.30777931213379,
"learning_rate": 2.8494075403949732e-05,
"loss": 11.336,
"step": 20225
},
{
"epoch": 3.635547576301616,
"grad_norm": 30.182174682617188,
"learning_rate": 2.8488090963494914e-05,
"loss": 11.2265,
"step": 20250
},
{
"epoch": 3.640035906642729,
"grad_norm": 31.07087516784668,
"learning_rate": 2.8482106523040096e-05,
"loss": 11.2847,
"step": 20275
},
{
"epoch": 3.644524236983842,
"grad_norm": 28.739133834838867,
"learning_rate": 2.847612208258528e-05,
"loss": 11.427,
"step": 20300
},
{
"epoch": 3.649012567324955,
"grad_norm": 31.23784637451172,
"learning_rate": 2.847013764213046e-05,
"loss": 11.0143,
"step": 20325
},
{
"epoch": 3.6535008976660683,
"grad_norm": 30.830699920654297,
"learning_rate": 2.8464153201675647e-05,
"loss": 11.2841,
"step": 20350
},
{
"epoch": 3.6579892280071813,
"grad_norm": 30.827341079711914,
"learning_rate": 2.8458168761220826e-05,
"loss": 10.9722,
"step": 20375
},
{
"epoch": 3.6624775583482947,
"grad_norm": 29.842851638793945,
"learning_rate": 2.8452184320766008e-05,
"loss": 11.2758,
"step": 20400
},
{
"epoch": 3.6669658886894076,
"grad_norm": 32.061363220214844,
"learning_rate": 2.844619988031119e-05,
"loss": 11.0502,
"step": 20425
},
{
"epoch": 3.6714542190305206,
"grad_norm": 31.67589569091797,
"learning_rate": 2.8440215439856373e-05,
"loss": 11.2309,
"step": 20450
},
{
"epoch": 3.6759425493716336,
"grad_norm": 29.2219295501709,
"learning_rate": 2.8434230999401555e-05,
"loss": 11.1079,
"step": 20475
},
{
"epoch": 3.680430879712747,
"grad_norm": 29.494009017944336,
"learning_rate": 2.842824655894674e-05,
"loss": 11.095,
"step": 20500
},
{
"epoch": 3.68491921005386,
"grad_norm": 30.72727394104004,
"learning_rate": 2.8422262118491923e-05,
"loss": 10.9651,
"step": 20525
},
{
"epoch": 3.6894075403949733,
"grad_norm": 32.62581253051758,
"learning_rate": 2.8416277678037106e-05,
"loss": 11.1485,
"step": 20550
},
{
"epoch": 3.6938958707360863,
"grad_norm": 32.17450714111328,
"learning_rate": 2.8410293237582285e-05,
"loss": 11.2383,
"step": 20575
},
{
"epoch": 3.6983842010771992,
"grad_norm": 31.298063278198242,
"learning_rate": 2.8404308797127467e-05,
"loss": 11.4215,
"step": 20600
},
{
"epoch": 3.702872531418312,
"grad_norm": 31.1262149810791,
"learning_rate": 2.8398324356672653e-05,
"loss": 11.0779,
"step": 20625
},
{
"epoch": 3.7073608617594256,
"grad_norm": 31.340126037597656,
"learning_rate": 2.8392339916217835e-05,
"loss": 11.0714,
"step": 20650
},
{
"epoch": 3.7118491921005385,
"grad_norm": 33.29624557495117,
"learning_rate": 2.8386355475763018e-05,
"loss": 11.0074,
"step": 20675
},
{
"epoch": 3.716337522441652,
"grad_norm": 30.880542755126953,
"learning_rate": 2.83803710353082e-05,
"loss": 10.8339,
"step": 20700
},
{
"epoch": 3.720825852782765,
"grad_norm": 29.898832321166992,
"learning_rate": 2.8374386594853382e-05,
"loss": 10.8654,
"step": 20725
},
{
"epoch": 3.725314183123878,
"grad_norm": 29.32884979248047,
"learning_rate": 2.8368402154398565e-05,
"loss": 11.3662,
"step": 20750
},
{
"epoch": 3.729802513464991,
"grad_norm": 32.064762115478516,
"learning_rate": 2.8362417713943747e-05,
"loss": 11.4581,
"step": 20775
},
{
"epoch": 3.734290843806104,
"grad_norm": 32.138267517089844,
"learning_rate": 2.835643327348893e-05,
"loss": 10.9124,
"step": 20800
},
{
"epoch": 3.738779174147217,
"grad_norm": 33.86062240600586,
"learning_rate": 2.8350448833034112e-05,
"loss": 11.0097,
"step": 20825
},
{
"epoch": 3.7432675044883306,
"grad_norm": 30.490970611572266,
"learning_rate": 2.8344464392579294e-05,
"loss": 11.4534,
"step": 20850
},
{
"epoch": 3.7477558348294435,
"grad_norm": 27.865781784057617,
"learning_rate": 2.8338479952124477e-05,
"loss": 11.4486,
"step": 20875
},
{
"epoch": 3.7522441651705565,
"grad_norm": 31.9267520904541,
"learning_rate": 2.833249551166966e-05,
"loss": 11.5519,
"step": 20900
},
{
"epoch": 3.7567324955116694,
"grad_norm": 29.056507110595703,
"learning_rate": 2.8326511071214845e-05,
"loss": 11.4834,
"step": 20925
},
{
"epoch": 3.761220825852783,
"grad_norm": 30.026002883911133,
"learning_rate": 2.8320526630760024e-05,
"loss": 11.3338,
"step": 20950
},
{
"epoch": 3.765709156193896,
"grad_norm": 30.737932205200195,
"learning_rate": 2.8314542190305206e-05,
"loss": 10.9653,
"step": 20975
},
{
"epoch": 3.770197486535009,
"grad_norm": 30.978910446166992,
"learning_rate": 2.8308557749850388e-05,
"loss": 11.4722,
"step": 21000
},
{
"epoch": 3.774685816876122,
"grad_norm": 30.23752212524414,
"learning_rate": 2.830257330939557e-05,
"loss": 11.3043,
"step": 21025
},
{
"epoch": 3.779174147217235,
"grad_norm": 32.29151153564453,
"learning_rate": 2.8296588868940756e-05,
"loss": 11.0943,
"step": 21050
},
{
"epoch": 3.783662477558348,
"grad_norm": 30.46995735168457,
"learning_rate": 2.829060442848594e-05,
"loss": 11.345,
"step": 21075
},
{
"epoch": 3.7881508078994615,
"grad_norm": 32.500823974609375,
"learning_rate": 2.828461998803112e-05,
"loss": 11.2675,
"step": 21100
},
{
"epoch": 3.7926391382405744,
"grad_norm": 31.643070220947266,
"learning_rate": 2.8278635547576303e-05,
"loss": 11.2081,
"step": 21125
},
{
"epoch": 3.797127468581688,
"grad_norm": 31.303314208984375,
"learning_rate": 2.8272651107121482e-05,
"loss": 11.3969,
"step": 21150
},
{
"epoch": 3.8016157989228008,
"grad_norm": 32.96514129638672,
"learning_rate": 2.8266666666666665e-05,
"loss": 11.1525,
"step": 21175
},
{
"epoch": 3.8061041292639137,
"grad_norm": 30.002351760864258,
"learning_rate": 2.826068222621185e-05,
"loss": 10.8658,
"step": 21200
},
{
"epoch": 3.8105924596050267,
"grad_norm": 31.169191360473633,
"learning_rate": 2.8254697785757033e-05,
"loss": 11.2565,
"step": 21225
},
{
"epoch": 3.81508078994614,
"grad_norm": 31.06591033935547,
"learning_rate": 2.8248713345302215e-05,
"loss": 11.2624,
"step": 21250
},
{
"epoch": 3.819569120287253,
"grad_norm": 28.5202579498291,
"learning_rate": 2.8242728904847398e-05,
"loss": 10.9322,
"step": 21275
},
{
"epoch": 3.8240574506283664,
"grad_norm": 30.786962509155273,
"learning_rate": 2.823674446439258e-05,
"loss": 11.0254,
"step": 21300
},
{
"epoch": 3.8285457809694794,
"grad_norm": 30.801992416381836,
"learning_rate": 2.8230760023937762e-05,
"loss": 11.156,
"step": 21325
},
{
"epoch": 3.8330341113105924,
"grad_norm": 28.441688537597656,
"learning_rate": 2.8224775583482945e-05,
"loss": 11.1657,
"step": 21350
},
{
"epoch": 3.8375224416517053,
"grad_norm": 29.77831268310547,
"learning_rate": 2.8218791143028127e-05,
"loss": 11.3975,
"step": 21375
},
{
"epoch": 3.8420107719928187,
"grad_norm": 31.247785568237305,
"learning_rate": 2.821280670257331e-05,
"loss": 11.0109,
"step": 21400
},
{
"epoch": 3.8464991023339317,
"grad_norm": 32.04808807373047,
"learning_rate": 2.8206822262118492e-05,
"loss": 11.0839,
"step": 21425
},
{
"epoch": 3.850987432675045,
"grad_norm": 30.55583953857422,
"learning_rate": 2.8200837821663674e-05,
"loss": 10.8381,
"step": 21450
},
{
"epoch": 3.855475763016158,
"grad_norm": 29.084171295166016,
"learning_rate": 2.819485338120886e-05,
"loss": 11.187,
"step": 21475
},
{
"epoch": 3.859964093357271,
"grad_norm": 31.084972381591797,
"learning_rate": 2.8188868940754042e-05,
"loss": 11.52,
"step": 21500
},
{
"epoch": 3.864452423698384,
"grad_norm": 31.979738235473633,
"learning_rate": 2.8182884500299225e-05,
"loss": 11.3922,
"step": 21525
},
{
"epoch": 3.8689407540394973,
"grad_norm": 28.92717742919922,
"learning_rate": 2.8176900059844404e-05,
"loss": 11.064,
"step": 21550
},
{
"epoch": 3.8734290843806103,
"grad_norm": 29.832292556762695,
"learning_rate": 2.8170915619389586e-05,
"loss": 11.2098,
"step": 21575
},
{
"epoch": 3.8779174147217237,
"grad_norm": 31.74751091003418,
"learning_rate": 2.816493117893477e-05,
"loss": 10.9546,
"step": 21600
},
{
"epoch": 3.8824057450628366,
"grad_norm": 32.31631088256836,
"learning_rate": 2.8158946738479954e-05,
"loss": 11.1283,
"step": 21625
},
{
"epoch": 3.8868940754039496,
"grad_norm": 30.267370223999023,
"learning_rate": 2.8152962298025136e-05,
"loss": 10.9434,
"step": 21650
},
{
"epoch": 3.891382405745063,
"grad_norm": 31.132080078125,
"learning_rate": 2.814697785757032e-05,
"loss": 11.404,
"step": 21675
},
{
"epoch": 3.895870736086176,
"grad_norm": 31.34539794921875,
"learning_rate": 2.81409934171155e-05,
"loss": 11.3978,
"step": 21700
},
{
"epoch": 3.900359066427289,
"grad_norm": 31.39044952392578,
"learning_rate": 2.8135008976660684e-05,
"loss": 11.187,
"step": 21725
},
{
"epoch": 3.9048473967684023,
"grad_norm": 32.7244873046875,
"learning_rate": 2.8129024536205862e-05,
"loss": 11.2598,
"step": 21750
},
{
"epoch": 3.9093357271095153,
"grad_norm": 28.18239974975586,
"learning_rate": 2.8123040095751048e-05,
"loss": 11.3187,
"step": 21775
},
{
"epoch": 3.9138240574506282,
"grad_norm": 31.796775817871094,
"learning_rate": 2.811705565529623e-05,
"loss": 11.0664,
"step": 21800
},
{
"epoch": 3.9183123877917416,
"grad_norm": 30.6005859375,
"learning_rate": 2.8111071214841413e-05,
"loss": 10.987,
"step": 21825
},
{
"epoch": 3.9228007181328546,
"grad_norm": 30.108829498291016,
"learning_rate": 2.8105086774386595e-05,
"loss": 11.0541,
"step": 21850
},
{
"epoch": 3.9272890484739675,
"grad_norm": 31.7265682220459,
"learning_rate": 2.8099102333931778e-05,
"loss": 11.1046,
"step": 21875
},
{
"epoch": 3.931777378815081,
"grad_norm": 32.628074645996094,
"learning_rate": 2.809311789347696e-05,
"loss": 11.2953,
"step": 21900
},
{
"epoch": 3.936265709156194,
"grad_norm": 28.80093765258789,
"learning_rate": 2.8087133453022146e-05,
"loss": 11.5525,
"step": 21925
},
{
"epoch": 3.940754039497307,
"grad_norm": 29.523881912231445,
"learning_rate": 2.8081149012567325e-05,
"loss": 11.2298,
"step": 21950
},
{
"epoch": 3.9452423698384202,
"grad_norm": 30.06547737121582,
"learning_rate": 2.8075164572112507e-05,
"loss": 10.7736,
"step": 21975
},
{
"epoch": 3.949730700179533,
"grad_norm": 29.540449142456055,
"learning_rate": 2.806918013165769e-05,
"loss": 11.0709,
"step": 22000
},
{
"epoch": 3.954219030520646,
"grad_norm": 33.31890869140625,
"learning_rate": 2.8063195691202872e-05,
"loss": 11.0698,
"step": 22025
},
{
"epoch": 3.9587073608617596,
"grad_norm": 30.68980598449707,
"learning_rate": 2.8057211250748058e-05,
"loss": 10.8253,
"step": 22050
},
{
"epoch": 3.9631956912028725,
"grad_norm": 31.10498809814453,
"learning_rate": 2.805122681029324e-05,
"loss": 11.4169,
"step": 22075
},
{
"epoch": 3.9676840215439855,
"grad_norm": 30.547962188720703,
"learning_rate": 2.8045242369838422e-05,
"loss": 11.2333,
"step": 22100
},
{
"epoch": 3.972172351885099,
"grad_norm": 30.325082778930664,
"learning_rate": 2.8039257929383605e-05,
"loss": 11.3395,
"step": 22125
},
{
"epoch": 3.976660682226212,
"grad_norm": 30.00259780883789,
"learning_rate": 2.8033273488928784e-05,
"loss": 11.2044,
"step": 22150
},
{
"epoch": 3.9811490125673252,
"grad_norm": 27.535524368286133,
"learning_rate": 2.8027289048473966e-05,
"loss": 10.92,
"step": 22175
},
{
"epoch": 3.985637342908438,
"grad_norm": 31.112247467041016,
"learning_rate": 2.8021304608019152e-05,
"loss": 11.1473,
"step": 22200
},
{
"epoch": 3.990125673249551,
"grad_norm": 30.036909103393555,
"learning_rate": 2.8015320167564334e-05,
"loss": 11.5283,
"step": 22225
},
{
"epoch": 3.994614003590664,
"grad_norm": 30.063087463378906,
"learning_rate": 2.8009335727109516e-05,
"loss": 11.1425,
"step": 22250
},
{
"epoch": 3.9991023339317775,
"grad_norm": 32.6578483581543,
"learning_rate": 2.80033512866547e-05,
"loss": 10.6965,
"step": 22275
},
{
"epoch": 4.0,
"eval_accuracy": 0.07753995379298494,
"eval_f1_macro": 0.009406764821620481,
"eval_f1_micro": 0.07753995379298494,
"eval_f1_weighted": 0.04766626516109554,
"eval_loss": 6.724180221557617,
"eval_precision_macro": 0.00894345687991437,
"eval_precision_micro": 0.07753995379298494,
"eval_precision_weighted": 0.04157378236856482,
"eval_recall_macro": 0.014579264668079467,
"eval_recall_micro": 0.07753995379298494,
"eval_recall_weighted": 0.07753995379298494,
"eval_runtime": 86.416,
"eval_samples_per_second": 606.057,
"eval_steps_per_second": 18.943,
"step": 22280
},
{
"epoch": 4.003590664272891,
"grad_norm": 30.3936824798584,
"learning_rate": 2.799736684619988e-05,
"loss": 9.4773,
"step": 22300
},
{
"epoch": 4.008078994614004,
"grad_norm": 30.763669967651367,
"learning_rate": 2.7991382405745064e-05,
"loss": 9.1388,
"step": 22325
},
{
"epoch": 4.012567324955117,
"grad_norm": 30.83111572265625,
"learning_rate": 2.7985397965290246e-05,
"loss": 9.1059,
"step": 22350
},
{
"epoch": 4.01705565529623,
"grad_norm": 32.58699035644531,
"learning_rate": 2.7979413524835428e-05,
"loss": 9.1599,
"step": 22375
},
{
"epoch": 4.021543985637343,
"grad_norm": 32.16946792602539,
"learning_rate": 2.797342908438061e-05,
"loss": 8.8678,
"step": 22400
},
{
"epoch": 4.026032315978456,
"grad_norm": 32.695838928222656,
"learning_rate": 2.7967444643925793e-05,
"loss": 8.5109,
"step": 22425
},
{
"epoch": 4.0305206463195695,
"grad_norm": 32.195003509521484,
"learning_rate": 2.7961460203470975e-05,
"loss": 8.8435,
"step": 22450
},
{
"epoch": 4.0350089766606825,
"grad_norm": 33.23640060424805,
"learning_rate": 2.795547576301616e-05,
"loss": 8.9362,
"step": 22475
},
{
"epoch": 4.039497307001795,
"grad_norm": 36.865997314453125,
"learning_rate": 2.7949491322561343e-05,
"loss": 9.0649,
"step": 22500
},
{
"epoch": 4.043985637342908,
"grad_norm": 35.41594696044922,
"learning_rate": 2.7943506882106526e-05,
"loss": 8.9695,
"step": 22525
},
{
"epoch": 4.048473967684021,
"grad_norm": 35.198551177978516,
"learning_rate": 2.7937522441651705e-05,
"loss": 9.3542,
"step": 22550
},
{
"epoch": 4.052962298025134,
"grad_norm": 49.4534912109375,
"learning_rate": 2.7931538001196887e-05,
"loss": 8.6442,
"step": 22575
},
{
"epoch": 4.057450628366248,
"grad_norm": 35.323726654052734,
"learning_rate": 2.792555356074207e-05,
"loss": 8.5507,
"step": 22600
},
{
"epoch": 4.061938958707361,
"grad_norm": Infinity,
"learning_rate": 2.7919808497905447e-05,
"loss": 9.0441,
"step": 22625
},
{
"epoch": 4.066427289048474,
"grad_norm": 35.7750244140625,
"learning_rate": 2.791382405745063e-05,
"loss": 9.1965,
"step": 22650
},
{
"epoch": 4.070915619389587,
"grad_norm": 31.913360595703125,
"learning_rate": 2.790783961699581e-05,
"loss": 9.1348,
"step": 22675
},
{
"epoch": 4.0754039497307,
"grad_norm": 33.979190826416016,
"learning_rate": 2.7901855176540994e-05,
"loss": 9.1416,
"step": 22700
},
{
"epoch": 4.079892280071813,
"grad_norm": 33.557029724121094,
"learning_rate": 2.7895870736086176e-05,
"loss": 8.8399,
"step": 22725
},
{
"epoch": 4.084380610412927,
"grad_norm": 35.37779998779297,
"learning_rate": 2.788988629563136e-05,
"loss": 9.2904,
"step": 22750
},
{
"epoch": 4.08886894075404,
"grad_norm": 33.334224700927734,
"learning_rate": 2.788390185517654e-05,
"loss": 9.0412,
"step": 22775
},
{
"epoch": 4.093357271095153,
"grad_norm": 38.393653869628906,
"learning_rate": 2.7877917414721726e-05,
"loss": 8.8758,
"step": 22800
},
{
"epoch": 4.097845601436266,
"grad_norm": 34.724517822265625,
"learning_rate": 2.787193297426691e-05,
"loss": 8.9632,
"step": 22825
},
{
"epoch": 4.102333931777379,
"grad_norm": 35.026126861572266,
"learning_rate": 2.7865948533812088e-05,
"loss": 8.81,
"step": 22850
},
{
"epoch": 4.1068222621184916,
"grad_norm": 33.23841094970703,
"learning_rate": 2.785996409335727e-05,
"loss": 8.965,
"step": 22875
},
{
"epoch": 4.111310592459605,
"grad_norm": 33.344581604003906,
"learning_rate": 2.7853979652902453e-05,
"loss": 8.6121,
"step": 22900
},
{
"epoch": 4.115798922800718,
"grad_norm": 33.311065673828125,
"learning_rate": 2.7847995212447638e-05,
"loss": 8.863,
"step": 22925
},
{
"epoch": 4.120287253141831,
"grad_norm": 31.99666404724121,
"learning_rate": 2.784201077199282e-05,
"loss": 9.2711,
"step": 22950
},
{
"epoch": 4.124775583482944,
"grad_norm": 35.421077728271484,
"learning_rate": 2.7836026331538003e-05,
"loss": 8.8117,
"step": 22975
},
{
"epoch": 4.129263913824057,
"grad_norm": 35.499202728271484,
"learning_rate": 2.7830041891083185e-05,
"loss": 9.0647,
"step": 23000
},
{
"epoch": 4.13375224416517,
"grad_norm": 39.84804916381836,
"learning_rate": 2.7824057450628368e-05,
"loss": 8.9921,
"step": 23025
},
{
"epoch": 4.138240574506284,
"grad_norm": 35.68635559082031,
"learning_rate": 2.7818073010173547e-05,
"loss": 9.0054,
"step": 23050
},
{
"epoch": 4.142728904847397,
"grad_norm": 34.515098571777344,
"learning_rate": 2.7812088569718732e-05,
"loss": 9.2403,
"step": 23075
},
{
"epoch": 4.14721723518851,
"grad_norm": 35.22542190551758,
"learning_rate": 2.7806104129263915e-05,
"loss": 9.1803,
"step": 23100
},
{
"epoch": 4.151705565529623,
"grad_norm": 31.101097106933594,
"learning_rate": 2.7800119688809097e-05,
"loss": 9.1061,
"step": 23125
},
{
"epoch": 4.156193895870736,
"grad_norm": 35.81389236450195,
"learning_rate": 2.779413524835428e-05,
"loss": 9.2282,
"step": 23150
},
{
"epoch": 4.160682226211849,
"grad_norm": 33.05430603027344,
"learning_rate": 2.7788150807899462e-05,
"loss": 8.9339,
"step": 23175
},
{
"epoch": 4.165170556552963,
"grad_norm": 32.21403884887695,
"learning_rate": 2.7782166367444644e-05,
"loss": 9.0769,
"step": 23200
},
{
"epoch": 4.169658886894076,
"grad_norm": 38.616085052490234,
"learning_rate": 2.777618192698983e-05,
"loss": 9.2644,
"step": 23225
},
{
"epoch": 4.174147217235189,
"grad_norm": 34.82571029663086,
"learning_rate": 2.777019748653501e-05,
"loss": 9.2723,
"step": 23250
},
{
"epoch": 4.1786355475763015,
"grad_norm": 37.125797271728516,
"learning_rate": 2.776421304608019e-05,
"loss": 9.2647,
"step": 23275
},
{
"epoch": 4.1831238779174145,
"grad_norm": 36.201927185058594,
"learning_rate": 2.7758228605625374e-05,
"loss": 9.5391,
"step": 23300
},
{
"epoch": 4.187612208258527,
"grad_norm": 34.90190505981445,
"learning_rate": 2.7752244165170556e-05,
"loss": 9.2669,
"step": 23325
},
{
"epoch": 4.192100538599641,
"grad_norm": 36.72137451171875,
"learning_rate": 2.7746259724715742e-05,
"loss": 9.3072,
"step": 23350
},
{
"epoch": 4.196588868940754,
"grad_norm": 34.933372497558594,
"learning_rate": 2.7740275284260924e-05,
"loss": 9.2713,
"step": 23375
},
{
"epoch": 4.201077199281867,
"grad_norm": 37.9987907409668,
"learning_rate": 2.7734290843806107e-05,
"loss": 9.0352,
"step": 23400
},
{
"epoch": 4.20556552962298,
"grad_norm": 33.95653533935547,
"learning_rate": 2.772830640335129e-05,
"loss": 9.1703,
"step": 23425
},
{
"epoch": 4.210053859964093,
"grad_norm": 32.79034423828125,
"learning_rate": 2.7722321962896468e-05,
"loss": 9.0717,
"step": 23450
},
{
"epoch": 4.214542190305206,
"grad_norm": 41.263702392578125,
"learning_rate": 2.771633752244165e-05,
"loss": 9.0279,
"step": 23475
},
{
"epoch": 4.21903052064632,
"grad_norm": 34.632225036621094,
"learning_rate": 2.7710353081986836e-05,
"loss": 9.0236,
"step": 23500
},
{
"epoch": 4.223518850987433,
"grad_norm": 34.72397232055664,
"learning_rate": 2.770436864153202e-05,
"loss": 9.2389,
"step": 23525
},
{
"epoch": 4.228007181328546,
"grad_norm": 34.320003509521484,
"learning_rate": 2.76983842010772e-05,
"loss": 9.0942,
"step": 23550
},
{
"epoch": 4.232495511669659,
"grad_norm": 35.2785758972168,
"learning_rate": 2.7692399760622383e-05,
"loss": 8.7822,
"step": 23575
},
{
"epoch": 4.236983842010772,
"grad_norm": 40.83307647705078,
"learning_rate": 2.7686415320167565e-05,
"loss": 9.2271,
"step": 23600
},
{
"epoch": 4.241472172351885,
"grad_norm": 34.236122131347656,
"learning_rate": 2.7680430879712748e-05,
"loss": 9.0967,
"step": 23625
},
{
"epoch": 4.2459605026929985,
"grad_norm": 34.03813171386719,
"learning_rate": 2.767444643925793e-05,
"loss": 9.1916,
"step": 23650
},
{
"epoch": 4.2504488330341115,
"grad_norm": 32.90471267700195,
"learning_rate": 2.7668461998803112e-05,
"loss": 8.9725,
"step": 23675
},
{
"epoch": 4.254937163375224,
"grad_norm": 37.31569290161133,
"learning_rate": 2.7662477558348295e-05,
"loss": 9.5137,
"step": 23700
},
{
"epoch": 4.259425493716337,
"grad_norm": 33.96034240722656,
"learning_rate": 2.7656493117893477e-05,
"loss": 9.1435,
"step": 23725
},
{
"epoch": 4.26391382405745,
"grad_norm": 37.626258850097656,
"learning_rate": 2.765050867743866e-05,
"loss": 9.1109,
"step": 23750
},
{
"epoch": 4.268402154398563,
"grad_norm": 37.14412307739258,
"learning_rate": 2.7644524236983842e-05,
"loss": 8.9028,
"step": 23775
},
{
"epoch": 4.272890484739677,
"grad_norm": 33.2732048034668,
"learning_rate": 2.7638539796529028e-05,
"loss": 8.9961,
"step": 23800
},
{
"epoch": 4.27737881508079,
"grad_norm": 35.71903991699219,
"learning_rate": 2.7632555356074207e-05,
"loss": 9.3034,
"step": 23825
},
{
"epoch": 4.281867145421903,
"grad_norm": 34.583213806152344,
"learning_rate": 2.762657091561939e-05,
"loss": 9.2041,
"step": 23850
},
{
"epoch": 4.286355475763016,
"grad_norm": 36.03817367553711,
"learning_rate": 2.762058647516457e-05,
"loss": 9.2503,
"step": 23875
},
{
"epoch": 4.290843806104129,
"grad_norm": 34.202823638916016,
"learning_rate": 2.7614602034709754e-05,
"loss": 9.2398,
"step": 23900
},
{
"epoch": 4.295332136445243,
"grad_norm": 35.64631652832031,
"learning_rate": 2.760861759425494e-05,
"loss": 9.139,
"step": 23925
},
{
"epoch": 4.299820466786356,
"grad_norm": 34.361637115478516,
"learning_rate": 2.7602633153800122e-05,
"loss": 9.0898,
"step": 23950
},
{
"epoch": 4.304308797127469,
"grad_norm": 32.614646911621094,
"learning_rate": 2.7596648713345304e-05,
"loss": 9.4455,
"step": 23975
},
{
"epoch": 4.308797127468582,
"grad_norm": 36.456077575683594,
"learning_rate": 2.7590664272890487e-05,
"loss": 9.2413,
"step": 24000
},
{
"epoch": 4.313285457809695,
"grad_norm": 33.43761444091797,
"learning_rate": 2.7584679832435666e-05,
"loss": 9.1153,
"step": 24025
},
{
"epoch": 4.317773788150808,
"grad_norm": 34.84223556518555,
"learning_rate": 2.7578695391980848e-05,
"loss": 9.2057,
"step": 24050
},
{
"epoch": 4.3222621184919205,
"grad_norm": 31.044452667236328,
"learning_rate": 2.7572710951526034e-05,
"loss": 9.3385,
"step": 24075
},
{
"epoch": 4.326750448833034,
"grad_norm": 38.18600845336914,
"learning_rate": 2.7566726511071216e-05,
"loss": 9.0842,
"step": 24100
},
{
"epoch": 4.331238779174147,
"grad_norm": 34.68734359741211,
"learning_rate": 2.75607420706164e-05,
"loss": 9.4446,
"step": 24125
},
{
"epoch": 4.33572710951526,
"grad_norm": 38.530601501464844,
"learning_rate": 2.755475763016158e-05,
"loss": 9.0487,
"step": 24150
},
{
"epoch": 4.340215439856373,
"grad_norm": 35.827022552490234,
"learning_rate": 2.7548773189706763e-05,
"loss": 9.1242,
"step": 24175
},
{
"epoch": 4.344703770197486,
"grad_norm": 37.25276184082031,
"learning_rate": 2.7542788749251945e-05,
"loss": 9.1769,
"step": 24200
},
{
"epoch": 4.3491921005386,
"grad_norm": 35.8741340637207,
"learning_rate": 2.7536804308797128e-05,
"loss": 9.256,
"step": 24225
},
{
"epoch": 4.353680430879713,
"grad_norm": 34.161651611328125,
"learning_rate": 2.753081986834231e-05,
"loss": 9.3959,
"step": 24250
},
{
"epoch": 4.358168761220826,
"grad_norm": 36.703941345214844,
"learning_rate": 2.7524835427887492e-05,
"loss": 9.6069,
"step": 24275
},
{
"epoch": 4.362657091561939,
"grad_norm": 33.90925216674805,
"learning_rate": 2.7518850987432675e-05,
"loss": 9.2081,
"step": 24300
},
{
"epoch": 4.367145421903052,
"grad_norm": 36.48859786987305,
"learning_rate": 2.7512866546977857e-05,
"loss": 9.2767,
"step": 24325
},
{
"epoch": 4.371633752244165,
"grad_norm": 36.00957489013672,
"learning_rate": 2.7506882106523043e-05,
"loss": 9.2949,
"step": 24350
},
{
"epoch": 4.376122082585279,
"grad_norm": 33.388736724853516,
"learning_rate": 2.7500897666068225e-05,
"loss": 9.4621,
"step": 24375
},
{
"epoch": 4.380610412926392,
"grad_norm": 32.6502571105957,
"learning_rate": 2.7494913225613408e-05,
"loss": 9.2408,
"step": 24400
},
{
"epoch": 4.385098743267505,
"grad_norm": 36.0883903503418,
"learning_rate": 2.7488928785158587e-05,
"loss": 9.3558,
"step": 24425
},
{
"epoch": 4.3895870736086176,
"grad_norm": 33.08795928955078,
"learning_rate": 2.748294434470377e-05,
"loss": 9.1737,
"step": 24450
},
{
"epoch": 4.3940754039497305,
"grad_norm": 37.87990188598633,
"learning_rate": 2.747695990424895e-05,
"loss": 9.2635,
"step": 24475
},
{
"epoch": 4.3985637342908435,
"grad_norm": 32.306396484375,
"learning_rate": 2.7470975463794137e-05,
"loss": 9.4797,
"step": 24500
},
{
"epoch": 4.403052064631957,
"grad_norm": 34.42149353027344,
"learning_rate": 2.746499102333932e-05,
"loss": 8.8528,
"step": 24525
},
{
"epoch": 4.40754039497307,
"grad_norm": 33.147850036621094,
"learning_rate": 2.7459006582884502e-05,
"loss": 9.3153,
"step": 24550
},
{
"epoch": 4.412028725314183,
"grad_norm": 36.34206771850586,
"learning_rate": 2.7453022142429684e-05,
"loss": 9.1607,
"step": 24575
},
{
"epoch": 4.416517055655296,
"grad_norm": 36.275413513183594,
"learning_rate": 2.7447037701974867e-05,
"loss": 9.2555,
"step": 24600
},
{
"epoch": 4.421005385996409,
"grad_norm": 34.83110427856445,
"learning_rate": 2.7441053261520046e-05,
"loss": 9.4131,
"step": 24625
},
{
"epoch": 4.425493716337522,
"grad_norm": 35.73281478881836,
"learning_rate": 2.743506882106523e-05,
"loss": 9.7517,
"step": 24650
},
{
"epoch": 4.429982046678636,
"grad_norm": 32.646751403808594,
"learning_rate": 2.7429084380610414e-05,
"loss": 9.452,
"step": 24675
},
{
"epoch": 4.434470377019749,
"grad_norm": 42.54426956176758,
"learning_rate": 2.7423099940155596e-05,
"loss": 9.3777,
"step": 24700
},
{
"epoch": 4.438958707360862,
"grad_norm": 35.09437942504883,
"learning_rate": 2.741711549970078e-05,
"loss": 9.3665,
"step": 24725
},
{
"epoch": 4.443447037701975,
"grad_norm": 36.45936965942383,
"learning_rate": 2.741113105924596e-05,
"loss": 9.4285,
"step": 24750
},
{
"epoch": 4.447935368043088,
"grad_norm": 34.06489181518555,
"learning_rate": 2.7405146618791146e-05,
"loss": 9.1473,
"step": 24775
},
{
"epoch": 4.452423698384201,
"grad_norm": 38.4737663269043,
"learning_rate": 2.739916217833633e-05,
"loss": 9.5141,
"step": 24800
},
{
"epoch": 4.456912028725315,
"grad_norm": 35.27596664428711,
"learning_rate": 2.7393177737881508e-05,
"loss": 9.3386,
"step": 24825
},
{
"epoch": 4.4614003590664275,
"grad_norm": 39.01841735839844,
"learning_rate": 2.738719329742669e-05,
"loss": 9.2959,
"step": 24850
},
{
"epoch": 4.4658886894075405,
"grad_norm": 40.175697326660156,
"learning_rate": 2.7381208856971873e-05,
"loss": 9.3482,
"step": 24875
},
{
"epoch": 4.470377019748653,
"grad_norm": 37.285396575927734,
"learning_rate": 2.7375224416517055e-05,
"loss": 9.0562,
"step": 24900
},
{
"epoch": 4.474865350089766,
"grad_norm": 37.979305267333984,
"learning_rate": 2.736923997606224e-05,
"loss": 9.4161,
"step": 24925
},
{
"epoch": 4.479353680430879,
"grad_norm": 34.52471160888672,
"learning_rate": 2.7363255535607423e-05,
"loss": 9.1926,
"step": 24950
},
{
"epoch": 4.483842010771993,
"grad_norm": 32.52268600463867,
"learning_rate": 2.7357271095152605e-05,
"loss": 9.5568,
"step": 24975
},
{
"epoch": 4.488330341113106,
"grad_norm": 34.64008712768555,
"learning_rate": 2.7351286654697788e-05,
"loss": 9.34,
"step": 25000
},
{
"epoch": 4.492818671454219,
"grad_norm": 35.43095397949219,
"learning_rate": 2.7345302214242967e-05,
"loss": 9.6012,
"step": 25025
},
{
"epoch": 4.497307001795332,
"grad_norm": 34.24216079711914,
"learning_rate": 2.733931777378815e-05,
"loss": 9.545,
"step": 25050
},
{
"epoch": 4.501795332136445,
"grad_norm": 36.410186767578125,
"learning_rate": 2.7333333333333335e-05,
"loss": 9.5178,
"step": 25075
},
{
"epoch": 4.506283662477558,
"grad_norm": 33.58375549316406,
"learning_rate": 2.7327348892878517e-05,
"loss": 9.0259,
"step": 25100
},
{
"epoch": 4.510771992818672,
"grad_norm": 33.377079010009766,
"learning_rate": 2.73213644524237e-05,
"loss": 9.1557,
"step": 25125
},
{
"epoch": 4.515260323159785,
"grad_norm": 37.322166442871094,
"learning_rate": 2.7315380011968882e-05,
"loss": 9.2679,
"step": 25150
},
{
"epoch": 4.519748653500898,
"grad_norm": 35.399192810058594,
"learning_rate": 2.7309395571514064e-05,
"loss": 9.3599,
"step": 25175
},
{
"epoch": 4.524236983842011,
"grad_norm": 34.6229362487793,
"learning_rate": 2.7303411131059247e-05,
"loss": 9.2008,
"step": 25200
},
{
"epoch": 4.528725314183124,
"grad_norm": 38.43641662597656,
"learning_rate": 2.729742669060443e-05,
"loss": 9.5365,
"step": 25225
},
{
"epoch": 4.533213644524237,
"grad_norm": 36.315940856933594,
"learning_rate": 2.729144225014961e-05,
"loss": 9.2328,
"step": 25250
},
{
"epoch": 4.53770197486535,
"grad_norm": 36.93431091308594,
"learning_rate": 2.7285457809694794e-05,
"loss": 9.1937,
"step": 25275
},
{
"epoch": 4.542190305206463,
"grad_norm": 34.52630615234375,
"learning_rate": 2.7279473369239976e-05,
"loss": 9.3224,
"step": 25300
},
{
"epoch": 4.546678635547576,
"grad_norm": 37.09843826293945,
"learning_rate": 2.727348892878516e-05,
"loss": 9.5531,
"step": 25325
},
{
"epoch": 4.551166965888689,
"grad_norm": 35.45225143432617,
"learning_rate": 2.7267504488330344e-05,
"loss": 9.3112,
"step": 25350
},
{
"epoch": 4.555655296229802,
"grad_norm": 36.52423858642578,
"learning_rate": 2.7261520047875526e-05,
"loss": 9.2307,
"step": 25375
},
{
"epoch": 4.560143626570916,
"grad_norm": 31.571231842041016,
"learning_rate": 2.725553560742071e-05,
"loss": 9.0741,
"step": 25400
},
{
"epoch": 4.564631956912029,
"grad_norm": 35.70735549926758,
"learning_rate": 2.7249551166965888e-05,
"loss": 9.4122,
"step": 25425
},
{
"epoch": 4.569120287253142,
"grad_norm": 35.43241882324219,
"learning_rate": 2.724356672651107e-05,
"loss": 9.4357,
"step": 25450
},
{
"epoch": 4.573608617594255,
"grad_norm": 35.756832122802734,
"learning_rate": 2.7237582286056253e-05,
"loss": 9.3782,
"step": 25475
},
{
"epoch": 4.578096947935368,
"grad_norm": 33.91000747680664,
"learning_rate": 2.7231597845601438e-05,
"loss": 9.7165,
"step": 25500
},
{
"epoch": 4.582585278276481,
"grad_norm": 34.89963912963867,
"learning_rate": 2.722561340514662e-05,
"loss": 9.5026,
"step": 25525
},
{
"epoch": 4.587073608617594,
"grad_norm": 35.42002868652344,
"learning_rate": 2.7219628964691803e-05,
"loss": 9.6544,
"step": 25550
},
{
"epoch": 4.591561938958708,
"grad_norm": 35.01460647583008,
"learning_rate": 2.7213644524236985e-05,
"loss": 9.3751,
"step": 25575
},
{
"epoch": 4.596050269299821,
"grad_norm": 32.873260498046875,
"learning_rate": 2.7207660083782164e-05,
"loss": 9.1189,
"step": 25600
},
{
"epoch": 4.600538599640934,
"grad_norm": 38.0374641418457,
"learning_rate": 2.7201675643327347e-05,
"loss": 9.4421,
"step": 25625
},
{
"epoch": 4.6050269299820465,
"grad_norm": 32.76029586791992,
"learning_rate": 2.7195691202872532e-05,
"loss": 9.2211,
"step": 25650
},
{
"epoch": 4.6095152603231595,
"grad_norm": 35.879295349121094,
"learning_rate": 2.7189706762417715e-05,
"loss": 9.4768,
"step": 25675
},
{
"epoch": 4.614003590664273,
"grad_norm": 34.31226348876953,
"learning_rate": 2.7183722321962897e-05,
"loss": 9.1382,
"step": 25700
},
{
"epoch": 4.618491921005386,
"grad_norm": 33.70473861694336,
"learning_rate": 2.717773788150808e-05,
"loss": 9.1704,
"step": 25725
},
{
"epoch": 4.622980251346499,
"grad_norm": 36.1688232421875,
"learning_rate": 2.7171753441053262e-05,
"loss": 9.4746,
"step": 25750
},
{
"epoch": 4.627468581687612,
"grad_norm": 35.33478927612305,
"learning_rate": 2.7165769000598448e-05,
"loss": 9.2567,
"step": 25775
},
{
"epoch": 4.631956912028725,
"grad_norm": 35.50520324707031,
"learning_rate": 2.7159784560143627e-05,
"loss": 9.5426,
"step": 25800
},
{
"epoch": 4.636445242369838,
"grad_norm": 34.68144989013672,
"learning_rate": 2.715380011968881e-05,
"loss": 9.4237,
"step": 25825
},
{
"epoch": 4.640933572710951,
"grad_norm": 32.5733528137207,
"learning_rate": 2.714781567923399e-05,
"loss": 9.2993,
"step": 25850
},
{
"epoch": 4.645421903052065,
"grad_norm": 34.17429733276367,
"learning_rate": 2.714207061639737e-05,
"loss": 9.4056,
"step": 25875
},
{
"epoch": 4.649910233393178,
"grad_norm": 49.32793045043945,
"learning_rate": 2.713608617594255e-05,
"loss": 9.3828,
"step": 25900
},
{
"epoch": 4.654398563734291,
"grad_norm": 35.83115768432617,
"learning_rate": 2.713010173548773e-05,
"loss": 9.4779,
"step": 25925
},
{
"epoch": 4.658886894075404,
"grad_norm": 35.35591125488281,
"learning_rate": 2.7124117295032915e-05,
"loss": 9.4844,
"step": 25950
},
{
"epoch": 4.663375224416517,
"grad_norm": 35.725494384765625,
"learning_rate": 2.7118132854578098e-05,
"loss": 9.1772,
"step": 25975
},
{
"epoch": 4.667863554757631,
"grad_norm": 34.3475227355957,
"learning_rate": 2.711214841412328e-05,
"loss": 9.5834,
"step": 26000
},
{
"epoch": 4.6723518850987436,
"grad_norm": 35.19342041015625,
"learning_rate": 2.7106163973668463e-05,
"loss": 9.1603,
"step": 26025
},
{
"epoch": 4.6768402154398565,
"grad_norm": 37.154518127441406,
"learning_rate": 2.7100179533213645e-05,
"loss": 9.5956,
"step": 26050
},
{
"epoch": 4.6813285457809695,
"grad_norm": 36.49668884277344,
"learning_rate": 2.7094195092758827e-05,
"loss": 9.5274,
"step": 26075
},
{
"epoch": 4.685816876122082,
"grad_norm": 34.92998504638672,
"learning_rate": 2.7088210652304013e-05,
"loss": 9.5255,
"step": 26100
},
{
"epoch": 4.690305206463195,
"grad_norm": 32.61775207519531,
"learning_rate": 2.7082226211849192e-05,
"loss": 9.4236,
"step": 26125
},
{
"epoch": 4.694793536804308,
"grad_norm": 35.2857666015625,
"learning_rate": 2.7076241771394374e-05,
"loss": 9.4578,
"step": 26150
},
{
"epoch": 4.699281867145422,
"grad_norm": 37.08427429199219,
"learning_rate": 2.7070257330939557e-05,
"loss": 9.5587,
"step": 26175
},
{
"epoch": 4.703770197486535,
"grad_norm": 33.42496109008789,
"learning_rate": 2.706427289048474e-05,
"loss": 9.6455,
"step": 26200
},
{
"epoch": 4.708258527827648,
"grad_norm": 38.109561920166016,
"learning_rate": 2.7058288450029925e-05,
"loss": 9.4372,
"step": 26225
},
{
"epoch": 4.712746858168761,
"grad_norm": 34.73807907104492,
"learning_rate": 2.7052304009575107e-05,
"loss": 9.2668,
"step": 26250
},
{
"epoch": 4.717235188509874,
"grad_norm": 35.39613723754883,
"learning_rate": 2.704631956912029e-05,
"loss": 9.3931,
"step": 26275
},
{
"epoch": 4.721723518850988,
"grad_norm": 35.8447380065918,
"learning_rate": 2.7040335128665472e-05,
"loss": 9.5386,
"step": 26300
},
{
"epoch": 4.726211849192101,
"grad_norm": 38.25541305541992,
"learning_rate": 2.703435068821065e-05,
"loss": 9.2727,
"step": 26325
},
{
"epoch": 4.730700179533214,
"grad_norm": 35.903507232666016,
"learning_rate": 2.7028366247755833e-05,
"loss": 9.7039,
"step": 26350
},
{
"epoch": 4.735188509874327,
"grad_norm": 35.6234130859375,
"learning_rate": 2.702238180730102e-05,
"loss": 9.6165,
"step": 26375
},
{
"epoch": 4.73967684021544,
"grad_norm": 37.08405303955078,
"learning_rate": 2.70163973668462e-05,
"loss": 9.6926,
"step": 26400
},
{
"epoch": 4.744165170556553,
"grad_norm": 32.01731491088867,
"learning_rate": 2.7010412926391384e-05,
"loss": 9.3151,
"step": 26425
},
{
"epoch": 4.748653500897666,
"grad_norm": 37.30953598022461,
"learning_rate": 2.7004428485936566e-05,
"loss": 9.4753,
"step": 26450
},
{
"epoch": 4.753141831238779,
"grad_norm": 37.31596755981445,
"learning_rate": 2.699844404548175e-05,
"loss": 9.4824,
"step": 26475
},
{
"epoch": 4.757630161579892,
"grad_norm": 35.827213287353516,
"learning_rate": 2.699245960502693e-05,
"loss": 9.5776,
"step": 26500
},
{
"epoch": 4.762118491921005,
"grad_norm": 39.54668045043945,
"learning_rate": 2.6986475164572113e-05,
"loss": 9.563,
"step": 26525
},
{
"epoch": 4.766606822262118,
"grad_norm": 32.41488265991211,
"learning_rate": 2.6980490724117296e-05,
"loss": 9.3826,
"step": 26550
},
{
"epoch": 4.771095152603231,
"grad_norm": 36.029666900634766,
"learning_rate": 2.6974506283662478e-05,
"loss": 9.512,
"step": 26575
},
{
"epoch": 4.775583482944345,
"grad_norm": 32.85836410522461,
"learning_rate": 2.696852184320766e-05,
"loss": 9.4177,
"step": 26600
},
{
"epoch": 4.780071813285458,
"grad_norm": 32.541988372802734,
"learning_rate": 2.6962537402752843e-05,
"loss": 9.3833,
"step": 26625
},
{
"epoch": 4.784560143626571,
"grad_norm": 35.42625045776367,
"learning_rate": 2.695655296229803e-05,
"loss": 9.5967,
"step": 26650
},
{
"epoch": 4.789048473967684,
"grad_norm": 39.130592346191406,
"learning_rate": 2.695056852184321e-05,
"loss": 9.4185,
"step": 26675
},
{
"epoch": 4.793536804308797,
"grad_norm": 37.135032653808594,
"learning_rate": 2.694458408138839e-05,
"loss": 9.4166,
"step": 26700
},
{
"epoch": 4.79802513464991,
"grad_norm": 32.95221710205078,
"learning_rate": 2.6938599640933572e-05,
"loss": 9.5325,
"step": 26725
},
{
"epoch": 4.802513464991024,
"grad_norm": 34.23844528198242,
"learning_rate": 2.6932615200478754e-05,
"loss": 9.5948,
"step": 26750
},
{
"epoch": 4.807001795332137,
"grad_norm": 36.37213134765625,
"learning_rate": 2.6926630760023937e-05,
"loss": 9.1881,
"step": 26775
},
{
"epoch": 4.81149012567325,
"grad_norm": 37.12689971923828,
"learning_rate": 2.6920646319569123e-05,
"loss": 9.4443,
"step": 26800
},
{
"epoch": 4.815978456014363,
"grad_norm": 30.90703582763672,
"learning_rate": 2.6914661879114305e-05,
"loss": 9.1284,
"step": 26825
},
{
"epoch": 4.8204667863554755,
"grad_norm": 34.2583122253418,
"learning_rate": 2.6908677438659487e-05,
"loss": 9.1908,
"step": 26850
},
{
"epoch": 4.8249551166965885,
"grad_norm": 36.532203674316406,
"learning_rate": 2.690269299820467e-05,
"loss": 9.3999,
"step": 26875
},
{
"epoch": 4.829443447037702,
"grad_norm": 36.42616271972656,
"learning_rate": 2.689670855774985e-05,
"loss": 9.448,
"step": 26900
},
{
"epoch": 4.833931777378815,
"grad_norm": 37.477928161621094,
"learning_rate": 2.689072411729503e-05,
"loss": 9.5414,
"step": 26925
},
{
"epoch": 4.838420107719928,
"grad_norm": 36.44997024536133,
"learning_rate": 2.6884739676840217e-05,
"loss": 9.406,
"step": 26950
},
{
"epoch": 4.842908438061041,
"grad_norm": 34.89653396606445,
"learning_rate": 2.68787552363854e-05,
"loss": 9.8373,
"step": 26975
},
{
"epoch": 4.847396768402154,
"grad_norm": 34.84752655029297,
"learning_rate": 2.687277079593058e-05,
"loss": 9.3907,
"step": 27000
},
{
"epoch": 4.851885098743267,
"grad_norm": 33.79581069946289,
"learning_rate": 2.6866786355475764e-05,
"loss": 9.4444,
"step": 27025
},
{
"epoch": 4.856373429084381,
"grad_norm": 34.37635040283203,
"learning_rate": 2.6860801915020946e-05,
"loss": 9.5671,
"step": 27050
},
{
"epoch": 4.860861759425494,
"grad_norm": 35.371822357177734,
"learning_rate": 2.685481747456613e-05,
"loss": 9.5015,
"step": 27075
},
{
"epoch": 4.865350089766607,
"grad_norm": 38.23295211791992,
"learning_rate": 2.684883303411131e-05,
"loss": 9.3564,
"step": 27100
},
{
"epoch": 4.86983842010772,
"grad_norm": 36.58891296386719,
"learning_rate": 2.6842848593656493e-05,
"loss": 9.6106,
"step": 27125
},
{
"epoch": 4.874326750448833,
"grad_norm": 38.73398208618164,
"learning_rate": 2.6836864153201676e-05,
"loss": 9.5011,
"step": 27150
},
{
"epoch": 4.878815080789947,
"grad_norm": 34.134403228759766,
"learning_rate": 2.6830879712746858e-05,
"loss": 9.5416,
"step": 27175
},
{
"epoch": 4.88330341113106,
"grad_norm": 34.43739318847656,
"learning_rate": 2.682489527229204e-05,
"loss": 9.4023,
"step": 27200
},
{
"epoch": 4.8877917414721725,
"grad_norm": 33.59444808959961,
"learning_rate": 2.6818910831837226e-05,
"loss": 9.7006,
"step": 27225
},
{
"epoch": 4.8922800718132855,
"grad_norm": 37.26764678955078,
"learning_rate": 2.681292639138241e-05,
"loss": 9.5618,
"step": 27250
},
{
"epoch": 4.8967684021543985,
"grad_norm": 34.99287033081055,
"learning_rate": 2.680694195092759e-05,
"loss": 9.181,
"step": 27275
},
{
"epoch": 4.901256732495511,
"grad_norm": 37.341121673583984,
"learning_rate": 2.680095751047277e-05,
"loss": 9.0808,
"step": 27300
},
{
"epoch": 4.905745062836624,
"grad_norm": 31.948301315307617,
"learning_rate": 2.6794973070017952e-05,
"loss": 9.2991,
"step": 27325
},
{
"epoch": 4.910233393177738,
"grad_norm": 31.787092208862305,
"learning_rate": 2.6788988629563134e-05,
"loss": 9.7063,
"step": 27350
},
{
"epoch": 4.914721723518851,
"grad_norm": 33.72126007080078,
"learning_rate": 2.678300418910832e-05,
"loss": 9.445,
"step": 27375
},
{
"epoch": 4.919210053859964,
"grad_norm": 35.92157745361328,
"learning_rate": 2.6777019748653503e-05,
"loss": 9.8848,
"step": 27400
},
{
"epoch": 4.923698384201077,
"grad_norm": 35.00507354736328,
"learning_rate": 2.6771035308198685e-05,
"loss": 9.4444,
"step": 27425
},
{
"epoch": 4.92818671454219,
"grad_norm": 35.75861358642578,
"learning_rate": 2.6765050867743867e-05,
"loss": 9.8597,
"step": 27450
},
{
"epoch": 4.932675044883304,
"grad_norm": 37.223167419433594,
"learning_rate": 2.675906642728905e-05,
"loss": 9.5199,
"step": 27475
},
{
"epoch": 4.937163375224417,
"grad_norm": 34.89140319824219,
"learning_rate": 2.675308198683423e-05,
"loss": 9.5326,
"step": 27500
},
{
"epoch": 4.94165170556553,
"grad_norm": 38.68606948852539,
"learning_rate": 2.6747097546379414e-05,
"loss": 9.6149,
"step": 27525
},
{
"epoch": 4.946140035906643,
"grad_norm": 35.76506805419922,
"learning_rate": 2.6741113105924597e-05,
"loss": 9.2212,
"step": 27550
},
{
"epoch": 4.950628366247756,
"grad_norm": 34.05699920654297,
"learning_rate": 2.673512866546978e-05,
"loss": 9.3154,
"step": 27575
},
{
"epoch": 4.955116696588869,
"grad_norm": 35.53427505493164,
"learning_rate": 2.672914422501496e-05,
"loss": 9.6563,
"step": 27600
},
{
"epoch": 4.959605026929982,
"grad_norm": 33.76486587524414,
"learning_rate": 2.6723159784560144e-05,
"loss": 9.5415,
"step": 27625
},
{
"epoch": 4.9640933572710955,
"grad_norm": 33.02145767211914,
"learning_rate": 2.671717534410533e-05,
"loss": 9.3559,
"step": 27650
},
{
"epoch": 4.968581687612208,
"grad_norm": 38.21705627441406,
"learning_rate": 2.6711190903650512e-05,
"loss": 9.6168,
"step": 27675
},
{
"epoch": 4.973070017953321,
"grad_norm": 37.642417907714844,
"learning_rate": 2.670520646319569e-05,
"loss": 9.5134,
"step": 27700
},
{
"epoch": 4.977558348294434,
"grad_norm": 33.83686828613281,
"learning_rate": 2.6699222022740873e-05,
"loss": 9.4817,
"step": 27725
},
{
"epoch": 4.982046678635547,
"grad_norm": 34.98296356201172,
"learning_rate": 2.6693237582286056e-05,
"loss": 9.6689,
"step": 27750
},
{
"epoch": 4.986535008976661,
"grad_norm": 35.06865692138672,
"learning_rate": 2.6687253141831238e-05,
"loss": 9.4976,
"step": 27775
},
{
"epoch": 4.991023339317774,
"grad_norm": 34.293113708496094,
"learning_rate": 2.6681268701376424e-05,
"loss": 9.3905,
"step": 27800
},
{
"epoch": 4.995511669658887,
"grad_norm": 34.20943832397461,
"learning_rate": 2.6675284260921606e-05,
"loss": 9.6572,
"step": 27825
},
{
"epoch": 5.0,
"grad_norm": 61.015777587890625,
"learning_rate": 2.666929982046679e-05,
"loss": 9.5296,
"step": 27850
},
{
"epoch": 5.0,
"eval_accuracy": 0.06898592786359384,
"eval_f1_macro": 0.011309476538890082,
"eval_f1_micro": 0.06898592786359384,
"eval_f1_weighted": 0.047429091612135786,
"eval_loss": 6.66681432723999,
"eval_precision_macro": 0.010315927310867975,
"eval_precision_micro": 0.06898592786359384,
"eval_precision_weighted": 0.041722656687824905,
"eval_recall_macro": 0.016669465188724426,
"eval_recall_micro": 0.06898592786359384,
"eval_recall_weighted": 0.06898592786359384,
"eval_runtime": 83.3116,
"eval_samples_per_second": 628.64,
"eval_steps_per_second": 19.649,
"step": 27850
}
],
"logging_steps": 25,
"max_steps": 139250,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.8218004536284e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}