starplus-llama-3.1-8b-gsm8k / trainer_state.json
JakeOh's picture
Upload folder using huggingface_hub
0dabc78 verified
{
"best_global_step": 1396,
"best_metric": 0.6223743557929993,
"best_model_checkpoint": "checkpoints/star_plus-llama-3.1-8b-gsm8k/gsm8k/finetune-llama-3.1-8b-gsm8k-step-3/checkpoint-1396",
"epoch": 0.9016793454858619,
"eval_steps": 349,
"global_step": 3141,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014353380221042056,
"grad_norm": 13.870022843272388,
"learning_rate": 1.1461318051575931e-09,
"loss": 1.1547,
"step": 5
},
{
"epoch": 0.002870676044208411,
"grad_norm": 15.087326577278679,
"learning_rate": 2.5787965616045846e-09,
"loss": 1.1398,
"step": 10
},
{
"epoch": 0.004306014066312616,
"grad_norm": 14.18146075651338,
"learning_rate": 4.011461318051576e-09,
"loss": 1.1402,
"step": 15
},
{
"epoch": 0.005741352088416822,
"grad_norm": 13.690203022341509,
"learning_rate": 5.444126074498567e-09,
"loss": 1.1197,
"step": 20
},
{
"epoch": 0.0071766901105210276,
"grad_norm": 14.994216901060916,
"learning_rate": 6.876790830945558e-09,
"loss": 1.1369,
"step": 25
},
{
"epoch": 0.008612028132625233,
"grad_norm": 14.400896427137175,
"learning_rate": 8.30945558739255e-09,
"loss": 1.1355,
"step": 30
},
{
"epoch": 0.01004736615472944,
"grad_norm": 14.26913515195767,
"learning_rate": 9.742120343839541e-09,
"loss": 1.1442,
"step": 35
},
{
"epoch": 0.011482704176833645,
"grad_norm": 13.797459248853228,
"learning_rate": 1.1174785100286532e-08,
"loss": 1.1198,
"step": 40
},
{
"epoch": 0.01291804219893785,
"grad_norm": 14.471409919274842,
"learning_rate": 1.2607449856733523e-08,
"loss": 1.1203,
"step": 45
},
{
"epoch": 0.014353380221042055,
"grad_norm": 14.748844131710204,
"learning_rate": 1.4040114613180515e-08,
"loss": 1.1226,
"step": 50
},
{
"epoch": 0.01578871824314626,
"grad_norm": 15.729621083463075,
"learning_rate": 1.5472779369627508e-08,
"loss": 1.1524,
"step": 55
},
{
"epoch": 0.017224056265250465,
"grad_norm": 15.386565556660619,
"learning_rate": 1.69054441260745e-08,
"loss": 1.1555,
"step": 60
},
{
"epoch": 0.01865939428735467,
"grad_norm": 13.85550381101084,
"learning_rate": 1.833810888252149e-08,
"loss": 1.1401,
"step": 65
},
{
"epoch": 0.02009473230945888,
"grad_norm": 13.717607018021473,
"learning_rate": 1.977077363896848e-08,
"loss": 1.1384,
"step": 70
},
{
"epoch": 0.021530070331563084,
"grad_norm": 13.565604110956505,
"learning_rate": 2.1203438395415473e-08,
"loss": 1.1018,
"step": 75
},
{
"epoch": 0.02296540835366729,
"grad_norm": 13.867377061312162,
"learning_rate": 2.2636103151862464e-08,
"loss": 1.1124,
"step": 80
},
{
"epoch": 0.024400746375771495,
"grad_norm": 13.724459591900809,
"learning_rate": 2.4068767908309455e-08,
"loss": 1.1398,
"step": 85
},
{
"epoch": 0.0258360843978757,
"grad_norm": 15.266898340074492,
"learning_rate": 2.5501432664756446e-08,
"loss": 1.147,
"step": 90
},
{
"epoch": 0.027271422419979905,
"grad_norm": 13.80077008791852,
"learning_rate": 2.6934097421203438e-08,
"loss": 1.0931,
"step": 95
},
{
"epoch": 0.02870676044208411,
"grad_norm": 14.264049058840255,
"learning_rate": 2.8366762177650426e-08,
"loss": 1.1096,
"step": 100
},
{
"epoch": 0.030142098464188315,
"grad_norm": 13.948028510127017,
"learning_rate": 2.979942693409742e-08,
"loss": 1.1363,
"step": 105
},
{
"epoch": 0.03157743648629252,
"grad_norm": 13.79267645960095,
"learning_rate": 3.123209169054441e-08,
"loss": 1.1159,
"step": 110
},
{
"epoch": 0.03301277450839673,
"grad_norm": 14.588900467039972,
"learning_rate": 3.2664756446991406e-08,
"loss": 1.1057,
"step": 115
},
{
"epoch": 0.03444811253050093,
"grad_norm": 12.835011681719193,
"learning_rate": 3.409742120343839e-08,
"loss": 1.0768,
"step": 120
},
{
"epoch": 0.03588345055260514,
"grad_norm": 12.865687203941038,
"learning_rate": 3.553008595988539e-08,
"loss": 1.0634,
"step": 125
},
{
"epoch": 0.03731878857470934,
"grad_norm": 13.384102130291096,
"learning_rate": 3.696275071633237e-08,
"loss": 1.0633,
"step": 130
},
{
"epoch": 0.03875412659681355,
"grad_norm": 13.041165747177553,
"learning_rate": 3.839541547277937e-08,
"loss": 1.0453,
"step": 135
},
{
"epoch": 0.04018946461891776,
"grad_norm": 13.819322616196978,
"learning_rate": 3.9828080229226356e-08,
"loss": 1.0406,
"step": 140
},
{
"epoch": 0.04162480264102196,
"grad_norm": 15.132519917569722,
"learning_rate": 4.1260744985673354e-08,
"loss": 1.0248,
"step": 145
},
{
"epoch": 0.04306014066312617,
"grad_norm": 16.304556779067884,
"learning_rate": 4.269340974212034e-08,
"loss": 1.0249,
"step": 150
},
{
"epoch": 0.04449547868523037,
"grad_norm": 16.01778148458211,
"learning_rate": 4.4126074498567336e-08,
"loss": 0.9789,
"step": 155
},
{
"epoch": 0.04593081670733458,
"grad_norm": 17.49228077832121,
"learning_rate": 4.555873925501432e-08,
"loss": 0.9715,
"step": 160
},
{
"epoch": 0.04736615472943878,
"grad_norm": 14.72967937422827,
"learning_rate": 4.699140401146132e-08,
"loss": 0.907,
"step": 165
},
{
"epoch": 0.04880149275154299,
"grad_norm": 13.06311838609418,
"learning_rate": 4.8424068767908303e-08,
"loss": 0.8993,
"step": 170
},
{
"epoch": 0.05023683077364719,
"grad_norm": 10.962383047958323,
"learning_rate": 4.98567335243553e-08,
"loss": 0.8467,
"step": 175
},
{
"epoch": 0.0516721687957514,
"grad_norm": 10.794251111033377,
"learning_rate": 5.1289398280802286e-08,
"loss": 0.8314,
"step": 180
},
{
"epoch": 0.05310750681785561,
"grad_norm": 12.739679331085805,
"learning_rate": 5.272206303724928e-08,
"loss": 0.8188,
"step": 185
},
{
"epoch": 0.05454284483995981,
"grad_norm": 16.34256020226391,
"learning_rate": 5.4154727793696275e-08,
"loss": 0.7914,
"step": 190
},
{
"epoch": 0.05597818286206402,
"grad_norm": 10.706894844684937,
"learning_rate": 5.5587392550143266e-08,
"loss": 0.7459,
"step": 195
},
{
"epoch": 0.05741352088416822,
"grad_norm": 13.476028036070423,
"learning_rate": 5.702005730659025e-08,
"loss": 0.7227,
"step": 200
},
{
"epoch": 0.05884885890627243,
"grad_norm": 6.419388332826234,
"learning_rate": 5.845272206303725e-08,
"loss": 0.647,
"step": 205
},
{
"epoch": 0.06028419692837663,
"grad_norm": 5.573797397881519,
"learning_rate": 5.988538681948424e-08,
"loss": 0.6502,
"step": 210
},
{
"epoch": 0.06171953495048084,
"grad_norm": 3.3893623971550917,
"learning_rate": 6.131805157593123e-08,
"loss": 0.6281,
"step": 215
},
{
"epoch": 0.06315487297258504,
"grad_norm": 2.8756784288209825,
"learning_rate": 6.275071633237822e-08,
"loss": 0.6464,
"step": 220
},
{
"epoch": 0.06459021099468924,
"grad_norm": 2.6960562601658853,
"learning_rate": 6.418338108882521e-08,
"loss": 0.6342,
"step": 225
},
{
"epoch": 0.06602554901679346,
"grad_norm": 2.672645623610309,
"learning_rate": 6.56160458452722e-08,
"loss": 0.6306,
"step": 230
},
{
"epoch": 0.06746088703889766,
"grad_norm": 2.7155506472775435,
"learning_rate": 6.70487106017192e-08,
"loss": 0.6313,
"step": 235
},
{
"epoch": 0.06889622506100186,
"grad_norm": 2.4926643277894285,
"learning_rate": 6.848137535816619e-08,
"loss": 0.6371,
"step": 240
},
{
"epoch": 0.07033156308310608,
"grad_norm": 2.5535696596499373,
"learning_rate": 6.991404011461318e-08,
"loss": 0.6337,
"step": 245
},
{
"epoch": 0.07176690110521028,
"grad_norm": 2.305504274318628,
"learning_rate": 7.134670487106017e-08,
"loss": 0.6252,
"step": 250
},
{
"epoch": 0.07320223912731448,
"grad_norm": 2.259445659814349,
"learning_rate": 7.277936962750716e-08,
"loss": 0.6223,
"step": 255
},
{
"epoch": 0.07463757714941868,
"grad_norm": 2.4454411408983514,
"learning_rate": 7.421203438395415e-08,
"loss": 0.646,
"step": 260
},
{
"epoch": 0.0760729151715229,
"grad_norm": 2.442391967235382,
"learning_rate": 7.564469914040114e-08,
"loss": 0.6105,
"step": 265
},
{
"epoch": 0.0775082531936271,
"grad_norm": 2.173170940546,
"learning_rate": 7.707736389684814e-08,
"loss": 0.6173,
"step": 270
},
{
"epoch": 0.0789435912157313,
"grad_norm": 2.2310208210940825,
"learning_rate": 7.851002865329513e-08,
"loss": 0.6244,
"step": 275
},
{
"epoch": 0.08037892923783552,
"grad_norm": 2.1247720503861687,
"learning_rate": 7.994269340974212e-08,
"loss": 0.611,
"step": 280
},
{
"epoch": 0.08181426725993972,
"grad_norm": 2.3653542261559144,
"learning_rate": 8.137535816618911e-08,
"loss": 0.6191,
"step": 285
},
{
"epoch": 0.08324960528204392,
"grad_norm": 2.33933439467133,
"learning_rate": 8.28080229226361e-08,
"loss": 0.6104,
"step": 290
},
{
"epoch": 0.08468494330414812,
"grad_norm": 2.3499668034162267,
"learning_rate": 8.424068767908309e-08,
"loss": 0.6117,
"step": 295
},
{
"epoch": 0.08612028132625234,
"grad_norm": 2.2312767888077643,
"learning_rate": 8.567335243553008e-08,
"loss": 0.6137,
"step": 300
},
{
"epoch": 0.08755561934835654,
"grad_norm": 2.258014565016843,
"learning_rate": 8.710601719197707e-08,
"loss": 0.6165,
"step": 305
},
{
"epoch": 0.08899095737046074,
"grad_norm": 2.361801401311493,
"learning_rate": 8.853868194842407e-08,
"loss": 0.6201,
"step": 310
},
{
"epoch": 0.09042629539256494,
"grad_norm": 2.0790761899486148,
"learning_rate": 8.997134670487106e-08,
"loss": 0.5957,
"step": 315
},
{
"epoch": 0.09186163341466916,
"grad_norm": 2.223817923234156,
"learning_rate": 9.140401146131805e-08,
"loss": 0.5882,
"step": 320
},
{
"epoch": 0.09329697143677336,
"grad_norm": 2.2127486325154764,
"learning_rate": 9.283667621776504e-08,
"loss": 0.6082,
"step": 325
},
{
"epoch": 0.09473230945887756,
"grad_norm": 2.0643057499306114,
"learning_rate": 9.426934097421203e-08,
"loss": 0.5952,
"step": 330
},
{
"epoch": 0.09616764748098178,
"grad_norm": 2.2854134236761046,
"learning_rate": 9.570200573065902e-08,
"loss": 0.5981,
"step": 335
},
{
"epoch": 0.09760298550308598,
"grad_norm": 2.1937418634526926,
"learning_rate": 9.713467048710601e-08,
"loss": 0.6301,
"step": 340
},
{
"epoch": 0.09903832352519018,
"grad_norm": 2.3914797327148474,
"learning_rate": 9.8567335243553e-08,
"loss": 0.6095,
"step": 345
},
{
"epoch": 0.10018659394287355,
"eval_loss": 0.629633367061615,
"eval_runtime": 284.7819,
"eval_samples_per_second": 134.226,
"eval_steps_per_second": 2.1,
"step": 349
},
{
"epoch": 0.10047366154729438,
"grad_norm": 2.2241062219621788,
"learning_rate": 1e-07,
"loss": 0.5995,
"step": 350
},
{
"epoch": 0.1019089995693986,
"grad_norm": 2.3025600042484196,
"learning_rate": 9.984051036682614e-08,
"loss": 0.6244,
"step": 355
},
{
"epoch": 0.1033443375915028,
"grad_norm": 2.1673308778669664,
"learning_rate": 9.96810207336523e-08,
"loss": 0.6057,
"step": 360
},
{
"epoch": 0.104779675613607,
"grad_norm": 2.148853032332558,
"learning_rate": 9.952153110047846e-08,
"loss": 0.5998,
"step": 365
},
{
"epoch": 0.10621501363571122,
"grad_norm": 2.0349810685950698,
"learning_rate": 9.936204146730463e-08,
"loss": 0.6053,
"step": 370
},
{
"epoch": 0.10765035165781542,
"grad_norm": 2.174540606431127,
"learning_rate": 9.920255183413077e-08,
"loss": 0.5961,
"step": 375
},
{
"epoch": 0.10908568967991962,
"grad_norm": 2.177994561683314,
"learning_rate": 9.904306220095693e-08,
"loss": 0.5995,
"step": 380
},
{
"epoch": 0.11052102770202382,
"grad_norm": 2.1071224070808854,
"learning_rate": 9.88835725677831e-08,
"loss": 0.6316,
"step": 385
},
{
"epoch": 0.11195636572412804,
"grad_norm": 2.051925662287215,
"learning_rate": 9.872408293460924e-08,
"loss": 0.6034,
"step": 390
},
{
"epoch": 0.11339170374623224,
"grad_norm": 2.0760748827078883,
"learning_rate": 9.85645933014354e-08,
"loss": 0.5925,
"step": 395
},
{
"epoch": 0.11482704176833644,
"grad_norm": 2.0510637954672166,
"learning_rate": 9.840510366826155e-08,
"loss": 0.5906,
"step": 400
},
{
"epoch": 0.11626237979044064,
"grad_norm": 2.256307217104805,
"learning_rate": 9.824561403508771e-08,
"loss": 0.6197,
"step": 405
},
{
"epoch": 0.11769771781254486,
"grad_norm": 2.1508326768819,
"learning_rate": 9.808612440191387e-08,
"loss": 0.6059,
"step": 410
},
{
"epoch": 0.11913305583464906,
"grad_norm": 2.158467198244223,
"learning_rate": 9.792663476874002e-08,
"loss": 0.6287,
"step": 415
},
{
"epoch": 0.12056839385675326,
"grad_norm": 2.0519012340549607,
"learning_rate": 9.776714513556618e-08,
"loss": 0.5919,
"step": 420
},
{
"epoch": 0.12200373187885748,
"grad_norm": 2.2122386720795206,
"learning_rate": 9.760765550239234e-08,
"loss": 0.6047,
"step": 425
},
{
"epoch": 0.12343906990096168,
"grad_norm": 2.4377341243140855,
"learning_rate": 9.74481658692185e-08,
"loss": 0.6086,
"step": 430
},
{
"epoch": 0.12487440792306588,
"grad_norm": 2.2032974131908736,
"learning_rate": 9.728867623604465e-08,
"loss": 0.5956,
"step": 435
},
{
"epoch": 0.12630974594517008,
"grad_norm": 2.1439507923710908,
"learning_rate": 9.71291866028708e-08,
"loss": 0.6188,
"step": 440
},
{
"epoch": 0.1277450839672743,
"grad_norm": 2.124605788252112,
"learning_rate": 9.696969696969697e-08,
"loss": 0.595,
"step": 445
},
{
"epoch": 0.12918042198937849,
"grad_norm": 2.0145813941505626,
"learning_rate": 9.681020733652312e-08,
"loss": 0.604,
"step": 450
},
{
"epoch": 0.1306157600114827,
"grad_norm": 1.9750273830741611,
"learning_rate": 9.665071770334928e-08,
"loss": 0.602,
"step": 455
},
{
"epoch": 0.13205109803358692,
"grad_norm": 2.132152341731948,
"learning_rate": 9.649122807017543e-08,
"loss": 0.5919,
"step": 460
},
{
"epoch": 0.1334864360556911,
"grad_norm": 2.2780064420605104,
"learning_rate": 9.633173843700159e-08,
"loss": 0.6126,
"step": 465
},
{
"epoch": 0.13492177407779532,
"grad_norm": 2.2739660981275884,
"learning_rate": 9.617224880382775e-08,
"loss": 0.6029,
"step": 470
},
{
"epoch": 0.13635711209989954,
"grad_norm": 2.1742546599852637,
"learning_rate": 9.60127591706539e-08,
"loss": 0.5769,
"step": 475
},
{
"epoch": 0.13779245012200372,
"grad_norm": 2.42381996317855,
"learning_rate": 9.585326953748006e-08,
"loss": 0.5994,
"step": 480
},
{
"epoch": 0.13922778814410794,
"grad_norm": 2.227117458354492,
"learning_rate": 9.569377990430622e-08,
"loss": 0.594,
"step": 485
},
{
"epoch": 0.14066312616621215,
"grad_norm": 2.4643334214114456,
"learning_rate": 9.553429027113237e-08,
"loss": 0.6027,
"step": 490
},
{
"epoch": 0.14209846418831634,
"grad_norm": 2.1983131254869415,
"learning_rate": 9.537480063795853e-08,
"loss": 0.6055,
"step": 495
},
{
"epoch": 0.14353380221042056,
"grad_norm": 2.143709628361645,
"learning_rate": 9.521531100478468e-08,
"loss": 0.6028,
"step": 500
},
{
"epoch": 0.14496914023252475,
"grad_norm": 2.111089939054113,
"learning_rate": 9.505582137161085e-08,
"loss": 0.6142,
"step": 505
},
{
"epoch": 0.14640447825462896,
"grad_norm": 2.185072710633692,
"learning_rate": 9.4896331738437e-08,
"loss": 0.6036,
"step": 510
},
{
"epoch": 0.14783981627673318,
"grad_norm": 2.1749714984974027,
"learning_rate": 9.473684210526315e-08,
"loss": 0.6017,
"step": 515
},
{
"epoch": 0.14927515429883736,
"grad_norm": 2.0595469823971086,
"learning_rate": 9.457735247208931e-08,
"loss": 0.593,
"step": 520
},
{
"epoch": 0.15071049232094158,
"grad_norm": 2.166316240639028,
"learning_rate": 9.441786283891547e-08,
"loss": 0.602,
"step": 525
},
{
"epoch": 0.1521458303430458,
"grad_norm": 2.1081294919011713,
"learning_rate": 9.425837320574163e-08,
"loss": 0.5704,
"step": 530
},
{
"epoch": 0.15358116836514998,
"grad_norm": 2.086486832035454,
"learning_rate": 9.409888357256778e-08,
"loss": 0.5829,
"step": 535
},
{
"epoch": 0.1550165063872542,
"grad_norm": 1.9452488677658146,
"learning_rate": 9.393939393939394e-08,
"loss": 0.5819,
"step": 540
},
{
"epoch": 0.15645184440935841,
"grad_norm": 2.2482994216353247,
"learning_rate": 9.37799043062201e-08,
"loss": 0.595,
"step": 545
},
{
"epoch": 0.1578871824314626,
"grad_norm": 2.1541134403144366,
"learning_rate": 9.362041467304625e-08,
"loss": 0.5947,
"step": 550
},
{
"epoch": 0.15932252045356682,
"grad_norm": 2.106629528341244,
"learning_rate": 9.34609250398724e-08,
"loss": 0.5914,
"step": 555
},
{
"epoch": 0.16075785847567103,
"grad_norm": 2.1549978294974013,
"learning_rate": 9.330143540669855e-08,
"loss": 0.5972,
"step": 560
},
{
"epoch": 0.16219319649777522,
"grad_norm": 2.087927226201651,
"learning_rate": 9.314194577352472e-08,
"loss": 0.601,
"step": 565
},
{
"epoch": 0.16362853451987944,
"grad_norm": 2.2969758412493184,
"learning_rate": 9.298245614035088e-08,
"loss": 0.6121,
"step": 570
},
{
"epoch": 0.16506387254198363,
"grad_norm": 1.9822061570846972,
"learning_rate": 9.282296650717702e-08,
"loss": 0.5901,
"step": 575
},
{
"epoch": 0.16649921056408784,
"grad_norm": 2.096712972417583,
"learning_rate": 9.266347687400318e-08,
"loss": 0.5907,
"step": 580
},
{
"epoch": 0.16793454858619206,
"grad_norm": 2.2566429251485394,
"learning_rate": 9.250398724082935e-08,
"loss": 0.6063,
"step": 585
},
{
"epoch": 0.16936988660829624,
"grad_norm": 2.1805651747312447,
"learning_rate": 9.23444976076555e-08,
"loss": 0.5957,
"step": 590
},
{
"epoch": 0.17080522463040046,
"grad_norm": 2.1247600803710296,
"learning_rate": 9.218500797448165e-08,
"loss": 0.5885,
"step": 595
},
{
"epoch": 0.17224056265250468,
"grad_norm": 2.0815283359286876,
"learning_rate": 9.20255183413078e-08,
"loss": 0.599,
"step": 600
},
{
"epoch": 0.17367590067460886,
"grad_norm": 2.038372892981339,
"learning_rate": 9.186602870813396e-08,
"loss": 0.609,
"step": 605
},
{
"epoch": 0.17511123869671308,
"grad_norm": 2.263660725821227,
"learning_rate": 9.170653907496012e-08,
"loss": 0.5962,
"step": 610
},
{
"epoch": 0.1765465767188173,
"grad_norm": 1.9909834330631233,
"learning_rate": 9.154704944178628e-08,
"loss": 0.6044,
"step": 615
},
{
"epoch": 0.17798191474092148,
"grad_norm": 2.047085721846824,
"learning_rate": 9.138755980861243e-08,
"loss": 0.5861,
"step": 620
},
{
"epoch": 0.1794172527630257,
"grad_norm": 2.142108425915519,
"learning_rate": 9.122807017543859e-08,
"loss": 0.6001,
"step": 625
},
{
"epoch": 0.18085259078512989,
"grad_norm": 2.2269167910002428,
"learning_rate": 9.106858054226475e-08,
"loss": 0.5692,
"step": 630
},
{
"epoch": 0.1822879288072341,
"grad_norm": 2.163281563946126,
"learning_rate": 9.09090909090909e-08,
"loss": 0.5746,
"step": 635
},
{
"epoch": 0.18372326682933832,
"grad_norm": 2.2283513846745393,
"learning_rate": 9.074960127591706e-08,
"loss": 0.6013,
"step": 640
},
{
"epoch": 0.1851586048514425,
"grad_norm": 2.4916531419640378,
"learning_rate": 9.059011164274322e-08,
"loss": 0.584,
"step": 645
},
{
"epoch": 0.18659394287354672,
"grad_norm": 2.2163307550487583,
"learning_rate": 9.043062200956937e-08,
"loss": 0.5997,
"step": 650
},
{
"epoch": 0.18802928089565094,
"grad_norm": 2.1338109129705343,
"learning_rate": 9.027113237639553e-08,
"loss": 0.6148,
"step": 655
},
{
"epoch": 0.18946461891775512,
"grad_norm": 2.1154029002889962,
"learning_rate": 9.011164274322168e-08,
"loss": 0.5934,
"step": 660
},
{
"epoch": 0.19089995693985934,
"grad_norm": 2.1834360253997604,
"learning_rate": 8.995215311004784e-08,
"loss": 0.6034,
"step": 665
},
{
"epoch": 0.19233529496196355,
"grad_norm": 2.224312171540706,
"learning_rate": 8.9792663476874e-08,
"loss": 0.6001,
"step": 670
},
{
"epoch": 0.19377063298406774,
"grad_norm": 2.149180259343769,
"learning_rate": 8.963317384370016e-08,
"loss": 0.5711,
"step": 675
},
{
"epoch": 0.19520597100617196,
"grad_norm": 2.2487709986154703,
"learning_rate": 8.947368421052631e-08,
"loss": 0.6106,
"step": 680
},
{
"epoch": 0.19664130902827615,
"grad_norm": 2.1736142585918863,
"learning_rate": 8.931419457735247e-08,
"loss": 0.5965,
"step": 685
},
{
"epoch": 0.19807664705038036,
"grad_norm": 2.2565465434395997,
"learning_rate": 8.915470494417863e-08,
"loss": 0.5878,
"step": 690
},
{
"epoch": 0.19951198507248458,
"grad_norm": 2.117117032717574,
"learning_rate": 8.899521531100478e-08,
"loss": 0.5795,
"step": 695
},
{
"epoch": 0.2003731878857471,
"eval_loss": 0.6242366433143616,
"eval_runtime": 276.6162,
"eval_samples_per_second": 138.188,
"eval_steps_per_second": 2.162,
"step": 698
},
{
"epoch": 0.20094732309458876,
"grad_norm": 2.282588992438042,
"learning_rate": 8.883572567783094e-08,
"loss": 0.6082,
"step": 700
},
{
"epoch": 0.20238266111669298,
"grad_norm": 2.3530914921224504,
"learning_rate": 8.867623604465709e-08,
"loss": 0.5971,
"step": 705
},
{
"epoch": 0.2038179991387972,
"grad_norm": 2.2866310443011173,
"learning_rate": 8.851674641148325e-08,
"loss": 0.6002,
"step": 710
},
{
"epoch": 0.20525333716090138,
"grad_norm": 2.301531540868706,
"learning_rate": 8.835725677830941e-08,
"loss": 0.5925,
"step": 715
},
{
"epoch": 0.2066886751830056,
"grad_norm": 2.1919572172369985,
"learning_rate": 8.819776714513556e-08,
"loss": 0.6174,
"step": 720
},
{
"epoch": 0.20812401320510981,
"grad_norm": 2.2967058044542794,
"learning_rate": 8.803827751196172e-08,
"loss": 0.5926,
"step": 725
},
{
"epoch": 0.209559351227214,
"grad_norm": 2.1307037968639717,
"learning_rate": 8.787878787878788e-08,
"loss": 0.6104,
"step": 730
},
{
"epoch": 0.21099468924931822,
"grad_norm": 2.209964266498109,
"learning_rate": 8.771929824561403e-08,
"loss": 0.5943,
"step": 735
},
{
"epoch": 0.21243002727142243,
"grad_norm": 2.419009311462197,
"learning_rate": 8.755980861244019e-08,
"loss": 0.5874,
"step": 740
},
{
"epoch": 0.21386536529352662,
"grad_norm": 2.389943282672986,
"learning_rate": 8.740031897926634e-08,
"loss": 0.586,
"step": 745
},
{
"epoch": 0.21530070331563084,
"grad_norm": 2.084899396749322,
"learning_rate": 8.724082934609251e-08,
"loss": 0.5955,
"step": 750
},
{
"epoch": 0.21673604133773502,
"grad_norm": 2.2165347770637687,
"learning_rate": 8.708133971291866e-08,
"loss": 0.5928,
"step": 755
},
{
"epoch": 0.21817137935983924,
"grad_norm": 2.1541567696115766,
"learning_rate": 8.69218500797448e-08,
"loss": 0.5802,
"step": 760
},
{
"epoch": 0.21960671738194346,
"grad_norm": 2.078810823982944,
"learning_rate": 8.676236044657097e-08,
"loss": 0.5851,
"step": 765
},
{
"epoch": 0.22104205540404764,
"grad_norm": 2.205438995615468,
"learning_rate": 8.660287081339713e-08,
"loss": 0.5822,
"step": 770
},
{
"epoch": 0.22247739342615186,
"grad_norm": 2.3681917067545335,
"learning_rate": 8.644338118022329e-08,
"loss": 0.5797,
"step": 775
},
{
"epoch": 0.22391273144825607,
"grad_norm": 2.1787226733938527,
"learning_rate": 8.628389154704943e-08,
"loss": 0.5952,
"step": 780
},
{
"epoch": 0.22534806947036026,
"grad_norm": 2.1341166867892705,
"learning_rate": 8.61244019138756e-08,
"loss": 0.5793,
"step": 785
},
{
"epoch": 0.22678340749246448,
"grad_norm": 2.226823901565418,
"learning_rate": 8.596491228070176e-08,
"loss": 0.5998,
"step": 790
},
{
"epoch": 0.2282187455145687,
"grad_norm": 2.154565895126637,
"learning_rate": 8.58054226475279e-08,
"loss": 0.6018,
"step": 795
},
{
"epoch": 0.22965408353667288,
"grad_norm": 2.1902644679628427,
"learning_rate": 8.564593301435407e-08,
"loss": 0.5846,
"step": 800
},
{
"epoch": 0.2310894215587771,
"grad_norm": 2.132807778812804,
"learning_rate": 8.548644338118021e-08,
"loss": 0.5973,
"step": 805
},
{
"epoch": 0.23252475958088129,
"grad_norm": 2.1541803653195237,
"learning_rate": 8.532695374800639e-08,
"loss": 0.6016,
"step": 810
},
{
"epoch": 0.2339600976029855,
"grad_norm": 2.6732037615470223,
"learning_rate": 8.516746411483253e-08,
"loss": 0.5996,
"step": 815
},
{
"epoch": 0.23539543562508972,
"grad_norm": 2.13269819035701,
"learning_rate": 8.500797448165868e-08,
"loss": 0.5944,
"step": 820
},
{
"epoch": 0.2368307736471939,
"grad_norm": 2.0948220019531134,
"learning_rate": 8.484848484848484e-08,
"loss": 0.5801,
"step": 825
},
{
"epoch": 0.23826611166929812,
"grad_norm": 2.141745313126493,
"learning_rate": 8.4688995215311e-08,
"loss": 0.5912,
"step": 830
},
{
"epoch": 0.23970144969140234,
"grad_norm": 2.185598157293274,
"learning_rate": 8.452950558213716e-08,
"loss": 0.561,
"step": 835
},
{
"epoch": 0.24113678771350652,
"grad_norm": 2.113152259023194,
"learning_rate": 8.437001594896331e-08,
"loss": 0.5767,
"step": 840
},
{
"epoch": 0.24257212573561074,
"grad_norm": 2.1653119179818403,
"learning_rate": 8.421052631578946e-08,
"loss": 0.5994,
"step": 845
},
{
"epoch": 0.24400746375771495,
"grad_norm": 2.232117286703433,
"learning_rate": 8.405103668261563e-08,
"loss": 0.5965,
"step": 850
},
{
"epoch": 0.24544280177981914,
"grad_norm": 2.1224679961554624,
"learning_rate": 8.389154704944178e-08,
"loss": 0.6024,
"step": 855
},
{
"epoch": 0.24687813980192336,
"grad_norm": 2.23125032456125,
"learning_rate": 8.373205741626794e-08,
"loss": 0.6039,
"step": 860
},
{
"epoch": 0.24831347782402755,
"grad_norm": 2.087520038725682,
"learning_rate": 8.357256778309409e-08,
"loss": 0.5846,
"step": 865
},
{
"epoch": 0.24974881584613176,
"grad_norm": 2.3368921148872337,
"learning_rate": 8.341307814992025e-08,
"loss": 0.5902,
"step": 870
},
{
"epoch": 0.25118415386823595,
"grad_norm": 2.1530686965187185,
"learning_rate": 8.325358851674641e-08,
"loss": 0.5749,
"step": 875
},
{
"epoch": 0.25261949189034016,
"grad_norm": 2.1127613396706186,
"learning_rate": 8.309409888357256e-08,
"loss": 0.583,
"step": 880
},
{
"epoch": 0.2540548299124444,
"grad_norm": 2.3444742975266393,
"learning_rate": 8.293460925039872e-08,
"loss": 0.5896,
"step": 885
},
{
"epoch": 0.2554901679345486,
"grad_norm": 2.2978259898614044,
"learning_rate": 8.277511961722488e-08,
"loss": 0.5949,
"step": 890
},
{
"epoch": 0.2569255059566528,
"grad_norm": 2.433573563011829,
"learning_rate": 8.261562998405104e-08,
"loss": 0.5727,
"step": 895
},
{
"epoch": 0.25836084397875697,
"grad_norm": 2.3201307959383177,
"learning_rate": 8.245614035087719e-08,
"loss": 0.6097,
"step": 900
},
{
"epoch": 0.2597961820008612,
"grad_norm": 2.2013387745102238,
"learning_rate": 8.229665071770334e-08,
"loss": 0.5788,
"step": 905
},
{
"epoch": 0.2612315200229654,
"grad_norm": 2.183018300558425,
"learning_rate": 8.21371610845295e-08,
"loss": 0.5812,
"step": 910
},
{
"epoch": 0.2626668580450696,
"grad_norm": 2.3072922091990025,
"learning_rate": 8.197767145135566e-08,
"loss": 0.5943,
"step": 915
},
{
"epoch": 0.26410219606717383,
"grad_norm": 2.3037915303690353,
"learning_rate": 8.181818181818182e-08,
"loss": 0.5748,
"step": 920
},
{
"epoch": 0.26553753408927805,
"grad_norm": 2.2529199926874672,
"learning_rate": 8.165869218500797e-08,
"loss": 0.5817,
"step": 925
},
{
"epoch": 0.2669728721113822,
"grad_norm": 2.0936454029080296,
"learning_rate": 8.149920255183413e-08,
"loss": 0.5886,
"step": 930
},
{
"epoch": 0.2684082101334864,
"grad_norm": 2.204603428252123,
"learning_rate": 8.133971291866029e-08,
"loss": 0.58,
"step": 935
},
{
"epoch": 0.26984354815559064,
"grad_norm": 2.3046758578427844,
"learning_rate": 8.118022328548644e-08,
"loss": 0.5639,
"step": 940
},
{
"epoch": 0.27127888617769486,
"grad_norm": 2.039030706090379,
"learning_rate": 8.10207336523126e-08,
"loss": 0.5747,
"step": 945
},
{
"epoch": 0.27271422419979907,
"grad_norm": 2.4822001236050184,
"learning_rate": 8.086124401913875e-08,
"loss": 0.5814,
"step": 950
},
{
"epoch": 0.27414956222190323,
"grad_norm": 2.072097265053284,
"learning_rate": 8.070175438596491e-08,
"loss": 0.6058,
"step": 955
},
{
"epoch": 0.27558490024400745,
"grad_norm": 2.213534670295135,
"learning_rate": 8.054226475279107e-08,
"loss": 0.5733,
"step": 960
},
{
"epoch": 0.27702023826611166,
"grad_norm": 2.0538875642379124,
"learning_rate": 8.038277511961722e-08,
"loss": 0.58,
"step": 965
},
{
"epoch": 0.2784555762882159,
"grad_norm": 2.656256926455447,
"learning_rate": 8.022328548644338e-08,
"loss": 0.5827,
"step": 970
},
{
"epoch": 0.2798909143103201,
"grad_norm": 2.2918877338477106,
"learning_rate": 8.006379585326954e-08,
"loss": 0.5906,
"step": 975
},
{
"epoch": 0.2813262523324243,
"grad_norm": 2.1437496217506307,
"learning_rate": 7.990430622009568e-08,
"loss": 0.5994,
"step": 980
},
{
"epoch": 0.28276159035452847,
"grad_norm": 2.108637907788069,
"learning_rate": 7.974481658692185e-08,
"loss": 0.5824,
"step": 985
},
{
"epoch": 0.2841969283766327,
"grad_norm": 2.2635800105704416,
"learning_rate": 7.9585326953748e-08,
"loss": 0.5899,
"step": 990
},
{
"epoch": 0.2856322663987369,
"grad_norm": 2.5457571482007864,
"learning_rate": 7.942583732057417e-08,
"loss": 0.5737,
"step": 995
},
{
"epoch": 0.2870676044208411,
"grad_norm": 2.194123282495016,
"learning_rate": 7.926634768740032e-08,
"loss": 0.5867,
"step": 1000
},
{
"epoch": 0.28850294244294533,
"grad_norm": 2.1371261720660537,
"learning_rate": 7.910685805422646e-08,
"loss": 0.6,
"step": 1005
},
{
"epoch": 0.2899382804650495,
"grad_norm": 2.2866199715661497,
"learning_rate": 7.894736842105262e-08,
"loss": 0.5897,
"step": 1010
},
{
"epoch": 0.2913736184871537,
"grad_norm": 2.3979447418720765,
"learning_rate": 7.878787878787878e-08,
"loss": 0.5673,
"step": 1015
},
{
"epoch": 0.2928089565092579,
"grad_norm": 2.2911555208949426,
"learning_rate": 7.862838915470495e-08,
"loss": 0.5967,
"step": 1020
},
{
"epoch": 0.29424429453136214,
"grad_norm": 2.16579462949441,
"learning_rate": 7.846889952153109e-08,
"loss": 0.5821,
"step": 1025
},
{
"epoch": 0.29567963255346635,
"grad_norm": 2.044335696136996,
"learning_rate": 7.830940988835725e-08,
"loss": 0.5696,
"step": 1030
},
{
"epoch": 0.29711497057557057,
"grad_norm": 2.431018134661352,
"learning_rate": 7.814992025518342e-08,
"loss": 0.5837,
"step": 1035
},
{
"epoch": 0.29855030859767473,
"grad_norm": 2.210821229899501,
"learning_rate": 7.799043062200956e-08,
"loss": 0.6103,
"step": 1040
},
{
"epoch": 0.29998564661977895,
"grad_norm": 2.104446504756259,
"learning_rate": 7.783094098883572e-08,
"loss": 0.5858,
"step": 1045
},
{
"epoch": 0.30055978182862064,
"eval_loss": 0.6225999593734741,
"eval_runtime": 276.7691,
"eval_samples_per_second": 138.112,
"eval_steps_per_second": 2.161,
"step": 1047
},
{
"epoch": 0.30142098464188316,
"grad_norm": 2.3677638058778063,
"learning_rate": 7.767145135566187e-08,
"loss": 0.5937,
"step": 1050
},
{
"epoch": 0.3028563226639874,
"grad_norm": 2.206731325058568,
"learning_rate": 7.751196172248805e-08,
"loss": 0.5859,
"step": 1055
},
{
"epoch": 0.3042916606860916,
"grad_norm": 2.1480500318169504,
"learning_rate": 7.735247208931419e-08,
"loss": 0.5868,
"step": 1060
},
{
"epoch": 0.3057269987081958,
"grad_norm": 2.5582545964146983,
"learning_rate": 7.719298245614034e-08,
"loss": 0.5742,
"step": 1065
},
{
"epoch": 0.30716233673029997,
"grad_norm": 2.193868662465662,
"learning_rate": 7.70334928229665e-08,
"loss": 0.5953,
"step": 1070
},
{
"epoch": 0.3085976747524042,
"grad_norm": 2.150662198093994,
"learning_rate": 7.687400318979266e-08,
"loss": 0.5893,
"step": 1075
},
{
"epoch": 0.3100330127745084,
"grad_norm": 2.248241358693427,
"learning_rate": 7.671451355661882e-08,
"loss": 0.5768,
"step": 1080
},
{
"epoch": 0.3114683507966126,
"grad_norm": 2.043693547149444,
"learning_rate": 7.655502392344497e-08,
"loss": 0.5645,
"step": 1085
},
{
"epoch": 0.31290368881871683,
"grad_norm": 2.123014406455518,
"learning_rate": 7.639553429027112e-08,
"loss": 0.5886,
"step": 1090
},
{
"epoch": 0.314339026840821,
"grad_norm": 2.1690683360268803,
"learning_rate": 7.623604465709729e-08,
"loss": 0.5855,
"step": 1095
},
{
"epoch": 0.3157743648629252,
"grad_norm": 2.2483407272993348,
"learning_rate": 7.607655502392344e-08,
"loss": 0.5734,
"step": 1100
},
{
"epoch": 0.3172097028850294,
"grad_norm": 2.2532867903751037,
"learning_rate": 7.59170653907496e-08,
"loss": 0.5905,
"step": 1105
},
{
"epoch": 0.31864504090713364,
"grad_norm": 2.1607546363917143,
"learning_rate": 7.575757575757575e-08,
"loss": 0.5715,
"step": 1110
},
{
"epoch": 0.32008037892923785,
"grad_norm": 2.4129775854563564,
"learning_rate": 7.559808612440191e-08,
"loss": 0.5882,
"step": 1115
},
{
"epoch": 0.32151571695134207,
"grad_norm": 2.2020460031905547,
"learning_rate": 7.543859649122807e-08,
"loss": 0.5755,
"step": 1120
},
{
"epoch": 0.3229510549734462,
"grad_norm": 2.2513751622058233,
"learning_rate": 7.527910685805422e-08,
"loss": 0.573,
"step": 1125
},
{
"epoch": 0.32438639299555044,
"grad_norm": 2.05291200128756,
"learning_rate": 7.511961722488038e-08,
"loss": 0.5725,
"step": 1130
},
{
"epoch": 0.32582173101765466,
"grad_norm": 2.167144338148049,
"learning_rate": 7.496012759170654e-08,
"loss": 0.5952,
"step": 1135
},
{
"epoch": 0.3272570690397589,
"grad_norm": 2.342558025297928,
"learning_rate": 7.48006379585327e-08,
"loss": 0.6007,
"step": 1140
},
{
"epoch": 0.3286924070618631,
"grad_norm": 2.2082285106467037,
"learning_rate": 7.464114832535885e-08,
"loss": 0.5988,
"step": 1145
},
{
"epoch": 0.33012774508396725,
"grad_norm": 2.4965805150161424,
"learning_rate": 7.4481658692185e-08,
"loss": 0.5976,
"step": 1150
},
{
"epoch": 0.33156308310607147,
"grad_norm": 2.199510861589018,
"learning_rate": 7.432216905901117e-08,
"loss": 0.5933,
"step": 1155
},
{
"epoch": 0.3329984211281757,
"grad_norm": 2.213256566440661,
"learning_rate": 7.416267942583732e-08,
"loss": 0.5925,
"step": 1160
},
{
"epoch": 0.3344337591502799,
"grad_norm": 2.267147072610886,
"learning_rate": 7.400318979266348e-08,
"loss": 0.5952,
"step": 1165
},
{
"epoch": 0.3358690971723841,
"grad_norm": 2.384880893465299,
"learning_rate": 7.384370015948963e-08,
"loss": 0.5854,
"step": 1170
},
{
"epoch": 0.33730443519448833,
"grad_norm": 2.3975013590320957,
"learning_rate": 7.368421052631579e-08,
"loss": 0.5896,
"step": 1175
},
{
"epoch": 0.3387397732165925,
"grad_norm": 2.196343857149047,
"learning_rate": 7.352472089314195e-08,
"loss": 0.5949,
"step": 1180
},
{
"epoch": 0.3401751112386967,
"grad_norm": 2.2389824981496416,
"learning_rate": 7.33652312599681e-08,
"loss": 0.5815,
"step": 1185
},
{
"epoch": 0.3416104492608009,
"grad_norm": 2.3137486247453163,
"learning_rate": 7.320574162679426e-08,
"loss": 0.5875,
"step": 1190
},
{
"epoch": 0.34304578728290513,
"grad_norm": 2.4693537666125085,
"learning_rate": 7.304625199362042e-08,
"loss": 0.5959,
"step": 1195
},
{
"epoch": 0.34448112530500935,
"grad_norm": 2.1602741385717725,
"learning_rate": 7.288676236044657e-08,
"loss": 0.5945,
"step": 1200
},
{
"epoch": 0.3459164633271135,
"grad_norm": 2.210588507776581,
"learning_rate": 7.272727272727273e-08,
"loss": 0.5743,
"step": 1205
},
{
"epoch": 0.3473518013492177,
"grad_norm": 2.097615546869791,
"learning_rate": 7.256778309409887e-08,
"loss": 0.5937,
"step": 1210
},
{
"epoch": 0.34878713937132194,
"grad_norm": 2.317699028012793,
"learning_rate": 7.240829346092503e-08,
"loss": 0.5849,
"step": 1215
},
{
"epoch": 0.35022247739342616,
"grad_norm": 2.3091147413758724,
"learning_rate": 7.22488038277512e-08,
"loss": 0.5856,
"step": 1220
},
{
"epoch": 0.3516578154155304,
"grad_norm": 2.2476415769641167,
"learning_rate": 7.208931419457734e-08,
"loss": 0.5574,
"step": 1225
},
{
"epoch": 0.3530931534376346,
"grad_norm": 2.4762397998859704,
"learning_rate": 7.19298245614035e-08,
"loss": 0.5851,
"step": 1230
},
{
"epoch": 0.35452849145973875,
"grad_norm": 2.437984095784635,
"learning_rate": 7.177033492822967e-08,
"loss": 0.5707,
"step": 1235
},
{
"epoch": 0.35596382948184296,
"grad_norm": 2.451803793047694,
"learning_rate": 7.161084529505583e-08,
"loss": 0.609,
"step": 1240
},
{
"epoch": 0.3573991675039472,
"grad_norm": 2.1857999101978476,
"learning_rate": 7.145135566188197e-08,
"loss": 0.5943,
"step": 1245
},
{
"epoch": 0.3588345055260514,
"grad_norm": 2.4116162170012,
"learning_rate": 7.129186602870812e-08,
"loss": 0.5869,
"step": 1250
},
{
"epoch": 0.3602698435481556,
"grad_norm": 2.545797080063671,
"learning_rate": 7.113237639553428e-08,
"loss": 0.5966,
"step": 1255
},
{
"epoch": 0.36170518157025977,
"grad_norm": 2.117352288405697,
"learning_rate": 7.097288676236044e-08,
"loss": 0.5887,
"step": 1260
},
{
"epoch": 0.363140519592364,
"grad_norm": 2.275523882502902,
"learning_rate": 7.08133971291866e-08,
"loss": 0.5975,
"step": 1265
},
{
"epoch": 0.3645758576144682,
"grad_norm": 2.3571318848422798,
"learning_rate": 7.065390749601275e-08,
"loss": 0.5645,
"step": 1270
},
{
"epoch": 0.3660111956365724,
"grad_norm": 2.312480194276115,
"learning_rate": 7.049441786283891e-08,
"loss": 0.5999,
"step": 1275
},
{
"epoch": 0.36744653365867663,
"grad_norm": 2.367840377627457,
"learning_rate": 7.033492822966507e-08,
"loss": 0.5824,
"step": 1280
},
{
"epoch": 0.36888187168078085,
"grad_norm": 2.160147447958279,
"learning_rate": 7.017543859649122e-08,
"loss": 0.5837,
"step": 1285
},
{
"epoch": 0.370317209702885,
"grad_norm": 2.0709776247715954,
"learning_rate": 7.001594896331738e-08,
"loss": 0.5699,
"step": 1290
},
{
"epoch": 0.3717525477249892,
"grad_norm": 2.3103426448215814,
"learning_rate": 6.985645933014353e-08,
"loss": 0.5608,
"step": 1295
},
{
"epoch": 0.37318788574709344,
"grad_norm": 2.4092008720321116,
"learning_rate": 6.96969696969697e-08,
"loss": 0.5814,
"step": 1300
},
{
"epoch": 0.37462322376919766,
"grad_norm": 2.310371459366773,
"learning_rate": 6.953748006379585e-08,
"loss": 0.5894,
"step": 1305
},
{
"epoch": 0.37605856179130187,
"grad_norm": 2.253935903638457,
"learning_rate": 6.9377990430622e-08,
"loss": 0.5728,
"step": 1310
},
{
"epoch": 0.37749389981340603,
"grad_norm": 2.1948403595861765,
"learning_rate": 6.921850079744816e-08,
"loss": 0.5907,
"step": 1315
},
{
"epoch": 0.37892923783551025,
"grad_norm": 2.5444511215863352,
"learning_rate": 6.905901116427432e-08,
"loss": 0.5916,
"step": 1320
},
{
"epoch": 0.38036457585761446,
"grad_norm": 2.4157863054442363,
"learning_rate": 6.889952153110048e-08,
"loss": 0.5966,
"step": 1325
},
{
"epoch": 0.3817999138797187,
"grad_norm": 2.358893882397905,
"learning_rate": 6.874003189792663e-08,
"loss": 0.584,
"step": 1330
},
{
"epoch": 0.3832352519018229,
"grad_norm": 2.4540399070214503,
"learning_rate": 6.858054226475278e-08,
"loss": 0.5682,
"step": 1335
},
{
"epoch": 0.3846705899239271,
"grad_norm": 2.4924888134144942,
"learning_rate": 6.842105263157895e-08,
"loss": 0.585,
"step": 1340
},
{
"epoch": 0.38610592794603127,
"grad_norm": 2.4341634135514805,
"learning_rate": 6.82615629984051e-08,
"loss": 0.5794,
"step": 1345
},
{
"epoch": 0.3875412659681355,
"grad_norm": 2.2137130985517888,
"learning_rate": 6.810207336523126e-08,
"loss": 0.5843,
"step": 1350
},
{
"epoch": 0.3889766039902397,
"grad_norm": 2.259943764213412,
"learning_rate": 6.794258373205741e-08,
"loss": 0.5751,
"step": 1355
},
{
"epoch": 0.3904119420123439,
"grad_norm": 2.313121297434376,
"learning_rate": 6.778309409888357e-08,
"loss": 0.5648,
"step": 1360
},
{
"epoch": 0.39184728003444813,
"grad_norm": 2.1195979632978053,
"learning_rate": 6.762360446570973e-08,
"loss": 0.5757,
"step": 1365
},
{
"epoch": 0.3932826180565523,
"grad_norm": 2.34003269760013,
"learning_rate": 6.746411483253588e-08,
"loss": 0.5839,
"step": 1370
},
{
"epoch": 0.3947179560786565,
"grad_norm": 2.2650000981436014,
"learning_rate": 6.730462519936204e-08,
"loss": 0.574,
"step": 1375
},
{
"epoch": 0.3961532941007607,
"grad_norm": 2.265678124826934,
"learning_rate": 6.71451355661882e-08,
"loss": 0.5925,
"step": 1380
},
{
"epoch": 0.39758863212286494,
"grad_norm": 2.4721732776855694,
"learning_rate": 6.698564593301436e-08,
"loss": 0.5749,
"step": 1385
},
{
"epoch": 0.39902397014496915,
"grad_norm": 2.0341699451050226,
"learning_rate": 6.682615629984051e-08,
"loss": 0.568,
"step": 1390
},
{
"epoch": 0.40045930816707337,
"grad_norm": 2.258266710480253,
"learning_rate": 6.666666666666665e-08,
"loss": 0.579,
"step": 1395
},
{
"epoch": 0.4007463757714942,
"eval_loss": 0.6223743557929993,
"eval_runtime": 276.9327,
"eval_samples_per_second": 138.03,
"eval_steps_per_second": 2.159,
"step": 1396
},
{
"epoch": 0.40189464618917753,
"grad_norm": 2.3590634880789527,
"learning_rate": 6.650717703349283e-08,
"loss": 0.5979,
"step": 1400
},
{
"epoch": 0.40332998421128174,
"grad_norm": 2.2482983012145112,
"learning_rate": 6.634768740031898e-08,
"loss": 0.5725,
"step": 1405
},
{
"epoch": 0.40476532223338596,
"grad_norm": 2.099031799334457,
"learning_rate": 6.618819776714514e-08,
"loss": 0.5702,
"step": 1410
},
{
"epoch": 0.4062006602554902,
"grad_norm": 2.2837745097672766,
"learning_rate": 6.602870813397129e-08,
"loss": 0.5853,
"step": 1415
},
{
"epoch": 0.4076359982775944,
"grad_norm": 2.1686626635780204,
"learning_rate": 6.586921850079745e-08,
"loss": 0.5925,
"step": 1420
},
{
"epoch": 0.40907133629969855,
"grad_norm": 2.3045919749457426,
"learning_rate": 6.570972886762361e-08,
"loss": 0.5678,
"step": 1425
},
{
"epoch": 0.41050667432180277,
"grad_norm": 2.506461569864655,
"learning_rate": 6.555023923444975e-08,
"loss": 0.5734,
"step": 1430
},
{
"epoch": 0.411942012343907,
"grad_norm": 2.216335116055202,
"learning_rate": 6.539074960127592e-08,
"loss": 0.5588,
"step": 1435
},
{
"epoch": 0.4133773503660112,
"grad_norm": 2.2460456566846614,
"learning_rate": 6.523125996810208e-08,
"loss": 0.5805,
"step": 1440
},
{
"epoch": 0.4148126883881154,
"grad_norm": 2.246523877288839,
"learning_rate": 6.507177033492822e-08,
"loss": 0.5595,
"step": 1445
},
{
"epoch": 0.41624802641021963,
"grad_norm": 2.3369923163629736,
"learning_rate": 6.491228070175438e-08,
"loss": 0.583,
"step": 1450
},
{
"epoch": 0.4176833644323238,
"grad_norm": 2.4647497141369445,
"learning_rate": 6.475279106858053e-08,
"loss": 0.5709,
"step": 1455
},
{
"epoch": 0.419118702454428,
"grad_norm": 2.169633790885161,
"learning_rate": 6.45933014354067e-08,
"loss": 0.5698,
"step": 1460
},
{
"epoch": 0.4205540404765322,
"grad_norm": 2.3384413570606712,
"learning_rate": 6.443381180223285e-08,
"loss": 0.5896,
"step": 1465
},
{
"epoch": 0.42198937849863644,
"grad_norm": 2.265985280435675,
"learning_rate": 6.4274322169059e-08,
"loss": 0.5824,
"step": 1470
},
{
"epoch": 0.42342471652074065,
"grad_norm": 2.3198946197532524,
"learning_rate": 6.411483253588516e-08,
"loss": 0.5846,
"step": 1475
},
{
"epoch": 0.42486005454284487,
"grad_norm": 2.1622460803134267,
"learning_rate": 6.395534290271132e-08,
"loss": 0.5901,
"step": 1480
},
{
"epoch": 0.426295392564949,
"grad_norm": 2.1003168882776335,
"learning_rate": 6.379585326953748e-08,
"loss": 0.5648,
"step": 1485
},
{
"epoch": 0.42773073058705324,
"grad_norm": 2.1502708485133595,
"learning_rate": 6.363636363636363e-08,
"loss": 0.5789,
"step": 1490
},
{
"epoch": 0.42916606860915746,
"grad_norm": 2.2104881202400306,
"learning_rate": 6.347687400318978e-08,
"loss": 0.5653,
"step": 1495
},
{
"epoch": 0.4306014066312617,
"grad_norm": 2.3011371767087176,
"learning_rate": 6.331738437001594e-08,
"loss": 0.5766,
"step": 1500
},
{
"epoch": 0.4320367446533659,
"grad_norm": 3.260166860791866,
"learning_rate": 6.31578947368421e-08,
"loss": 0.5731,
"step": 1505
},
{
"epoch": 0.43347208267547005,
"grad_norm": 2.394536391337356,
"learning_rate": 6.299840510366826e-08,
"loss": 0.5752,
"step": 1510
},
{
"epoch": 0.43490742069757427,
"grad_norm": 2.283308722940367,
"learning_rate": 6.283891547049441e-08,
"loss": 0.5743,
"step": 1515
},
{
"epoch": 0.4363427587196785,
"grad_norm": 2.3203679079324635,
"learning_rate": 6.267942583732057e-08,
"loss": 0.5753,
"step": 1520
},
{
"epoch": 0.4377780967417827,
"grad_norm": 2.3222606808309667,
"learning_rate": 6.251993620414673e-08,
"loss": 0.5926,
"step": 1525
},
{
"epoch": 0.4392134347638869,
"grad_norm": 2.207029727095187,
"learning_rate": 6.236044657097288e-08,
"loss": 0.5636,
"step": 1530
},
{
"epoch": 0.4406487727859911,
"grad_norm": 2.1568606265391055,
"learning_rate": 6.220095693779904e-08,
"loss": 0.569,
"step": 1535
},
{
"epoch": 0.4420841108080953,
"grad_norm": 2.317372995242904,
"learning_rate": 6.204146730462519e-08,
"loss": 0.5783,
"step": 1540
},
{
"epoch": 0.4435194488301995,
"grad_norm": 2.162686944734311,
"learning_rate": 6.188197767145136e-08,
"loss": 0.5799,
"step": 1545
},
{
"epoch": 0.4449547868523037,
"grad_norm": 2.4388032101358528,
"learning_rate": 6.172248803827751e-08,
"loss": 0.5839,
"step": 1550
},
{
"epoch": 0.44639012487440793,
"grad_norm": 2.2305867419581293,
"learning_rate": 6.156299840510366e-08,
"loss": 0.5735,
"step": 1555
},
{
"epoch": 0.44782546289651215,
"grad_norm": 2.134988514368576,
"learning_rate": 6.140350877192982e-08,
"loss": 0.5632,
"step": 1560
},
{
"epoch": 0.4492608009186163,
"grad_norm": 2.3800787456946115,
"learning_rate": 6.124401913875598e-08,
"loss": 0.5796,
"step": 1565
},
{
"epoch": 0.4506961389407205,
"grad_norm": 2.1810860707910735,
"learning_rate": 6.108452950558214e-08,
"loss": 0.5788,
"step": 1570
},
{
"epoch": 0.45213147696282474,
"grad_norm": 2.308640019656118,
"learning_rate": 6.092503987240829e-08,
"loss": 0.5839,
"step": 1575
},
{
"epoch": 0.45356681498492896,
"grad_norm": 2.297594960230257,
"learning_rate": 6.076555023923444e-08,
"loss": 0.605,
"step": 1580
},
{
"epoch": 0.45500215300703317,
"grad_norm": 2.1364449207680396,
"learning_rate": 6.060606060606061e-08,
"loss": 0.5867,
"step": 1585
},
{
"epoch": 0.4564374910291374,
"grad_norm": 2.1756934857023666,
"learning_rate": 6.044657097288676e-08,
"loss": 0.5722,
"step": 1590
},
{
"epoch": 0.45787282905124155,
"grad_norm": 2.18990044643342,
"learning_rate": 6.028708133971292e-08,
"loss": 0.5754,
"step": 1595
},
{
"epoch": 0.45930816707334576,
"grad_norm": 2.4364139727048415,
"learning_rate": 6.012759170653907e-08,
"loss": 0.5856,
"step": 1600
},
{
"epoch": 0.46074350509545,
"grad_norm": 2.2360356658373446,
"learning_rate": 5.996810207336523e-08,
"loss": 0.5612,
"step": 1605
},
{
"epoch": 0.4621788431175542,
"grad_norm": 2.190809489755964,
"learning_rate": 5.980861244019139e-08,
"loss": 0.5717,
"step": 1610
},
{
"epoch": 0.4636141811396584,
"grad_norm": 2.359827734351754,
"learning_rate": 5.964912280701754e-08,
"loss": 0.5733,
"step": 1615
},
{
"epoch": 0.46504951916176257,
"grad_norm": 2.21035489055241,
"learning_rate": 5.94896331738437e-08,
"loss": 0.5758,
"step": 1620
},
{
"epoch": 0.4664848571838668,
"grad_norm": 2.4239500111786776,
"learning_rate": 5.933014354066985e-08,
"loss": 0.5709,
"step": 1625
},
{
"epoch": 0.467920195205971,
"grad_norm": 2.3531687327749444,
"learning_rate": 5.917065390749602e-08,
"loss": 0.5769,
"step": 1630
},
{
"epoch": 0.4693555332280752,
"grad_norm": 2.2487850185411267,
"learning_rate": 5.9011164274322166e-08,
"loss": 0.5682,
"step": 1635
},
{
"epoch": 0.47079087125017943,
"grad_norm": 2.309323517203607,
"learning_rate": 5.885167464114832e-08,
"loss": 0.5848,
"step": 1640
},
{
"epoch": 0.47222620927228365,
"grad_norm": 2.214237754922774,
"learning_rate": 5.869218500797448e-08,
"loss": 0.572,
"step": 1645
},
{
"epoch": 0.4736615472943878,
"grad_norm": 2.3622619155735305,
"learning_rate": 5.8532695374800635e-08,
"loss": 0.5569,
"step": 1650
},
{
"epoch": 0.475096885316492,
"grad_norm": 2.5867769507291904,
"learning_rate": 5.8373205741626796e-08,
"loss": 0.5837,
"step": 1655
},
{
"epoch": 0.47653222333859624,
"grad_norm": 2.529755318765141,
"learning_rate": 5.821371610845295e-08,
"loss": 0.5811,
"step": 1660
},
{
"epoch": 0.47796756136070045,
"grad_norm": 2.266910537139947,
"learning_rate": 5.80542264752791e-08,
"loss": 0.578,
"step": 1665
},
{
"epoch": 0.47940289938280467,
"grad_norm": 2.279806038320283,
"learning_rate": 5.7894736842105265e-08,
"loss": 0.5526,
"step": 1670
},
{
"epoch": 0.48083823740490883,
"grad_norm": 2.12918604571782,
"learning_rate": 5.773524720893141e-08,
"loss": 0.5794,
"step": 1675
},
{
"epoch": 0.48227357542701305,
"grad_norm": 2.2284026717270806,
"learning_rate": 5.757575757575758e-08,
"loss": 0.5704,
"step": 1680
},
{
"epoch": 0.48370891344911726,
"grad_norm": 2.4032593629075145,
"learning_rate": 5.741626794258373e-08,
"loss": 0.5459,
"step": 1685
},
{
"epoch": 0.4851442514712215,
"grad_norm": 2.203189127996456,
"learning_rate": 5.725677830940988e-08,
"loss": 0.5997,
"step": 1690
},
{
"epoch": 0.4865795894933257,
"grad_norm": 2.4266862396460627,
"learning_rate": 5.7097288676236043e-08,
"loss": 0.5728,
"step": 1695
},
{
"epoch": 0.4880149275154299,
"grad_norm": 2.1211185101213785,
"learning_rate": 5.69377990430622e-08,
"loss": 0.5727,
"step": 1700
},
{
"epoch": 0.48945026553753407,
"grad_norm": 2.3364430793877244,
"learning_rate": 5.677830940988836e-08,
"loss": 0.58,
"step": 1705
},
{
"epoch": 0.4908856035596383,
"grad_norm": 2.206551955612983,
"learning_rate": 5.661881977671451e-08,
"loss": 0.5696,
"step": 1710
},
{
"epoch": 0.4923209415817425,
"grad_norm": 2.2652638647430687,
"learning_rate": 5.645933014354066e-08,
"loss": 0.5679,
"step": 1715
},
{
"epoch": 0.4937562796038467,
"grad_norm": 2.596656624165877,
"learning_rate": 5.629984051036683e-08,
"loss": 0.5767,
"step": 1720
},
{
"epoch": 0.49519161762595093,
"grad_norm": 2.3164896628154286,
"learning_rate": 5.6140350877192976e-08,
"loss": 0.5628,
"step": 1725
},
{
"epoch": 0.4966269556480551,
"grad_norm": 2.602375169914542,
"learning_rate": 5.5980861244019137e-08,
"loss": 0.5769,
"step": 1730
},
{
"epoch": 0.4980622936701593,
"grad_norm": 2.5046523107168928,
"learning_rate": 5.582137161084529e-08,
"loss": 0.5654,
"step": 1735
},
{
"epoch": 0.4994976316922635,
"grad_norm": 2.2378268554432013,
"learning_rate": 5.566188197767145e-08,
"loss": 0.5619,
"step": 1740
},
{
"epoch": 0.5009329697143677,
"grad_norm": 2.3540069510141444,
"learning_rate": 5.5502392344497606e-08,
"loss": 0.565,
"step": 1745
},
{
"epoch": 0.5009329697143677,
"eval_loss": 0.6235304474830627,
"eval_runtime": 276.9897,
"eval_samples_per_second": 138.002,
"eval_steps_per_second": 2.159,
"step": 1745
},
{
"epoch": 0.5023683077364719,
"grad_norm": 2.322877723800299,
"learning_rate": 5.534290271132376e-08,
"loss": 0.57,
"step": 1750
},
{
"epoch": 0.5038036457585762,
"grad_norm": 2.2404868803436684,
"learning_rate": 5.518341307814992e-08,
"loss": 0.5619,
"step": 1755
},
{
"epoch": 0.5052389837806803,
"grad_norm": 2.427309485938781,
"learning_rate": 5.5023923444976075e-08,
"loss": 0.5568,
"step": 1760
},
{
"epoch": 0.5066743218027846,
"grad_norm": 2.3119777544390323,
"learning_rate": 5.4864433811802236e-08,
"loss": 0.5769,
"step": 1765
},
{
"epoch": 0.5081096598248888,
"grad_norm": 2.876236781539242,
"learning_rate": 5.4704944178628384e-08,
"loss": 0.5479,
"step": 1770
},
{
"epoch": 0.5095449978469929,
"grad_norm": 2.2849069424986106,
"learning_rate": 5.454545454545454e-08,
"loss": 0.5648,
"step": 1775
},
{
"epoch": 0.5109803358690972,
"grad_norm": 2.477418575095315,
"learning_rate": 5.43859649122807e-08,
"loss": 0.573,
"step": 1780
},
{
"epoch": 0.5124156738912014,
"grad_norm": 2.293676037844509,
"learning_rate": 5.4226475279106853e-08,
"loss": 0.5717,
"step": 1785
},
{
"epoch": 0.5138510119133056,
"grad_norm": 2.1498159417130145,
"learning_rate": 5.4066985645933014e-08,
"loss": 0.5574,
"step": 1790
},
{
"epoch": 0.5152863499354098,
"grad_norm": 2.3707187570502937,
"learning_rate": 5.390749601275917e-08,
"loss": 0.5793,
"step": 1795
},
{
"epoch": 0.5167216879575139,
"grad_norm": 2.4610592838296506,
"learning_rate": 5.374800637958532e-08,
"loss": 0.5603,
"step": 1800
},
{
"epoch": 0.5181570259796182,
"grad_norm": 2.235095726599105,
"learning_rate": 5.3588516746411484e-08,
"loss": 0.5732,
"step": 1805
},
{
"epoch": 0.5195923640017224,
"grad_norm": 2.2403487945921796,
"learning_rate": 5.342902711323764e-08,
"loss": 0.5726,
"step": 1810
},
{
"epoch": 0.5210277020238266,
"grad_norm": 2.46672680845345,
"learning_rate": 5.32695374800638e-08,
"loss": 0.5784,
"step": 1815
},
{
"epoch": 0.5224630400459308,
"grad_norm": 2.1910024552090284,
"learning_rate": 5.3110047846889946e-08,
"loss": 0.5778,
"step": 1820
},
{
"epoch": 0.5238983780680351,
"grad_norm": 2.2204944277498426,
"learning_rate": 5.29505582137161e-08,
"loss": 0.572,
"step": 1825
},
{
"epoch": 0.5253337160901392,
"grad_norm": 2.211764134448438,
"learning_rate": 5.279106858054226e-08,
"loss": 0.5563,
"step": 1830
},
{
"epoch": 0.5267690541122434,
"grad_norm": 2.160392217124782,
"learning_rate": 5.2631578947368416e-08,
"loss": 0.5715,
"step": 1835
},
{
"epoch": 0.5282043921343477,
"grad_norm": 2.3173974716804837,
"learning_rate": 5.247208931419458e-08,
"loss": 0.5799,
"step": 1840
},
{
"epoch": 0.5296397301564518,
"grad_norm": 2.4119603113614625,
"learning_rate": 5.231259968102073e-08,
"loss": 0.5715,
"step": 1845
},
{
"epoch": 0.5310750681785561,
"grad_norm": 2.3942922290744852,
"learning_rate": 5.2153110047846885e-08,
"loss": 0.5798,
"step": 1850
},
{
"epoch": 0.5325104062006603,
"grad_norm": 2.4827513279602487,
"learning_rate": 5.1993620414673046e-08,
"loss": 0.5759,
"step": 1855
},
{
"epoch": 0.5339457442227644,
"grad_norm": 2.303837079874495,
"learning_rate": 5.1834130781499194e-08,
"loss": 0.5793,
"step": 1860
},
{
"epoch": 0.5353810822448687,
"grad_norm": 2.3729441241328324,
"learning_rate": 5.167464114832536e-08,
"loss": 0.5647,
"step": 1865
},
{
"epoch": 0.5368164202669728,
"grad_norm": 2.584955282461203,
"learning_rate": 5.151515151515151e-08,
"loss": 0.5635,
"step": 1870
},
{
"epoch": 0.5382517582890771,
"grad_norm": 2.3732511749039014,
"learning_rate": 5.1355661881977677e-08,
"loss": 0.5677,
"step": 1875
},
{
"epoch": 0.5396870963111813,
"grad_norm": 2.3031000575513967,
"learning_rate": 5.1196172248803824e-08,
"loss": 0.5765,
"step": 1880
},
{
"epoch": 0.5411224343332854,
"grad_norm": 2.13233132113133,
"learning_rate": 5.103668261562998e-08,
"loss": 0.5713,
"step": 1885
},
{
"epoch": 0.5425577723553897,
"grad_norm": 2.3633442992361844,
"learning_rate": 5.087719298245614e-08,
"loss": 0.5548,
"step": 1890
},
{
"epoch": 0.5439931103774939,
"grad_norm": 2.369702190598172,
"learning_rate": 5.0717703349282294e-08,
"loss": 0.5665,
"step": 1895
},
{
"epoch": 0.5454284483995981,
"grad_norm": 2.3321818602794377,
"learning_rate": 5.0558213716108454e-08,
"loss": 0.5704,
"step": 1900
},
{
"epoch": 0.5468637864217023,
"grad_norm": 2.380628420225508,
"learning_rate": 5.039872408293461e-08,
"loss": 0.5634,
"step": 1905
},
{
"epoch": 0.5482991244438065,
"grad_norm": 2.416952583768008,
"learning_rate": 5.0239234449760756e-08,
"loss": 0.5816,
"step": 1910
},
{
"epoch": 0.5497344624659107,
"grad_norm": 2.143565389342716,
"learning_rate": 5.0079744816586924e-08,
"loss": 0.5685,
"step": 1915
},
{
"epoch": 0.5511698004880149,
"grad_norm": 2.318881848504763,
"learning_rate": 4.992025518341307e-08,
"loss": 0.5538,
"step": 1920
},
{
"epoch": 0.5526051385101192,
"grad_norm": 2.4470463053460043,
"learning_rate": 4.976076555023923e-08,
"loss": 0.5726,
"step": 1925
},
{
"epoch": 0.5540404765322233,
"grad_norm": 2.3102885996703946,
"learning_rate": 4.960127591706539e-08,
"loss": 0.553,
"step": 1930
},
{
"epoch": 0.5554758145543276,
"grad_norm": 2.485118825226686,
"learning_rate": 4.944178628389155e-08,
"loss": 0.5614,
"step": 1935
},
{
"epoch": 0.5569111525764318,
"grad_norm": 2.4666660309950554,
"learning_rate": 4.92822966507177e-08,
"loss": 0.5767,
"step": 1940
},
{
"epoch": 0.5583464905985359,
"grad_norm": 2.6440619922621194,
"learning_rate": 4.9122807017543856e-08,
"loss": 0.577,
"step": 1945
},
{
"epoch": 0.5597818286206402,
"grad_norm": 2.234037650395611,
"learning_rate": 4.896331738437001e-08,
"loss": 0.5603,
"step": 1950
},
{
"epoch": 0.5612171666427443,
"grad_norm": 2.2239222727962518,
"learning_rate": 4.880382775119617e-08,
"loss": 0.5684,
"step": 1955
},
{
"epoch": 0.5626525046648486,
"grad_norm": 2.2149189102032643,
"learning_rate": 4.8644338118022326e-08,
"loss": 0.564,
"step": 1960
},
{
"epoch": 0.5640878426869528,
"grad_norm": 2.302377689722843,
"learning_rate": 4.8484848484848486e-08,
"loss": 0.5723,
"step": 1965
},
{
"epoch": 0.5655231807090569,
"grad_norm": 2.3609046338014767,
"learning_rate": 4.832535885167464e-08,
"loss": 0.5576,
"step": 1970
},
{
"epoch": 0.5669585187311612,
"grad_norm": 2.3940149734210183,
"learning_rate": 4.8165869218500795e-08,
"loss": 0.5701,
"step": 1975
},
{
"epoch": 0.5683938567532654,
"grad_norm": 2.499184750287661,
"learning_rate": 4.800637958532695e-08,
"loss": 0.5917,
"step": 1980
},
{
"epoch": 0.5698291947753696,
"grad_norm": 2.3892770083720953,
"learning_rate": 4.784688995215311e-08,
"loss": 0.5739,
"step": 1985
},
{
"epoch": 0.5712645327974738,
"grad_norm": 2.342709389162696,
"learning_rate": 4.7687400318979264e-08,
"loss": 0.5687,
"step": 1990
},
{
"epoch": 0.572699870819578,
"grad_norm": 2.3950007221533762,
"learning_rate": 4.7527910685805425e-08,
"loss": 0.5823,
"step": 1995
},
{
"epoch": 0.5741352088416822,
"grad_norm": 2.456899330442452,
"learning_rate": 4.736842105263157e-08,
"loss": 0.5554,
"step": 2000
},
{
"epoch": 0.5755705468637864,
"grad_norm": 2.0195213406503503,
"learning_rate": 4.7208931419457734e-08,
"loss": 0.5498,
"step": 2005
},
{
"epoch": 0.5770058848858907,
"grad_norm": 2.4854469417568144,
"learning_rate": 4.704944178628389e-08,
"loss": 0.5797,
"step": 2010
},
{
"epoch": 0.5784412229079948,
"grad_norm": 2.397518747088301,
"learning_rate": 4.688995215311005e-08,
"loss": 0.5799,
"step": 2015
},
{
"epoch": 0.579876560930099,
"grad_norm": 2.30185261859014,
"learning_rate": 4.67304625199362e-08,
"loss": 0.5716,
"step": 2020
},
{
"epoch": 0.5813118989522033,
"grad_norm": 2.454194414740183,
"learning_rate": 4.657097288676236e-08,
"loss": 0.5714,
"step": 2025
},
{
"epoch": 0.5827472369743074,
"grad_norm": 2.2295797322099564,
"learning_rate": 4.641148325358851e-08,
"loss": 0.56,
"step": 2030
},
{
"epoch": 0.5841825749964117,
"grad_norm": 2.2435929203230836,
"learning_rate": 4.625199362041467e-08,
"loss": 0.5612,
"step": 2035
},
{
"epoch": 0.5856179130185158,
"grad_norm": 2.354108340281971,
"learning_rate": 4.609250398724083e-08,
"loss": 0.5559,
"step": 2040
},
{
"epoch": 0.5870532510406201,
"grad_norm": 2.383408186012462,
"learning_rate": 4.593301435406698e-08,
"loss": 0.5552,
"step": 2045
},
{
"epoch": 0.5884885890627243,
"grad_norm": 2.274522558925121,
"learning_rate": 4.577352472089314e-08,
"loss": 0.5712,
"step": 2050
},
{
"epoch": 0.5899239270848284,
"grad_norm": 2.193942049887206,
"learning_rate": 4.5614035087719296e-08,
"loss": 0.5591,
"step": 2055
},
{
"epoch": 0.5913592651069327,
"grad_norm": 2.5336345097087176,
"learning_rate": 4.545454545454545e-08,
"loss": 0.5589,
"step": 2060
},
{
"epoch": 0.5927946031290369,
"grad_norm": 2.2998619526688633,
"learning_rate": 4.529505582137161e-08,
"loss": 0.5672,
"step": 2065
},
{
"epoch": 0.5942299411511411,
"grad_norm": 2.2622163134776603,
"learning_rate": 4.5135566188197766e-08,
"loss": 0.563,
"step": 2070
},
{
"epoch": 0.5956652791732453,
"grad_norm": 2.3239949997017204,
"learning_rate": 4.497607655502392e-08,
"loss": 0.5764,
"step": 2075
},
{
"epoch": 0.5971006171953495,
"grad_norm": 2.2025498742254337,
"learning_rate": 4.481658692185008e-08,
"loss": 0.5676,
"step": 2080
},
{
"epoch": 0.5985359552174537,
"grad_norm": 2.5241546548407943,
"learning_rate": 4.4657097288676235e-08,
"loss": 0.5602,
"step": 2085
},
{
"epoch": 0.5999712932395579,
"grad_norm": 2.3507845605252493,
"learning_rate": 4.449760765550239e-08,
"loss": 0.565,
"step": 2090
},
{
"epoch": 0.6011195636572413,
"eval_loss": 0.624911904335022,
"eval_runtime": 276.9879,
"eval_samples_per_second": 138.002,
"eval_steps_per_second": 2.159,
"step": 2094
},
{
"epoch": 0.6014066312616622,
"grad_norm": 2.249365554658679,
"learning_rate": 4.4338118022328544e-08,
"loss": 0.5816,
"step": 2095
},
{
"epoch": 0.6028419692837663,
"grad_norm": 2.679508996031371,
"learning_rate": 4.4178628389154705e-08,
"loss": 0.584,
"step": 2100
},
{
"epoch": 0.6042773073058705,
"grad_norm": 2.2991072095507423,
"learning_rate": 4.401913875598086e-08,
"loss": 0.5634,
"step": 2105
},
{
"epoch": 0.6057126453279748,
"grad_norm": 2.8967113990790376,
"learning_rate": 4.385964912280701e-08,
"loss": 0.5705,
"step": 2110
},
{
"epoch": 0.6071479833500789,
"grad_norm": 2.418044587004619,
"learning_rate": 4.370015948963317e-08,
"loss": 0.5786,
"step": 2115
},
{
"epoch": 0.6085833213721832,
"grad_norm": 2.220282737585246,
"learning_rate": 4.354066985645933e-08,
"loss": 0.5708,
"step": 2120
},
{
"epoch": 0.6100186593942873,
"grad_norm": 2.242153390322156,
"learning_rate": 4.338118022328548e-08,
"loss": 0.5587,
"step": 2125
},
{
"epoch": 0.6114539974163916,
"grad_norm": 2.224785684141244,
"learning_rate": 4.3221690590111644e-08,
"loss": 0.5734,
"step": 2130
},
{
"epoch": 0.6128893354384958,
"grad_norm": 2.3268091122198484,
"learning_rate": 4.30622009569378e-08,
"loss": 0.5529,
"step": 2135
},
{
"epoch": 0.6143246734605999,
"grad_norm": 2.363021850053623,
"learning_rate": 4.290271132376395e-08,
"loss": 0.569,
"step": 2140
},
{
"epoch": 0.6157600114827042,
"grad_norm": 2.3431314056786574,
"learning_rate": 4.2743221690590106e-08,
"loss": 0.5602,
"step": 2145
},
{
"epoch": 0.6171953495048084,
"grad_norm": 2.4159747940780245,
"learning_rate": 4.258373205741627e-08,
"loss": 0.5618,
"step": 2150
},
{
"epoch": 0.6186306875269126,
"grad_norm": 2.4207306331146694,
"learning_rate": 4.242424242424242e-08,
"loss": 0.5543,
"step": 2155
},
{
"epoch": 0.6200660255490168,
"grad_norm": 2.1923873922362054,
"learning_rate": 4.226475279106858e-08,
"loss": 0.5555,
"step": 2160
},
{
"epoch": 0.621501363571121,
"grad_norm": 2.338285581454532,
"learning_rate": 4.210526315789473e-08,
"loss": 0.583,
"step": 2165
},
{
"epoch": 0.6229367015932252,
"grad_norm": 2.3753940324053464,
"learning_rate": 4.194577352472089e-08,
"loss": 0.5838,
"step": 2170
},
{
"epoch": 0.6243720396153294,
"grad_norm": 2.2591075922980197,
"learning_rate": 4.1786283891547045e-08,
"loss": 0.5584,
"step": 2175
},
{
"epoch": 0.6258073776374337,
"grad_norm": 2.3674897727831556,
"learning_rate": 4.1626794258373206e-08,
"loss": 0.5726,
"step": 2180
},
{
"epoch": 0.6272427156595378,
"grad_norm": 2.440181667868599,
"learning_rate": 4.146730462519936e-08,
"loss": 0.5575,
"step": 2185
},
{
"epoch": 0.628678053681642,
"grad_norm": 2.3559449056154227,
"learning_rate": 4.130781499202552e-08,
"loss": 0.5791,
"step": 2190
},
{
"epoch": 0.6301133917037463,
"grad_norm": 2.731472032876172,
"learning_rate": 4.114832535885167e-08,
"loss": 0.5657,
"step": 2195
},
{
"epoch": 0.6315487297258504,
"grad_norm": 2.2497218848315987,
"learning_rate": 4.098883572567783e-08,
"loss": 0.5629,
"step": 2200
},
{
"epoch": 0.6329840677479547,
"grad_norm": 2.443671279805305,
"learning_rate": 4.0829346092503984e-08,
"loss": 0.5529,
"step": 2205
},
{
"epoch": 0.6344194057700588,
"grad_norm": 2.7423599812106514,
"learning_rate": 4.0669856459330145e-08,
"loss": 0.574,
"step": 2210
},
{
"epoch": 0.635854743792163,
"grad_norm": 2.4366565097143327,
"learning_rate": 4.05103668261563e-08,
"loss": 0.5677,
"step": 2215
},
{
"epoch": 0.6372900818142673,
"grad_norm": 2.4786368215629597,
"learning_rate": 4.0350877192982454e-08,
"loss": 0.5666,
"step": 2220
},
{
"epoch": 0.6387254198363714,
"grad_norm": 2.1264349737147192,
"learning_rate": 4.019138755980861e-08,
"loss": 0.5657,
"step": 2225
},
{
"epoch": 0.6401607578584757,
"grad_norm": 2.4930450914127635,
"learning_rate": 4.003189792663477e-08,
"loss": 0.5711,
"step": 2230
},
{
"epoch": 0.6415960958805799,
"grad_norm": 2.5188486429575825,
"learning_rate": 3.987240829346092e-08,
"loss": 0.5606,
"step": 2235
},
{
"epoch": 0.6430314339026841,
"grad_norm": 2.4615749950506607,
"learning_rate": 3.9712918660287084e-08,
"loss": 0.5608,
"step": 2240
},
{
"epoch": 0.6444667719247883,
"grad_norm": 2.3168585600756577,
"learning_rate": 3.955342902711323e-08,
"loss": 0.5779,
"step": 2245
},
{
"epoch": 0.6459021099468925,
"grad_norm": 2.431077288205627,
"learning_rate": 3.939393939393939e-08,
"loss": 0.5637,
"step": 2250
},
{
"epoch": 0.6473374479689967,
"grad_norm": 2.4743477577848716,
"learning_rate": 3.9234449760765547e-08,
"loss": 0.5802,
"step": 2255
},
{
"epoch": 0.6487727859911009,
"grad_norm": 2.3956871074752732,
"learning_rate": 3.907496012759171e-08,
"loss": 0.5462,
"step": 2260
},
{
"epoch": 0.6502081240132052,
"grad_norm": 2.232965267462123,
"learning_rate": 3.891547049441786e-08,
"loss": 0.5633,
"step": 2265
},
{
"epoch": 0.6516434620353093,
"grad_norm": 2.313054571588013,
"learning_rate": 3.875598086124402e-08,
"loss": 0.5703,
"step": 2270
},
{
"epoch": 0.6530788000574135,
"grad_norm": 2.5589108956349134,
"learning_rate": 3.859649122807017e-08,
"loss": 0.5655,
"step": 2275
},
{
"epoch": 0.6545141380795177,
"grad_norm": 2.4031051489391952,
"learning_rate": 3.843700159489633e-08,
"loss": 0.5754,
"step": 2280
},
{
"epoch": 0.6559494761016219,
"grad_norm": 2.2922272855048393,
"learning_rate": 3.8277511961722485e-08,
"loss": 0.5676,
"step": 2285
},
{
"epoch": 0.6573848141237262,
"grad_norm": 2.3624459351716216,
"learning_rate": 3.8118022328548646e-08,
"loss": 0.5524,
"step": 2290
},
{
"epoch": 0.6588201521458303,
"grad_norm": 2.370212518166197,
"learning_rate": 3.79585326953748e-08,
"loss": 0.5564,
"step": 2295
},
{
"epoch": 0.6602554901679345,
"grad_norm": 2.2505462367021125,
"learning_rate": 3.7799043062200955e-08,
"loss": 0.5743,
"step": 2300
},
{
"epoch": 0.6616908281900388,
"grad_norm": 2.3819573849600926,
"learning_rate": 3.763955342902711e-08,
"loss": 0.5641,
"step": 2305
},
{
"epoch": 0.6631261662121429,
"grad_norm": 2.262014053722333,
"learning_rate": 3.748006379585327e-08,
"loss": 0.5618,
"step": 2310
},
{
"epoch": 0.6645615042342472,
"grad_norm": 2.3932483107950198,
"learning_rate": 3.7320574162679424e-08,
"loss": 0.5753,
"step": 2315
},
{
"epoch": 0.6659968422563514,
"grad_norm": 2.551394665480823,
"learning_rate": 3.7161084529505585e-08,
"loss": 0.564,
"step": 2320
},
{
"epoch": 0.6674321802784555,
"grad_norm": 2.601979736097557,
"learning_rate": 3.700159489633174e-08,
"loss": 0.5683,
"step": 2325
},
{
"epoch": 0.6688675183005598,
"grad_norm": 2.256444776064737,
"learning_rate": 3.6842105263157894e-08,
"loss": 0.5458,
"step": 2330
},
{
"epoch": 0.670302856322664,
"grad_norm": 2.674757703170623,
"learning_rate": 3.668261562998405e-08,
"loss": 0.5594,
"step": 2335
},
{
"epoch": 0.6717381943447682,
"grad_norm": 2.2518812694654686,
"learning_rate": 3.652312599681021e-08,
"loss": 0.5588,
"step": 2340
},
{
"epoch": 0.6731735323668724,
"grad_norm": 2.4794287433388416,
"learning_rate": 3.636363636363636e-08,
"loss": 0.5646,
"step": 2345
},
{
"epoch": 0.6746088703889767,
"grad_norm": 2.4696337536144326,
"learning_rate": 3.620414673046252e-08,
"loss": 0.5735,
"step": 2350
},
{
"epoch": 0.6760442084110808,
"grad_norm": 2.4381605737624557,
"learning_rate": 3.604465709728867e-08,
"loss": 0.5498,
"step": 2355
},
{
"epoch": 0.677479546433185,
"grad_norm": 2.493543754896731,
"learning_rate": 3.588516746411483e-08,
"loss": 0.5641,
"step": 2360
},
{
"epoch": 0.6789148844552892,
"grad_norm": 2.4237111562016116,
"learning_rate": 3.572567783094099e-08,
"loss": 0.5636,
"step": 2365
},
{
"epoch": 0.6803502224773934,
"grad_norm": 2.5259877154306465,
"learning_rate": 3.556618819776714e-08,
"loss": 0.563,
"step": 2370
},
{
"epoch": 0.6817855604994977,
"grad_norm": 2.4361062944752625,
"learning_rate": 3.54066985645933e-08,
"loss": 0.543,
"step": 2375
},
{
"epoch": 0.6832208985216018,
"grad_norm": 2.518366498082825,
"learning_rate": 3.5247208931419456e-08,
"loss": 0.5739,
"step": 2380
},
{
"epoch": 0.684656236543706,
"grad_norm": 2.4025021753219877,
"learning_rate": 3.508771929824561e-08,
"loss": 0.5607,
"step": 2385
},
{
"epoch": 0.6860915745658103,
"grad_norm": 2.4146630449992537,
"learning_rate": 3.4928229665071765e-08,
"loss": 0.5454,
"step": 2390
},
{
"epoch": 0.6875269125879144,
"grad_norm": 2.3763826608890595,
"learning_rate": 3.4768740031897926e-08,
"loss": 0.5525,
"step": 2395
},
{
"epoch": 0.6889622506100187,
"grad_norm": 2.346702361136207,
"learning_rate": 3.460925039872408e-08,
"loss": 0.5747,
"step": 2400
},
{
"epoch": 0.6903975886321229,
"grad_norm": 2.3609513152625667,
"learning_rate": 3.444976076555024e-08,
"loss": 0.571,
"step": 2405
},
{
"epoch": 0.691832926654227,
"grad_norm": 2.5451074586099827,
"learning_rate": 3.429027113237639e-08,
"loss": 0.5591,
"step": 2410
},
{
"epoch": 0.6932682646763313,
"grad_norm": 2.4807919469343687,
"learning_rate": 3.413078149920255e-08,
"loss": 0.5635,
"step": 2415
},
{
"epoch": 0.6947036026984355,
"grad_norm": 2.3699253800445015,
"learning_rate": 3.3971291866028704e-08,
"loss": 0.5478,
"step": 2420
},
{
"epoch": 0.6961389407205397,
"grad_norm": 2.536150295890729,
"learning_rate": 3.3811802232854865e-08,
"loss": 0.5704,
"step": 2425
},
{
"epoch": 0.6975742787426439,
"grad_norm": 2.3887434323456658,
"learning_rate": 3.365231259968102e-08,
"loss": 0.5636,
"step": 2430
},
{
"epoch": 0.699009616764748,
"grad_norm": 2.42308842897933,
"learning_rate": 3.349282296650718e-08,
"loss": 0.5536,
"step": 2435
},
{
"epoch": 0.7004449547868523,
"grad_norm": 2.478461202673975,
"learning_rate": 3.333333333333333e-08,
"loss": 0.5376,
"step": 2440
},
{
"epoch": 0.7013061576001148,
"eval_loss": 0.624883770942688,
"eval_runtime": 276.8184,
"eval_samples_per_second": 138.087,
"eval_steps_per_second": 2.16,
"step": 2443
},
{
"epoch": 0.7018802928089565,
"grad_norm": 2.6213513088762364,
"learning_rate": 3.317384370015949e-08,
"loss": 0.5952,
"step": 2445
},
{
"epoch": 0.7033156308310607,
"grad_norm": 2.5659887569487565,
"learning_rate": 3.301435406698564e-08,
"loss": 0.5523,
"step": 2450
},
{
"epoch": 0.7047509688531649,
"grad_norm": 2.361371207424685,
"learning_rate": 3.2854864433811803e-08,
"loss": 0.5559,
"step": 2455
},
{
"epoch": 0.7061863068752692,
"grad_norm": 2.4798471285290904,
"learning_rate": 3.269537480063796e-08,
"loss": 0.551,
"step": 2460
},
{
"epoch": 0.7076216448973733,
"grad_norm": 2.2363305539445273,
"learning_rate": 3.253588516746411e-08,
"loss": 0.5579,
"step": 2465
},
{
"epoch": 0.7090569829194775,
"grad_norm": 2.261300566267236,
"learning_rate": 3.2376395534290266e-08,
"loss": 0.5626,
"step": 2470
},
{
"epoch": 0.7104923209415818,
"grad_norm": 2.440848756899625,
"learning_rate": 3.221690590111643e-08,
"loss": 0.5722,
"step": 2475
},
{
"epoch": 0.7119276589636859,
"grad_norm": 2.23072306956738,
"learning_rate": 3.205741626794258e-08,
"loss": 0.5613,
"step": 2480
},
{
"epoch": 0.7133629969857902,
"grad_norm": 2.5653088309028913,
"learning_rate": 3.189792663476874e-08,
"loss": 0.5678,
"step": 2485
},
{
"epoch": 0.7147983350078944,
"grad_norm": 2.222236453599309,
"learning_rate": 3.173843700159489e-08,
"loss": 0.5683,
"step": 2490
},
{
"epoch": 0.7162336730299985,
"grad_norm": 2.301868976256063,
"learning_rate": 3.157894736842105e-08,
"loss": 0.5578,
"step": 2495
},
{
"epoch": 0.7176690110521028,
"grad_norm": 2.2272526230809877,
"learning_rate": 3.1419457735247205e-08,
"loss": 0.5702,
"step": 2500
},
{
"epoch": 0.719104349074207,
"grad_norm": 2.456995665005312,
"learning_rate": 3.1259968102073366e-08,
"loss": 0.5604,
"step": 2505
},
{
"epoch": 0.7205396870963112,
"grad_norm": 2.3490860128288467,
"learning_rate": 3.110047846889952e-08,
"loss": 0.5608,
"step": 2510
},
{
"epoch": 0.7219750251184154,
"grad_norm": 2.352273017737412,
"learning_rate": 3.094098883572568e-08,
"loss": 0.5524,
"step": 2515
},
{
"epoch": 0.7234103631405195,
"grad_norm": 2.415857321587192,
"learning_rate": 3.078149920255183e-08,
"loss": 0.5665,
"step": 2520
},
{
"epoch": 0.7248457011626238,
"grad_norm": 2.446235510745109,
"learning_rate": 3.062200956937799e-08,
"loss": 0.5649,
"step": 2525
},
{
"epoch": 0.726281039184728,
"grad_norm": 2.313314202520151,
"learning_rate": 3.0462519936204144e-08,
"loss": 0.5544,
"step": 2530
},
{
"epoch": 0.7277163772068322,
"grad_norm": 2.2292005323902266,
"learning_rate": 3.0303030303030305e-08,
"loss": 0.5523,
"step": 2535
},
{
"epoch": 0.7291517152289364,
"grad_norm": 2.374986392317948,
"learning_rate": 3.014354066985646e-08,
"loss": 0.5754,
"step": 2540
},
{
"epoch": 0.7305870532510407,
"grad_norm": 2.3305738566656156,
"learning_rate": 2.9984051036682613e-08,
"loss": 0.5518,
"step": 2545
},
{
"epoch": 0.7320223912731448,
"grad_norm": 2.2387038610888617,
"learning_rate": 2.982456140350877e-08,
"loss": 0.5663,
"step": 2550
},
{
"epoch": 0.733457729295249,
"grad_norm": 2.585952380969797,
"learning_rate": 2.9665071770334925e-08,
"loss": 0.5648,
"step": 2555
},
{
"epoch": 0.7348930673173533,
"grad_norm": 2.337232248685897,
"learning_rate": 2.9505582137161083e-08,
"loss": 0.5639,
"step": 2560
},
{
"epoch": 0.7363284053394574,
"grad_norm": 2.466139299880866,
"learning_rate": 2.934609250398724e-08,
"loss": 0.5552,
"step": 2565
},
{
"epoch": 0.7377637433615617,
"grad_norm": 3.101041282908991,
"learning_rate": 2.9186602870813398e-08,
"loss": 0.5704,
"step": 2570
},
{
"epoch": 0.7391990813836659,
"grad_norm": 2.487285631269175,
"learning_rate": 2.902711323763955e-08,
"loss": 0.5477,
"step": 2575
},
{
"epoch": 0.74063441940577,
"grad_norm": 2.5281080819393127,
"learning_rate": 2.8867623604465707e-08,
"loss": 0.5635,
"step": 2580
},
{
"epoch": 0.7420697574278743,
"grad_norm": 2.3789929302011887,
"learning_rate": 2.8708133971291864e-08,
"loss": 0.5448,
"step": 2585
},
{
"epoch": 0.7435050954499784,
"grad_norm": 2.393381150003913,
"learning_rate": 2.8548644338118022e-08,
"loss": 0.5644,
"step": 2590
},
{
"epoch": 0.7449404334720827,
"grad_norm": 2.550473175661344,
"learning_rate": 2.838915470494418e-08,
"loss": 0.5605,
"step": 2595
},
{
"epoch": 0.7463757714941869,
"grad_norm": 2.5353596792783164,
"learning_rate": 2.822966507177033e-08,
"loss": 0.563,
"step": 2600
},
{
"epoch": 0.747811109516291,
"grad_norm": 2.3846607043055914,
"learning_rate": 2.8070175438596488e-08,
"loss": 0.5693,
"step": 2605
},
{
"epoch": 0.7492464475383953,
"grad_norm": 2.6849030076571294,
"learning_rate": 2.7910685805422645e-08,
"loss": 0.5638,
"step": 2610
},
{
"epoch": 0.7506817855604995,
"grad_norm": 2.4740286555102453,
"learning_rate": 2.7751196172248803e-08,
"loss": 0.5597,
"step": 2615
},
{
"epoch": 0.7521171235826037,
"grad_norm": 2.447084260292741,
"learning_rate": 2.759170653907496e-08,
"loss": 0.5578,
"step": 2620
},
{
"epoch": 0.7535524616047079,
"grad_norm": 2.5839932986940117,
"learning_rate": 2.7432216905901118e-08,
"loss": 0.5766,
"step": 2625
},
{
"epoch": 0.7549877996268121,
"grad_norm": 2.3445256188925816,
"learning_rate": 2.727272727272727e-08,
"loss": 0.5672,
"step": 2630
},
{
"epoch": 0.7564231376489163,
"grad_norm": 2.4468131835893896,
"learning_rate": 2.7113237639553427e-08,
"loss": 0.5665,
"step": 2635
},
{
"epoch": 0.7578584756710205,
"grad_norm": 2.5456487718415484,
"learning_rate": 2.6953748006379584e-08,
"loss": 0.5652,
"step": 2640
},
{
"epoch": 0.7592938136931248,
"grad_norm": 2.4023284608519186,
"learning_rate": 2.6794258373205742e-08,
"loss": 0.5596,
"step": 2645
},
{
"epoch": 0.7607291517152289,
"grad_norm": 2.5316031016203135,
"learning_rate": 2.66347687400319e-08,
"loss": 0.5603,
"step": 2650
},
{
"epoch": 0.7621644897373332,
"grad_norm": 2.433666966566567,
"learning_rate": 2.647527910685805e-08,
"loss": 0.5671,
"step": 2655
},
{
"epoch": 0.7635998277594374,
"grad_norm": 2.3587826822508755,
"learning_rate": 2.6315789473684208e-08,
"loss": 0.5504,
"step": 2660
},
{
"epoch": 0.7650351657815415,
"grad_norm": 2.398341019086129,
"learning_rate": 2.6156299840510366e-08,
"loss": 0.5641,
"step": 2665
},
{
"epoch": 0.7664705038036458,
"grad_norm": 2.2174413530817696,
"learning_rate": 2.5996810207336523e-08,
"loss": 0.5546,
"step": 2670
},
{
"epoch": 0.76790584182575,
"grad_norm": 2.293798551862801,
"learning_rate": 2.583732057416268e-08,
"loss": 0.5651,
"step": 2675
},
{
"epoch": 0.7693411798478542,
"grad_norm": 2.6205007545310766,
"learning_rate": 2.5677830940988838e-08,
"loss": 0.5511,
"step": 2680
},
{
"epoch": 0.7707765178699584,
"grad_norm": 2.5273343089178577,
"learning_rate": 2.551834130781499e-08,
"loss": 0.5646,
"step": 2685
},
{
"epoch": 0.7722118558920625,
"grad_norm": 2.4202912870329896,
"learning_rate": 2.5358851674641147e-08,
"loss": 0.5586,
"step": 2690
},
{
"epoch": 0.7736471939141668,
"grad_norm": 2.3235211285214694,
"learning_rate": 2.5199362041467304e-08,
"loss": 0.5715,
"step": 2695
},
{
"epoch": 0.775082531936271,
"grad_norm": 2.4065106135536602,
"learning_rate": 2.5039872408293462e-08,
"loss": 0.5594,
"step": 2700
},
{
"epoch": 0.7765178699583752,
"grad_norm": 2.431036411294595,
"learning_rate": 2.4880382775119616e-08,
"loss": 0.5437,
"step": 2705
},
{
"epoch": 0.7779532079804794,
"grad_norm": 2.449941200842066,
"learning_rate": 2.4720893141945774e-08,
"loss": 0.5458,
"step": 2710
},
{
"epoch": 0.7793885460025836,
"grad_norm": 2.5115374799858907,
"learning_rate": 2.4561403508771928e-08,
"loss": 0.5569,
"step": 2715
},
{
"epoch": 0.7808238840246878,
"grad_norm": 2.3359120202737023,
"learning_rate": 2.4401913875598086e-08,
"loss": 0.5623,
"step": 2720
},
{
"epoch": 0.782259222046792,
"grad_norm": 2.448687571770946,
"learning_rate": 2.4242424242424243e-08,
"loss": 0.5704,
"step": 2725
},
{
"epoch": 0.7836945600688963,
"grad_norm": 2.386097453587479,
"learning_rate": 2.4082934609250398e-08,
"loss": 0.5595,
"step": 2730
},
{
"epoch": 0.7851298980910004,
"grad_norm": 2.29157924213661,
"learning_rate": 2.3923444976076555e-08,
"loss": 0.5493,
"step": 2735
},
{
"epoch": 0.7865652361131046,
"grad_norm": 2.368351844371977,
"learning_rate": 2.3763955342902713e-08,
"loss": 0.5621,
"step": 2740
},
{
"epoch": 0.7880005741352089,
"grad_norm": 2.428279265040824,
"learning_rate": 2.3604465709728867e-08,
"loss": 0.5501,
"step": 2745
},
{
"epoch": 0.789435912157313,
"grad_norm": 2.5354885880092355,
"learning_rate": 2.3444976076555025e-08,
"loss": 0.56,
"step": 2750
},
{
"epoch": 0.7908712501794173,
"grad_norm": 2.394481001467146,
"learning_rate": 2.328548644338118e-08,
"loss": 0.553,
"step": 2755
},
{
"epoch": 0.7923065882015214,
"grad_norm": 2.733847809680854,
"learning_rate": 2.3125996810207336e-08,
"loss": 0.5522,
"step": 2760
},
{
"epoch": 0.7937419262236257,
"grad_norm": 2.390397541327289,
"learning_rate": 2.296650717703349e-08,
"loss": 0.5475,
"step": 2765
},
{
"epoch": 0.7951772642457299,
"grad_norm": 2.3229132156440624,
"learning_rate": 2.2807017543859648e-08,
"loss": 0.5471,
"step": 2770
},
{
"epoch": 0.796612602267834,
"grad_norm": 2.249806012410395,
"learning_rate": 2.2647527910685806e-08,
"loss": 0.5526,
"step": 2775
},
{
"epoch": 0.7980479402899383,
"grad_norm": 2.2913607195571513,
"learning_rate": 2.248803827751196e-08,
"loss": 0.5699,
"step": 2780
},
{
"epoch": 0.7994832783120425,
"grad_norm": 2.3155768396344465,
"learning_rate": 2.2328548644338118e-08,
"loss": 0.5605,
"step": 2785
},
{
"epoch": 0.8009186163341467,
"grad_norm": 2.2266682437603094,
"learning_rate": 2.2169059011164272e-08,
"loss": 0.5633,
"step": 2790
},
{
"epoch": 0.8014927515429884,
"eval_loss": 0.6254069209098816,
"eval_runtime": 276.7807,
"eval_samples_per_second": 138.106,
"eval_steps_per_second": 2.161,
"step": 2792
},
{
"epoch": 0.8023539543562509,
"grad_norm": 2.3313765523766996,
"learning_rate": 2.200956937799043e-08,
"loss": 0.564,
"step": 2795
},
{
"epoch": 0.8037892923783551,
"grad_norm": 2.2750443956794637,
"learning_rate": 2.1850079744816584e-08,
"loss": 0.5425,
"step": 2800
},
{
"epoch": 0.8052246304004593,
"grad_norm": 2.3759701760182956,
"learning_rate": 2.169059011164274e-08,
"loss": 0.5614,
"step": 2805
},
{
"epoch": 0.8066599684225635,
"grad_norm": 2.3995646656798226,
"learning_rate": 2.15311004784689e-08,
"loss": 0.5679,
"step": 2810
},
{
"epoch": 0.8080953064446678,
"grad_norm": 2.413178742154949,
"learning_rate": 2.1371610845295053e-08,
"loss": 0.5364,
"step": 2815
},
{
"epoch": 0.8095306444667719,
"grad_norm": 2.451085540687265,
"learning_rate": 2.121212121212121e-08,
"loss": 0.558,
"step": 2820
},
{
"epoch": 0.8109659824888761,
"grad_norm": 2.5366424198902884,
"learning_rate": 2.1052631578947365e-08,
"loss": 0.5563,
"step": 2825
},
{
"epoch": 0.8124013205109804,
"grad_norm": 2.4480623057702564,
"learning_rate": 2.0893141945773523e-08,
"loss": 0.5533,
"step": 2830
},
{
"epoch": 0.8138366585330845,
"grad_norm": 2.5085180829799225,
"learning_rate": 2.073365231259968e-08,
"loss": 0.5569,
"step": 2835
},
{
"epoch": 0.8152719965551888,
"grad_norm": 2.518306435416413,
"learning_rate": 2.0574162679425834e-08,
"loss": 0.559,
"step": 2840
},
{
"epoch": 0.8167073345772929,
"grad_norm": 2.3775863845429805,
"learning_rate": 2.0414673046251992e-08,
"loss": 0.553,
"step": 2845
},
{
"epoch": 0.8181426725993971,
"grad_norm": 2.2707446927149193,
"learning_rate": 2.025518341307815e-08,
"loss": 0.5459,
"step": 2850
},
{
"epoch": 0.8195780106215014,
"grad_norm": 2.545080295796524,
"learning_rate": 2.0095693779904304e-08,
"loss": 0.5685,
"step": 2855
},
{
"epoch": 0.8210133486436055,
"grad_norm": 2.53691680864861,
"learning_rate": 1.993620414673046e-08,
"loss": 0.555,
"step": 2860
},
{
"epoch": 0.8224486866657098,
"grad_norm": 2.500766862141125,
"learning_rate": 1.9776714513556616e-08,
"loss": 0.5485,
"step": 2865
},
{
"epoch": 0.823884024687814,
"grad_norm": 2.5800550412102474,
"learning_rate": 1.9617224880382773e-08,
"loss": 0.5539,
"step": 2870
},
{
"epoch": 0.8253193627099182,
"grad_norm": 2.459836362956779,
"learning_rate": 1.945773524720893e-08,
"loss": 0.5572,
"step": 2875
},
{
"epoch": 0.8267547007320224,
"grad_norm": 2.4537466248582604,
"learning_rate": 1.9298245614035085e-08,
"loss": 0.5353,
"step": 2880
},
{
"epoch": 0.8281900387541266,
"grad_norm": 2.459607137477631,
"learning_rate": 1.9138755980861243e-08,
"loss": 0.5341,
"step": 2885
},
{
"epoch": 0.8296253767762308,
"grad_norm": 2.6669636316278016,
"learning_rate": 1.89792663476874e-08,
"loss": 0.5671,
"step": 2890
},
{
"epoch": 0.831060714798335,
"grad_norm": 2.5873206815337064,
"learning_rate": 1.8819776714513555e-08,
"loss": 0.5603,
"step": 2895
},
{
"epoch": 0.8324960528204393,
"grad_norm": 2.4057810595787275,
"learning_rate": 1.8660287081339712e-08,
"loss": 0.5296,
"step": 2900
},
{
"epoch": 0.8339313908425434,
"grad_norm": 2.525542516694272,
"learning_rate": 1.850079744816587e-08,
"loss": 0.563,
"step": 2905
},
{
"epoch": 0.8353667288646476,
"grad_norm": 2.304325422653923,
"learning_rate": 1.8341307814992024e-08,
"loss": 0.5491,
"step": 2910
},
{
"epoch": 0.8368020668867518,
"grad_norm": 2.5083207145172124,
"learning_rate": 1.818181818181818e-08,
"loss": 0.5621,
"step": 2915
},
{
"epoch": 0.838237404908856,
"grad_norm": 2.362375542748003,
"learning_rate": 1.8022328548644336e-08,
"loss": 0.5552,
"step": 2920
},
{
"epoch": 0.8396727429309603,
"grad_norm": 2.5039301590611602,
"learning_rate": 1.7862838915470493e-08,
"loss": 0.5574,
"step": 2925
},
{
"epoch": 0.8411080809530644,
"grad_norm": 2.381035985901319,
"learning_rate": 1.770334928229665e-08,
"loss": 0.5576,
"step": 2930
},
{
"epoch": 0.8425434189751686,
"grad_norm": 2.4011187394557454,
"learning_rate": 1.7543859649122805e-08,
"loss": 0.5562,
"step": 2935
},
{
"epoch": 0.8439787569972729,
"grad_norm": 2.423352651833944,
"learning_rate": 1.7384370015948963e-08,
"loss": 0.5467,
"step": 2940
},
{
"epoch": 0.845414095019377,
"grad_norm": 2.377148884585651,
"learning_rate": 1.722488038277512e-08,
"loss": 0.5559,
"step": 2945
},
{
"epoch": 0.8468494330414813,
"grad_norm": 2.625302837686785,
"learning_rate": 1.7065390749601275e-08,
"loss": 0.5651,
"step": 2950
},
{
"epoch": 0.8482847710635855,
"grad_norm": 2.387352810607353,
"learning_rate": 1.6905901116427432e-08,
"loss": 0.5655,
"step": 2955
},
{
"epoch": 0.8497201090856897,
"grad_norm": 2.608468244575993,
"learning_rate": 1.674641148325359e-08,
"loss": 0.5572,
"step": 2960
},
{
"epoch": 0.8511554471077939,
"grad_norm": 2.413659561825893,
"learning_rate": 1.6586921850079744e-08,
"loss": 0.554,
"step": 2965
},
{
"epoch": 0.852590785129898,
"grad_norm": 2.3750806487783143,
"learning_rate": 1.6427432216905902e-08,
"loss": 0.5581,
"step": 2970
},
{
"epoch": 0.8540261231520023,
"grad_norm": 2.279688370144133,
"learning_rate": 1.6267942583732056e-08,
"loss": 0.5547,
"step": 2975
},
{
"epoch": 0.8554614611741065,
"grad_norm": 2.624719049851082,
"learning_rate": 1.6108452950558214e-08,
"loss": 0.5533,
"step": 2980
},
{
"epoch": 0.8568967991962108,
"grad_norm": 2.377843268250057,
"learning_rate": 1.594896331738437e-08,
"loss": 0.5438,
"step": 2985
},
{
"epoch": 0.8583321372183149,
"grad_norm": 2.2847075132530525,
"learning_rate": 1.5789473684210525e-08,
"loss": 0.5335,
"step": 2990
},
{
"epoch": 0.8597674752404191,
"grad_norm": 2.37084129738354,
"learning_rate": 1.5629984051036683e-08,
"loss": 0.5663,
"step": 2995
},
{
"epoch": 0.8612028132625233,
"grad_norm": 2.4355739071215647,
"learning_rate": 1.547049441786284e-08,
"loss": 0.5494,
"step": 3000
},
{
"epoch": 0.8626381512846275,
"grad_norm": 2.470577499040234,
"learning_rate": 1.5311004784688995e-08,
"loss": 0.5655,
"step": 3005
},
{
"epoch": 0.8640734893067318,
"grad_norm": 2.3472749158221506,
"learning_rate": 1.5151515151515152e-08,
"loss": 0.5441,
"step": 3010
},
{
"epoch": 0.8655088273288359,
"grad_norm": 2.435819416568884,
"learning_rate": 1.4992025518341307e-08,
"loss": 0.5422,
"step": 3015
},
{
"epoch": 0.8669441653509401,
"grad_norm": 2.4087778144150036,
"learning_rate": 1.4832535885167463e-08,
"loss": 0.5541,
"step": 3020
},
{
"epoch": 0.8683795033730444,
"grad_norm": 2.5159308315827085,
"learning_rate": 1.467304625199362e-08,
"loss": 0.5572,
"step": 3025
},
{
"epoch": 0.8698148413951485,
"grad_norm": 2.177063646000523,
"learning_rate": 1.4513556618819774e-08,
"loss": 0.5563,
"step": 3030
},
{
"epoch": 0.8712501794172528,
"grad_norm": 2.5037716344512058,
"learning_rate": 1.4354066985645932e-08,
"loss": 0.5628,
"step": 3035
},
{
"epoch": 0.872685517439357,
"grad_norm": 2.402176353413018,
"learning_rate": 1.419457735247209e-08,
"loss": 0.556,
"step": 3040
},
{
"epoch": 0.8741208554614611,
"grad_norm": 2.550729584909021,
"learning_rate": 1.4035087719298244e-08,
"loss": 0.5315,
"step": 3045
},
{
"epoch": 0.8755561934835654,
"grad_norm": 3.0055945543123133,
"learning_rate": 1.3875598086124401e-08,
"loss": 0.5639,
"step": 3050
},
{
"epoch": 0.8769915315056696,
"grad_norm": 2.3539320173290474,
"learning_rate": 1.3716108452950559e-08,
"loss": 0.5564,
"step": 3055
},
{
"epoch": 0.8784268695277738,
"grad_norm": 2.3311951542487046,
"learning_rate": 1.3556618819776713e-08,
"loss": 0.5606,
"step": 3060
},
{
"epoch": 0.879862207549878,
"grad_norm": 2.242631561275903,
"learning_rate": 1.3397129186602871e-08,
"loss": 0.5525,
"step": 3065
},
{
"epoch": 0.8812975455719823,
"grad_norm": 2.339532716408427,
"learning_rate": 1.3237639553429025e-08,
"loss": 0.5419,
"step": 3070
},
{
"epoch": 0.8827328835940864,
"grad_norm": 2.547870617282417,
"learning_rate": 1.3078149920255183e-08,
"loss": 0.5644,
"step": 3075
},
{
"epoch": 0.8841682216161906,
"grad_norm": 2.3303852663788907,
"learning_rate": 1.291866028708134e-08,
"loss": 0.5586,
"step": 3080
},
{
"epoch": 0.8856035596382948,
"grad_norm": 2.4204934502872115,
"learning_rate": 1.2759170653907495e-08,
"loss": 0.553,
"step": 3085
},
{
"epoch": 0.887038897660399,
"grad_norm": 2.696258891027339,
"learning_rate": 1.2599681020733652e-08,
"loss": 0.5762,
"step": 3090
},
{
"epoch": 0.8884742356825033,
"grad_norm": 2.6104754887154256,
"learning_rate": 1.2440191387559808e-08,
"loss": 0.5502,
"step": 3095
},
{
"epoch": 0.8899095737046074,
"grad_norm": 2.2629330882804197,
"learning_rate": 1.2280701754385964e-08,
"loss": 0.5545,
"step": 3100
},
{
"epoch": 0.8913449117267116,
"grad_norm": 2.3232641254149633,
"learning_rate": 1.2121212121212122e-08,
"loss": 0.532,
"step": 3105
},
{
"epoch": 0.8927802497488159,
"grad_norm": 2.342474888159451,
"learning_rate": 1.1961722488038278e-08,
"loss": 0.5704,
"step": 3110
},
{
"epoch": 0.89421558777092,
"grad_norm": 2.424809382809371,
"learning_rate": 1.1802232854864433e-08,
"loss": 0.5537,
"step": 3115
},
{
"epoch": 0.8956509257930243,
"grad_norm": 2.50275110580096,
"learning_rate": 1.164274322169059e-08,
"loss": 0.547,
"step": 3120
},
{
"epoch": 0.8970862638151285,
"grad_norm": 2.3447767193393165,
"learning_rate": 1.1483253588516745e-08,
"loss": 0.5565,
"step": 3125
},
{
"epoch": 0.8985216018372326,
"grad_norm": 2.3166718517750384,
"learning_rate": 1.1323763955342903e-08,
"loss": 0.5425,
"step": 3130
},
{
"epoch": 0.8999569398593369,
"grad_norm": 2.5132920153594536,
"learning_rate": 1.1164274322169059e-08,
"loss": 0.558,
"step": 3135
},
{
"epoch": 0.901392277881441,
"grad_norm": 2.5619104750161146,
"learning_rate": 1.1004784688995215e-08,
"loss": 0.5563,
"step": 3140
},
{
"epoch": 0.9016793454858619,
"eval_loss": 0.6268156170845032,
"eval_runtime": 276.7226,
"eval_samples_per_second": 138.135,
"eval_steps_per_second": 2.161,
"step": 3141
},
{
"epoch": 0.9016793454858619,
"step": 3141,
"total_flos": 7109747952582656.0,
"train_loss": 0.6063995264169328,
"train_runtime": 10128.64,
"train_samples_per_second": 22.011,
"train_steps_per_second": 0.344
}
],
"logging_steps": 5,
"max_steps": 3484,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 349,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 5
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7109747952582656.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}