Image-Text-to-Text
PEFT
Safetensors
conversational
Tridis_HTR_MiniCPM_ABBR / trainer_state.json
magistermilitum's picture
First load
545e8f6 verified
raw
history blame
247 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.4652504589562025,
"eval_steps": 3001,
"global_step": 14100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017484045808200017,
"grad_norm": 22.684425354003906,
"learning_rate": 3.2558139534883724e-06,
"loss": 3.8911,
"step": 10
},
{
"epoch": 0.0034968091616400035,
"grad_norm": 12.553377151489258,
"learning_rate": 7.906976744186048e-06,
"loss": 2.9776,
"step": 20
},
{
"epoch": 0.005245213742460005,
"grad_norm": 12.756866455078125,
"learning_rate": 1.2558139534883723e-05,
"loss": 2.543,
"step": 30
},
{
"epoch": 0.006993618323280007,
"grad_norm": 11.82550048828125,
"learning_rate": 1.7209302325581396e-05,
"loss": 2.1148,
"step": 40
},
{
"epoch": 0.008742022904100009,
"grad_norm": 10.381027221679688,
"learning_rate": 2.186046511627907e-05,
"loss": 1.924,
"step": 50
},
{
"epoch": 0.01049042748492001,
"grad_norm": 13.786709785461426,
"learning_rate": 2.6511627906976743e-05,
"loss": 1.7809,
"step": 60
},
{
"epoch": 0.012238832065740012,
"grad_norm": 9.527298927307129,
"learning_rate": 3.116279069767442e-05,
"loss": 1.5543,
"step": 70
},
{
"epoch": 0.013987236646560014,
"grad_norm": 9.207590103149414,
"learning_rate": 3.58139534883721e-05,
"loss": 1.4604,
"step": 80
},
{
"epoch": 0.015735641227380016,
"grad_norm": 7.972263336181641,
"learning_rate": 3.999765684494172e-05,
"loss": 1.3258,
"step": 90
},
{
"epoch": 0.017484045808200017,
"grad_norm": 6.906246662139893,
"learning_rate": 3.997422529435886e-05,
"loss": 1.3031,
"step": 100
},
{
"epoch": 0.01923245038902002,
"grad_norm": 8.441957473754883,
"learning_rate": 3.9950793743776e-05,
"loss": 1.2696,
"step": 110
},
{
"epoch": 0.02098085496984002,
"grad_norm": 8.918852806091309,
"learning_rate": 3.992736219319314e-05,
"loss": 1.2792,
"step": 120
},
{
"epoch": 0.022729259550660023,
"grad_norm": 8.715033531188965,
"learning_rate": 3.990393064261028e-05,
"loss": 1.1793,
"step": 130
},
{
"epoch": 0.024477664131480024,
"grad_norm": 9.690086364746094,
"learning_rate": 3.9880499092027415e-05,
"loss": 1.2385,
"step": 140
},
{
"epoch": 0.026226068712300026,
"grad_norm": 6.637733459472656,
"learning_rate": 3.985706754144456e-05,
"loss": 1.1732,
"step": 150
},
{
"epoch": 0.027974473293120028,
"grad_norm": 11.725113868713379,
"learning_rate": 3.9833635990861696e-05,
"loss": 1.1724,
"step": 160
},
{
"epoch": 0.02972287787394003,
"grad_norm": 10.116538047790527,
"learning_rate": 3.9810204440278834e-05,
"loss": 1.0775,
"step": 170
},
{
"epoch": 0.03147128245476003,
"grad_norm": 7.076519966125488,
"learning_rate": 3.978677288969598e-05,
"loss": 1.0104,
"step": 180
},
{
"epoch": 0.03321968703558004,
"grad_norm": 8.93696117401123,
"learning_rate": 3.9763341339113115e-05,
"loss": 1.1093,
"step": 190
},
{
"epoch": 0.034968091616400035,
"grad_norm": 7.459012985229492,
"learning_rate": 3.973990978853026e-05,
"loss": 0.9626,
"step": 200
},
{
"epoch": 0.03671649619722004,
"grad_norm": 7.383749961853027,
"learning_rate": 3.9716478237947396e-05,
"loss": 0.9835,
"step": 210
},
{
"epoch": 0.03846490077804004,
"grad_norm": 6.915167808532715,
"learning_rate": 3.969304668736454e-05,
"loss": 0.9152,
"step": 220
},
{
"epoch": 0.040213305358860044,
"grad_norm": 6.553030014038086,
"learning_rate": 3.966961513678168e-05,
"loss": 0.9545,
"step": 230
},
{
"epoch": 0.04196170993968004,
"grad_norm": 7.936176776885986,
"learning_rate": 3.964618358619882e-05,
"loss": 0.9771,
"step": 240
},
{
"epoch": 0.04371011452050005,
"grad_norm": 8.945891380310059,
"learning_rate": 3.962275203561596e-05,
"loss": 0.9664,
"step": 250
},
{
"epoch": 0.045458519101320045,
"grad_norm": 7.30666971206665,
"learning_rate": 3.95993204850331e-05,
"loss": 0.8576,
"step": 260
},
{
"epoch": 0.04720692368214005,
"grad_norm": 6.691559791564941,
"learning_rate": 3.957588893445024e-05,
"loss": 0.8771,
"step": 270
},
{
"epoch": 0.04895532826296005,
"grad_norm": 6.786176681518555,
"learning_rate": 3.9552457383867377e-05,
"loss": 0.9918,
"step": 280
},
{
"epoch": 0.050703732843780054,
"grad_norm": 8.400111198425293,
"learning_rate": 3.952902583328452e-05,
"loss": 0.9256,
"step": 290
},
{
"epoch": 0.05245213742460005,
"grad_norm": 6.029471397399902,
"learning_rate": 3.950559428270166e-05,
"loss": 0.8877,
"step": 300
},
{
"epoch": 0.05420054200542006,
"grad_norm": 5.6731276512146,
"learning_rate": 3.94821627321188e-05,
"loss": 0.7843,
"step": 310
},
{
"epoch": 0.055948946586240056,
"grad_norm": 5.914205074310303,
"learning_rate": 3.945873118153594e-05,
"loss": 0.8157,
"step": 320
},
{
"epoch": 0.05769735116706006,
"grad_norm": 5.39005708694458,
"learning_rate": 3.943529963095308e-05,
"loss": 0.8165,
"step": 330
},
{
"epoch": 0.05944575574788006,
"grad_norm": 8.598004341125488,
"learning_rate": 3.941186808037022e-05,
"loss": 0.8669,
"step": 340
},
{
"epoch": 0.061194160328700065,
"grad_norm": 6.7799763679504395,
"learning_rate": 3.9388436529787364e-05,
"loss": 0.8062,
"step": 350
},
{
"epoch": 0.06294256490952006,
"grad_norm": 6.09038782119751,
"learning_rate": 3.93650049792045e-05,
"loss": 0.7334,
"step": 360
},
{
"epoch": 0.06469096949034006,
"grad_norm": 6.056455612182617,
"learning_rate": 3.9341573428621645e-05,
"loss": 0.8423,
"step": 370
},
{
"epoch": 0.06643937407116007,
"grad_norm": 7.564798355102539,
"learning_rate": 3.931814187803878e-05,
"loss": 0.7508,
"step": 380
},
{
"epoch": 0.06818777865198007,
"grad_norm": 6.5860490798950195,
"learning_rate": 3.929471032745592e-05,
"loss": 0.7571,
"step": 390
},
{
"epoch": 0.06993618323280007,
"grad_norm": 5.904583930969238,
"learning_rate": 3.9271278776873064e-05,
"loss": 0.8628,
"step": 400
},
{
"epoch": 0.07168458781362007,
"grad_norm": 9.70653247833252,
"learning_rate": 3.92478472262902e-05,
"loss": 0.8791,
"step": 410
},
{
"epoch": 0.07343299239444008,
"grad_norm": 6.332978248596191,
"learning_rate": 3.9224415675707345e-05,
"loss": 0.7527,
"step": 420
},
{
"epoch": 0.07518139697526008,
"grad_norm": 5.800631523132324,
"learning_rate": 3.920098412512448e-05,
"loss": 0.7262,
"step": 430
},
{
"epoch": 0.07692980155608008,
"grad_norm": 6.9981489181518555,
"learning_rate": 3.9177552574541626e-05,
"loss": 0.8282,
"step": 440
},
{
"epoch": 0.07867820613690008,
"grad_norm": 7.195876121520996,
"learning_rate": 3.915412102395876e-05,
"loss": 0.8676,
"step": 450
},
{
"epoch": 0.08042661071772009,
"grad_norm": 4.58298397064209,
"learning_rate": 3.913068947337591e-05,
"loss": 0.6943,
"step": 460
},
{
"epoch": 0.08217501529854009,
"grad_norm": 7.165435314178467,
"learning_rate": 3.9107257922793044e-05,
"loss": 0.6899,
"step": 470
},
{
"epoch": 0.08392341987936008,
"grad_norm": 5.472495079040527,
"learning_rate": 3.908382637221019e-05,
"loss": 0.7777,
"step": 480
},
{
"epoch": 0.08567182446018008,
"grad_norm": 5.845532417297363,
"learning_rate": 3.9060394821627325e-05,
"loss": 0.7208,
"step": 490
},
{
"epoch": 0.0874202290410001,
"grad_norm": 6.358067989349365,
"learning_rate": 3.903696327104446e-05,
"loss": 0.7331,
"step": 500
},
{
"epoch": 0.08916863362182009,
"grad_norm": 7.398125648498535,
"learning_rate": 3.901353172046161e-05,
"loss": 0.7571,
"step": 510
},
{
"epoch": 0.09091703820264009,
"grad_norm": 8.365299224853516,
"learning_rate": 3.8990100169878744e-05,
"loss": 0.7833,
"step": 520
},
{
"epoch": 0.09266544278346009,
"grad_norm": 6.759946823120117,
"learning_rate": 3.896666861929589e-05,
"loss": 0.713,
"step": 530
},
{
"epoch": 0.0944138473642801,
"grad_norm": 5.2622270584106445,
"learning_rate": 3.8943237068713025e-05,
"loss": 0.69,
"step": 540
},
{
"epoch": 0.0961622519451001,
"grad_norm": 5.143499851226807,
"learning_rate": 3.891980551813017e-05,
"loss": 0.6665,
"step": 550
},
{
"epoch": 0.0979106565259201,
"grad_norm": 6.446269989013672,
"learning_rate": 3.8896373967547306e-05,
"loss": 0.6641,
"step": 560
},
{
"epoch": 0.0996590611067401,
"grad_norm": 5.429083347320557,
"learning_rate": 3.887294241696445e-05,
"loss": 0.6616,
"step": 570
},
{
"epoch": 0.10140746568756011,
"grad_norm": 6.426352500915527,
"learning_rate": 3.884951086638159e-05,
"loss": 0.7137,
"step": 580
},
{
"epoch": 0.1031558702683801,
"grad_norm": 6.015476226806641,
"learning_rate": 3.882607931579873e-05,
"loss": 0.7473,
"step": 590
},
{
"epoch": 0.1049042748492001,
"grad_norm": 5.670246601104736,
"learning_rate": 3.880264776521587e-05,
"loss": 0.6374,
"step": 600
},
{
"epoch": 0.1066526794300201,
"grad_norm": 6.190732479095459,
"learning_rate": 3.8779216214633006e-05,
"loss": 0.6213,
"step": 610
},
{
"epoch": 0.10840108401084012,
"grad_norm": 5.9032793045043945,
"learning_rate": 3.875578466405014e-05,
"loss": 0.6196,
"step": 620
},
{
"epoch": 0.11014948859166011,
"grad_norm": 5.00473690032959,
"learning_rate": 3.873235311346729e-05,
"loss": 0.6422,
"step": 630
},
{
"epoch": 0.11189789317248011,
"grad_norm": 4.598703384399414,
"learning_rate": 3.8708921562884424e-05,
"loss": 0.6302,
"step": 640
},
{
"epoch": 0.11364629775330011,
"grad_norm": 5.970333099365234,
"learning_rate": 3.868549001230157e-05,
"loss": 0.6547,
"step": 650
},
{
"epoch": 0.11539470233412012,
"grad_norm": 5.355820655822754,
"learning_rate": 3.8662058461718705e-05,
"loss": 0.6128,
"step": 660
},
{
"epoch": 0.11714310691494012,
"grad_norm": 5.05305290222168,
"learning_rate": 3.863862691113584e-05,
"loss": 0.6536,
"step": 670
},
{
"epoch": 0.11889151149576012,
"grad_norm": 4.703378200531006,
"learning_rate": 3.8615195360552986e-05,
"loss": 0.576,
"step": 680
},
{
"epoch": 0.12063991607658012,
"grad_norm": 5.541085720062256,
"learning_rate": 3.8591763809970124e-05,
"loss": 0.6727,
"step": 690
},
{
"epoch": 0.12238832065740013,
"grad_norm": 4.993374824523926,
"learning_rate": 3.856833225938727e-05,
"loss": 0.6802,
"step": 700
},
{
"epoch": 0.12413672523822013,
"grad_norm": 4.485571384429932,
"learning_rate": 3.8544900708804405e-05,
"loss": 0.6061,
"step": 710
},
{
"epoch": 0.12588512981904013,
"grad_norm": 6.4693756103515625,
"learning_rate": 3.852146915822155e-05,
"loss": 0.6032,
"step": 720
},
{
"epoch": 0.12763353439986014,
"grad_norm": 5.537546634674072,
"learning_rate": 3.8498037607638686e-05,
"loss": 0.6899,
"step": 730
},
{
"epoch": 0.12938193898068012,
"grad_norm": 5.683461666107178,
"learning_rate": 3.847460605705583e-05,
"loss": 0.509,
"step": 740
},
{
"epoch": 0.13113034356150013,
"grad_norm": 6.413394451141357,
"learning_rate": 3.845117450647297e-05,
"loss": 0.5851,
"step": 750
},
{
"epoch": 0.13287874814232015,
"grad_norm": 6.22868013381958,
"learning_rate": 3.842774295589011e-05,
"loss": 0.6095,
"step": 760
},
{
"epoch": 0.13462715272314013,
"grad_norm": 4.91422700881958,
"learning_rate": 3.840431140530725e-05,
"loss": 0.6649,
"step": 770
},
{
"epoch": 0.13637555730396014,
"grad_norm": 6.396027088165283,
"learning_rate": 3.8380879854724386e-05,
"loss": 0.6328,
"step": 780
},
{
"epoch": 0.13812396188478013,
"grad_norm": 5.467519283294678,
"learning_rate": 3.835744830414153e-05,
"loss": 0.5906,
"step": 790
},
{
"epoch": 0.13987236646560014,
"grad_norm": 6.788895130157471,
"learning_rate": 3.833401675355867e-05,
"loss": 0.6719,
"step": 800
},
{
"epoch": 0.14162077104642015,
"grad_norm": 8.55156135559082,
"learning_rate": 3.831058520297581e-05,
"loss": 0.6192,
"step": 810
},
{
"epoch": 0.14336917562724014,
"grad_norm": 4.527801513671875,
"learning_rate": 3.828715365239295e-05,
"loss": 0.5647,
"step": 820
},
{
"epoch": 0.14511758020806015,
"grad_norm": 7.3711042404174805,
"learning_rate": 3.826372210181009e-05,
"loss": 0.6221,
"step": 830
},
{
"epoch": 0.14686598478888016,
"grad_norm": 3.927372932434082,
"learning_rate": 3.824029055122723e-05,
"loss": 0.5524,
"step": 840
},
{
"epoch": 0.14861438936970015,
"grad_norm": 6.472529411315918,
"learning_rate": 3.821685900064437e-05,
"loss": 0.5478,
"step": 850
},
{
"epoch": 0.15036279395052016,
"grad_norm": 3.832937240600586,
"learning_rate": 3.819342745006151e-05,
"loss": 0.5765,
"step": 860
},
{
"epoch": 0.15211119853134014,
"grad_norm": 6.222163677215576,
"learning_rate": 3.8169995899478654e-05,
"loss": 0.6352,
"step": 870
},
{
"epoch": 0.15385960311216015,
"grad_norm": 4.043605804443359,
"learning_rate": 3.814656434889579e-05,
"loss": 0.5951,
"step": 880
},
{
"epoch": 0.15560800769298017,
"grad_norm": 6.224709510803223,
"learning_rate": 3.812313279831293e-05,
"loss": 0.6274,
"step": 890
},
{
"epoch": 0.15735641227380015,
"grad_norm": 4.874531269073486,
"learning_rate": 3.809970124773007e-05,
"loss": 0.5995,
"step": 900
},
{
"epoch": 0.15910481685462016,
"grad_norm": 4.65131950378418,
"learning_rate": 3.807626969714721e-05,
"loss": 0.5395,
"step": 910
},
{
"epoch": 0.16085322143544017,
"grad_norm": 6.024080753326416,
"learning_rate": 3.8052838146564354e-05,
"loss": 0.6002,
"step": 920
},
{
"epoch": 0.16260162601626016,
"grad_norm": 5.143209457397461,
"learning_rate": 3.802940659598149e-05,
"loss": 0.5729,
"step": 930
},
{
"epoch": 0.16435003059708017,
"grad_norm": 4.049091815948486,
"learning_rate": 3.8005975045398635e-05,
"loss": 0.5778,
"step": 940
},
{
"epoch": 0.16609843517790016,
"grad_norm": 7.80893611907959,
"learning_rate": 3.798254349481577e-05,
"loss": 0.6218,
"step": 950
},
{
"epoch": 0.16784683975872017,
"grad_norm": 4.759846210479736,
"learning_rate": 3.7959111944232916e-05,
"loss": 0.5375,
"step": 960
},
{
"epoch": 0.16959524433954018,
"grad_norm": 4.4919023513793945,
"learning_rate": 3.793568039365005e-05,
"loss": 0.5064,
"step": 970
},
{
"epoch": 0.17134364892036016,
"grad_norm": 6.188364028930664,
"learning_rate": 3.79122488430672e-05,
"loss": 0.5616,
"step": 980
},
{
"epoch": 0.17309205350118018,
"grad_norm": 4.2530436515808105,
"learning_rate": 3.7888817292484334e-05,
"loss": 0.5562,
"step": 990
},
{
"epoch": 0.1748404580820002,
"grad_norm": 4.9472737312316895,
"learning_rate": 3.786538574190147e-05,
"loss": 0.5224,
"step": 1000
},
{
"epoch": 0.17658886266282017,
"grad_norm": 4.524152755737305,
"learning_rate": 3.7841954191318616e-05,
"loss": 0.5238,
"step": 1010
},
{
"epoch": 0.17833726724364019,
"grad_norm": 4.708081245422363,
"learning_rate": 3.781852264073575e-05,
"loss": 0.6241,
"step": 1020
},
{
"epoch": 0.18008567182446017,
"grad_norm": 3.5008468627929688,
"learning_rate": 3.77950910901529e-05,
"loss": 0.4815,
"step": 1030
},
{
"epoch": 0.18183407640528018,
"grad_norm": 5.174265384674072,
"learning_rate": 3.7771659539570034e-05,
"loss": 0.5692,
"step": 1040
},
{
"epoch": 0.1835824809861002,
"grad_norm": 5.7122883796691895,
"learning_rate": 3.774822798898718e-05,
"loss": 0.5172,
"step": 1050
},
{
"epoch": 0.18533088556692018,
"grad_norm": 6.649969100952148,
"learning_rate": 3.7724796438404315e-05,
"loss": 0.5664,
"step": 1060
},
{
"epoch": 0.1870792901477402,
"grad_norm": 4.619507312774658,
"learning_rate": 3.770136488782146e-05,
"loss": 0.5372,
"step": 1070
},
{
"epoch": 0.1888276947285602,
"grad_norm": 5.630303382873535,
"learning_rate": 3.7677933337238596e-05,
"loss": 0.513,
"step": 1080
},
{
"epoch": 0.1905760993093802,
"grad_norm": 7.488351821899414,
"learning_rate": 3.765450178665574e-05,
"loss": 0.5913,
"step": 1090
},
{
"epoch": 0.1923245038902002,
"grad_norm": 4.957793235778809,
"learning_rate": 3.763107023607287e-05,
"loss": 0.5634,
"step": 1100
},
{
"epoch": 0.19407290847102018,
"grad_norm": 7.087141990661621,
"learning_rate": 3.7607638685490015e-05,
"loss": 0.567,
"step": 1110
},
{
"epoch": 0.1958213130518402,
"grad_norm": 3.770094871520996,
"learning_rate": 3.758420713490715e-05,
"loss": 0.5791,
"step": 1120
},
{
"epoch": 0.1975697176326602,
"grad_norm": 12.021100044250488,
"learning_rate": 3.7560775584324296e-05,
"loss": 0.5493,
"step": 1130
},
{
"epoch": 0.1993181222134802,
"grad_norm": 4.023717880249023,
"learning_rate": 3.753734403374143e-05,
"loss": 0.4724,
"step": 1140
},
{
"epoch": 0.2010665267943002,
"grad_norm": 4.10474157333374,
"learning_rate": 3.751391248315858e-05,
"loss": 0.5078,
"step": 1150
},
{
"epoch": 0.20281493137512022,
"grad_norm": 4.947973728179932,
"learning_rate": 3.7490480932575714e-05,
"loss": 0.5133,
"step": 1160
},
{
"epoch": 0.2045633359559402,
"grad_norm": 4.226052761077881,
"learning_rate": 3.746704938199286e-05,
"loss": 0.5145,
"step": 1170
},
{
"epoch": 0.2063117405367602,
"grad_norm": 3.9101569652557373,
"learning_rate": 3.7443617831409995e-05,
"loss": 0.4885,
"step": 1180
},
{
"epoch": 0.2080601451175802,
"grad_norm": 5.738667964935303,
"learning_rate": 3.742018628082713e-05,
"loss": 0.5903,
"step": 1190
},
{
"epoch": 0.2098085496984002,
"grad_norm": 5.365860462188721,
"learning_rate": 3.7396754730244277e-05,
"loss": 0.5117,
"step": 1200
},
{
"epoch": 0.21155695427922022,
"grad_norm": 4.273809909820557,
"learning_rate": 3.7373323179661414e-05,
"loss": 0.507,
"step": 1210
},
{
"epoch": 0.2133053588600402,
"grad_norm": 4.795403480529785,
"learning_rate": 3.734989162907856e-05,
"loss": 0.4717,
"step": 1220
},
{
"epoch": 0.21505376344086022,
"grad_norm": 4.199695110321045,
"learning_rate": 3.7326460078495695e-05,
"loss": 0.4503,
"step": 1230
},
{
"epoch": 0.21680216802168023,
"grad_norm": 4.74060583114624,
"learning_rate": 3.730302852791284e-05,
"loss": 0.5411,
"step": 1240
},
{
"epoch": 0.21855057260250021,
"grad_norm": 3.1553878784179688,
"learning_rate": 3.7279596977329976e-05,
"loss": 0.4929,
"step": 1250
},
{
"epoch": 0.22029897718332023,
"grad_norm": 6.157911777496338,
"learning_rate": 3.725616542674712e-05,
"loss": 0.4856,
"step": 1260
},
{
"epoch": 0.2220473817641402,
"grad_norm": 3.480355739593506,
"learning_rate": 3.723273387616426e-05,
"loss": 0.4727,
"step": 1270
},
{
"epoch": 0.22379578634496022,
"grad_norm": 4.776696681976318,
"learning_rate": 3.72093023255814e-05,
"loss": 0.5281,
"step": 1280
},
{
"epoch": 0.22554419092578024,
"grad_norm": 4.238858222961426,
"learning_rate": 3.718587077499854e-05,
"loss": 0.4846,
"step": 1290
},
{
"epoch": 0.22729259550660022,
"grad_norm": 6.8187994956970215,
"learning_rate": 3.7162439224415676e-05,
"loss": 0.5232,
"step": 1300
},
{
"epoch": 0.22904100008742023,
"grad_norm": 2.982185125350952,
"learning_rate": 3.713900767383282e-05,
"loss": 0.5057,
"step": 1310
},
{
"epoch": 0.23078940466824024,
"grad_norm": 5.940972805023193,
"learning_rate": 3.711557612324996e-05,
"loss": 0.5363,
"step": 1320
},
{
"epoch": 0.23253780924906023,
"grad_norm": 4.530322074890137,
"learning_rate": 3.70921445726671e-05,
"loss": 0.5113,
"step": 1330
},
{
"epoch": 0.23428621382988024,
"grad_norm": 7.602742671966553,
"learning_rate": 3.706871302208424e-05,
"loss": 0.4607,
"step": 1340
},
{
"epoch": 0.23603461841070023,
"grad_norm": 4.332957744598389,
"learning_rate": 3.704528147150138e-05,
"loss": 0.5148,
"step": 1350
},
{
"epoch": 0.23778302299152024,
"grad_norm": 4.353420734405518,
"learning_rate": 3.702184992091852e-05,
"loss": 0.5027,
"step": 1360
},
{
"epoch": 0.23953142757234025,
"grad_norm": 3.4939002990722656,
"learning_rate": 3.699841837033566e-05,
"loss": 0.462,
"step": 1370
},
{
"epoch": 0.24127983215316023,
"grad_norm": 5.113827705383301,
"learning_rate": 3.69749868197528e-05,
"loss": 0.4807,
"step": 1380
},
{
"epoch": 0.24302823673398025,
"grad_norm": 2.979421377182007,
"learning_rate": 3.6953898424228225e-05,
"loss": 0.4931,
"step": 1390
},
{
"epoch": 0.24477664131480026,
"grad_norm": 4.109902858734131,
"learning_rate": 3.693046687364536e-05,
"loss": 0.4817,
"step": 1400
},
{
"epoch": 0.24652504589562024,
"grad_norm": 3.92783522605896,
"learning_rate": 3.6907035323062506e-05,
"loss": 0.4318,
"step": 1410
},
{
"epoch": 0.24827345047644025,
"grad_norm": 3.3686742782592773,
"learning_rate": 3.6883603772479644e-05,
"loss": 0.5278,
"step": 1420
},
{
"epoch": 0.25002185505726027,
"grad_norm": 3.851233720779419,
"learning_rate": 3.686017222189679e-05,
"loss": 0.4368,
"step": 1430
},
{
"epoch": 0.25177025963808025,
"grad_norm": 4.5934062004089355,
"learning_rate": 3.6836740671313925e-05,
"loss": 0.5209,
"step": 1440
},
{
"epoch": 0.25351866421890024,
"grad_norm": 5.395561695098877,
"learning_rate": 3.681330912073107e-05,
"loss": 0.449,
"step": 1450
},
{
"epoch": 0.2552670687997203,
"grad_norm": 3.0194127559661865,
"learning_rate": 3.6789877570148206e-05,
"loss": 0.4584,
"step": 1460
},
{
"epoch": 0.25701547338054026,
"grad_norm": 6.502100944519043,
"learning_rate": 3.676644601956535e-05,
"loss": 0.539,
"step": 1470
},
{
"epoch": 0.25876387796136024,
"grad_norm": 4.323697090148926,
"learning_rate": 3.674301446898249e-05,
"loss": 0.4571,
"step": 1480
},
{
"epoch": 0.2605122825421803,
"grad_norm": 3.714613914489746,
"learning_rate": 3.671958291839963e-05,
"loss": 0.4525,
"step": 1490
},
{
"epoch": 0.26226068712300027,
"grad_norm": 3.388582706451416,
"learning_rate": 3.669615136781677e-05,
"loss": 0.4457,
"step": 1500
},
{
"epoch": 0.26400909170382025,
"grad_norm": 4.142477989196777,
"learning_rate": 3.6672719817233905e-05,
"loss": 0.4379,
"step": 1510
},
{
"epoch": 0.2657574962846403,
"grad_norm": 5.404989242553711,
"learning_rate": 3.664928826665105e-05,
"loss": 0.4761,
"step": 1520
},
{
"epoch": 0.2675059008654603,
"grad_norm": 2.9752023220062256,
"learning_rate": 3.6625856716068187e-05,
"loss": 0.4431,
"step": 1530
},
{
"epoch": 0.26925430544628026,
"grad_norm": 5.292706489562988,
"learning_rate": 3.660242516548533e-05,
"loss": 0.4439,
"step": 1540
},
{
"epoch": 0.27100271002710025,
"grad_norm": 4.105823516845703,
"learning_rate": 3.657899361490247e-05,
"loss": 0.4925,
"step": 1550
},
{
"epoch": 0.2727511146079203,
"grad_norm": 4.246946334838867,
"learning_rate": 3.655556206431961e-05,
"loss": 0.46,
"step": 1560
},
{
"epoch": 0.27449951918874027,
"grad_norm": 4.617825508117676,
"learning_rate": 3.653213051373675e-05,
"loss": 0.4587,
"step": 1570
},
{
"epoch": 0.27624792376956026,
"grad_norm": 7.036331653594971,
"learning_rate": 3.650869896315389e-05,
"loss": 0.4417,
"step": 1580
},
{
"epoch": 0.2779963283503803,
"grad_norm": 3.7988290786743164,
"learning_rate": 3.648526741257103e-05,
"loss": 0.4307,
"step": 1590
},
{
"epoch": 0.2797447329312003,
"grad_norm": 4.456466197967529,
"learning_rate": 3.6461835861988174e-05,
"loss": 0.5143,
"step": 1600
},
{
"epoch": 0.28149313751202026,
"grad_norm": 3.4820919036865234,
"learning_rate": 3.643840431140531e-05,
"loss": 0.4492,
"step": 1610
},
{
"epoch": 0.2832415420928403,
"grad_norm": 3.1583054065704346,
"learning_rate": 3.641497276082245e-05,
"loss": 0.4656,
"step": 1620
},
{
"epoch": 0.2849899466736603,
"grad_norm": 3.003847360610962,
"learning_rate": 3.639154121023959e-05,
"loss": 0.4477,
"step": 1630
},
{
"epoch": 0.2867383512544803,
"grad_norm": 3.8377931118011475,
"learning_rate": 3.636810965965673e-05,
"loss": 0.437,
"step": 1640
},
{
"epoch": 0.2884867558353003,
"grad_norm": 4.43681526184082,
"learning_rate": 3.6344678109073874e-05,
"loss": 0.4315,
"step": 1650
},
{
"epoch": 0.2902351604161203,
"grad_norm": 3.5097219944000244,
"learning_rate": 3.632124655849101e-05,
"loss": 0.4466,
"step": 1660
},
{
"epoch": 0.2919835649969403,
"grad_norm": 4.560410976409912,
"learning_rate": 3.6297815007908155e-05,
"loss": 0.4282,
"step": 1670
},
{
"epoch": 0.2937319695777603,
"grad_norm": 4.94926643371582,
"learning_rate": 3.627438345732529e-05,
"loss": 0.4744,
"step": 1680
},
{
"epoch": 0.2954803741585803,
"grad_norm": 3.3490183353424072,
"learning_rate": 3.6250951906742436e-05,
"loss": 0.4927,
"step": 1690
},
{
"epoch": 0.2972287787394003,
"grad_norm": 3.6036620140075684,
"learning_rate": 3.622752035615957e-05,
"loss": 0.483,
"step": 1700
},
{
"epoch": 0.2989771833202203,
"grad_norm": 3.800067663192749,
"learning_rate": 3.620408880557671e-05,
"loss": 0.4069,
"step": 1710
},
{
"epoch": 0.3007255879010403,
"grad_norm": 3.0274336338043213,
"learning_rate": 3.6180657254993854e-05,
"loss": 0.4235,
"step": 1720
},
{
"epoch": 0.3024739924818603,
"grad_norm": 6.128991603851318,
"learning_rate": 3.615722570441099e-05,
"loss": 0.4782,
"step": 1730
},
{
"epoch": 0.3042223970626803,
"grad_norm": 3.7391269207000732,
"learning_rate": 3.6133794153828135e-05,
"loss": 0.4981,
"step": 1740
},
{
"epoch": 0.3059708016435003,
"grad_norm": 4.853888034820557,
"learning_rate": 3.611036260324527e-05,
"loss": 0.4605,
"step": 1750
},
{
"epoch": 0.3077192062243203,
"grad_norm": 4.367269515991211,
"learning_rate": 3.608693105266242e-05,
"loss": 0.4695,
"step": 1760
},
{
"epoch": 0.3094676108051403,
"grad_norm": 3.594266414642334,
"learning_rate": 3.6063499502079554e-05,
"loss": 0.4399,
"step": 1770
},
{
"epoch": 0.31121601538596033,
"grad_norm": 2.8320610523223877,
"learning_rate": 3.604006795149669e-05,
"loss": 0.4281,
"step": 1780
},
{
"epoch": 0.3129644199667803,
"grad_norm": 2.9507384300231934,
"learning_rate": 3.601663640091383e-05,
"loss": 0.425,
"step": 1790
},
{
"epoch": 0.3147128245476003,
"grad_norm": 5.069509506225586,
"learning_rate": 3.599320485033097e-05,
"loss": 0.4376,
"step": 1800
},
{
"epoch": 0.31646122912842034,
"grad_norm": 4.728787899017334,
"learning_rate": 3.596977329974811e-05,
"loss": 0.487,
"step": 1810
},
{
"epoch": 0.3182096337092403,
"grad_norm": 4.627148151397705,
"learning_rate": 3.594634174916525e-05,
"loss": 0.3894,
"step": 1820
},
{
"epoch": 0.3199580382900603,
"grad_norm": 9.96601676940918,
"learning_rate": 3.592291019858239e-05,
"loss": 0.4409,
"step": 1830
},
{
"epoch": 0.32170644287088035,
"grad_norm": 5.776546478271484,
"learning_rate": 3.5899478647999535e-05,
"loss": 0.4583,
"step": 1840
},
{
"epoch": 0.32345484745170033,
"grad_norm": 3.2901666164398193,
"learning_rate": 3.587604709741667e-05,
"loss": 0.4464,
"step": 1850
},
{
"epoch": 0.3252032520325203,
"grad_norm": 5.241537094116211,
"learning_rate": 3.5852615546833816e-05,
"loss": 0.4677,
"step": 1860
},
{
"epoch": 0.3269516566133403,
"grad_norm": 4.418180465698242,
"learning_rate": 3.582918399625095e-05,
"loss": 0.4884,
"step": 1870
},
{
"epoch": 0.32870006119416034,
"grad_norm": 3.4376697540283203,
"learning_rate": 3.58057524456681e-05,
"loss": 0.4303,
"step": 1880
},
{
"epoch": 0.3304484657749803,
"grad_norm": 4.297024726867676,
"learning_rate": 3.5782320895085234e-05,
"loss": 0.5045,
"step": 1890
},
{
"epoch": 0.3321968703558003,
"grad_norm": 7.0916948318481445,
"learning_rate": 3.575888934450237e-05,
"loss": 0.4702,
"step": 1900
},
{
"epoch": 0.33394527493662035,
"grad_norm": 8.566019058227539,
"learning_rate": 3.5735457793919515e-05,
"loss": 0.4032,
"step": 1910
},
{
"epoch": 0.33569367951744034,
"grad_norm": 2.6082425117492676,
"learning_rate": 3.571202624333665e-05,
"loss": 0.4511,
"step": 1920
},
{
"epoch": 0.3374420840982603,
"grad_norm": 4.204542636871338,
"learning_rate": 3.5688594692753796e-05,
"loss": 0.3781,
"step": 1930
},
{
"epoch": 0.33919048867908036,
"grad_norm": 3.8807003498077393,
"learning_rate": 3.5665163142170934e-05,
"loss": 0.4176,
"step": 1940
},
{
"epoch": 0.34093889325990034,
"grad_norm": 2.7027812004089355,
"learning_rate": 3.564173159158808e-05,
"loss": 0.3927,
"step": 1950
},
{
"epoch": 0.34268729784072033,
"grad_norm": 3.784552574157715,
"learning_rate": 3.5618300041005215e-05,
"loss": 0.4709,
"step": 1960
},
{
"epoch": 0.34443570242154037,
"grad_norm": 3.2418737411499023,
"learning_rate": 3.559486849042236e-05,
"loss": 0.4252,
"step": 1970
},
{
"epoch": 0.34618410700236035,
"grad_norm": 5.2287397384643555,
"learning_rate": 3.5571436939839496e-05,
"loss": 0.4115,
"step": 1980
},
{
"epoch": 0.34793251158318034,
"grad_norm": 4.299645900726318,
"learning_rate": 3.554800538925664e-05,
"loss": 0.4234,
"step": 1990
},
{
"epoch": 0.3496809161640004,
"grad_norm": 2.8772058486938477,
"learning_rate": 3.552457383867378e-05,
"loss": 0.4709,
"step": 2000
},
{
"epoch": 0.35142932074482036,
"grad_norm": 3.6467995643615723,
"learning_rate": 3.5501142288090914e-05,
"loss": 0.3678,
"step": 2010
},
{
"epoch": 0.35317772532564035,
"grad_norm": 4.426238059997559,
"learning_rate": 3.547771073750806e-05,
"loss": 0.3868,
"step": 2020
},
{
"epoch": 0.35492612990646033,
"grad_norm": 6.251084327697754,
"learning_rate": 3.5454279186925195e-05,
"loss": 0.4048,
"step": 2030
},
{
"epoch": 0.35667453448728037,
"grad_norm": 3.1436846256256104,
"learning_rate": 3.543084763634234e-05,
"loss": 0.369,
"step": 2040
},
{
"epoch": 0.35842293906810035,
"grad_norm": 5.551470756530762,
"learning_rate": 3.540741608575948e-05,
"loss": 0.4341,
"step": 2050
},
{
"epoch": 0.36017134364892034,
"grad_norm": 3.3913917541503906,
"learning_rate": 3.538398453517662e-05,
"loss": 0.3886,
"step": 2060
},
{
"epoch": 0.3619197482297404,
"grad_norm": 2.8911020755767822,
"learning_rate": 3.536055298459376e-05,
"loss": 0.3767,
"step": 2070
},
{
"epoch": 0.36366815281056036,
"grad_norm": 3.6292026042938232,
"learning_rate": 3.53371214340109e-05,
"loss": 0.4193,
"step": 2080
},
{
"epoch": 0.36541655739138035,
"grad_norm": 3.489974021911621,
"learning_rate": 3.531368988342804e-05,
"loss": 0.4379,
"step": 2090
},
{
"epoch": 0.3671649619722004,
"grad_norm": 3.882077217102051,
"learning_rate": 3.529025833284518e-05,
"loss": 0.3757,
"step": 2100
},
{
"epoch": 0.36891336655302037,
"grad_norm": 4.579954147338867,
"learning_rate": 3.526682678226232e-05,
"loss": 0.3671,
"step": 2110
},
{
"epoch": 0.37066177113384036,
"grad_norm": 3.082444906234741,
"learning_rate": 3.524339523167946e-05,
"loss": 0.3885,
"step": 2120
},
{
"epoch": 0.3724101757146604,
"grad_norm": 4.321898460388184,
"learning_rate": 3.52199636810966e-05,
"loss": 0.4543,
"step": 2130
},
{
"epoch": 0.3741585802954804,
"grad_norm": 5.577615737915039,
"learning_rate": 3.519653213051374e-05,
"loss": 0.3491,
"step": 2140
},
{
"epoch": 0.37590698487630037,
"grad_norm": 4.5239057540893555,
"learning_rate": 3.517310057993088e-05,
"loss": 0.4436,
"step": 2150
},
{
"epoch": 0.3776553894571204,
"grad_norm": 4.200013637542725,
"learning_rate": 3.514966902934802e-05,
"loss": 0.4066,
"step": 2160
},
{
"epoch": 0.3794037940379404,
"grad_norm": 2.5660548210144043,
"learning_rate": 3.5126237478765164e-05,
"loss": 0.4033,
"step": 2170
},
{
"epoch": 0.3811521986187604,
"grad_norm": 3.0077526569366455,
"learning_rate": 3.51028059281823e-05,
"loss": 0.4044,
"step": 2180
},
{
"epoch": 0.38290060319958036,
"grad_norm": 3.9082486629486084,
"learning_rate": 3.5079374377599445e-05,
"loss": 0.4027,
"step": 2190
},
{
"epoch": 0.3846490077804004,
"grad_norm": 3.365020751953125,
"learning_rate": 3.505594282701658e-05,
"loss": 0.4271,
"step": 2200
},
{
"epoch": 0.3863974123612204,
"grad_norm": 5.796195983886719,
"learning_rate": 3.5032511276433726e-05,
"loss": 0.4496,
"step": 2210
},
{
"epoch": 0.38814581694204037,
"grad_norm": 4.143885612487793,
"learning_rate": 3.500907972585086e-05,
"loss": 0.3982,
"step": 2220
},
{
"epoch": 0.3898942215228604,
"grad_norm": 3.6130030155181885,
"learning_rate": 3.4985648175268e-05,
"loss": 0.432,
"step": 2230
},
{
"epoch": 0.3916426261036804,
"grad_norm": 4.119210243225098,
"learning_rate": 3.4962216624685144e-05,
"loss": 0.4078,
"step": 2240
},
{
"epoch": 0.3933910306845004,
"grad_norm": 3.4416446685791016,
"learning_rate": 3.493878507410228e-05,
"loss": 0.4556,
"step": 2250
},
{
"epoch": 0.3951394352653204,
"grad_norm": 3.9759068489074707,
"learning_rate": 3.491535352351942e-05,
"loss": 0.4438,
"step": 2260
},
{
"epoch": 0.3968878398461404,
"grad_norm": 5.250323295593262,
"learning_rate": 3.489192197293656e-05,
"loss": 0.3963,
"step": 2270
},
{
"epoch": 0.3986362444269604,
"grad_norm": 2.926793098449707,
"learning_rate": 3.48684904223537e-05,
"loss": 0.3948,
"step": 2280
},
{
"epoch": 0.4003846490077804,
"grad_norm": 3.849525213241577,
"learning_rate": 3.484505887177084e-05,
"loss": 0.4359,
"step": 2290
},
{
"epoch": 0.4021330535886004,
"grad_norm": 5.853473663330078,
"learning_rate": 3.482162732118798e-05,
"loss": 0.4449,
"step": 2300
},
{
"epoch": 0.4038814581694204,
"grad_norm": 2.683713674545288,
"learning_rate": 3.479819577060512e-05,
"loss": 0.4052,
"step": 2310
},
{
"epoch": 0.40562986275024043,
"grad_norm": 4.078883647918701,
"learning_rate": 3.477476422002226e-05,
"loss": 0.4448,
"step": 2320
},
{
"epoch": 0.4073782673310604,
"grad_norm": 2.502694606781006,
"learning_rate": 3.47513326694394e-05,
"loss": 0.3751,
"step": 2330
},
{
"epoch": 0.4091266719118804,
"grad_norm": 3.9655332565307617,
"learning_rate": 3.4727901118856543e-05,
"loss": 0.394,
"step": 2340
},
{
"epoch": 0.4108750764927004,
"grad_norm": 3.2672157287597656,
"learning_rate": 3.470446956827368e-05,
"loss": 0.3714,
"step": 2350
},
{
"epoch": 0.4126234810735204,
"grad_norm": 2.6867640018463135,
"learning_rate": 3.4681038017690825e-05,
"loss": 0.3843,
"step": 2360
},
{
"epoch": 0.4143718856543404,
"grad_norm": 3.3197810649871826,
"learning_rate": 3.465760646710796e-05,
"loss": 0.3949,
"step": 2370
},
{
"epoch": 0.4161202902351604,
"grad_norm": 3.062208652496338,
"learning_rate": 3.4634174916525106e-05,
"loss": 0.3798,
"step": 2380
},
{
"epoch": 0.41786869481598043,
"grad_norm": 3.712489366531372,
"learning_rate": 3.461074336594224e-05,
"loss": 0.3991,
"step": 2390
},
{
"epoch": 0.4196170993968004,
"grad_norm": 4.0003437995910645,
"learning_rate": 3.458731181535938e-05,
"loss": 0.4353,
"step": 2400
},
{
"epoch": 0.4213655039776204,
"grad_norm": 4.571620464324951,
"learning_rate": 3.4563880264776524e-05,
"loss": 0.3888,
"step": 2410
},
{
"epoch": 0.42311390855844044,
"grad_norm": 9.544726371765137,
"learning_rate": 3.454044871419366e-05,
"loss": 0.4344,
"step": 2420
},
{
"epoch": 0.42486231313926043,
"grad_norm": 5.037539958953857,
"learning_rate": 3.4517017163610805e-05,
"loss": 0.3653,
"step": 2430
},
{
"epoch": 0.4266107177200804,
"grad_norm": 3.384692907333374,
"learning_rate": 3.449358561302794e-05,
"loss": 0.3715,
"step": 2440
},
{
"epoch": 0.42835912230090045,
"grad_norm": 3.169987916946411,
"learning_rate": 3.4470154062445087e-05,
"loss": 0.3899,
"step": 2450
},
{
"epoch": 0.43010752688172044,
"grad_norm": 3.1669843196868896,
"learning_rate": 3.4446722511862224e-05,
"loss": 0.4537,
"step": 2460
},
{
"epoch": 0.4318559314625404,
"grad_norm": 3.975206136703491,
"learning_rate": 3.442329096127937e-05,
"loss": 0.4296,
"step": 2470
},
{
"epoch": 0.43360433604336046,
"grad_norm": 3.153317928314209,
"learning_rate": 3.4399859410696505e-05,
"loss": 0.3524,
"step": 2480
},
{
"epoch": 0.43535274062418045,
"grad_norm": 3.307684898376465,
"learning_rate": 3.437642786011365e-05,
"loss": 0.3707,
"step": 2490
},
{
"epoch": 0.43710114520500043,
"grad_norm": 3.744170904159546,
"learning_rate": 3.4352996309530786e-05,
"loss": 0.4054,
"step": 2500
},
{
"epoch": 0.4388495497858204,
"grad_norm": 5.2920427322387695,
"learning_rate": 3.432956475894792e-05,
"loss": 0.3621,
"step": 2510
},
{
"epoch": 0.44059795436664045,
"grad_norm": 4.31833553314209,
"learning_rate": 3.430613320836507e-05,
"loss": 0.4324,
"step": 2520
},
{
"epoch": 0.44234635894746044,
"grad_norm": 6.0344929695129395,
"learning_rate": 3.4282701657782204e-05,
"loss": 0.4201,
"step": 2530
},
{
"epoch": 0.4440947635282804,
"grad_norm": 6.097479820251465,
"learning_rate": 3.425927010719935e-05,
"loss": 0.4248,
"step": 2540
},
{
"epoch": 0.44584316810910046,
"grad_norm": 3.0370521545410156,
"learning_rate": 3.4235838556616486e-05,
"loss": 0.3989,
"step": 2550
},
{
"epoch": 0.44759157268992045,
"grad_norm": 3.0904951095581055,
"learning_rate": 3.421240700603363e-05,
"loss": 0.4062,
"step": 2560
},
{
"epoch": 0.44933997727074043,
"grad_norm": 4.901310920715332,
"learning_rate": 3.418897545545077e-05,
"loss": 0.401,
"step": 2570
},
{
"epoch": 0.45108838185156047,
"grad_norm": 5.094497203826904,
"learning_rate": 3.416554390486791e-05,
"loss": 0.3905,
"step": 2580
},
{
"epoch": 0.45283678643238046,
"grad_norm": 3.1318740844726562,
"learning_rate": 3.414211235428505e-05,
"loss": 0.3795,
"step": 2590
},
{
"epoch": 0.45458519101320044,
"grad_norm": 3.8154094219207764,
"learning_rate": 3.411868080370219e-05,
"loss": 0.389,
"step": 2600
},
{
"epoch": 0.4563335955940205,
"grad_norm": 3.527348756790161,
"learning_rate": 3.409524925311933e-05,
"loss": 0.4058,
"step": 2610
},
{
"epoch": 0.45808200017484046,
"grad_norm": 6.264882564544678,
"learning_rate": 3.4071817702536466e-05,
"loss": 0.3955,
"step": 2620
},
{
"epoch": 0.45983040475566045,
"grad_norm": 2.324648857116699,
"learning_rate": 3.404838615195361e-05,
"loss": 0.3582,
"step": 2630
},
{
"epoch": 0.4615788093364805,
"grad_norm": 4.422379016876221,
"learning_rate": 3.402495460137075e-05,
"loss": 0.3479,
"step": 2640
},
{
"epoch": 0.4633272139173005,
"grad_norm": 3.0725393295288086,
"learning_rate": 3.400152305078789e-05,
"loss": 0.3714,
"step": 2650
},
{
"epoch": 0.46507561849812046,
"grad_norm": 4.242288112640381,
"learning_rate": 3.397809150020503e-05,
"loss": 0.3762,
"step": 2660
},
{
"epoch": 0.46682402307894044,
"grad_norm": 4.661035537719727,
"learning_rate": 3.395465994962217e-05,
"loss": 0.3732,
"step": 2670
},
{
"epoch": 0.4685724276597605,
"grad_norm": 2.518129825592041,
"learning_rate": 3.393122839903931e-05,
"loss": 0.4087,
"step": 2680
},
{
"epoch": 0.47032083224058047,
"grad_norm": 4.576558589935303,
"learning_rate": 3.3907796848456454e-05,
"loss": 0.3828,
"step": 2690
},
{
"epoch": 0.47206923682140045,
"grad_norm": 2.8037259578704834,
"learning_rate": 3.388436529787359e-05,
"loss": 0.3603,
"step": 2700
},
{
"epoch": 0.4738176414022205,
"grad_norm": 4.486359596252441,
"learning_rate": 3.3860933747290735e-05,
"loss": 0.366,
"step": 2710
},
{
"epoch": 0.4755660459830405,
"grad_norm": 3.6302101612091064,
"learning_rate": 3.383750219670787e-05,
"loss": 0.3775,
"step": 2720
},
{
"epoch": 0.47731445056386046,
"grad_norm": 4.8168768882751465,
"learning_rate": 3.381407064612501e-05,
"loss": 0.3573,
"step": 2730
},
{
"epoch": 0.4790628551446805,
"grad_norm": 4.675252437591553,
"learning_rate": 3.3790639095542147e-05,
"loss": 0.3809,
"step": 2740
},
{
"epoch": 0.4808112597255005,
"grad_norm": 2.982111930847168,
"learning_rate": 3.376720754495929e-05,
"loss": 0.3664,
"step": 2750
},
{
"epoch": 0.48255966430632047,
"grad_norm": 3.7463014125823975,
"learning_rate": 3.374377599437643e-05,
"loss": 0.3629,
"step": 2760
},
{
"epoch": 0.4843080688871405,
"grad_norm": 2.5750458240509033,
"learning_rate": 3.372268759885186e-05,
"loss": 0.3459,
"step": 2770
},
{
"epoch": 0.4860564734679605,
"grad_norm": 4.039637565612793,
"learning_rate": 3.3699256048268997e-05,
"loss": 0.3836,
"step": 2780
},
{
"epoch": 0.4878048780487805,
"grad_norm": 4.1245832443237305,
"learning_rate": 3.367582449768614e-05,
"loss": 0.3801,
"step": 2790
},
{
"epoch": 0.4895532826296005,
"grad_norm": 3.3629612922668457,
"learning_rate": 3.365239294710328e-05,
"loss": 0.3474,
"step": 2800
},
{
"epoch": 0.4913016872104205,
"grad_norm": 3.7446513175964355,
"learning_rate": 3.362896139652042e-05,
"loss": 0.4071,
"step": 2810
},
{
"epoch": 0.4930500917912405,
"grad_norm": 2.827909231185913,
"learning_rate": 3.360552984593756e-05,
"loss": 0.3862,
"step": 2820
},
{
"epoch": 0.49479849637206047,
"grad_norm": 6.202451229095459,
"learning_rate": 3.3582098295354696e-05,
"loss": 0.3261,
"step": 2830
},
{
"epoch": 0.4965469009528805,
"grad_norm": 3.2312405109405518,
"learning_rate": 3.355866674477184e-05,
"loss": 0.3866,
"step": 2840
},
{
"epoch": 0.4982953055337005,
"grad_norm": 4.0673699378967285,
"learning_rate": 3.353523519418898e-05,
"loss": 0.3258,
"step": 2850
},
{
"epoch": 0.5000437101145205,
"grad_norm": 2.6515884399414062,
"learning_rate": 3.351180364360612e-05,
"loss": 0.3814,
"step": 2860
},
{
"epoch": 0.5017921146953405,
"grad_norm": 3.774637460708618,
"learning_rate": 3.348837209302326e-05,
"loss": 0.4082,
"step": 2870
},
{
"epoch": 0.5035405192761605,
"grad_norm": 3.6968934535980225,
"learning_rate": 3.34649405424404e-05,
"loss": 0.395,
"step": 2880
},
{
"epoch": 0.5052889238569805,
"grad_norm": 4.876258850097656,
"learning_rate": 3.344150899185754e-05,
"loss": 0.3328,
"step": 2890
},
{
"epoch": 0.5070373284378005,
"grad_norm": 3.079639196395874,
"learning_rate": 3.3418077441274684e-05,
"loss": 0.3278,
"step": 2900
},
{
"epoch": 0.5087857330186205,
"grad_norm": 3.560870409011841,
"learning_rate": 3.339464589069182e-05,
"loss": 0.3698,
"step": 2910
},
{
"epoch": 0.5105341375994406,
"grad_norm": 3.489295482635498,
"learning_rate": 3.3371214340108965e-05,
"loss": 0.3407,
"step": 2920
},
{
"epoch": 0.5122825421802605,
"grad_norm": 3.805683135986328,
"learning_rate": 3.3347782789526095e-05,
"loss": 0.3602,
"step": 2930
},
{
"epoch": 0.5140309467610805,
"grad_norm": 8.44050407409668,
"learning_rate": 3.332435123894324e-05,
"loss": 0.3246,
"step": 2940
},
{
"epoch": 0.5157793513419006,
"grad_norm": 3.361529588699341,
"learning_rate": 3.3300919688360376e-05,
"loss": 0.2978,
"step": 2950
},
{
"epoch": 0.5175277559227205,
"grad_norm": 4.2473039627075195,
"learning_rate": 3.327748813777752e-05,
"loss": 0.3766,
"step": 2960
},
{
"epoch": 0.5192761605035405,
"grad_norm": 4.070904731750488,
"learning_rate": 3.325405658719466e-05,
"loss": 0.393,
"step": 2970
},
{
"epoch": 0.5210245650843606,
"grad_norm": 3.869349956512451,
"learning_rate": 3.32306250366118e-05,
"loss": 0.3478,
"step": 2980
},
{
"epoch": 0.5227729696651805,
"grad_norm": 2.8064253330230713,
"learning_rate": 3.320719348602894e-05,
"loss": 0.3633,
"step": 2990
},
{
"epoch": 0.5245213742460005,
"grad_norm": 3.720731019973755,
"learning_rate": 3.318376193544608e-05,
"loss": 0.3835,
"step": 3000
},
{
"epoch": 0.5246962147040826,
"eval_loss": 0.5097190737724304,
"eval_runtime": 1744.787,
"eval_samples_per_second": 8.3,
"eval_steps_per_second": 1.038,
"step": 3001
},
{
"epoch": 0.5262697788268206,
"grad_norm": 4.681828498840332,
"learning_rate": 3.316033038486322e-05,
"loss": 0.333,
"step": 3010
},
{
"epoch": 0.5280181834076405,
"grad_norm": 2.902214527130127,
"learning_rate": 3.313689883428036e-05,
"loss": 0.3833,
"step": 3020
},
{
"epoch": 0.5297665879884605,
"grad_norm": 2.4706242084503174,
"learning_rate": 3.31134672836975e-05,
"loss": 0.3533,
"step": 3030
},
{
"epoch": 0.5315149925692806,
"grad_norm": 3.9379889965057373,
"learning_rate": 3.309003573311464e-05,
"loss": 0.3583,
"step": 3040
},
{
"epoch": 0.5332633971501005,
"grad_norm": 3.412458896636963,
"learning_rate": 3.306660418253178e-05,
"loss": 0.3696,
"step": 3050
},
{
"epoch": 0.5350118017309206,
"grad_norm": 6.194937705993652,
"learning_rate": 3.304317263194892e-05,
"loss": 0.3401,
"step": 3060
},
{
"epoch": 0.5367602063117405,
"grad_norm": 2.9540324211120605,
"learning_rate": 3.301974108136606e-05,
"loss": 0.3873,
"step": 3070
},
{
"epoch": 0.5385086108925605,
"grad_norm": 2.695261240005493,
"learning_rate": 3.29963095307832e-05,
"loss": 0.3498,
"step": 3080
},
{
"epoch": 0.5402570154733806,
"grad_norm": 3.2079668045043945,
"learning_rate": 3.2972877980200344e-05,
"loss": 0.3478,
"step": 3090
},
{
"epoch": 0.5420054200542005,
"grad_norm": 2.720630407333374,
"learning_rate": 3.294944642961748e-05,
"loss": 0.3858,
"step": 3100
},
{
"epoch": 0.5437538246350205,
"grad_norm": 2.9935555458068848,
"learning_rate": 3.2926014879034626e-05,
"loss": 0.3728,
"step": 3110
},
{
"epoch": 0.5455022292158406,
"grad_norm": 3.6952109336853027,
"learning_rate": 3.290258332845176e-05,
"loss": 0.4133,
"step": 3120
},
{
"epoch": 0.5472506337966605,
"grad_norm": 3.6369428634643555,
"learning_rate": 3.28791517778689e-05,
"loss": 0.3798,
"step": 3130
},
{
"epoch": 0.5489990383774805,
"grad_norm": 4.478968620300293,
"learning_rate": 3.2855720227286044e-05,
"loss": 0.3333,
"step": 3140
},
{
"epoch": 0.5507474429583006,
"grad_norm": 3.080244779586792,
"learning_rate": 3.283228867670318e-05,
"loss": 0.3818,
"step": 3150
},
{
"epoch": 0.5524958475391205,
"grad_norm": 3.321441888809204,
"learning_rate": 3.2808857126120325e-05,
"loss": 0.3889,
"step": 3160
},
{
"epoch": 0.5542442521199406,
"grad_norm": 4.040501117706299,
"learning_rate": 3.278542557553746e-05,
"loss": 0.3487,
"step": 3170
},
{
"epoch": 0.5559926567007606,
"grad_norm": 2.8912744522094727,
"learning_rate": 3.2761994024954606e-05,
"loss": 0.331,
"step": 3180
},
{
"epoch": 0.5577410612815805,
"grad_norm": 3.6850969791412354,
"learning_rate": 3.2738562474371744e-05,
"loss": 0.3921,
"step": 3190
},
{
"epoch": 0.5594894658624006,
"grad_norm": 2.3551599979400635,
"learning_rate": 3.271513092378889e-05,
"loss": 0.3469,
"step": 3200
},
{
"epoch": 0.5612378704432206,
"grad_norm": 3.7826590538024902,
"learning_rate": 3.2691699373206025e-05,
"loss": 0.3607,
"step": 3210
},
{
"epoch": 0.5629862750240405,
"grad_norm": 4.876736164093018,
"learning_rate": 3.266826782262316e-05,
"loss": 0.4509,
"step": 3220
},
{
"epoch": 0.5647346796048606,
"grad_norm": 4.1988959312438965,
"learning_rate": 3.2644836272040306e-05,
"loss": 0.3974,
"step": 3230
},
{
"epoch": 0.5664830841856806,
"grad_norm": 5.847795009613037,
"learning_rate": 3.262140472145744e-05,
"loss": 0.3719,
"step": 3240
},
{
"epoch": 0.5682314887665005,
"grad_norm": 1.9008731842041016,
"learning_rate": 3.259797317087459e-05,
"loss": 0.3163,
"step": 3250
},
{
"epoch": 0.5699798933473206,
"grad_norm": 5.935329437255859,
"learning_rate": 3.2574541620291724e-05,
"loss": 0.383,
"step": 3260
},
{
"epoch": 0.5717282979281406,
"grad_norm": 4.4414448738098145,
"learning_rate": 3.255111006970887e-05,
"loss": 0.3307,
"step": 3270
},
{
"epoch": 0.5734767025089605,
"grad_norm": 3.289113998413086,
"learning_rate": 3.2527678519126005e-05,
"loss": 0.3523,
"step": 3280
},
{
"epoch": 0.5752251070897806,
"grad_norm": 3.165818214416504,
"learning_rate": 3.250424696854315e-05,
"loss": 0.36,
"step": 3290
},
{
"epoch": 0.5769735116706006,
"grad_norm": 2.0032644271850586,
"learning_rate": 3.248081541796029e-05,
"loss": 0.3958,
"step": 3300
},
{
"epoch": 0.5787219162514206,
"grad_norm": 2.1944901943206787,
"learning_rate": 3.245738386737743e-05,
"loss": 0.3303,
"step": 3310
},
{
"epoch": 0.5804703208322406,
"grad_norm": 2.2373247146606445,
"learning_rate": 3.243395231679457e-05,
"loss": 0.3071,
"step": 3320
},
{
"epoch": 0.5822187254130606,
"grad_norm": 3.046159029006958,
"learning_rate": 3.2410520766211705e-05,
"loss": 0.3334,
"step": 3330
},
{
"epoch": 0.5839671299938806,
"grad_norm": 4.304010391235352,
"learning_rate": 3.238708921562885e-05,
"loss": 0.323,
"step": 3340
},
{
"epoch": 0.5857155345747006,
"grad_norm": 4.681334972381592,
"learning_rate": 3.2363657665045986e-05,
"loss": 0.3632,
"step": 3350
},
{
"epoch": 0.5874639391555206,
"grad_norm": 2.211153984069824,
"learning_rate": 3.234022611446313e-05,
"loss": 0.3263,
"step": 3360
},
{
"epoch": 0.5892123437363406,
"grad_norm": 3.0405044555664062,
"learning_rate": 3.231679456388027e-05,
"loss": 0.337,
"step": 3370
},
{
"epoch": 0.5909607483171606,
"grad_norm": 3.744624614715576,
"learning_rate": 3.229336301329741e-05,
"loss": 0.3644,
"step": 3380
},
{
"epoch": 0.5927091528979805,
"grad_norm": 2.4798245429992676,
"learning_rate": 3.226993146271455e-05,
"loss": 0.3328,
"step": 3390
},
{
"epoch": 0.5944575574788006,
"grad_norm": 3.567376136779785,
"learning_rate": 3.2246499912131686e-05,
"loss": 0.3273,
"step": 3400
},
{
"epoch": 0.5962059620596206,
"grad_norm": 3.397534132003784,
"learning_rate": 3.222306836154882e-05,
"loss": 0.3503,
"step": 3410
},
{
"epoch": 0.5979543666404405,
"grad_norm": 5.0991692543029785,
"learning_rate": 3.219963681096597e-05,
"loss": 0.3703,
"step": 3420
},
{
"epoch": 0.5997027712212606,
"grad_norm": 3.5852749347686768,
"learning_rate": 3.2176205260383104e-05,
"loss": 0.3693,
"step": 3430
},
{
"epoch": 0.6014511758020806,
"grad_norm": 2.5035903453826904,
"learning_rate": 3.215277370980025e-05,
"loss": 0.316,
"step": 3440
},
{
"epoch": 0.6031995803829006,
"grad_norm": 3.4612536430358887,
"learning_rate": 3.2129342159217385e-05,
"loss": 0.3087,
"step": 3450
},
{
"epoch": 0.6049479849637206,
"grad_norm": 6.134129524230957,
"learning_rate": 3.210591060863453e-05,
"loss": 0.3416,
"step": 3460
},
{
"epoch": 0.6066963895445406,
"grad_norm": 2.857895612716675,
"learning_rate": 3.2082479058051666e-05,
"loss": 0.3846,
"step": 3470
},
{
"epoch": 0.6084447941253606,
"grad_norm": 4.056922912597656,
"learning_rate": 3.205904750746881e-05,
"loss": 0.3527,
"step": 3480
},
{
"epoch": 0.6101931987061806,
"grad_norm": 3.3907055854797363,
"learning_rate": 3.203561595688595e-05,
"loss": 0.3492,
"step": 3490
},
{
"epoch": 0.6119416032870006,
"grad_norm": 2.8946733474731445,
"learning_rate": 3.201218440630309e-05,
"loss": 0.3297,
"step": 3500
},
{
"epoch": 0.6136900078678206,
"grad_norm": 2.1298012733459473,
"learning_rate": 3.198875285572023e-05,
"loss": 0.356,
"step": 3510
},
{
"epoch": 0.6154384124486406,
"grad_norm": 3.428750514984131,
"learning_rate": 3.1965321305137366e-05,
"loss": 0.3316,
"step": 3520
},
{
"epoch": 0.6171868170294607,
"grad_norm": 2.575652599334717,
"learning_rate": 3.194188975455451e-05,
"loss": 0.3288,
"step": 3530
},
{
"epoch": 0.6189352216102806,
"grad_norm": 3.2835533618927,
"learning_rate": 3.191845820397165e-05,
"loss": 0.372,
"step": 3540
},
{
"epoch": 0.6206836261911006,
"grad_norm": 3.7413949966430664,
"learning_rate": 3.189502665338879e-05,
"loss": 0.3372,
"step": 3550
},
{
"epoch": 0.6224320307719207,
"grad_norm": 3.207977533340454,
"learning_rate": 3.187159510280593e-05,
"loss": 0.3327,
"step": 3560
},
{
"epoch": 0.6241804353527406,
"grad_norm": 2.6386313438415527,
"learning_rate": 3.184816355222307e-05,
"loss": 0.3639,
"step": 3570
},
{
"epoch": 0.6259288399335606,
"grad_norm": 3.423943519592285,
"learning_rate": 3.182473200164021e-05,
"loss": 0.3337,
"step": 3580
},
{
"epoch": 0.6276772445143807,
"grad_norm": 2.5779225826263428,
"learning_rate": 3.1801300451057353e-05,
"loss": 0.3813,
"step": 3590
},
{
"epoch": 0.6294256490952006,
"grad_norm": 3.204908847808838,
"learning_rate": 3.177786890047449e-05,
"loss": 0.2671,
"step": 3600
},
{
"epoch": 0.6311740536760206,
"grad_norm": 2.594581365585327,
"learning_rate": 3.1754437349891635e-05,
"loss": 0.3638,
"step": 3610
},
{
"epoch": 0.6329224582568407,
"grad_norm": 3.348681688308716,
"learning_rate": 3.173100579930877e-05,
"loss": 0.3639,
"step": 3620
},
{
"epoch": 0.6346708628376606,
"grad_norm": 3.975121259689331,
"learning_rate": 3.170757424872591e-05,
"loss": 0.3239,
"step": 3630
},
{
"epoch": 0.6364192674184806,
"grad_norm": 3.4465627670288086,
"learning_rate": 3.168414269814305e-05,
"loss": 0.3359,
"step": 3640
},
{
"epoch": 0.6381676719993007,
"grad_norm": 3.444370985031128,
"learning_rate": 3.166071114756019e-05,
"loss": 0.3387,
"step": 3650
},
{
"epoch": 0.6399160765801206,
"grad_norm": 2.481161117553711,
"learning_rate": 3.1637279596977334e-05,
"loss": 0.3292,
"step": 3660
},
{
"epoch": 0.6416644811609407,
"grad_norm": 3.121288537979126,
"learning_rate": 3.161384804639447e-05,
"loss": 0.3552,
"step": 3670
},
{
"epoch": 0.6434128857417607,
"grad_norm": 4.9208221435546875,
"learning_rate": 3.1590416495811615e-05,
"loss": 0.3309,
"step": 3680
},
{
"epoch": 0.6451612903225806,
"grad_norm": 2.5919346809387207,
"learning_rate": 3.156698494522875e-05,
"loss": 0.3432,
"step": 3690
},
{
"epoch": 0.6469096949034007,
"grad_norm": 3.173069953918457,
"learning_rate": 3.1543553394645896e-05,
"loss": 0.3804,
"step": 3700
},
{
"epoch": 0.6486580994842206,
"grad_norm": 5.001594066619873,
"learning_rate": 3.1520121844063034e-05,
"loss": 0.3335,
"step": 3710
},
{
"epoch": 0.6504065040650406,
"grad_norm": 2.092214822769165,
"learning_rate": 3.149669029348018e-05,
"loss": 0.3362,
"step": 3720
},
{
"epoch": 0.6521549086458607,
"grad_norm": 4.916845798492432,
"learning_rate": 3.1473258742897315e-05,
"loss": 0.3568,
"step": 3730
},
{
"epoch": 0.6539033132266806,
"grad_norm": 2.18415904045105,
"learning_rate": 3.144982719231445e-05,
"loss": 0.3235,
"step": 3740
},
{
"epoch": 0.6556517178075006,
"grad_norm": 2.239564895629883,
"learning_rate": 3.1426395641731596e-05,
"loss": 0.3591,
"step": 3750
},
{
"epoch": 0.6574001223883207,
"grad_norm": 2.547616481781006,
"learning_rate": 3.140296409114873e-05,
"loss": 0.3603,
"step": 3760
},
{
"epoch": 0.6591485269691406,
"grad_norm": 2.3699333667755127,
"learning_rate": 3.137953254056588e-05,
"loss": 0.3346,
"step": 3770
},
{
"epoch": 0.6608969315499607,
"grad_norm": 2.7866456508636475,
"learning_rate": 3.1356100989983014e-05,
"loss": 0.3621,
"step": 3780
},
{
"epoch": 0.6626453361307807,
"grad_norm": 2.5254106521606445,
"learning_rate": 3.133266943940016e-05,
"loss": 0.3184,
"step": 3790
},
{
"epoch": 0.6643937407116006,
"grad_norm": 4.498997211456299,
"learning_rate": 3.1309237888817296e-05,
"loss": 0.301,
"step": 3800
},
{
"epoch": 0.6661421452924207,
"grad_norm": 3.224710702896118,
"learning_rate": 3.128580633823444e-05,
"loss": 0.3203,
"step": 3810
},
{
"epoch": 0.6678905498732407,
"grad_norm": 2.7599270343780518,
"learning_rate": 3.126237478765158e-05,
"loss": 0.3336,
"step": 3820
},
{
"epoch": 0.6696389544540606,
"grad_norm": 4.111194133758545,
"learning_rate": 3.123894323706872e-05,
"loss": 0.3643,
"step": 3830
},
{
"epoch": 0.6713873590348807,
"grad_norm": 9.280810356140137,
"learning_rate": 3.121551168648586e-05,
"loss": 0.3294,
"step": 3840
},
{
"epoch": 0.6731357636157007,
"grad_norm": 3.527141809463501,
"learning_rate": 3.1192080135902995e-05,
"loss": 0.3035,
"step": 3850
},
{
"epoch": 0.6748841681965206,
"grad_norm": 2.8929083347320557,
"learning_rate": 3.116864858532014e-05,
"loss": 0.3047,
"step": 3860
},
{
"epoch": 0.6766325727773407,
"grad_norm": 4.317626476287842,
"learning_rate": 3.1145217034737276e-05,
"loss": 0.352,
"step": 3870
},
{
"epoch": 0.6783809773581607,
"grad_norm": 3.2065703868865967,
"learning_rate": 3.1121785484154413e-05,
"loss": 0.352,
"step": 3880
},
{
"epoch": 0.6801293819389806,
"grad_norm": 2.805239677429199,
"learning_rate": 3.109835393357156e-05,
"loss": 0.3539,
"step": 3890
},
{
"epoch": 0.6818777865198007,
"grad_norm": 3.5220377445220947,
"learning_rate": 3.1074922382988695e-05,
"loss": 0.3313,
"step": 3900
},
{
"epoch": 0.6836261911006207,
"grad_norm": 3.755730628967285,
"learning_rate": 3.105149083240583e-05,
"loss": 0.321,
"step": 3910
},
{
"epoch": 0.6853745956814407,
"grad_norm": 3.296947956085205,
"learning_rate": 3.1028059281822976e-05,
"loss": 0.3512,
"step": 3920
},
{
"epoch": 0.6871230002622607,
"grad_norm": 3.954050064086914,
"learning_rate": 3.100462773124011e-05,
"loss": 0.3563,
"step": 3930
},
{
"epoch": 0.6888714048430807,
"grad_norm": 3.8162853717803955,
"learning_rate": 3.098119618065726e-05,
"loss": 0.3559,
"step": 3940
},
{
"epoch": 0.6906198094239007,
"grad_norm": 3.416830062866211,
"learning_rate": 3.0957764630074394e-05,
"loss": 0.3166,
"step": 3950
},
{
"epoch": 0.6923682140047207,
"grad_norm": 3.054938554763794,
"learning_rate": 3.0936676234549826e-05,
"loss": 0.3129,
"step": 3960
},
{
"epoch": 0.6941166185855407,
"grad_norm": 3.042231798171997,
"learning_rate": 3.091324468396696e-05,
"loss": 0.3109,
"step": 3970
},
{
"epoch": 0.6958650231663607,
"grad_norm": 3.516209125518799,
"learning_rate": 3.088981313338411e-05,
"loss": 0.321,
"step": 3980
},
{
"epoch": 0.6976134277471807,
"grad_norm": 3.2602617740631104,
"learning_rate": 3.0866381582801244e-05,
"loss": 0.3351,
"step": 3990
},
{
"epoch": 0.6993618323280008,
"grad_norm": 2.974976062774658,
"learning_rate": 3.084295003221839e-05,
"loss": 0.3029,
"step": 4000
},
{
"epoch": 0.7011102369088207,
"grad_norm": 4.38007926940918,
"learning_rate": 3.0819518481635525e-05,
"loss": 0.3487,
"step": 4010
},
{
"epoch": 0.7028586414896407,
"grad_norm": 3.0209977626800537,
"learning_rate": 3.079608693105267e-05,
"loss": 0.3643,
"step": 4020
},
{
"epoch": 0.7046070460704607,
"grad_norm": 3.7745320796966553,
"learning_rate": 3.0772655380469806e-05,
"loss": 0.3621,
"step": 4030
},
{
"epoch": 0.7063554506512807,
"grad_norm": 2.3382375240325928,
"learning_rate": 3.074922382988695e-05,
"loss": 0.3187,
"step": 4040
},
{
"epoch": 0.7081038552321007,
"grad_norm": 4.3452558517456055,
"learning_rate": 3.072579227930409e-05,
"loss": 0.3314,
"step": 4050
},
{
"epoch": 0.7098522598129207,
"grad_norm": 4.7545695304870605,
"learning_rate": 3.0702360728721225e-05,
"loss": 0.3112,
"step": 4060
},
{
"epoch": 0.7116006643937407,
"grad_norm": 1.974089503288269,
"learning_rate": 3.067892917813837e-05,
"loss": 0.3323,
"step": 4070
},
{
"epoch": 0.7133490689745607,
"grad_norm": 9.842713356018066,
"learning_rate": 3.0655497627555506e-05,
"loss": 0.3103,
"step": 4080
},
{
"epoch": 0.7150974735553807,
"grad_norm": 2.8412296772003174,
"learning_rate": 3.063206607697264e-05,
"loss": 0.3463,
"step": 4090
},
{
"epoch": 0.7168458781362007,
"grad_norm": 2.391716480255127,
"learning_rate": 3.060863452638979e-05,
"loss": 0.3147,
"step": 4100
},
{
"epoch": 0.7185942827170207,
"grad_norm": 3.2871251106262207,
"learning_rate": 3.0585202975806924e-05,
"loss": 0.329,
"step": 4110
},
{
"epoch": 0.7203426872978407,
"grad_norm": 3.405353307723999,
"learning_rate": 3.056177142522406e-05,
"loss": 0.3005,
"step": 4120
},
{
"epoch": 0.7220910918786607,
"grad_norm": 3.316866636276245,
"learning_rate": 3.0538339874641206e-05,
"loss": 0.3368,
"step": 4130
},
{
"epoch": 0.7238394964594808,
"grad_norm": 2.363496780395508,
"learning_rate": 3.0517251479116634e-05,
"loss": 0.3416,
"step": 4140
},
{
"epoch": 0.7255879010403007,
"grad_norm": 4.142341136932373,
"learning_rate": 3.0493819928533774e-05,
"loss": 0.3067,
"step": 4150
},
{
"epoch": 0.7273363056211207,
"grad_norm": 2.67199444770813,
"learning_rate": 3.0470388377950915e-05,
"loss": 0.3286,
"step": 4160
},
{
"epoch": 0.7290847102019408,
"grad_norm": 1.8342429399490356,
"learning_rate": 3.0446956827368056e-05,
"loss": 0.344,
"step": 4170
},
{
"epoch": 0.7308331147827607,
"grad_norm": 3.3271865844726562,
"learning_rate": 3.0423525276785196e-05,
"loss": 0.3116,
"step": 4180
},
{
"epoch": 0.7325815193635807,
"grad_norm": 3.249887466430664,
"learning_rate": 3.0400093726202337e-05,
"loss": 0.3075,
"step": 4190
},
{
"epoch": 0.7343299239444008,
"grad_norm": 3.5678811073303223,
"learning_rate": 3.0376662175619477e-05,
"loss": 0.3138,
"step": 4200
},
{
"epoch": 0.7360783285252207,
"grad_norm": 3.5020499229431152,
"learning_rate": 3.03555737800949e-05,
"loss": 0.3042,
"step": 4210
},
{
"epoch": 0.7378267331060407,
"grad_norm": 4.059022903442383,
"learning_rate": 3.033214222951204e-05,
"loss": 0.3776,
"step": 4220
},
{
"epoch": 0.7395751376868608,
"grad_norm": 2.5982818603515625,
"learning_rate": 3.0311053833987464e-05,
"loss": 0.3309,
"step": 4230
},
{
"epoch": 0.7413235422676807,
"grad_norm": 2.723339557647705,
"learning_rate": 3.0287622283404605e-05,
"loss": 0.3199,
"step": 4240
},
{
"epoch": 0.7430719468485008,
"grad_norm": 2.9116008281707764,
"learning_rate": 3.0264190732821745e-05,
"loss": 0.3283,
"step": 4250
},
{
"epoch": 0.7448203514293208,
"grad_norm": 2.722931385040283,
"learning_rate": 3.0240759182238886e-05,
"loss": 0.3321,
"step": 4260
},
{
"epoch": 0.7465687560101407,
"grad_norm": 2.779557228088379,
"learning_rate": 3.0217327631656026e-05,
"loss": 0.3497,
"step": 4270
},
{
"epoch": 0.7483171605909608,
"grad_norm": 3.886526346206665,
"learning_rate": 3.0193896081073167e-05,
"loss": 0.3274,
"step": 4280
},
{
"epoch": 0.7500655651717808,
"grad_norm": 2.644566059112549,
"learning_rate": 3.0170464530490308e-05,
"loss": 0.28,
"step": 4290
},
{
"epoch": 0.7518139697526007,
"grad_norm": 3.2041871547698975,
"learning_rate": 3.0147032979907448e-05,
"loss": 0.3403,
"step": 4300
},
{
"epoch": 0.7535623743334208,
"grad_norm": 3.1360421180725098,
"learning_rate": 3.0123601429324585e-05,
"loss": 0.3318,
"step": 4310
},
{
"epoch": 0.7553107789142408,
"grad_norm": 3.71610164642334,
"learning_rate": 3.0100169878741726e-05,
"loss": 0.3425,
"step": 4320
},
{
"epoch": 0.7570591834950607,
"grad_norm": 3.1907827854156494,
"learning_rate": 3.0076738328158867e-05,
"loss": 0.325,
"step": 4330
},
{
"epoch": 0.7588075880758808,
"grad_norm": 3.3181164264678955,
"learning_rate": 3.0053306777576007e-05,
"loss": 0.3117,
"step": 4340
},
{
"epoch": 0.7605559926567007,
"grad_norm": 2.3667750358581543,
"learning_rate": 3.0029875226993148e-05,
"loss": 0.2832,
"step": 4350
},
{
"epoch": 0.7623043972375207,
"grad_norm": 3.1936376094818115,
"learning_rate": 3.000644367641029e-05,
"loss": 0.3561,
"step": 4360
},
{
"epoch": 0.7640528018183408,
"grad_norm": 2.5650253295898438,
"learning_rate": 2.998301212582743e-05,
"loss": 0.3113,
"step": 4370
},
{
"epoch": 0.7658012063991607,
"grad_norm": 4.8962082862854,
"learning_rate": 2.995958057524457e-05,
"loss": 0.3126,
"step": 4380
},
{
"epoch": 0.7675496109799808,
"grad_norm": 2.574734926223755,
"learning_rate": 2.993614902466171e-05,
"loss": 0.3395,
"step": 4390
},
{
"epoch": 0.7692980155608008,
"grad_norm": 2.9846832752227783,
"learning_rate": 2.991271747407885e-05,
"loss": 0.2877,
"step": 4400
},
{
"epoch": 0.7710464201416207,
"grad_norm": 2.8485524654388428,
"learning_rate": 2.988928592349599e-05,
"loss": 0.3293,
"step": 4410
},
{
"epoch": 0.7727948247224408,
"grad_norm": 3.241642475128174,
"learning_rate": 2.986585437291313e-05,
"loss": 0.3082,
"step": 4420
},
{
"epoch": 0.7745432293032608,
"grad_norm": 3.4064993858337402,
"learning_rate": 2.984242282233027e-05,
"loss": 0.3417,
"step": 4430
},
{
"epoch": 0.7762916338840807,
"grad_norm": 2.7267072200775146,
"learning_rate": 2.981899127174741e-05,
"loss": 0.2924,
"step": 4440
},
{
"epoch": 0.7780400384649008,
"grad_norm": 4.6471266746521,
"learning_rate": 2.979555972116455e-05,
"loss": 0.3543,
"step": 4450
},
{
"epoch": 0.7797884430457208,
"grad_norm": 3.8437321186065674,
"learning_rate": 2.977212817058169e-05,
"loss": 0.2831,
"step": 4460
},
{
"epoch": 0.7815368476265407,
"grad_norm": 2.720120429992676,
"learning_rate": 2.974869661999883e-05,
"loss": 0.3661,
"step": 4470
},
{
"epoch": 0.7832852522073608,
"grad_norm": 7.060765743255615,
"learning_rate": 2.9725265069415972e-05,
"loss": 0.3187,
"step": 4480
},
{
"epoch": 0.7850336567881808,
"grad_norm": 3.308164358139038,
"learning_rate": 2.9701833518833113e-05,
"loss": 0.2999,
"step": 4490
},
{
"epoch": 0.7867820613690008,
"grad_norm": 2.223618745803833,
"learning_rate": 2.9678401968250253e-05,
"loss": 0.2943,
"step": 4500
},
{
"epoch": 0.7885304659498208,
"grad_norm": 2.492687463760376,
"learning_rate": 2.9654970417667394e-05,
"loss": 0.2852,
"step": 4510
},
{
"epoch": 0.7902788705306408,
"grad_norm": 2.5673835277557373,
"learning_rate": 2.9631538867084534e-05,
"loss": 0.3437,
"step": 4520
},
{
"epoch": 0.7920272751114608,
"grad_norm": 2.192340850830078,
"learning_rate": 2.960810731650167e-05,
"loss": 0.3324,
"step": 4530
},
{
"epoch": 0.7937756796922808,
"grad_norm": 4.07915735244751,
"learning_rate": 2.9584675765918812e-05,
"loss": 0.2924,
"step": 4540
},
{
"epoch": 0.7955240842731008,
"grad_norm": 2.894836187362671,
"learning_rate": 2.9561244215335953e-05,
"loss": 0.2772,
"step": 4550
},
{
"epoch": 0.7972724888539208,
"grad_norm": 5.969550132751465,
"learning_rate": 2.9537812664753093e-05,
"loss": 0.2964,
"step": 4560
},
{
"epoch": 0.7990208934347408,
"grad_norm": 3.9820101261138916,
"learning_rate": 2.9514381114170234e-05,
"loss": 0.3054,
"step": 4570
},
{
"epoch": 0.8007692980155608,
"grad_norm": 3.4480111598968506,
"learning_rate": 2.9490949563587374e-05,
"loss": 0.3456,
"step": 4580
},
{
"epoch": 0.8025177025963808,
"grad_norm": 2.6892831325531006,
"learning_rate": 2.9467518013004515e-05,
"loss": 0.2845,
"step": 4590
},
{
"epoch": 0.8042661071772008,
"grad_norm": 3.488251209259033,
"learning_rate": 2.9444086462421656e-05,
"loss": 0.311,
"step": 4600
},
{
"epoch": 0.8060145117580209,
"grad_norm": 2.8509933948516846,
"learning_rate": 2.9420654911838796e-05,
"loss": 0.3282,
"step": 4610
},
{
"epoch": 0.8077629163388408,
"grad_norm": 2.416158676147461,
"learning_rate": 2.9397223361255937e-05,
"loss": 0.3229,
"step": 4620
},
{
"epoch": 0.8095113209196608,
"grad_norm": 3.2727701663970947,
"learning_rate": 2.9373791810673077e-05,
"loss": 0.2934,
"step": 4630
},
{
"epoch": 0.8112597255004809,
"grad_norm": 3.619755268096924,
"learning_rate": 2.9350360260090215e-05,
"loss": 0.3367,
"step": 4640
},
{
"epoch": 0.8130081300813008,
"grad_norm": 2.848443031311035,
"learning_rate": 2.9326928709507355e-05,
"loss": 0.3027,
"step": 4650
},
{
"epoch": 0.8147565346621208,
"grad_norm": 2.9893646240234375,
"learning_rate": 2.9303497158924492e-05,
"loss": 0.2942,
"step": 4660
},
{
"epoch": 0.8165049392429408,
"grad_norm": 2.021780014038086,
"learning_rate": 2.9280065608341633e-05,
"loss": 0.3451,
"step": 4670
},
{
"epoch": 0.8182533438237608,
"grad_norm": 2.3608715534210205,
"learning_rate": 2.9256634057758774e-05,
"loss": 0.2945,
"step": 4680
},
{
"epoch": 0.8200017484045808,
"grad_norm": 2.520504951477051,
"learning_rate": 2.9233202507175914e-05,
"loss": 0.3133,
"step": 4690
},
{
"epoch": 0.8217501529854008,
"grad_norm": 2.2169888019561768,
"learning_rate": 2.9209770956593055e-05,
"loss": 0.2961,
"step": 4700
},
{
"epoch": 0.8234985575662208,
"grad_norm": 2.2153303623199463,
"learning_rate": 2.9186339406010192e-05,
"loss": 0.3015,
"step": 4710
},
{
"epoch": 0.8252469621470409,
"grad_norm": 3.4472455978393555,
"learning_rate": 2.9162907855427332e-05,
"loss": 0.2829,
"step": 4720
},
{
"epoch": 0.8269953667278608,
"grad_norm": 2.900556802749634,
"learning_rate": 2.9139476304844473e-05,
"loss": 0.3393,
"step": 4730
},
{
"epoch": 0.8287437713086808,
"grad_norm": 5.161322593688965,
"learning_rate": 2.9116044754261614e-05,
"loss": 0.2681,
"step": 4740
},
{
"epoch": 0.8304921758895009,
"grad_norm": 2.231976270675659,
"learning_rate": 2.9092613203678754e-05,
"loss": 0.3025,
"step": 4750
},
{
"epoch": 0.8322405804703208,
"grad_norm": 3.7248172760009766,
"learning_rate": 2.9069181653095895e-05,
"loss": 0.325,
"step": 4760
},
{
"epoch": 0.8339889850511408,
"grad_norm": 2.6625943183898926,
"learning_rate": 2.9045750102513035e-05,
"loss": 0.3331,
"step": 4770
},
{
"epoch": 0.8357373896319609,
"grad_norm": 4.35618782043457,
"learning_rate": 2.9022318551930176e-05,
"loss": 0.3164,
"step": 4780
},
{
"epoch": 0.8374857942127808,
"grad_norm": 3.5341124534606934,
"learning_rate": 2.8998887001347317e-05,
"loss": 0.3171,
"step": 4790
},
{
"epoch": 0.8392341987936008,
"grad_norm": 2.554603099822998,
"learning_rate": 2.8975455450764457e-05,
"loss": 0.2849,
"step": 4800
},
{
"epoch": 0.8409826033744209,
"grad_norm": 4.446366310119629,
"learning_rate": 2.8952023900181598e-05,
"loss": 0.3221,
"step": 4810
},
{
"epoch": 0.8427310079552408,
"grad_norm": 2.116299629211426,
"learning_rate": 2.8928592349598735e-05,
"loss": 0.2904,
"step": 4820
},
{
"epoch": 0.8444794125360608,
"grad_norm": 2.078610897064209,
"learning_rate": 2.8905160799015876e-05,
"loss": 0.3047,
"step": 4830
},
{
"epoch": 0.8462278171168809,
"grad_norm": 2.4735875129699707,
"learning_rate": 2.8881729248433016e-05,
"loss": 0.29,
"step": 4840
},
{
"epoch": 0.8479762216977008,
"grad_norm": 2.858583688735962,
"learning_rate": 2.8858297697850157e-05,
"loss": 0.3244,
"step": 4850
},
{
"epoch": 0.8497246262785209,
"grad_norm": 2.644150733947754,
"learning_rate": 2.8834866147267297e-05,
"loss": 0.3133,
"step": 4860
},
{
"epoch": 0.8514730308593409,
"grad_norm": 3.146573305130005,
"learning_rate": 2.8811434596684438e-05,
"loss": 0.2714,
"step": 4870
},
{
"epoch": 0.8532214354401608,
"grad_norm": 3.7888834476470947,
"learning_rate": 2.878800304610158e-05,
"loss": 0.2532,
"step": 4880
},
{
"epoch": 0.8549698400209809,
"grad_norm": 2.7797985076904297,
"learning_rate": 2.876457149551872e-05,
"loss": 0.3136,
"step": 4890
},
{
"epoch": 0.8567182446018009,
"grad_norm": 2.3448262214660645,
"learning_rate": 2.874113994493586e-05,
"loss": 0.2965,
"step": 4900
},
{
"epoch": 0.8584666491826208,
"grad_norm": 3.9640755653381348,
"learning_rate": 2.8717708394353e-05,
"loss": 0.2687,
"step": 4910
},
{
"epoch": 0.8602150537634409,
"grad_norm": 2.2898406982421875,
"learning_rate": 2.8694276843770137e-05,
"loss": 0.3079,
"step": 4920
},
{
"epoch": 0.8619634583442609,
"grad_norm": 2.158766508102417,
"learning_rate": 2.8670845293187278e-05,
"loss": 0.3238,
"step": 4930
},
{
"epoch": 0.8637118629250808,
"grad_norm": 2.283958911895752,
"learning_rate": 2.864741374260442e-05,
"loss": 0.2864,
"step": 4940
},
{
"epoch": 0.8654602675059009,
"grad_norm": 2.1821975708007812,
"learning_rate": 2.862398219202156e-05,
"loss": 0.2885,
"step": 4950
},
{
"epoch": 0.8672086720867209,
"grad_norm": 3.1854681968688965,
"learning_rate": 2.86005506414387e-05,
"loss": 0.2939,
"step": 4960
},
{
"epoch": 0.8689570766675409,
"grad_norm": 3.484039306640625,
"learning_rate": 2.857711909085584e-05,
"loss": 0.302,
"step": 4970
},
{
"epoch": 0.8707054812483609,
"grad_norm": 2.4981396198272705,
"learning_rate": 2.855368754027298e-05,
"loss": 0.2504,
"step": 4980
},
{
"epoch": 0.8724538858291808,
"grad_norm": 2.401259422302246,
"learning_rate": 2.853025598969012e-05,
"loss": 0.3036,
"step": 4990
},
{
"epoch": 0.8742022904100009,
"grad_norm": 2.8834569454193115,
"learning_rate": 2.8506824439107262e-05,
"loss": 0.2515,
"step": 5000
},
{
"epoch": 0.8759506949908209,
"grad_norm": 2.483109474182129,
"learning_rate": 2.8483392888524403e-05,
"loss": 0.2575,
"step": 5010
},
{
"epoch": 0.8776990995716408,
"grad_norm": 4.260477542877197,
"learning_rate": 2.8459961337941543e-05,
"loss": 0.322,
"step": 5020
},
{
"epoch": 0.8794475041524609,
"grad_norm": 2.232416868209839,
"learning_rate": 2.843652978735868e-05,
"loss": 0.3196,
"step": 5030
},
{
"epoch": 0.8811959087332809,
"grad_norm": 4.053016185760498,
"learning_rate": 2.841309823677582e-05,
"loss": 0.2948,
"step": 5040
},
{
"epoch": 0.8829443133141008,
"grad_norm": 3.5699994564056396,
"learning_rate": 2.838966668619296e-05,
"loss": 0.2459,
"step": 5050
},
{
"epoch": 0.8846927178949209,
"grad_norm": 2.5316355228424072,
"learning_rate": 2.8366235135610102e-05,
"loss": 0.2907,
"step": 5060
},
{
"epoch": 0.8864411224757409,
"grad_norm": 3.3079192638397217,
"learning_rate": 2.8342803585027243e-05,
"loss": 0.335,
"step": 5070
},
{
"epoch": 0.8881895270565608,
"grad_norm": 3.4846723079681396,
"learning_rate": 2.8319372034444383e-05,
"loss": 0.3312,
"step": 5080
},
{
"epoch": 0.8899379316373809,
"grad_norm": 2.5676932334899902,
"learning_rate": 2.8295940483861524e-05,
"loss": 0.2999,
"step": 5090
},
{
"epoch": 0.8916863362182009,
"grad_norm": 3.0182511806488037,
"learning_rate": 2.8272508933278665e-05,
"loss": 0.291,
"step": 5100
},
{
"epoch": 0.8934347407990209,
"grad_norm": 2.7049145698547363,
"learning_rate": 2.8249077382695805e-05,
"loss": 0.2789,
"step": 5110
},
{
"epoch": 0.8951831453798409,
"grad_norm": 4.516125202178955,
"learning_rate": 2.8225645832112946e-05,
"loss": 0.3439,
"step": 5120
},
{
"epoch": 0.8969315499606609,
"grad_norm": 2.7721059322357178,
"learning_rate": 2.820221428153008e-05,
"loss": 0.2949,
"step": 5130
},
{
"epoch": 0.8986799545414809,
"grad_norm": 3.285956859588623,
"learning_rate": 2.817878273094722e-05,
"loss": 0.2931,
"step": 5140
},
{
"epoch": 0.9004283591223009,
"grad_norm": 2.932749032974243,
"learning_rate": 2.815535118036436e-05,
"loss": 0.223,
"step": 5150
},
{
"epoch": 0.9021767637031209,
"grad_norm": 2.5357871055603027,
"learning_rate": 2.81319196297815e-05,
"loss": 0.2869,
"step": 5160
},
{
"epoch": 0.9039251682839409,
"grad_norm": 4.862621784210205,
"learning_rate": 2.8108488079198642e-05,
"loss": 0.3179,
"step": 5170
},
{
"epoch": 0.9056735728647609,
"grad_norm": 3.4447200298309326,
"learning_rate": 2.8085056528615782e-05,
"loss": 0.3325,
"step": 5180
},
{
"epoch": 0.907421977445581,
"grad_norm": 4.216275691986084,
"learning_rate": 2.8061624978032923e-05,
"loss": 0.3122,
"step": 5190
},
{
"epoch": 0.9091703820264009,
"grad_norm": 2.99465012550354,
"learning_rate": 2.8038193427450064e-05,
"loss": 0.335,
"step": 5200
},
{
"epoch": 0.9109187866072209,
"grad_norm": 2.5309603214263916,
"learning_rate": 2.80147618768672e-05,
"loss": 0.261,
"step": 5210
},
{
"epoch": 0.912667191188041,
"grad_norm": 3.232043504714966,
"learning_rate": 2.799133032628434e-05,
"loss": 0.3165,
"step": 5220
},
{
"epoch": 0.9144155957688609,
"grad_norm": 2.638690948486328,
"learning_rate": 2.7967898775701482e-05,
"loss": 0.303,
"step": 5230
},
{
"epoch": 0.9161640003496809,
"grad_norm": 2.458595037460327,
"learning_rate": 2.7944467225118623e-05,
"loss": 0.263,
"step": 5240
},
{
"epoch": 0.917912404930501,
"grad_norm": 2.526832342147827,
"learning_rate": 2.7921035674535763e-05,
"loss": 0.2814,
"step": 5250
},
{
"epoch": 0.9196608095113209,
"grad_norm": 3.0694055557250977,
"learning_rate": 2.7897604123952904e-05,
"loss": 0.2664,
"step": 5260
},
{
"epoch": 0.9214092140921409,
"grad_norm": 3.2216978073120117,
"learning_rate": 2.7874172573370044e-05,
"loss": 0.2943,
"step": 5270
},
{
"epoch": 0.923157618672961,
"grad_norm": 2.958218574523926,
"learning_rate": 2.7850741022787185e-05,
"loss": 0.2932,
"step": 5280
},
{
"epoch": 0.9249060232537809,
"grad_norm": 3.8881359100341797,
"learning_rate": 2.7827309472204326e-05,
"loss": 0.2968,
"step": 5290
},
{
"epoch": 0.926654427834601,
"grad_norm": 2.983222723007202,
"learning_rate": 2.7803877921621466e-05,
"loss": 0.2567,
"step": 5300
},
{
"epoch": 0.9284028324154209,
"grad_norm": 2.482820749282837,
"learning_rate": 2.7780446371038607e-05,
"loss": 0.3172,
"step": 5310
},
{
"epoch": 0.9301512369962409,
"grad_norm": 2.0078659057617188,
"learning_rate": 2.7757014820455744e-05,
"loss": 0.2927,
"step": 5320
},
{
"epoch": 0.931899641577061,
"grad_norm": 3.8458902835845947,
"learning_rate": 2.7733583269872884e-05,
"loss": 0.2618,
"step": 5330
},
{
"epoch": 0.9336480461578809,
"grad_norm": 4.100974082946777,
"learning_rate": 2.7710151719290025e-05,
"loss": 0.2704,
"step": 5340
},
{
"epoch": 0.9353964507387009,
"grad_norm": 4.017348766326904,
"learning_rate": 2.7686720168707166e-05,
"loss": 0.2909,
"step": 5350
},
{
"epoch": 0.937144855319521,
"grad_norm": 2.4890189170837402,
"learning_rate": 2.7663288618124306e-05,
"loss": 0.3533,
"step": 5360
},
{
"epoch": 0.9388932599003409,
"grad_norm": 2.674192190170288,
"learning_rate": 2.7639857067541447e-05,
"loss": 0.2778,
"step": 5370
},
{
"epoch": 0.9406416644811609,
"grad_norm": 2.096602439880371,
"learning_rate": 2.7616425516958587e-05,
"loss": 0.2865,
"step": 5380
},
{
"epoch": 0.942390069061981,
"grad_norm": 3.5576303005218506,
"learning_rate": 2.7592993966375728e-05,
"loss": 0.2894,
"step": 5390
},
{
"epoch": 0.9441384736428009,
"grad_norm": 1.878792643547058,
"learning_rate": 2.756956241579287e-05,
"loss": 0.2778,
"step": 5400
},
{
"epoch": 0.9458868782236209,
"grad_norm": 3.251866579055786,
"learning_rate": 2.754613086521001e-05,
"loss": 0.2895,
"step": 5410
},
{
"epoch": 0.947635282804441,
"grad_norm": 3.257899522781372,
"learning_rate": 2.752269931462715e-05,
"loss": 0.3125,
"step": 5420
},
{
"epoch": 0.9493836873852609,
"grad_norm": 2.863107442855835,
"learning_rate": 2.7499267764044287e-05,
"loss": 0.2414,
"step": 5430
},
{
"epoch": 0.951132091966081,
"grad_norm": 3.4100799560546875,
"learning_rate": 2.7475836213461428e-05,
"loss": 0.2866,
"step": 5440
},
{
"epoch": 0.952880496546901,
"grad_norm": 2.9259746074676514,
"learning_rate": 2.7452404662878568e-05,
"loss": 0.2912,
"step": 5450
},
{
"epoch": 0.9546289011277209,
"grad_norm": 3.5859031677246094,
"learning_rate": 2.742897311229571e-05,
"loss": 0.2945,
"step": 5460
},
{
"epoch": 0.956377305708541,
"grad_norm": 2.341395378112793,
"learning_rate": 2.740554156171285e-05,
"loss": 0.29,
"step": 5470
},
{
"epoch": 0.958125710289361,
"grad_norm": 4.96193265914917,
"learning_rate": 2.738211001112999e-05,
"loss": 0.2655,
"step": 5480
},
{
"epoch": 0.9598741148701809,
"grad_norm": 2.6068100929260254,
"learning_rate": 2.735867846054713e-05,
"loss": 0.2954,
"step": 5490
},
{
"epoch": 0.961622519451001,
"grad_norm": 2.946169137954712,
"learning_rate": 2.733524690996427e-05,
"loss": 0.2622,
"step": 5500
},
{
"epoch": 0.963370924031821,
"grad_norm": 2.533318042755127,
"learning_rate": 2.731181535938141e-05,
"loss": 0.2773,
"step": 5510
},
{
"epoch": 0.9651193286126409,
"grad_norm": 3.2419304847717285,
"learning_rate": 2.7288383808798552e-05,
"loss": 0.2755,
"step": 5520
},
{
"epoch": 0.966867733193461,
"grad_norm": 6.12715482711792,
"learning_rate": 2.7264952258215693e-05,
"loss": 0.2713,
"step": 5530
},
{
"epoch": 0.968616137774281,
"grad_norm": 6.266939640045166,
"learning_rate": 2.724152070763283e-05,
"loss": 0.2615,
"step": 5540
},
{
"epoch": 0.970364542355101,
"grad_norm": 2.3439266681671143,
"learning_rate": 2.721808915704997e-05,
"loss": 0.2522,
"step": 5550
},
{
"epoch": 0.972112946935921,
"grad_norm": 3.9679603576660156,
"learning_rate": 2.719465760646711e-05,
"loss": 0.2881,
"step": 5560
},
{
"epoch": 0.973861351516741,
"grad_norm": 4.07214879989624,
"learning_rate": 2.7171226055884252e-05,
"loss": 0.31,
"step": 5570
},
{
"epoch": 0.975609756097561,
"grad_norm": 1.8089581727981567,
"learning_rate": 2.7147794505301392e-05,
"loss": 0.2693,
"step": 5580
},
{
"epoch": 0.977358160678381,
"grad_norm": 2.350628137588501,
"learning_rate": 2.7124362954718533e-05,
"loss": 0.2936,
"step": 5590
},
{
"epoch": 0.979106565259201,
"grad_norm": 2.955479383468628,
"learning_rate": 2.7100931404135674e-05,
"loss": 0.2859,
"step": 5600
},
{
"epoch": 0.980854969840021,
"grad_norm": 2.559128522872925,
"learning_rate": 2.7077499853552807e-05,
"loss": 0.2941,
"step": 5610
},
{
"epoch": 0.982603374420841,
"grad_norm": 4.118138313293457,
"learning_rate": 2.7054068302969948e-05,
"loss": 0.2653,
"step": 5620
},
{
"epoch": 0.9843517790016609,
"grad_norm": 2.770746946334839,
"learning_rate": 2.703063675238709e-05,
"loss": 0.2511,
"step": 5630
},
{
"epoch": 0.986100183582481,
"grad_norm": 2.902510404586792,
"learning_rate": 2.700720520180423e-05,
"loss": 0.2458,
"step": 5640
},
{
"epoch": 0.987848588163301,
"grad_norm": 2.2082626819610596,
"learning_rate": 2.698377365122137e-05,
"loss": 0.2704,
"step": 5650
},
{
"epoch": 0.9895969927441209,
"grad_norm": 3.753960371017456,
"learning_rate": 2.696034210063851e-05,
"loss": 0.2993,
"step": 5660
},
{
"epoch": 0.991345397324941,
"grad_norm": 2.009828567504883,
"learning_rate": 2.693691055005565e-05,
"loss": 0.2383,
"step": 5670
},
{
"epoch": 0.993093801905761,
"grad_norm": 2.961215019226074,
"learning_rate": 2.691347899947279e-05,
"loss": 0.2761,
"step": 5680
},
{
"epoch": 0.994842206486581,
"grad_norm": 4.399358749389648,
"learning_rate": 2.6890047448889932e-05,
"loss": 0.2597,
"step": 5690
},
{
"epoch": 0.996590611067401,
"grad_norm": 2.652677059173584,
"learning_rate": 2.6866615898307073e-05,
"loss": 0.2983,
"step": 5700
},
{
"epoch": 0.998339015648221,
"grad_norm": 2.0003232955932617,
"learning_rate": 2.6843184347724213e-05,
"loss": 0.2542,
"step": 5710
},
{
"epoch": 1.000087420229041,
"grad_norm": 2.8039956092834473,
"learning_rate": 2.681975279714135e-05,
"loss": 0.264,
"step": 5720
},
{
"epoch": 1.001835824809861,
"grad_norm": 2.23201322555542,
"learning_rate": 2.679632124655849e-05,
"loss": 0.2583,
"step": 5730
},
{
"epoch": 1.003584229390681,
"grad_norm": 3.3724849224090576,
"learning_rate": 2.677288969597563e-05,
"loss": 0.2173,
"step": 5740
},
{
"epoch": 1.005332633971501,
"grad_norm": 2.6445441246032715,
"learning_rate": 2.6749458145392772e-05,
"loss": 0.2508,
"step": 5750
},
{
"epoch": 1.007081038552321,
"grad_norm": 2.4510304927825928,
"learning_rate": 2.6726026594809913e-05,
"loss": 0.2326,
"step": 5760
},
{
"epoch": 1.008829443133141,
"grad_norm": 2.9268946647644043,
"learning_rate": 2.6702595044227053e-05,
"loss": 0.2106,
"step": 5770
},
{
"epoch": 1.010577847713961,
"grad_norm": 2.347891330718994,
"learning_rate": 2.6679163493644194e-05,
"loss": 0.2641,
"step": 5780
},
{
"epoch": 1.012326252294781,
"grad_norm": 3.352431535720825,
"learning_rate": 2.6655731943061334e-05,
"loss": 0.2473,
"step": 5790
},
{
"epoch": 1.014074656875601,
"grad_norm": 1.9598578214645386,
"learning_rate": 2.6632300392478475e-05,
"loss": 0.2164,
"step": 5800
},
{
"epoch": 1.015823061456421,
"grad_norm": 1.9211621284484863,
"learning_rate": 2.6608868841895616e-05,
"loss": 0.211,
"step": 5810
},
{
"epoch": 1.017571466037241,
"grad_norm": 2.0851757526397705,
"learning_rate": 2.6585437291312756e-05,
"loss": 0.2116,
"step": 5820
},
{
"epoch": 1.019319870618061,
"grad_norm": 2.829580783843994,
"learning_rate": 2.6562005740729893e-05,
"loss": 0.2404,
"step": 5830
},
{
"epoch": 1.021068275198881,
"grad_norm": 3.872819185256958,
"learning_rate": 2.6538574190147034e-05,
"loss": 0.2386,
"step": 5840
},
{
"epoch": 1.022816679779701,
"grad_norm": 2.6188647747039795,
"learning_rate": 2.6515142639564175e-05,
"loss": 0.2521,
"step": 5850
},
{
"epoch": 1.024565084360521,
"grad_norm": 2.390606164932251,
"learning_rate": 2.6491711088981315e-05,
"loss": 0.2205,
"step": 5860
},
{
"epoch": 1.026313488941341,
"grad_norm": 3.985508918762207,
"learning_rate": 2.6468279538398456e-05,
"loss": 0.2025,
"step": 5870
},
{
"epoch": 1.028061893522161,
"grad_norm": 2.431910753250122,
"learning_rate": 2.6444847987815596e-05,
"loss": 0.2647,
"step": 5880
},
{
"epoch": 1.029810298102981,
"grad_norm": 2.83016300201416,
"learning_rate": 2.6421416437232737e-05,
"loss": 0.2238,
"step": 5890
},
{
"epoch": 1.0315587026838011,
"grad_norm": 2.0961086750030518,
"learning_rate": 2.6397984886649878e-05,
"loss": 0.232,
"step": 5900
},
{
"epoch": 1.033307107264621,
"grad_norm": 2.8335044384002686,
"learning_rate": 2.6374553336067018e-05,
"loss": 0.237,
"step": 5910
},
{
"epoch": 1.035055511845441,
"grad_norm": 3.199272871017456,
"learning_rate": 2.635112178548416e-05,
"loss": 0.1832,
"step": 5920
},
{
"epoch": 1.036803916426261,
"grad_norm": 3.307910919189453,
"learning_rate": 2.6327690234901296e-05,
"loss": 0.2298,
"step": 5930
},
{
"epoch": 1.038552321007081,
"grad_norm": 5.532860279083252,
"learning_rate": 2.6304258684318436e-05,
"loss": 0.2429,
"step": 5940
},
{
"epoch": 1.040300725587901,
"grad_norm": 2.590127468109131,
"learning_rate": 2.6280827133735577e-05,
"loss": 0.2157,
"step": 5950
},
{
"epoch": 1.0420491301687211,
"grad_norm": 2.8453683853149414,
"learning_rate": 2.6257395583152718e-05,
"loss": 0.2553,
"step": 5960
},
{
"epoch": 1.043797534749541,
"grad_norm": 1.8200639486312866,
"learning_rate": 2.6233964032569858e-05,
"loss": 0.2335,
"step": 5970
},
{
"epoch": 1.045545939330361,
"grad_norm": 2.9046294689178467,
"learning_rate": 2.6210532481987e-05,
"loss": 0.247,
"step": 5980
},
{
"epoch": 1.047294343911181,
"grad_norm": 5.429454326629639,
"learning_rate": 2.618710093140414e-05,
"loss": 0.2496,
"step": 5990
},
{
"epoch": 1.049042748492001,
"grad_norm": 5.531388282775879,
"learning_rate": 2.616366938082128e-05,
"loss": 0.2569,
"step": 6000
},
{
"epoch": 1.0493924294081651,
"eval_loss": 0.41312137246131897,
"eval_runtime": 1800.1269,
"eval_samples_per_second": 8.045,
"eval_steps_per_second": 1.006,
"step": 6002
},
{
"epoch": 1.0507911530728211,
"grad_norm": 1.860955834388733,
"learning_rate": 2.614023783023842e-05,
"loss": 0.2275,
"step": 6010
},
{
"epoch": 1.0525395576536412,
"grad_norm": 2.9169716835021973,
"learning_rate": 2.611680627965556e-05,
"loss": 0.2302,
"step": 6020
},
{
"epoch": 1.054287962234461,
"grad_norm": 2.0836966037750244,
"learning_rate": 2.6093374729072702e-05,
"loss": 0.2396,
"step": 6030
},
{
"epoch": 1.056036366815281,
"grad_norm": 1.6626900434494019,
"learning_rate": 2.606994317848984e-05,
"loss": 0.2381,
"step": 6040
},
{
"epoch": 1.057784771396101,
"grad_norm": 2.771097421646118,
"learning_rate": 2.604651162790698e-05,
"loss": 0.2185,
"step": 6050
},
{
"epoch": 1.059533175976921,
"grad_norm": 3.489532232284546,
"learning_rate": 2.602308007732412e-05,
"loss": 0.2591,
"step": 6060
},
{
"epoch": 1.0612815805577411,
"grad_norm": 2.6725568771362305,
"learning_rate": 2.599964852674126e-05,
"loss": 0.2444,
"step": 6070
},
{
"epoch": 1.0630299851385612,
"grad_norm": 1.9345289468765259,
"learning_rate": 2.59762169761584e-05,
"loss": 0.2628,
"step": 6080
},
{
"epoch": 1.064778389719381,
"grad_norm": 2.020622491836548,
"learning_rate": 2.595278542557554e-05,
"loss": 0.2586,
"step": 6090
},
{
"epoch": 1.066526794300201,
"grad_norm": 2.4979703426361084,
"learning_rate": 2.592935387499268e-05,
"loss": 0.2441,
"step": 6100
},
{
"epoch": 1.068275198881021,
"grad_norm": 2.5634591579437256,
"learning_rate": 2.5905922324409816e-05,
"loss": 0.2361,
"step": 6110
},
{
"epoch": 1.070023603461841,
"grad_norm": 2.3151049613952637,
"learning_rate": 2.5882490773826957e-05,
"loss": 0.2465,
"step": 6120
},
{
"epoch": 1.0717720080426612,
"grad_norm": 1.7265106439590454,
"learning_rate": 2.5859059223244097e-05,
"loss": 0.2395,
"step": 6130
},
{
"epoch": 1.073520412623481,
"grad_norm": 2.878922462463379,
"learning_rate": 2.5835627672661238e-05,
"loss": 0.2371,
"step": 6140
},
{
"epoch": 1.075268817204301,
"grad_norm": 3.5647659301757812,
"learning_rate": 2.581219612207838e-05,
"loss": 0.218,
"step": 6150
},
{
"epoch": 1.077017221785121,
"grad_norm": 3.4102213382720947,
"learning_rate": 2.578876457149552e-05,
"loss": 0.2137,
"step": 6160
},
{
"epoch": 1.078765626365941,
"grad_norm": 4.656369209289551,
"learning_rate": 2.576533302091266e-05,
"loss": 0.2058,
"step": 6170
},
{
"epoch": 1.0805140309467611,
"grad_norm": 2.1041653156280518,
"learning_rate": 2.57419014703298e-05,
"loss": 0.2067,
"step": 6180
},
{
"epoch": 1.0822624355275812,
"grad_norm": 2.1586110591888428,
"learning_rate": 2.571846991974694e-05,
"loss": 0.2056,
"step": 6190
},
{
"epoch": 1.084010840108401,
"grad_norm": 1.9681655168533325,
"learning_rate": 2.569503836916408e-05,
"loss": 0.184,
"step": 6200
},
{
"epoch": 1.085759244689221,
"grad_norm": 2.8586220741271973,
"learning_rate": 2.5671606818581222e-05,
"loss": 0.2226,
"step": 6210
},
{
"epoch": 1.087507649270041,
"grad_norm": 7.736782073974609,
"learning_rate": 2.564817526799836e-05,
"loss": 0.1919,
"step": 6220
},
{
"epoch": 1.089256053850861,
"grad_norm": 3.2476119995117188,
"learning_rate": 2.56247437174155e-05,
"loss": 0.2489,
"step": 6230
},
{
"epoch": 1.0910044584316811,
"grad_norm": 3.7844748497009277,
"learning_rate": 2.560131216683264e-05,
"loss": 0.2415,
"step": 6240
},
{
"epoch": 1.0927528630125012,
"grad_norm": 1.4511767625808716,
"learning_rate": 2.557788061624978e-05,
"loss": 0.2556,
"step": 6250
},
{
"epoch": 1.094501267593321,
"grad_norm": 4.27903938293457,
"learning_rate": 2.555444906566692e-05,
"loss": 0.2551,
"step": 6260
},
{
"epoch": 1.096249672174141,
"grad_norm": 3.3497371673583984,
"learning_rate": 2.5531017515084062e-05,
"loss": 0.2377,
"step": 6270
},
{
"epoch": 1.097998076754961,
"grad_norm": 2.4137842655181885,
"learning_rate": 2.5507585964501203e-05,
"loss": 0.2338,
"step": 6280
},
{
"epoch": 1.0997464813357811,
"grad_norm": 2.213383913040161,
"learning_rate": 2.5484154413918343e-05,
"loss": 0.2352,
"step": 6290
},
{
"epoch": 1.1014948859166012,
"grad_norm": 2.463801622390747,
"learning_rate": 2.5460722863335484e-05,
"loss": 0.2433,
"step": 6300
},
{
"epoch": 1.1032432904974212,
"grad_norm": 2.349886178970337,
"learning_rate": 2.5437291312752625e-05,
"loss": 0.2307,
"step": 6310
},
{
"epoch": 1.104991695078241,
"grad_norm": 4.470160484313965,
"learning_rate": 2.5413859762169765e-05,
"loss": 0.2147,
"step": 6320
},
{
"epoch": 1.106740099659061,
"grad_norm": 1.9070550203323364,
"learning_rate": 2.5390428211586902e-05,
"loss": 0.1952,
"step": 6330
},
{
"epoch": 1.108488504239881,
"grad_norm": 3.6984667778015137,
"learning_rate": 2.5366996661004043e-05,
"loss": 0.2451,
"step": 6340
},
{
"epoch": 1.1102369088207011,
"grad_norm": 2.5296738147735596,
"learning_rate": 2.5343565110421184e-05,
"loss": 0.2623,
"step": 6350
},
{
"epoch": 1.1119853134015212,
"grad_norm": 1.6782501935958862,
"learning_rate": 2.5320133559838324e-05,
"loss": 0.2121,
"step": 6360
},
{
"epoch": 1.113733717982341,
"grad_norm": 2.8727078437805176,
"learning_rate": 2.5296702009255465e-05,
"loss": 0.2381,
"step": 6370
},
{
"epoch": 1.115482122563161,
"grad_norm": 2.176513671875,
"learning_rate": 2.5273270458672605e-05,
"loss": 0.1952,
"step": 6380
},
{
"epoch": 1.117230527143981,
"grad_norm": 2.2744338512420654,
"learning_rate": 2.5249838908089746e-05,
"loss": 0.2417,
"step": 6390
},
{
"epoch": 1.1189789317248011,
"grad_norm": 3.2771434783935547,
"learning_rate": 2.5226407357506886e-05,
"loss": 0.2233,
"step": 6400
},
{
"epoch": 1.1207273363056212,
"grad_norm": 2.574244499206543,
"learning_rate": 2.5202975806924027e-05,
"loss": 0.2245,
"step": 6410
},
{
"epoch": 1.1224757408864412,
"grad_norm": 2.5185132026672363,
"learning_rate": 2.5181887411399452e-05,
"loss": 0.2339,
"step": 6420
},
{
"epoch": 1.1242241454672612,
"grad_norm": 1.5455431938171387,
"learning_rate": 2.5158455860816592e-05,
"loss": 0.221,
"step": 6430
},
{
"epoch": 1.125972550048081,
"grad_norm": 3.230663299560547,
"learning_rate": 2.513502431023373e-05,
"loss": 0.2431,
"step": 6440
},
{
"epoch": 1.127720954628901,
"grad_norm": 1.7547463178634644,
"learning_rate": 2.511159275965087e-05,
"loss": 0.258,
"step": 6450
},
{
"epoch": 1.1294693592097211,
"grad_norm": 2.806102752685547,
"learning_rate": 2.508816120906801e-05,
"loss": 0.2637,
"step": 6460
},
{
"epoch": 1.1312177637905412,
"grad_norm": 2.752462863922119,
"learning_rate": 2.506472965848515e-05,
"loss": 0.2201,
"step": 6470
},
{
"epoch": 1.1329661683713612,
"grad_norm": 2.401191473007202,
"learning_rate": 2.5041298107902292e-05,
"loss": 0.2458,
"step": 6480
},
{
"epoch": 1.134714572952181,
"grad_norm": 2.487614154815674,
"learning_rate": 2.5017866557319433e-05,
"loss": 0.2311,
"step": 6490
},
{
"epoch": 1.136462977533001,
"grad_norm": 2.6839981079101562,
"learning_rate": 2.4994435006736573e-05,
"loss": 0.2184,
"step": 6500
},
{
"epoch": 1.1382113821138211,
"grad_norm": 2.5567777156829834,
"learning_rate": 2.4971003456153714e-05,
"loss": 0.2579,
"step": 6510
},
{
"epoch": 1.1399597866946412,
"grad_norm": 1.7943795919418335,
"learning_rate": 2.4947571905570854e-05,
"loss": 0.2226,
"step": 6520
},
{
"epoch": 1.1417081912754612,
"grad_norm": 2.5085887908935547,
"learning_rate": 2.4924140354987995e-05,
"loss": 0.2171,
"step": 6530
},
{
"epoch": 1.1434565958562812,
"grad_norm": 3.6210641860961914,
"learning_rate": 2.4900708804405132e-05,
"loss": 0.2351,
"step": 6540
},
{
"epoch": 1.145205000437101,
"grad_norm": 2.5945966243743896,
"learning_rate": 2.4877277253822273e-05,
"loss": 0.2187,
"step": 6550
},
{
"epoch": 1.146953405017921,
"grad_norm": 1.9475432634353638,
"learning_rate": 2.4853845703239413e-05,
"loss": 0.1945,
"step": 6560
},
{
"epoch": 1.1487018095987411,
"grad_norm": 3.4020893573760986,
"learning_rate": 2.4830414152656554e-05,
"loss": 0.2151,
"step": 6570
},
{
"epoch": 1.1504502141795612,
"grad_norm": 3.2796547412872314,
"learning_rate": 2.4806982602073694e-05,
"loss": 0.2289,
"step": 6580
},
{
"epoch": 1.1521986187603812,
"grad_norm": 2.947808265686035,
"learning_rate": 2.4783551051490835e-05,
"loss": 0.2379,
"step": 6590
},
{
"epoch": 1.1539470233412013,
"grad_norm": 1.4934639930725098,
"learning_rate": 2.4760119500907976e-05,
"loss": 0.2332,
"step": 6600
},
{
"epoch": 1.155695427922021,
"grad_norm": 3.527163505554199,
"learning_rate": 2.4736687950325116e-05,
"loss": 0.2276,
"step": 6610
},
{
"epoch": 1.157443832502841,
"grad_norm": 4.331171989440918,
"learning_rate": 2.4713256399742257e-05,
"loss": 0.2408,
"step": 6620
},
{
"epoch": 1.1591922370836611,
"grad_norm": 1.2162501811981201,
"learning_rate": 2.4689824849159397e-05,
"loss": 0.2246,
"step": 6630
},
{
"epoch": 1.1609406416644812,
"grad_norm": 2.465019702911377,
"learning_rate": 2.4666393298576538e-05,
"loss": 0.1932,
"step": 6640
},
{
"epoch": 1.1626890462453012,
"grad_norm": 2.7230145931243896,
"learning_rate": 2.4642961747993675e-05,
"loss": 0.258,
"step": 6650
},
{
"epoch": 1.1644374508261213,
"grad_norm": 2.6885485649108887,
"learning_rate": 2.4619530197410816e-05,
"loss": 0.2287,
"step": 6660
},
{
"epoch": 1.166185855406941,
"grad_norm": 3.2494916915893555,
"learning_rate": 2.4596098646827956e-05,
"loss": 0.2647,
"step": 6670
},
{
"epoch": 1.1679342599877611,
"grad_norm": 1.7186158895492554,
"learning_rate": 2.4572667096245097e-05,
"loss": 0.2115,
"step": 6680
},
{
"epoch": 1.1696826645685812,
"grad_norm": 2.3007144927978516,
"learning_rate": 2.4549235545662237e-05,
"loss": 0.2497,
"step": 6690
},
{
"epoch": 1.1714310691494012,
"grad_norm": 2.522245168685913,
"learning_rate": 2.4525803995079378e-05,
"loss": 0.2215,
"step": 6700
},
{
"epoch": 1.1731794737302212,
"grad_norm": 2.192690134048462,
"learning_rate": 2.450237244449652e-05,
"loss": 0.2056,
"step": 6710
},
{
"epoch": 1.174927878311041,
"grad_norm": 3.8989531993865967,
"learning_rate": 2.447894089391366e-05,
"loss": 0.2096,
"step": 6720
},
{
"epoch": 1.176676282891861,
"grad_norm": 3.636918544769287,
"learning_rate": 2.44555093433308e-05,
"loss": 0.22,
"step": 6730
},
{
"epoch": 1.1784246874726811,
"grad_norm": 2.5938773155212402,
"learning_rate": 2.443207779274794e-05,
"loss": 0.253,
"step": 6740
},
{
"epoch": 1.1801730920535012,
"grad_norm": 2.396374464035034,
"learning_rate": 2.440864624216508e-05,
"loss": 0.2843,
"step": 6750
},
{
"epoch": 1.1819214966343212,
"grad_norm": 2.2090964317321777,
"learning_rate": 2.4385214691582218e-05,
"loss": 0.2352,
"step": 6760
},
{
"epoch": 1.1836699012151413,
"grad_norm": 2.504795551300049,
"learning_rate": 2.4361783140999355e-05,
"loss": 0.2247,
"step": 6770
},
{
"epoch": 1.1854183057959613,
"grad_norm": 3.191880702972412,
"learning_rate": 2.4338351590416496e-05,
"loss": 0.2311,
"step": 6780
},
{
"epoch": 1.1871667103767811,
"grad_norm": 2.5257225036621094,
"learning_rate": 2.4314920039833637e-05,
"loss": 0.2227,
"step": 6790
},
{
"epoch": 1.1889151149576012,
"grad_norm": 1.7567265033721924,
"learning_rate": 2.4291488489250777e-05,
"loss": 0.206,
"step": 6800
},
{
"epoch": 1.1906635195384212,
"grad_norm": 2.0397517681121826,
"learning_rate": 2.4268056938667918e-05,
"loss": 0.2359,
"step": 6810
},
{
"epoch": 1.1924119241192412,
"grad_norm": 2.164275884628296,
"learning_rate": 2.424462538808506e-05,
"loss": 0.2466,
"step": 6820
},
{
"epoch": 1.1941603287000613,
"grad_norm": 3.402735710144043,
"learning_rate": 2.4221193837502196e-05,
"loss": 0.2304,
"step": 6830
},
{
"epoch": 1.195908733280881,
"grad_norm": 6.231383800506592,
"learning_rate": 2.4200105441977627e-05,
"loss": 0.223,
"step": 6840
},
{
"epoch": 1.1976571378617011,
"grad_norm": 2.991027355194092,
"learning_rate": 2.4176673891394768e-05,
"loss": 0.207,
"step": 6850
},
{
"epoch": 1.1994055424425212,
"grad_norm": 3.3107314109802246,
"learning_rate": 2.4153242340811905e-05,
"loss": 0.221,
"step": 6860
},
{
"epoch": 1.2011539470233412,
"grad_norm": 3.051894426345825,
"learning_rate": 2.4129810790229045e-05,
"loss": 0.236,
"step": 6870
},
{
"epoch": 1.2029023516041613,
"grad_norm": 2.3835701942443848,
"learning_rate": 2.4106379239646186e-05,
"loss": 0.2195,
"step": 6880
},
{
"epoch": 1.2046507561849813,
"grad_norm": 3.9972636699676514,
"learning_rate": 2.4082947689063327e-05,
"loss": 0.2198,
"step": 6890
},
{
"epoch": 1.2063991607658011,
"grad_norm": 2.0902743339538574,
"learning_rate": 2.4059516138480467e-05,
"loss": 0.2563,
"step": 6900
},
{
"epoch": 1.2081475653466212,
"grad_norm": 2.5505049228668213,
"learning_rate": 2.4036084587897608e-05,
"loss": 0.2093,
"step": 6910
},
{
"epoch": 1.2098959699274412,
"grad_norm": 3.174210548400879,
"learning_rate": 2.401265303731475e-05,
"loss": 0.221,
"step": 6920
},
{
"epoch": 1.2116443745082612,
"grad_norm": 2.3565521240234375,
"learning_rate": 2.398922148673189e-05,
"loss": 0.224,
"step": 6930
},
{
"epoch": 1.2133927790890813,
"grad_norm": 6.279238224029541,
"learning_rate": 2.396578993614903e-05,
"loss": 0.2333,
"step": 6940
},
{
"epoch": 1.215141183669901,
"grad_norm": 4.914646625518799,
"learning_rate": 2.394235838556617e-05,
"loss": 0.2521,
"step": 6950
},
{
"epoch": 1.2168895882507211,
"grad_norm": 3.3840725421905518,
"learning_rate": 2.3918926834983304e-05,
"loss": 0.2065,
"step": 6960
},
{
"epoch": 1.2186379928315412,
"grad_norm": 2.5615413188934326,
"learning_rate": 2.3895495284400445e-05,
"loss": 0.2363,
"step": 6970
},
{
"epoch": 1.2203863974123612,
"grad_norm": 3.362717628479004,
"learning_rate": 2.3872063733817585e-05,
"loss": 0.2325,
"step": 6980
},
{
"epoch": 1.2221348019931813,
"grad_norm": 2.461860179901123,
"learning_rate": 2.3848632183234726e-05,
"loss": 0.2228,
"step": 6990
},
{
"epoch": 1.2238832065740013,
"grad_norm": 2.791576385498047,
"learning_rate": 2.3825200632651866e-05,
"loss": 0.1829,
"step": 7000
},
{
"epoch": 1.2256316111548213,
"grad_norm": 2.1985511779785156,
"learning_rate": 2.3801769082069007e-05,
"loss": 0.1964,
"step": 7010
},
{
"epoch": 1.2273800157356411,
"grad_norm": 2.528165578842163,
"learning_rate": 2.3778337531486147e-05,
"loss": 0.2327,
"step": 7020
},
{
"epoch": 1.2291284203164612,
"grad_norm": 2.3017685413360596,
"learning_rate": 2.3754905980903288e-05,
"loss": 0.2299,
"step": 7030
},
{
"epoch": 1.2308768248972812,
"grad_norm": 1.9561365842819214,
"learning_rate": 2.3731474430320425e-05,
"loss": 0.2468,
"step": 7040
},
{
"epoch": 1.2326252294781013,
"grad_norm": 3.533801555633545,
"learning_rate": 2.3708042879737566e-05,
"loss": 0.237,
"step": 7050
},
{
"epoch": 1.2343736340589213,
"grad_norm": 1.6506298780441284,
"learning_rate": 2.3684611329154706e-05,
"loss": 0.2262,
"step": 7060
},
{
"epoch": 1.2361220386397411,
"grad_norm": 2.2800042629241943,
"learning_rate": 2.3661179778571847e-05,
"loss": 0.2079,
"step": 7070
},
{
"epoch": 1.2378704432205612,
"grad_norm": 3.2282028198242188,
"learning_rate": 2.3637748227988988e-05,
"loss": 0.2497,
"step": 7080
},
{
"epoch": 1.2396188478013812,
"grad_norm": 9.39627742767334,
"learning_rate": 2.3614316677406128e-05,
"loss": 0.2292,
"step": 7090
},
{
"epoch": 1.2413672523822012,
"grad_norm": 2.449101448059082,
"learning_rate": 2.359088512682327e-05,
"loss": 0.2477,
"step": 7100
},
{
"epoch": 1.2431156569630213,
"grad_norm": 3.1302859783172607,
"learning_rate": 2.356745357624041e-05,
"loss": 0.2285,
"step": 7110
},
{
"epoch": 1.2448640615438413,
"grad_norm": 1.8327531814575195,
"learning_rate": 2.354402202565755e-05,
"loss": 0.2005,
"step": 7120
},
{
"epoch": 1.2466124661246614,
"grad_norm": 4.233156681060791,
"learning_rate": 2.352059047507469e-05,
"loss": 0.236,
"step": 7130
},
{
"epoch": 1.2483608707054812,
"grad_norm": 2.3583102226257324,
"learning_rate": 2.349715892449183e-05,
"loss": 0.2273,
"step": 7140
},
{
"epoch": 1.2501092752863012,
"grad_norm": 2.376291275024414,
"learning_rate": 2.347372737390897e-05,
"loss": 0.1867,
"step": 7150
},
{
"epoch": 1.2518576798671213,
"grad_norm": 3.159830093383789,
"learning_rate": 2.345029582332611e-05,
"loss": 0.2266,
"step": 7160
},
{
"epoch": 1.2536060844479413,
"grad_norm": 2.7414627075195312,
"learning_rate": 2.342686427274325e-05,
"loss": 0.2598,
"step": 7170
},
{
"epoch": 1.2553544890287611,
"grad_norm": 3.401259183883667,
"learning_rate": 2.340343272216039e-05,
"loss": 0.2103,
"step": 7180
},
{
"epoch": 1.2571028936095812,
"grad_norm": 2.1210904121398926,
"learning_rate": 2.338000117157753e-05,
"loss": 0.1867,
"step": 7190
},
{
"epoch": 1.2588512981904012,
"grad_norm": 4.165828704833984,
"learning_rate": 2.335656962099467e-05,
"loss": 0.2408,
"step": 7200
},
{
"epoch": 1.2605997027712212,
"grad_norm": 3.0600874423980713,
"learning_rate": 2.3333138070411812e-05,
"loss": 0.2235,
"step": 7210
},
{
"epoch": 1.2623481073520413,
"grad_norm": 2.238833427429199,
"learning_rate": 2.3309706519828952e-05,
"loss": 0.2295,
"step": 7220
},
{
"epoch": 1.2640965119328613,
"grad_norm": 7.00640344619751,
"learning_rate": 2.3286274969246093e-05,
"loss": 0.224,
"step": 7230
},
{
"epoch": 1.2658449165136814,
"grad_norm": 3.161783218383789,
"learning_rate": 2.3262843418663234e-05,
"loss": 0.2077,
"step": 7240
},
{
"epoch": 1.2675933210945014,
"grad_norm": 6.526487350463867,
"learning_rate": 2.3239411868080374e-05,
"loss": 0.2106,
"step": 7250
},
{
"epoch": 1.2693417256753212,
"grad_norm": 1.9831335544586182,
"learning_rate": 2.321598031749751e-05,
"loss": 0.231,
"step": 7260
},
{
"epoch": 1.2710901302561413,
"grad_norm": 2.8936715126037598,
"learning_rate": 2.3192548766914652e-05,
"loss": 0.2364,
"step": 7270
},
{
"epoch": 1.2728385348369613,
"grad_norm": 2.9134674072265625,
"learning_rate": 2.3169117216331793e-05,
"loss": 0.2153,
"step": 7280
},
{
"epoch": 1.2745869394177813,
"grad_norm": 1.7994840145111084,
"learning_rate": 2.3145685665748933e-05,
"loss": 0.2049,
"step": 7290
},
{
"epoch": 1.2763353439986012,
"grad_norm": 1.5330135822296143,
"learning_rate": 2.3122254115166074e-05,
"loss": 0.202,
"step": 7300
},
{
"epoch": 1.2780837485794212,
"grad_norm": 2.563875436782837,
"learning_rate": 2.3098822564583214e-05,
"loss": 0.2086,
"step": 7310
},
{
"epoch": 1.2798321531602412,
"grad_norm": 3.0079505443573,
"learning_rate": 2.3075391014000355e-05,
"loss": 0.1833,
"step": 7320
},
{
"epoch": 1.2815805577410613,
"grad_norm": 1.683423638343811,
"learning_rate": 2.3051959463417495e-05,
"loss": 0.2167,
"step": 7330
},
{
"epoch": 1.2833289623218813,
"grad_norm": 2.0884647369384766,
"learning_rate": 2.3028527912834636e-05,
"loss": 0.2138,
"step": 7340
},
{
"epoch": 1.2850773669027014,
"grad_norm": 3.214635133743286,
"learning_rate": 2.3005096362251777e-05,
"loss": 0.2095,
"step": 7350
},
{
"epoch": 1.2868257714835214,
"grad_norm": 2.2779738903045654,
"learning_rate": 2.2981664811668917e-05,
"loss": 0.2373,
"step": 7360
},
{
"epoch": 1.2885741760643412,
"grad_norm": 3.165019989013672,
"learning_rate": 2.2958233261086054e-05,
"loss": 0.2027,
"step": 7370
},
{
"epoch": 1.2903225806451613,
"grad_norm": 1.9054561853408813,
"learning_rate": 2.2934801710503195e-05,
"loss": 0.2307,
"step": 7380
},
{
"epoch": 1.2920709852259813,
"grad_norm": 2.0474791526794434,
"learning_rate": 2.2911370159920336e-05,
"loss": 0.2045,
"step": 7390
},
{
"epoch": 1.2938193898068013,
"grad_norm": 13.95895004272461,
"learning_rate": 2.2887938609337476e-05,
"loss": 0.2003,
"step": 7400
},
{
"epoch": 1.2955677943876214,
"grad_norm": 3.3436994552612305,
"learning_rate": 2.2864507058754617e-05,
"loss": 0.2024,
"step": 7410
},
{
"epoch": 1.2973161989684412,
"grad_norm": 2.3914167881011963,
"learning_rate": 2.2841075508171757e-05,
"loss": 0.2,
"step": 7420
},
{
"epoch": 1.2990646035492612,
"grad_norm": 2.7382149696350098,
"learning_rate": 2.2817643957588898e-05,
"loss": 0.2361,
"step": 7430
},
{
"epoch": 1.3008130081300813,
"grad_norm": 1.886077642440796,
"learning_rate": 2.2794212407006032e-05,
"loss": 0.1993,
"step": 7440
},
{
"epoch": 1.3025614127109013,
"grad_norm": 7.261506080627441,
"learning_rate": 2.2770780856423172e-05,
"loss": 0.1806,
"step": 7450
},
{
"epoch": 1.3043098172917214,
"grad_norm": 2.7932260036468506,
"learning_rate": 2.2747349305840313e-05,
"loss": 0.2451,
"step": 7460
},
{
"epoch": 1.3060582218725414,
"grad_norm": 2.1137731075286865,
"learning_rate": 2.2723917755257453e-05,
"loss": 0.2437,
"step": 7470
},
{
"epoch": 1.3078066264533614,
"grad_norm": 2.070944309234619,
"learning_rate": 2.2700486204674594e-05,
"loss": 0.201,
"step": 7480
},
{
"epoch": 1.3095550310341812,
"grad_norm": 2.4869847297668457,
"learning_rate": 2.2677054654091735e-05,
"loss": 0.2218,
"step": 7490
},
{
"epoch": 1.3113034356150013,
"grad_norm": 2.1322641372680664,
"learning_rate": 2.2653623103508875e-05,
"loss": 0.2069,
"step": 7500
},
{
"epoch": 1.3130518401958213,
"grad_norm": 3.1480398178100586,
"learning_rate": 2.2630191552926016e-05,
"loss": 0.2562,
"step": 7510
},
{
"epoch": 1.3148002447766414,
"grad_norm": 2.788144111633301,
"learning_rate": 2.2606760002343156e-05,
"loss": 0.2215,
"step": 7520
},
{
"epoch": 1.3165486493574612,
"grad_norm": 2.7730488777160645,
"learning_rate": 2.2583328451760297e-05,
"loss": 0.1914,
"step": 7530
},
{
"epoch": 1.3182970539382812,
"grad_norm": 2.38083815574646,
"learning_rate": 2.2559896901177438e-05,
"loss": 0.2192,
"step": 7540
},
{
"epoch": 1.3200454585191013,
"grad_norm": 1.9907407760620117,
"learning_rate": 2.2536465350594575e-05,
"loss": 0.1665,
"step": 7550
},
{
"epoch": 1.3217938630999213,
"grad_norm": 2.5785951614379883,
"learning_rate": 2.2513033800011715e-05,
"loss": 0.23,
"step": 7560
},
{
"epoch": 1.3235422676807413,
"grad_norm": 2.255279064178467,
"learning_rate": 2.2489602249428856e-05,
"loss": 0.2671,
"step": 7570
},
{
"epoch": 1.3252906722615614,
"grad_norm": 2.6091926097869873,
"learning_rate": 2.2466170698845997e-05,
"loss": 0.1948,
"step": 7580
},
{
"epoch": 1.3270390768423814,
"grad_norm": 1.9192357063293457,
"learning_rate": 2.2442739148263137e-05,
"loss": 0.2092,
"step": 7590
},
{
"epoch": 1.3287874814232015,
"grad_norm": 2.7502996921539307,
"learning_rate": 2.2419307597680278e-05,
"loss": 0.2184,
"step": 7600
},
{
"epoch": 1.3305358860040213,
"grad_norm": 1.731701374053955,
"learning_rate": 2.2395876047097418e-05,
"loss": 0.226,
"step": 7610
},
{
"epoch": 1.3322842905848413,
"grad_norm": 2.618088960647583,
"learning_rate": 2.237244449651456e-05,
"loss": 0.1989,
"step": 7620
},
{
"epoch": 1.3340326951656614,
"grad_norm": 2.9609458446502686,
"learning_rate": 2.23490129459317e-05,
"loss": 0.2568,
"step": 7630
},
{
"epoch": 1.3357810997464814,
"grad_norm": 2.470890760421753,
"learning_rate": 2.232558139534884e-05,
"loss": 0.2545,
"step": 7640
},
{
"epoch": 1.3375295043273012,
"grad_norm": 4.0039215087890625,
"learning_rate": 2.230214984476598e-05,
"loss": 0.2507,
"step": 7650
},
{
"epoch": 1.3392779089081213,
"grad_norm": 1.8677549362182617,
"learning_rate": 2.2278718294183118e-05,
"loss": 0.2364,
"step": 7660
},
{
"epoch": 1.3410263134889413,
"grad_norm": 2.4219982624053955,
"learning_rate": 2.225528674360026e-05,
"loss": 0.2278,
"step": 7670
},
{
"epoch": 1.3427747180697613,
"grad_norm": 2.556628465652466,
"learning_rate": 2.22318551930174e-05,
"loss": 0.2008,
"step": 7680
},
{
"epoch": 1.3445231226505814,
"grad_norm": 3.106130361557007,
"learning_rate": 2.220842364243454e-05,
"loss": 0.241,
"step": 7690
},
{
"epoch": 1.3462715272314014,
"grad_norm": 5.037795066833496,
"learning_rate": 2.218499209185168e-05,
"loss": 0.1823,
"step": 7700
},
{
"epoch": 1.3480199318122215,
"grad_norm": 2.826275110244751,
"learning_rate": 2.216156054126882e-05,
"loss": 0.2108,
"step": 7710
},
{
"epoch": 1.3497683363930413,
"grad_norm": 2.2898755073547363,
"learning_rate": 2.213812899068596e-05,
"loss": 0.2649,
"step": 7720
},
{
"epoch": 1.3515167409738613,
"grad_norm": 1.984183669090271,
"learning_rate": 2.2114697440103102e-05,
"loss": 0.22,
"step": 7730
},
{
"epoch": 1.3532651455546814,
"grad_norm": 2.6796443462371826,
"learning_rate": 2.2091265889520243e-05,
"loss": 0.2003,
"step": 7740
},
{
"epoch": 1.3550135501355014,
"grad_norm": 2.9810585975646973,
"learning_rate": 2.2067834338937383e-05,
"loss": 0.2152,
"step": 7750
},
{
"epoch": 1.3567619547163214,
"grad_norm": 2.0471839904785156,
"learning_rate": 2.204440278835452e-05,
"loss": 0.1844,
"step": 7760
},
{
"epoch": 1.3585103592971413,
"grad_norm": 2.1709094047546387,
"learning_rate": 2.202097123777166e-05,
"loss": 0.207,
"step": 7770
},
{
"epoch": 1.3602587638779613,
"grad_norm": 3.221278667449951,
"learning_rate": 2.19975396871888e-05,
"loss": 0.2077,
"step": 7780
},
{
"epoch": 1.3620071684587813,
"grad_norm": 2.5238027572631836,
"learning_rate": 2.1974108136605942e-05,
"loss": 0.23,
"step": 7790
},
{
"epoch": 1.3637555730396014,
"grad_norm": 1.5731216669082642,
"learning_rate": 2.1950676586023083e-05,
"loss": 0.2321,
"step": 7800
},
{
"epoch": 1.3655039776204214,
"grad_norm": 4.94499397277832,
"learning_rate": 2.1927245035440223e-05,
"loss": 0.2136,
"step": 7810
},
{
"epoch": 1.3672523822012415,
"grad_norm": 1.9199833869934082,
"learning_rate": 2.1903813484857364e-05,
"loss": 0.2235,
"step": 7820
},
{
"epoch": 1.3690007867820615,
"grad_norm": 2.569610357284546,
"learning_rate": 2.1880381934274504e-05,
"loss": 0.2219,
"step": 7830
},
{
"epoch": 1.3707491913628813,
"grad_norm": 2.2798616886138916,
"learning_rate": 2.1856950383691645e-05,
"loss": 0.1895,
"step": 7840
},
{
"epoch": 1.3724975959437014,
"grad_norm": 2.7358908653259277,
"learning_rate": 2.183586198816707e-05,
"loss": 0.2481,
"step": 7850
},
{
"epoch": 1.3742460005245214,
"grad_norm": 2.1300289630889893,
"learning_rate": 2.181243043758421e-05,
"loss": 0.2229,
"step": 7860
},
{
"epoch": 1.3759944051053414,
"grad_norm": 2.7066380977630615,
"learning_rate": 2.1788998887001348e-05,
"loss": 0.1998,
"step": 7870
},
{
"epoch": 1.3777428096861613,
"grad_norm": 2.4064714908599854,
"learning_rate": 2.1765567336418488e-05,
"loss": 0.2018,
"step": 7880
},
{
"epoch": 1.3794912142669813,
"grad_norm": 2.445901870727539,
"learning_rate": 2.174213578583563e-05,
"loss": 0.2298,
"step": 7890
},
{
"epoch": 1.3812396188478013,
"grad_norm": 2.6251111030578613,
"learning_rate": 2.171870423525277e-05,
"loss": 0.2056,
"step": 7900
},
{
"epoch": 1.3829880234286214,
"grad_norm": 2.2267794609069824,
"learning_rate": 2.169527268466991e-05,
"loss": 0.2266,
"step": 7910
},
{
"epoch": 1.3847364280094414,
"grad_norm": 2.0632987022399902,
"learning_rate": 2.167184113408705e-05,
"loss": 0.2078,
"step": 7920
},
{
"epoch": 1.3864848325902615,
"grad_norm": 1.6934055089950562,
"learning_rate": 2.164840958350419e-05,
"loss": 0.1987,
"step": 7930
},
{
"epoch": 1.3882332371710815,
"grad_norm": 2.452653646469116,
"learning_rate": 2.1624978032921332e-05,
"loss": 0.2715,
"step": 7940
},
{
"epoch": 1.3899816417519013,
"grad_norm": 1.3962805271148682,
"learning_rate": 2.1601546482338472e-05,
"loss": 0.2062,
"step": 7950
},
{
"epoch": 1.3917300463327213,
"grad_norm": 2.5406017303466797,
"learning_rate": 2.1578114931755613e-05,
"loss": 0.2262,
"step": 7960
},
{
"epoch": 1.3934784509135414,
"grad_norm": 1.7772146463394165,
"learning_rate": 2.1554683381172753e-05,
"loss": 0.2201,
"step": 7970
},
{
"epoch": 1.3952268554943614,
"grad_norm": 2.1140542030334473,
"learning_rate": 2.153125183058989e-05,
"loss": 0.2508,
"step": 7980
},
{
"epoch": 1.3969752600751815,
"grad_norm": 2.517038345336914,
"learning_rate": 2.150782028000703e-05,
"loss": 0.2068,
"step": 7990
},
{
"epoch": 1.3987236646560013,
"grad_norm": 2.4555583000183105,
"learning_rate": 2.1484388729424172e-05,
"loss": 0.2419,
"step": 8000
},
{
"epoch": 1.4004720692368213,
"grad_norm": 1.9649275541305542,
"learning_rate": 2.1460957178841312e-05,
"loss": 0.2094,
"step": 8010
},
{
"epoch": 1.4022204738176414,
"grad_norm": 1.9330495595932007,
"learning_rate": 2.1437525628258453e-05,
"loss": 0.222,
"step": 8020
},
{
"epoch": 1.4039688783984614,
"grad_norm": 2.596536636352539,
"learning_rate": 2.1414094077675594e-05,
"loss": 0.2442,
"step": 8030
},
{
"epoch": 1.4057172829792814,
"grad_norm": 2.4878602027893066,
"learning_rate": 2.1390662527092734e-05,
"loss": 0.2341,
"step": 8040
},
{
"epoch": 1.4074656875601015,
"grad_norm": 1.9203938245773315,
"learning_rate": 2.1367230976509875e-05,
"loss": 0.2145,
"step": 8050
},
{
"epoch": 1.4092140921409215,
"grad_norm": 3.3724257946014404,
"learning_rate": 2.1343799425927015e-05,
"loss": 0.2151,
"step": 8060
},
{
"epoch": 1.4109624967217413,
"grad_norm": 3.4099626541137695,
"learning_rate": 2.1320367875344156e-05,
"loss": 0.2314,
"step": 8070
},
{
"epoch": 1.4127109013025614,
"grad_norm": 2.2942490577697754,
"learning_rate": 2.1296936324761293e-05,
"loss": 0.1972,
"step": 8080
},
{
"epoch": 1.4144593058833814,
"grad_norm": 1.607845425605774,
"learning_rate": 2.1273504774178434e-05,
"loss": 0.2223,
"step": 8090
},
{
"epoch": 1.4162077104642015,
"grad_norm": 2.1875293254852295,
"learning_rate": 2.1250073223595574e-05,
"loss": 0.2209,
"step": 8100
},
{
"epoch": 1.4179561150450213,
"grad_norm": 2.5811476707458496,
"learning_rate": 2.122664167301271e-05,
"loss": 0.2046,
"step": 8110
},
{
"epoch": 1.4197045196258413,
"grad_norm": 3.045577049255371,
"learning_rate": 2.1203210122429852e-05,
"loss": 0.2177,
"step": 8120
},
{
"epoch": 1.4214529242066614,
"grad_norm": 3.3778419494628906,
"learning_rate": 2.1179778571846993e-05,
"loss": 0.2511,
"step": 8130
},
{
"epoch": 1.4232013287874814,
"grad_norm": 4.097287654876709,
"learning_rate": 2.1156347021264133e-05,
"loss": 0.2004,
"step": 8140
},
{
"epoch": 1.4249497333683014,
"grad_norm": 3.0460093021392822,
"learning_rate": 2.1132915470681274e-05,
"loss": 0.2308,
"step": 8150
},
{
"epoch": 1.4266981379491215,
"grad_norm": 2.531940221786499,
"learning_rate": 2.110948392009841e-05,
"loss": 0.2029,
"step": 8160
},
{
"epoch": 1.4284465425299415,
"grad_norm": 2.160956382751465,
"learning_rate": 2.108605236951555e-05,
"loss": 0.1643,
"step": 8170
},
{
"epoch": 1.4301949471107616,
"grad_norm": 2.251553535461426,
"learning_rate": 2.1062620818932692e-05,
"loss": 0.2043,
"step": 8180
},
{
"epoch": 1.4319433516915814,
"grad_norm": 1.8432042598724365,
"learning_rate": 2.1039189268349833e-05,
"loss": 0.2204,
"step": 8190
},
{
"epoch": 1.4336917562724014,
"grad_norm": 2.3578741550445557,
"learning_rate": 2.1015757717766973e-05,
"loss": 0.216,
"step": 8200
},
{
"epoch": 1.4354401608532215,
"grad_norm": 2.0462770462036133,
"learning_rate": 2.0992326167184114e-05,
"loss": 0.2136,
"step": 8210
},
{
"epoch": 1.4371885654340415,
"grad_norm": 3.6537725925445557,
"learning_rate": 2.0968894616601255e-05,
"loss": 0.2025,
"step": 8220
},
{
"epoch": 1.4389369700148613,
"grad_norm": 3.5801661014556885,
"learning_rate": 2.0945463066018395e-05,
"loss": 0.2113,
"step": 8230
},
{
"epoch": 1.4406853745956814,
"grad_norm": 1.7767003774642944,
"learning_rate": 2.0922031515435536e-05,
"loss": 0.2428,
"step": 8240
},
{
"epoch": 1.4424337791765014,
"grad_norm": 3.095386028289795,
"learning_rate": 2.0898599964852676e-05,
"loss": 0.1953,
"step": 8250
},
{
"epoch": 1.4441821837573214,
"grad_norm": 2.714571237564087,
"learning_rate": 2.0875168414269817e-05,
"loss": 0.1861,
"step": 8260
},
{
"epoch": 1.4459305883381415,
"grad_norm": 2.4387760162353516,
"learning_rate": 2.0851736863686954e-05,
"loss": 0.2076,
"step": 8270
},
{
"epoch": 1.4476789929189615,
"grad_norm": 2.1680490970611572,
"learning_rate": 2.0828305313104095e-05,
"loss": 0.1952,
"step": 8280
},
{
"epoch": 1.4494273974997816,
"grad_norm": 1.8361495733261108,
"learning_rate": 2.0804873762521235e-05,
"loss": 0.2027,
"step": 8290
},
{
"epoch": 1.4511758020806014,
"grad_norm": 2.529107093811035,
"learning_rate": 2.0781442211938376e-05,
"loss": 0.1926,
"step": 8300
},
{
"epoch": 1.4529242066614214,
"grad_norm": 1.8087151050567627,
"learning_rate": 2.0758010661355516e-05,
"loss": 0.226,
"step": 8310
},
{
"epoch": 1.4546726112422415,
"grad_norm": 1.4918991327285767,
"learning_rate": 2.0734579110772657e-05,
"loss": 0.1934,
"step": 8320
},
{
"epoch": 1.4564210158230615,
"grad_norm": 2.2762157917022705,
"learning_rate": 2.0711147560189798e-05,
"loss": 0.2199,
"step": 8330
},
{
"epoch": 1.4581694204038815,
"grad_norm": 2.103135585784912,
"learning_rate": 2.0687716009606938e-05,
"loss": 0.1884,
"step": 8340
},
{
"epoch": 1.4599178249847014,
"grad_norm": 2.3067610263824463,
"learning_rate": 2.066428445902408e-05,
"loss": 0.2075,
"step": 8350
},
{
"epoch": 1.4616662295655214,
"grad_norm": 1.7336812019348145,
"learning_rate": 2.064085290844122e-05,
"loss": 0.176,
"step": 8360
},
{
"epoch": 1.4634146341463414,
"grad_norm": 2.6093456745147705,
"learning_rate": 2.0617421357858357e-05,
"loss": 0.2267,
"step": 8370
},
{
"epoch": 1.4651630387271615,
"grad_norm": 2.137324571609497,
"learning_rate": 2.0593989807275497e-05,
"loss": 0.2467,
"step": 8380
},
{
"epoch": 1.4669114433079815,
"grad_norm": 2.212411880493164,
"learning_rate": 2.0570558256692638e-05,
"loss": 0.2465,
"step": 8390
},
{
"epoch": 1.4686598478888016,
"grad_norm": 1.9635968208312988,
"learning_rate": 2.0547126706109778e-05,
"loss": 0.1947,
"step": 8400
},
{
"epoch": 1.4704082524696216,
"grad_norm": 1.5587635040283203,
"learning_rate": 2.052369515552692e-05,
"loss": 0.1783,
"step": 8410
},
{
"epoch": 1.4721566570504414,
"grad_norm": 1.5327953100204468,
"learning_rate": 2.050026360494406e-05,
"loss": 0.203,
"step": 8420
},
{
"epoch": 1.4739050616312614,
"grad_norm": 1.5852612257003784,
"learning_rate": 2.04768320543612e-05,
"loss": 0.1994,
"step": 8430
},
{
"epoch": 1.4756534662120815,
"grad_norm": 2.138629913330078,
"learning_rate": 2.045340050377834e-05,
"loss": 0.1943,
"step": 8440
},
{
"epoch": 1.4774018707929015,
"grad_norm": 2.41829252243042,
"learning_rate": 2.042996895319548e-05,
"loss": 0.1848,
"step": 8450
},
{
"epoch": 1.4791502753737213,
"grad_norm": 2.3004097938537598,
"learning_rate": 2.0406537402612622e-05,
"loss": 0.2026,
"step": 8460
},
{
"epoch": 1.4808986799545414,
"grad_norm": 3.455299139022827,
"learning_rate": 2.0383105852029762e-05,
"loss": 0.2194,
"step": 8470
},
{
"epoch": 1.4826470845353614,
"grad_norm": 2.1128129959106445,
"learning_rate": 2.03596743014469e-05,
"loss": 0.2093,
"step": 8480
},
{
"epoch": 1.4843954891161815,
"grad_norm": 3.1031060218811035,
"learning_rate": 2.033624275086404e-05,
"loss": 0.2165,
"step": 8490
},
{
"epoch": 1.4861438936970015,
"grad_norm": 3.0639071464538574,
"learning_rate": 2.031281120028118e-05,
"loss": 0.2182,
"step": 8500
},
{
"epoch": 1.4878922982778215,
"grad_norm": 1.974797010421753,
"learning_rate": 2.028937964969832e-05,
"loss": 0.2176,
"step": 8510
},
{
"epoch": 1.4896407028586416,
"grad_norm": 2.2451913356781006,
"learning_rate": 2.0265948099115462e-05,
"loss": 0.1985,
"step": 8520
},
{
"epoch": 1.4913891074394616,
"grad_norm": 3.2870914936065674,
"learning_rate": 2.0242516548532603e-05,
"loss": 0.2182,
"step": 8530
},
{
"epoch": 1.4931375120202814,
"grad_norm": 1.9064737558364868,
"learning_rate": 2.0219084997949743e-05,
"loss": 0.1819,
"step": 8540
},
{
"epoch": 1.4948859166011015,
"grad_norm": 7.4060564041137695,
"learning_rate": 2.0195653447366884e-05,
"loss": 0.1877,
"step": 8550
},
{
"epoch": 1.4966343211819215,
"grad_norm": 2.1721577644348145,
"learning_rate": 2.0172221896784024e-05,
"loss": 0.2139,
"step": 8560
},
{
"epoch": 1.4983827257627416,
"grad_norm": 3.478092908859253,
"learning_rate": 2.0148790346201165e-05,
"loss": 0.2526,
"step": 8570
},
{
"epoch": 1.5001311303435614,
"grad_norm": 2.2158713340759277,
"learning_rate": 2.0125358795618305e-05,
"loss": 0.1914,
"step": 8580
},
{
"epoch": 1.5018795349243814,
"grad_norm": 1.9559482336044312,
"learning_rate": 2.010192724503544e-05,
"loss": 0.2016,
"step": 8590
},
{
"epoch": 1.5036279395052015,
"grad_norm": 2.525531768798828,
"learning_rate": 2.007849569445258e-05,
"loss": 0.175,
"step": 8600
},
{
"epoch": 1.5053763440860215,
"grad_norm": 2.0414490699768066,
"learning_rate": 2.005506414386972e-05,
"loss": 0.1678,
"step": 8610
},
{
"epoch": 1.5071247486668415,
"grad_norm": 2.622178316116333,
"learning_rate": 2.003163259328686e-05,
"loss": 0.1964,
"step": 8620
},
{
"epoch": 1.5088731532476616,
"grad_norm": 2.0801520347595215,
"learning_rate": 2.0008201042704e-05,
"loss": 0.2044,
"step": 8630
},
{
"epoch": 1.5106215578284816,
"grad_norm": 2.0462727546691895,
"learning_rate": 1.9984769492121142e-05,
"loss": 0.1786,
"step": 8640
},
{
"epoch": 1.5123699624093017,
"grad_norm": 2.199009895324707,
"learning_rate": 1.9961337941538283e-05,
"loss": 0.2076,
"step": 8650
},
{
"epoch": 1.5141183669901215,
"grad_norm": 2.0829432010650635,
"learning_rate": 1.9937906390955423e-05,
"loss": 0.217,
"step": 8660
},
{
"epoch": 1.5158667715709415,
"grad_norm": 1.6419503688812256,
"learning_rate": 1.9914474840372564e-05,
"loss": 0.2066,
"step": 8670
},
{
"epoch": 1.5176151761517616,
"grad_norm": 2.3850531578063965,
"learning_rate": 1.9891043289789705e-05,
"loss": 0.2279,
"step": 8680
},
{
"epoch": 1.5193635807325814,
"grad_norm": 2.983680486679077,
"learning_rate": 1.9867611739206845e-05,
"loss": 0.2123,
"step": 8690
},
{
"epoch": 1.5211119853134014,
"grad_norm": 2.2864255905151367,
"learning_rate": 1.9844180188623986e-05,
"loss": 0.2574,
"step": 8700
},
{
"epoch": 1.5228603898942215,
"grad_norm": 2.297508955001831,
"learning_rate": 1.9820748638041123e-05,
"loss": 0.182,
"step": 8710
},
{
"epoch": 1.5246087944750415,
"grad_norm": 1.8566784858703613,
"learning_rate": 1.9797317087458263e-05,
"loss": 0.2081,
"step": 8720
},
{
"epoch": 1.5263571990558615,
"grad_norm": 2.14294695854187,
"learning_rate": 1.9773885536875404e-05,
"loss": 0.2094,
"step": 8730
},
{
"epoch": 1.5281056036366816,
"grad_norm": 3.8622517585754395,
"learning_rate": 1.9750453986292545e-05,
"loss": 0.1816,
"step": 8740
},
{
"epoch": 1.5298540082175016,
"grad_norm": 3.9352099895477295,
"learning_rate": 1.9727022435709685e-05,
"loss": 0.1895,
"step": 8750
},
{
"epoch": 1.5316024127983217,
"grad_norm": 2.0230891704559326,
"learning_rate": 1.9703590885126826e-05,
"loss": 0.2077,
"step": 8760
},
{
"epoch": 1.5333508173791417,
"grad_norm": 3.11130952835083,
"learning_rate": 1.9680159334543963e-05,
"loss": 0.1958,
"step": 8770
},
{
"epoch": 1.5350992219599615,
"grad_norm": 2.240939140319824,
"learning_rate": 1.9656727783961104e-05,
"loss": 0.1949,
"step": 8780
},
{
"epoch": 1.5368476265407816,
"grad_norm": 3.9747726917266846,
"learning_rate": 1.9633296233378244e-05,
"loss": 0.2115,
"step": 8790
},
{
"epoch": 1.5385960311216016,
"grad_norm": 2.0578691959381104,
"learning_rate": 1.9609864682795385e-05,
"loss": 0.2361,
"step": 8800
},
{
"epoch": 1.5403444357024214,
"grad_norm": 1.8161098957061768,
"learning_rate": 1.9586433132212525e-05,
"loss": 0.1738,
"step": 8810
},
{
"epoch": 1.5420928402832415,
"grad_norm": 2.508936643600464,
"learning_rate": 1.9563001581629666e-05,
"loss": 0.2129,
"step": 8820
},
{
"epoch": 1.5438412448640615,
"grad_norm": 1.8289694786071777,
"learning_rate": 1.9539570031046807e-05,
"loss": 0.1573,
"step": 8830
},
{
"epoch": 1.5455896494448815,
"grad_norm": 2.67386531829834,
"learning_rate": 1.9516138480463947e-05,
"loss": 0.228,
"step": 8840
},
{
"epoch": 1.5473380540257016,
"grad_norm": 2.565060615539551,
"learning_rate": 1.9492706929881088e-05,
"loss": 0.234,
"step": 8850
},
{
"epoch": 1.5490864586065216,
"grad_norm": 2.6208226680755615,
"learning_rate": 1.9469275379298228e-05,
"loss": 0.2135,
"step": 8860
},
{
"epoch": 1.5508348631873417,
"grad_norm": 2.1162941455841064,
"learning_rate": 1.944584382871537e-05,
"loss": 0.2138,
"step": 8870
},
{
"epoch": 1.5525832677681617,
"grad_norm": 3.716646194458008,
"learning_rate": 1.9422412278132506e-05,
"loss": 0.2058,
"step": 8880
},
{
"epoch": 1.5543316723489815,
"grad_norm": 2.363117218017578,
"learning_rate": 1.9398980727549647e-05,
"loss": 0.2185,
"step": 8890
},
{
"epoch": 1.5560800769298015,
"grad_norm": 4.495354175567627,
"learning_rate": 1.9375549176966787e-05,
"loss": 0.2137,
"step": 8900
},
{
"epoch": 1.5578284815106216,
"grad_norm": 1.6843703985214233,
"learning_rate": 1.9352117626383928e-05,
"loss": 0.2047,
"step": 8910
},
{
"epoch": 1.5595768860914414,
"grad_norm": 3.8748202323913574,
"learning_rate": 1.932868607580107e-05,
"loss": 0.2283,
"step": 8920
},
{
"epoch": 1.5613252906722614,
"grad_norm": 2.3094186782836914,
"learning_rate": 1.930525452521821e-05,
"loss": 0.1873,
"step": 8930
},
{
"epoch": 1.5630736952530815,
"grad_norm": 1.842756986618042,
"learning_rate": 1.928182297463535e-05,
"loss": 0.19,
"step": 8940
},
{
"epoch": 1.5648220998339015,
"grad_norm": 2.115356206893921,
"learning_rate": 1.9258391424052487e-05,
"loss": 0.2043,
"step": 8950
},
{
"epoch": 1.5665705044147216,
"grad_norm": 3.2382071018218994,
"learning_rate": 1.9234959873469627e-05,
"loss": 0.2055,
"step": 8960
},
{
"epoch": 1.5683189089955416,
"grad_norm": 2.4569027423858643,
"learning_rate": 1.9211528322886768e-05,
"loss": 0.1677,
"step": 8970
},
{
"epoch": 1.5700673135763616,
"grad_norm": 2.180326461791992,
"learning_rate": 1.918809677230391e-05,
"loss": 0.198,
"step": 8980
},
{
"epoch": 1.5718157181571817,
"grad_norm": 8.215828895568848,
"learning_rate": 1.916466522172105e-05,
"loss": 0.2735,
"step": 8990
},
{
"epoch": 1.5735641227380017,
"grad_norm": 2.064840316772461,
"learning_rate": 1.9143576826196474e-05,
"loss": 0.1926,
"step": 9000
},
{
"epoch": 1.5740886441122477,
"eval_loss": 0.36656469106674194,
"eval_runtime": 1792.8447,
"eval_samples_per_second": 8.078,
"eval_steps_per_second": 1.01,
"step": 9003
},
{
"epoch": 1.5753125273188215,
"grad_norm": 1.91609787940979,
"learning_rate": 1.9120145275613615e-05,
"loss": 0.2178,
"step": 9010
},
{
"epoch": 1.5770609318996416,
"grad_norm": 4.243214130401611,
"learning_rate": 1.9096713725030755e-05,
"loss": 0.2039,
"step": 9020
},
{
"epoch": 1.5788093364804616,
"grad_norm": 2.9057223796844482,
"learning_rate": 1.9073282174447896e-05,
"loss": 0.177,
"step": 9030
},
{
"epoch": 1.5805577410612814,
"grad_norm": 3.835923433303833,
"learning_rate": 1.9049850623865036e-05,
"loss": 0.19,
"step": 9040
},
{
"epoch": 1.5823061456421015,
"grad_norm": 2.18546986579895,
"learning_rate": 1.9026419073282177e-05,
"loss": 0.1639,
"step": 9050
},
{
"epoch": 1.5840545502229215,
"grad_norm": 2.5857343673706055,
"learning_rate": 1.9002987522699317e-05,
"loss": 0.2156,
"step": 9060
},
{
"epoch": 1.5858029548037416,
"grad_norm": 5.416774272918701,
"learning_rate": 1.8979555972116458e-05,
"loss": 0.2085,
"step": 9070
},
{
"epoch": 1.5875513593845616,
"grad_norm": 2.449079751968384,
"learning_rate": 1.89561244215336e-05,
"loss": 0.191,
"step": 9080
},
{
"epoch": 1.5892997639653816,
"grad_norm": 2.1129159927368164,
"learning_rate": 1.8932692870950736e-05,
"loss": 0.2079,
"step": 9090
},
{
"epoch": 1.5910481685462017,
"grad_norm": 2.000885009765625,
"learning_rate": 1.8909261320367876e-05,
"loss": 0.2055,
"step": 9100
},
{
"epoch": 1.5927965731270217,
"grad_norm": 2.0597760677337646,
"learning_rate": 1.8885829769785017e-05,
"loss": 0.1894,
"step": 9110
},
{
"epoch": 1.5945449777078418,
"grad_norm": 2.180694818496704,
"learning_rate": 1.8862398219202158e-05,
"loss": 0.1801,
"step": 9120
},
{
"epoch": 1.5962933822886616,
"grad_norm": 2.4223179817199707,
"learning_rate": 1.8838966668619298e-05,
"loss": 0.1895,
"step": 9130
},
{
"epoch": 1.5980417868694816,
"grad_norm": 1.957263708114624,
"learning_rate": 1.8815535118036435e-05,
"loss": 0.2013,
"step": 9140
},
{
"epoch": 1.5997901914503014,
"grad_norm": 1.6094838380813599,
"learning_rate": 1.8792103567453576e-05,
"loss": 0.2229,
"step": 9150
},
{
"epoch": 1.6015385960311215,
"grad_norm": 1.6934531927108765,
"learning_rate": 1.8768672016870717e-05,
"loss": 0.1792,
"step": 9160
},
{
"epoch": 1.6032870006119415,
"grad_norm": 1.3896024227142334,
"learning_rate": 1.8745240466287857e-05,
"loss": 0.2096,
"step": 9170
},
{
"epoch": 1.6050354051927616,
"grad_norm": 1.9864113330841064,
"learning_rate": 1.8724152070763285e-05,
"loss": 0.2035,
"step": 9180
},
{
"epoch": 1.6067838097735816,
"grad_norm": 1.592942714691162,
"learning_rate": 1.8700720520180426e-05,
"loss": 0.2076,
"step": 9190
},
{
"epoch": 1.6085322143544016,
"grad_norm": 1.9681413173675537,
"learning_rate": 1.8677288969597563e-05,
"loss": 0.1514,
"step": 9200
},
{
"epoch": 1.6102806189352217,
"grad_norm": 1.6271953582763672,
"learning_rate": 1.8653857419014704e-05,
"loss": 0.2308,
"step": 9210
},
{
"epoch": 1.6120290235160417,
"grad_norm": 2.731745481491089,
"learning_rate": 1.8630425868431844e-05,
"loss": 0.1761,
"step": 9220
},
{
"epoch": 1.6137774280968618,
"grad_norm": 1.8629202842712402,
"learning_rate": 1.8606994317848985e-05,
"loss": 0.1986,
"step": 9230
},
{
"epoch": 1.6155258326776816,
"grad_norm": 2.563565731048584,
"learning_rate": 1.8583562767266125e-05,
"loss": 0.2051,
"step": 9240
},
{
"epoch": 1.6172742372585016,
"grad_norm": 3.7784006595611572,
"learning_rate": 1.8560131216683266e-05,
"loss": 0.161,
"step": 9250
},
{
"epoch": 1.6190226418393217,
"grad_norm": 2.4034857749938965,
"learning_rate": 1.8536699666100407e-05,
"loss": 0.1901,
"step": 9260
},
{
"epoch": 1.6207710464201415,
"grad_norm": 1.440220594406128,
"learning_rate": 1.8513268115517547e-05,
"loss": 0.1724,
"step": 9270
},
{
"epoch": 1.6225194510009615,
"grad_norm": 3.3751020431518555,
"learning_rate": 1.8489836564934688e-05,
"loss": 0.2411,
"step": 9280
},
{
"epoch": 1.6242678555817816,
"grad_norm": 1.5174119472503662,
"learning_rate": 1.846640501435183e-05,
"loss": 0.2148,
"step": 9290
},
{
"epoch": 1.6260162601626016,
"grad_norm": 1.9150112867355347,
"learning_rate": 1.8442973463768966e-05,
"loss": 0.1907,
"step": 9300
},
{
"epoch": 1.6277646647434216,
"grad_norm": 2.028003692626953,
"learning_rate": 1.8419541913186106e-05,
"loss": 0.2156,
"step": 9310
},
{
"epoch": 1.6295130693242417,
"grad_norm": 1.975215196609497,
"learning_rate": 1.8396110362603247e-05,
"loss": 0.2151,
"step": 9320
},
{
"epoch": 1.6312614739050617,
"grad_norm": 3.184420347213745,
"learning_rate": 1.8372678812020387e-05,
"loss": 0.1918,
"step": 9330
},
{
"epoch": 1.6330098784858817,
"grad_norm": 2.1042797565460205,
"learning_rate": 1.8349247261437525e-05,
"loss": 0.1825,
"step": 9340
},
{
"epoch": 1.6347582830667018,
"grad_norm": 2.3438453674316406,
"learning_rate": 1.8325815710854665e-05,
"loss": 0.2106,
"step": 9350
},
{
"epoch": 1.6365066876475216,
"grad_norm": 2.2266077995300293,
"learning_rate": 1.8302384160271806e-05,
"loss": 0.2422,
"step": 9360
},
{
"epoch": 1.6382550922283416,
"grad_norm": 2.7527801990509033,
"learning_rate": 1.8278952609688946e-05,
"loss": 0.2066,
"step": 9370
},
{
"epoch": 1.6400034968091617,
"grad_norm": 1.2006065845489502,
"learning_rate": 1.8255521059106087e-05,
"loss": 0.1604,
"step": 9380
},
{
"epoch": 1.6417519013899815,
"grad_norm": 2.272947311401367,
"learning_rate": 1.8232089508523227e-05,
"loss": 0.1771,
"step": 9390
},
{
"epoch": 1.6435003059708015,
"grad_norm": 3.1900084018707275,
"learning_rate": 1.8208657957940368e-05,
"loss": 0.2074,
"step": 9400
},
{
"epoch": 1.6452487105516216,
"grad_norm": 2.1115543842315674,
"learning_rate": 1.818522640735751e-05,
"loss": 0.2119,
"step": 9410
},
{
"epoch": 1.6469971151324416,
"grad_norm": 2.303034543991089,
"learning_rate": 1.816179485677465e-05,
"loss": 0.2049,
"step": 9420
},
{
"epoch": 1.6487455197132617,
"grad_norm": 2.002100944519043,
"learning_rate": 1.813836330619179e-05,
"loss": 0.1819,
"step": 9430
},
{
"epoch": 1.6504939242940817,
"grad_norm": 2.697618246078491,
"learning_rate": 1.811493175560893e-05,
"loss": 0.2178,
"step": 9440
},
{
"epoch": 1.6522423288749017,
"grad_norm": 1.8766717910766602,
"learning_rate": 1.8091500205026068e-05,
"loss": 0.1941,
"step": 9450
},
{
"epoch": 1.6539907334557218,
"grad_norm": 1.7633792161941528,
"learning_rate": 1.8068068654443208e-05,
"loss": 0.1462,
"step": 9460
},
{
"epoch": 1.6557391380365416,
"grad_norm": 2.1197335720062256,
"learning_rate": 1.804463710386035e-05,
"loss": 0.1916,
"step": 9470
},
{
"epoch": 1.6574875426173616,
"grad_norm": 2.715771436691284,
"learning_rate": 1.802120555327749e-05,
"loss": 0.2045,
"step": 9480
},
{
"epoch": 1.6592359471981817,
"grad_norm": 2.6755380630493164,
"learning_rate": 1.799777400269463e-05,
"loss": 0.1736,
"step": 9490
},
{
"epoch": 1.6609843517790015,
"grad_norm": 2.555866003036499,
"learning_rate": 1.797434245211177e-05,
"loss": 0.1701,
"step": 9500
},
{
"epoch": 1.6627327563598215,
"grad_norm": 1.945939540863037,
"learning_rate": 1.79532540565872e-05,
"loss": 0.2216,
"step": 9510
},
{
"epoch": 1.6644811609406416,
"grad_norm": 1.77175772190094,
"learning_rate": 1.7929822506004336e-05,
"loss": 0.2054,
"step": 9520
},
{
"epoch": 1.6662295655214616,
"grad_norm": 2.1837494373321533,
"learning_rate": 1.7906390955421477e-05,
"loss": 0.1917,
"step": 9530
},
{
"epoch": 1.6679779701022817,
"grad_norm": 2.8827102184295654,
"learning_rate": 1.7882959404838617e-05,
"loss": 0.1919,
"step": 9540
},
{
"epoch": 1.6697263746831017,
"grad_norm": 1.9127072095870972,
"learning_rate": 1.7859527854255754e-05,
"loss": 0.1998,
"step": 9550
},
{
"epoch": 1.6714747792639217,
"grad_norm": 3.5764472484588623,
"learning_rate": 1.7836096303672895e-05,
"loss": 0.1751,
"step": 9560
},
{
"epoch": 1.6732231838447418,
"grad_norm": 3.749941110610962,
"learning_rate": 1.7812664753090035e-05,
"loss": 0.2057,
"step": 9570
},
{
"epoch": 1.6749715884255618,
"grad_norm": 2.783621072769165,
"learning_rate": 1.7789233202507176e-05,
"loss": 0.1788,
"step": 9580
},
{
"epoch": 1.6767199930063816,
"grad_norm": 1.676371455192566,
"learning_rate": 1.7765801651924317e-05,
"loss": 0.1926,
"step": 9590
},
{
"epoch": 1.6784683975872017,
"grad_norm": 2.354701519012451,
"learning_rate": 1.7742370101341457e-05,
"loss": 0.2128,
"step": 9600
},
{
"epoch": 1.6802168021680217,
"grad_norm": 1.633273720741272,
"learning_rate": 1.7718938550758598e-05,
"loss": 0.2343,
"step": 9610
},
{
"epoch": 1.6819652067488415,
"grad_norm": 1.3156176805496216,
"learning_rate": 1.769550700017574e-05,
"loss": 0.1748,
"step": 9620
},
{
"epoch": 1.6837136113296616,
"grad_norm": 1.5171828269958496,
"learning_rate": 1.767207544959288e-05,
"loss": 0.1877,
"step": 9630
},
{
"epoch": 1.6854620159104816,
"grad_norm": 2.5515353679656982,
"learning_rate": 1.764864389901002e-05,
"loss": 0.1922,
"step": 9640
},
{
"epoch": 1.6872104204913017,
"grad_norm": 2.472705364227295,
"learning_rate": 1.762521234842716e-05,
"loss": 0.1552,
"step": 9650
},
{
"epoch": 1.6889588250721217,
"grad_norm": 2.7057673931121826,
"learning_rate": 1.7601780797844297e-05,
"loss": 0.1945,
"step": 9660
},
{
"epoch": 1.6907072296529417,
"grad_norm": 2.4016506671905518,
"learning_rate": 1.7578349247261438e-05,
"loss": 0.2261,
"step": 9670
},
{
"epoch": 1.6924556342337618,
"grad_norm": 2.49354887008667,
"learning_rate": 1.755491769667858e-05,
"loss": 0.1836,
"step": 9680
},
{
"epoch": 1.6942040388145818,
"grad_norm": 3.3163530826568604,
"learning_rate": 1.753148614609572e-05,
"loss": 0.2011,
"step": 9690
},
{
"epoch": 1.6959524433954019,
"grad_norm": 1.8435873985290527,
"learning_rate": 1.750805459551286e-05,
"loss": 0.1976,
"step": 9700
},
{
"epoch": 1.6977008479762217,
"grad_norm": 2.1045944690704346,
"learning_rate": 1.748462304493e-05,
"loss": 0.2117,
"step": 9710
},
{
"epoch": 1.6994492525570417,
"grad_norm": 2.2373058795928955,
"learning_rate": 1.746119149434714e-05,
"loss": 0.2231,
"step": 9720
},
{
"epoch": 1.7011976571378618,
"grad_norm": 4.231749534606934,
"learning_rate": 1.743775994376428e-05,
"loss": 0.2033,
"step": 9730
},
{
"epoch": 1.7029460617186816,
"grad_norm": 1.860250473022461,
"learning_rate": 1.7414328393181422e-05,
"loss": 0.1666,
"step": 9740
},
{
"epoch": 1.7046944662995016,
"grad_norm": 1.631990909576416,
"learning_rate": 1.7390896842598563e-05,
"loss": 0.1946,
"step": 9750
},
{
"epoch": 1.7064428708803216,
"grad_norm": 2.109370470046997,
"learning_rate": 1.7367465292015703e-05,
"loss": 0.2146,
"step": 9760
},
{
"epoch": 1.7081912754611417,
"grad_norm": 1.7522424459457397,
"learning_rate": 1.734403374143284e-05,
"loss": 0.2189,
"step": 9770
},
{
"epoch": 1.7099396800419617,
"grad_norm": 2.2851176261901855,
"learning_rate": 1.732060219084998e-05,
"loss": 0.1709,
"step": 9780
},
{
"epoch": 1.7116880846227818,
"grad_norm": 1.994432806968689,
"learning_rate": 1.729717064026712e-05,
"loss": 0.1814,
"step": 9790
},
{
"epoch": 1.7134364892036018,
"grad_norm": 1.5442957878112793,
"learning_rate": 1.7273739089684262e-05,
"loss": 0.1878,
"step": 9800
},
{
"epoch": 1.7151848937844218,
"grad_norm": 1.8015555143356323,
"learning_rate": 1.72503075391014e-05,
"loss": 0.2224,
"step": 9810
},
{
"epoch": 1.7169332983652417,
"grad_norm": 2.210533380508423,
"learning_rate": 1.722687598851854e-05,
"loss": 0.209,
"step": 9820
},
{
"epoch": 1.7186817029460617,
"grad_norm": 1.9179598093032837,
"learning_rate": 1.720344443793568e-05,
"loss": 0.1857,
"step": 9830
},
{
"epoch": 1.7204301075268817,
"grad_norm": 3.421584367752075,
"learning_rate": 1.718001288735282e-05,
"loss": 0.219,
"step": 9840
},
{
"epoch": 1.7221785121077016,
"grad_norm": 2.5188374519348145,
"learning_rate": 1.715658133676996e-05,
"loss": 0.1896,
"step": 9850
},
{
"epoch": 1.7239269166885216,
"grad_norm": 2.2160191535949707,
"learning_rate": 1.7133149786187102e-05,
"loss": 0.1987,
"step": 9860
},
{
"epoch": 1.7256753212693416,
"grad_norm": 2.5851011276245117,
"learning_rate": 1.7109718235604243e-05,
"loss": 0.1861,
"step": 9870
},
{
"epoch": 1.7274237258501617,
"grad_norm": 1.6431199312210083,
"learning_rate": 1.7086286685021383e-05,
"loss": 0.167,
"step": 9880
},
{
"epoch": 1.7291721304309817,
"grad_norm": 2.0865867137908936,
"learning_rate": 1.7062855134438524e-05,
"loss": 0.1806,
"step": 9890
},
{
"epoch": 1.7309205350118018,
"grad_norm": 1.9474307298660278,
"learning_rate": 1.7039423583855665e-05,
"loss": 0.1907,
"step": 9900
},
{
"epoch": 1.7326689395926218,
"grad_norm": 2.301378011703491,
"learning_rate": 1.7015992033272802e-05,
"loss": 0.2028,
"step": 9910
},
{
"epoch": 1.7344173441734418,
"grad_norm": 1.9113527536392212,
"learning_rate": 1.6992560482689942e-05,
"loss": 0.2172,
"step": 9920
},
{
"epoch": 1.7361657487542619,
"grad_norm": 2.14692759513855,
"learning_rate": 1.6969128932107083e-05,
"loss": 0.1846,
"step": 9930
},
{
"epoch": 1.7379141533350817,
"grad_norm": 2.062518358230591,
"learning_rate": 1.6945697381524224e-05,
"loss": 0.1902,
"step": 9940
},
{
"epoch": 1.7396625579159017,
"grad_norm": 3.0750021934509277,
"learning_rate": 1.6922265830941364e-05,
"loss": 0.1944,
"step": 9950
},
{
"epoch": 1.7414109624967218,
"grad_norm": 1.9308764934539795,
"learning_rate": 1.6898834280358505e-05,
"loss": 0.2198,
"step": 9960
},
{
"epoch": 1.7431593670775416,
"grad_norm": 2.22654128074646,
"learning_rate": 1.6875402729775645e-05,
"loss": 0.1913,
"step": 9970
},
{
"epoch": 1.7449077716583616,
"grad_norm": 2.7233572006225586,
"learning_rate": 1.6851971179192786e-05,
"loss": 0.198,
"step": 9980
},
{
"epoch": 1.7466561762391817,
"grad_norm": 2.2208783626556396,
"learning_rate": 1.6828539628609926e-05,
"loss": 0.2084,
"step": 9990
},
{
"epoch": 1.7484045808200017,
"grad_norm": 1.8110119104385376,
"learning_rate": 1.6805108078027067e-05,
"loss": 0.1946,
"step": 10000
},
{
"epoch": 1.7501529854008218,
"grad_norm": 2.9725208282470703,
"learning_rate": 1.6781676527444204e-05,
"loss": 0.179,
"step": 10010
},
{
"epoch": 1.7519013899816418,
"grad_norm": 5.847568511962891,
"learning_rate": 1.6758244976861345e-05,
"loss": 0.2022,
"step": 10020
},
{
"epoch": 1.7536497945624618,
"grad_norm": 2.0036888122558594,
"learning_rate": 1.6734813426278485e-05,
"loss": 0.1727,
"step": 10030
},
{
"epoch": 1.7553981991432819,
"grad_norm": 2.1402735710144043,
"learning_rate": 1.6711381875695626e-05,
"loss": 0.1903,
"step": 10040
},
{
"epoch": 1.757146603724102,
"grad_norm": 4.128479480743408,
"learning_rate": 1.6687950325112767e-05,
"loss": 0.2195,
"step": 10050
},
{
"epoch": 1.7588950083049217,
"grad_norm": 1.9900801181793213,
"learning_rate": 1.6664518774529904e-05,
"loss": 0.1946,
"step": 10060
},
{
"epoch": 1.7606434128857418,
"grad_norm": 1.4300425052642822,
"learning_rate": 1.6641087223947044e-05,
"loss": 0.1835,
"step": 10070
},
{
"epoch": 1.7623918174665618,
"grad_norm": 2.1098504066467285,
"learning_rate": 1.6617655673364185e-05,
"loss": 0.2234,
"step": 10080
},
{
"epoch": 1.7641402220473816,
"grad_norm": 4.656988143920898,
"learning_rate": 1.6594224122781326e-05,
"loss": 0.217,
"step": 10090
},
{
"epoch": 1.7658886266282017,
"grad_norm": 1.80104660987854,
"learning_rate": 1.6570792572198466e-05,
"loss": 0.1836,
"step": 10100
},
{
"epoch": 1.7676370312090217,
"grad_norm": 2.10807466506958,
"learning_rate": 1.6547361021615607e-05,
"loss": 0.2034,
"step": 10110
},
{
"epoch": 1.7693854357898418,
"grad_norm": 2.7385706901550293,
"learning_rate": 1.6523929471032747e-05,
"loss": 0.216,
"step": 10120
},
{
"epoch": 1.7711338403706618,
"grad_norm": 2.4053244590759277,
"learning_rate": 1.6500497920449888e-05,
"loss": 0.1993,
"step": 10130
},
{
"epoch": 1.7728822449514818,
"grad_norm": 2.8163459300994873,
"learning_rate": 1.647706636986703e-05,
"loss": 0.1719,
"step": 10140
},
{
"epoch": 1.7746306495323019,
"grad_norm": 1.9868711233139038,
"learning_rate": 1.645363481928417e-05,
"loss": 0.2597,
"step": 10150
},
{
"epoch": 1.776379054113122,
"grad_norm": 2.1938767433166504,
"learning_rate": 1.643020326870131e-05,
"loss": 0.1863,
"step": 10160
},
{
"epoch": 1.7781274586939417,
"grad_norm": 2.0124146938323975,
"learning_rate": 1.6406771718118447e-05,
"loss": 0.1958,
"step": 10170
},
{
"epoch": 1.7798758632747618,
"grad_norm": 1.6832975149154663,
"learning_rate": 1.6383340167535587e-05,
"loss": 0.206,
"step": 10180
},
{
"epoch": 1.7816242678555818,
"grad_norm": 2.3199076652526855,
"learning_rate": 1.6359908616952728e-05,
"loss": 0.2055,
"step": 10190
},
{
"epoch": 1.7833726724364016,
"grad_norm": 3.9184508323669434,
"learning_rate": 1.633647706636987e-05,
"loss": 0.207,
"step": 10200
},
{
"epoch": 1.7851210770172217,
"grad_norm": 2.7728350162506104,
"learning_rate": 1.631304551578701e-05,
"loss": 0.1892,
"step": 10210
},
{
"epoch": 1.7868694815980417,
"grad_norm": 2.6902334690093994,
"learning_rate": 1.628961396520415e-05,
"loss": 0.1797,
"step": 10220
},
{
"epoch": 1.7886178861788617,
"grad_norm": 1.4976388216018677,
"learning_rate": 1.626618241462129e-05,
"loss": 0.181,
"step": 10230
},
{
"epoch": 1.7903662907596818,
"grad_norm": 2.592813730239868,
"learning_rate": 1.624275086403843e-05,
"loss": 0.192,
"step": 10240
},
{
"epoch": 1.7921146953405018,
"grad_norm": 1.3043900728225708,
"learning_rate": 1.6219319313455568e-05,
"loss": 0.177,
"step": 10250
},
{
"epoch": 1.7938630999213219,
"grad_norm": 3.4276037216186523,
"learning_rate": 1.619588776287271e-05,
"loss": 0.1738,
"step": 10260
},
{
"epoch": 1.795611504502142,
"grad_norm": 3.072364568710327,
"learning_rate": 1.617245621228985e-05,
"loss": 0.211,
"step": 10270
},
{
"epoch": 1.797359909082962,
"grad_norm": 1.6340441703796387,
"learning_rate": 1.614902466170699e-05,
"loss": 0.1852,
"step": 10280
},
{
"epoch": 1.7991083136637818,
"grad_norm": 3.1025967597961426,
"learning_rate": 1.612559311112413e-05,
"loss": 0.1863,
"step": 10290
},
{
"epoch": 1.8008567182446018,
"grad_norm": 2.3155341148376465,
"learning_rate": 1.610216156054127e-05,
"loss": 0.1998,
"step": 10300
},
{
"epoch": 1.8026051228254218,
"grad_norm": 1.6753582954406738,
"learning_rate": 1.6078730009958408e-05,
"loss": 0.1839,
"step": 10310
},
{
"epoch": 1.8043535274062417,
"grad_norm": 1.5715868473052979,
"learning_rate": 1.605529845937555e-05,
"loss": 0.1732,
"step": 10320
},
{
"epoch": 1.8061019319870617,
"grad_norm": 2.3362326622009277,
"learning_rate": 1.603186690879269e-05,
"loss": 0.2014,
"step": 10330
},
{
"epoch": 1.8078503365678817,
"grad_norm": 2.038578510284424,
"learning_rate": 1.600843535820983e-05,
"loss": 0.1737,
"step": 10340
},
{
"epoch": 1.8095987411487018,
"grad_norm": 3.527510643005371,
"learning_rate": 1.598500380762697e-05,
"loss": 0.2329,
"step": 10350
},
{
"epoch": 1.8113471457295218,
"grad_norm": 1.9759219884872437,
"learning_rate": 1.596157225704411e-05,
"loss": 0.1867,
"step": 10360
},
{
"epoch": 1.8130955503103419,
"grad_norm": 1.822554588317871,
"learning_rate": 1.5938140706461252e-05,
"loss": 0.1904,
"step": 10370
},
{
"epoch": 1.814843954891162,
"grad_norm": 2.0749101638793945,
"learning_rate": 1.5914709155878392e-05,
"loss": 0.1875,
"step": 10380
},
{
"epoch": 1.816592359471982,
"grad_norm": 2.470715045928955,
"learning_rate": 1.5891277605295533e-05,
"loss": 0.1864,
"step": 10390
},
{
"epoch": 1.818340764052802,
"grad_norm": 1.7627729177474976,
"learning_rate": 1.5867846054712674e-05,
"loss": 0.1887,
"step": 10400
},
{
"epoch": 1.8200891686336218,
"grad_norm": 1.9679255485534668,
"learning_rate": 1.5844414504129814e-05,
"loss": 0.2045,
"step": 10410
},
{
"epoch": 1.8218375732144418,
"grad_norm": 2.222324848175049,
"learning_rate": 1.582098295354695e-05,
"loss": 0.1988,
"step": 10420
},
{
"epoch": 1.8235859777952617,
"grad_norm": 2.272304058074951,
"learning_rate": 1.5797551402964092e-05,
"loss": 0.1825,
"step": 10430
},
{
"epoch": 1.8253343823760817,
"grad_norm": 2.1010255813598633,
"learning_rate": 1.5774119852381232e-05,
"loss": 0.1801,
"step": 10440
},
{
"epoch": 1.8270827869569017,
"grad_norm": 2.3894827365875244,
"learning_rate": 1.5750688301798373e-05,
"loss": 0.1912,
"step": 10450
},
{
"epoch": 1.8288311915377218,
"grad_norm": 2.351548910140991,
"learning_rate": 1.5727256751215514e-05,
"loss": 0.1814,
"step": 10460
},
{
"epoch": 1.8305795961185418,
"grad_norm": 1.5725765228271484,
"learning_rate": 1.5703825200632654e-05,
"loss": 0.2019,
"step": 10470
},
{
"epoch": 1.8323280006993619,
"grad_norm": 2.2197859287261963,
"learning_rate": 1.5680393650049795e-05,
"loss": 0.2174,
"step": 10480
},
{
"epoch": 1.834076405280182,
"grad_norm": 2.1431429386138916,
"learning_rate": 1.5656962099466932e-05,
"loss": 0.1944,
"step": 10490
},
{
"epoch": 1.835824809861002,
"grad_norm": 2.159376382827759,
"learning_rate": 1.5633530548884073e-05,
"loss": 0.1816,
"step": 10500
},
{
"epoch": 1.837573214441822,
"grad_norm": 2.3646135330200195,
"learning_rate": 1.5610098998301213e-05,
"loss": 0.1694,
"step": 10510
},
{
"epoch": 1.8393216190226418,
"grad_norm": 1.8579986095428467,
"learning_rate": 1.5586667447718354e-05,
"loss": 0.1852,
"step": 10520
},
{
"epoch": 1.8410700236034618,
"grad_norm": 1.8673964738845825,
"learning_rate": 1.5563235897135494e-05,
"loss": 0.215,
"step": 10530
},
{
"epoch": 1.8428184281842819,
"grad_norm": 1.8977408409118652,
"learning_rate": 1.5539804346552635e-05,
"loss": 0.1745,
"step": 10540
},
{
"epoch": 1.8445668327651017,
"grad_norm": 2.263277530670166,
"learning_rate": 1.5516372795969776e-05,
"loss": 0.1934,
"step": 10550
},
{
"epoch": 1.8463152373459217,
"grad_norm": 3.5230796337127686,
"learning_rate": 1.5492941245386913e-05,
"loss": 0.2121,
"step": 10560
},
{
"epoch": 1.8480636419267418,
"grad_norm": 2.2471237182617188,
"learning_rate": 1.5469509694804053e-05,
"loss": 0.1665,
"step": 10570
},
{
"epoch": 1.8498120465075618,
"grad_norm": 1.7764135599136353,
"learning_rate": 1.5446078144221194e-05,
"loss": 0.1789,
"step": 10580
},
{
"epoch": 1.8515604510883819,
"grad_norm": 2.5531044006347656,
"learning_rate": 1.5422646593638334e-05,
"loss": 0.198,
"step": 10590
},
{
"epoch": 1.853308855669202,
"grad_norm": 1.9833929538726807,
"learning_rate": 1.5399215043055475e-05,
"loss": 0.1773,
"step": 10600
},
{
"epoch": 1.855057260250022,
"grad_norm": 3.2580177783966064,
"learning_rate": 1.5375783492472616e-05,
"loss": 0.1999,
"step": 10610
},
{
"epoch": 1.856805664830842,
"grad_norm": 1.653817057609558,
"learning_rate": 1.5352351941889756e-05,
"loss": 0.1903,
"step": 10620
},
{
"epoch": 1.858554069411662,
"grad_norm": 2.687511920928955,
"learning_rate": 1.5328920391306897e-05,
"loss": 0.1955,
"step": 10630
},
{
"epoch": 1.8603024739924818,
"grad_norm": 2.3848729133605957,
"learning_rate": 1.5305488840724037e-05,
"loss": 0.1644,
"step": 10640
},
{
"epoch": 1.8620508785733019,
"grad_norm": 3.581122636795044,
"learning_rate": 1.5282057290141178e-05,
"loss": 0.1731,
"step": 10650
},
{
"epoch": 1.863799283154122,
"grad_norm": 2.3530807495117188,
"learning_rate": 1.5258625739558317e-05,
"loss": 0.1719,
"step": 10660
},
{
"epoch": 1.8655476877349417,
"grad_norm": 3.611687421798706,
"learning_rate": 1.5235194188975457e-05,
"loss": 0.2315,
"step": 10670
},
{
"epoch": 1.8672960923157618,
"grad_norm": 2.133084774017334,
"learning_rate": 1.5211762638392598e-05,
"loss": 0.222,
"step": 10680
},
{
"epoch": 1.8690444968965818,
"grad_norm": 2.017202854156494,
"learning_rate": 1.5188331087809739e-05,
"loss": 0.1537,
"step": 10690
},
{
"epoch": 1.8707929014774018,
"grad_norm": 4.252660751342773,
"learning_rate": 1.5164899537226878e-05,
"loss": 0.2105,
"step": 10700
},
{
"epoch": 1.8725413060582219,
"grad_norm": 1.902491569519043,
"learning_rate": 1.5141467986644018e-05,
"loss": 0.1876,
"step": 10710
},
{
"epoch": 1.874289710639042,
"grad_norm": 5.36565637588501,
"learning_rate": 1.5118036436061159e-05,
"loss": 0.1734,
"step": 10720
},
{
"epoch": 1.876038115219862,
"grad_norm": 1.6138919591903687,
"learning_rate": 1.5094604885478296e-05,
"loss": 0.1812,
"step": 10730
},
{
"epoch": 1.877786519800682,
"grad_norm": 3.4811031818389893,
"learning_rate": 1.5071173334895436e-05,
"loss": 0.1883,
"step": 10740
},
{
"epoch": 1.8795349243815018,
"grad_norm": 2.3118202686309814,
"learning_rate": 1.5047741784312577e-05,
"loss": 0.1933,
"step": 10750
},
{
"epoch": 1.8812833289623219,
"grad_norm": 2.5845682621002197,
"learning_rate": 1.5024310233729718e-05,
"loss": 0.2122,
"step": 10760
},
{
"epoch": 1.883031733543142,
"grad_norm": 1.4849445819854736,
"learning_rate": 1.5000878683146858e-05,
"loss": 0.1763,
"step": 10770
},
{
"epoch": 1.8847801381239617,
"grad_norm": 1.851722240447998,
"learning_rate": 1.4977447132563999e-05,
"loss": 0.1837,
"step": 10780
},
{
"epoch": 1.8865285427047818,
"grad_norm": 1.793611764907837,
"learning_rate": 1.4954015581981138e-05,
"loss": 0.1894,
"step": 10790
},
{
"epoch": 1.8882769472856018,
"grad_norm": 1.6560570001602173,
"learning_rate": 1.4930584031398278e-05,
"loss": 0.2056,
"step": 10800
},
{
"epoch": 1.8900253518664218,
"grad_norm": 2.940868377685547,
"learning_rate": 1.4907152480815419e-05,
"loss": 0.2131,
"step": 10810
},
{
"epoch": 1.8917737564472419,
"grad_norm": 1.374992847442627,
"learning_rate": 1.488372093023256e-05,
"loss": 0.2025,
"step": 10820
},
{
"epoch": 1.893522161028062,
"grad_norm": 4.640373229980469,
"learning_rate": 1.48602893796497e-05,
"loss": 0.2027,
"step": 10830
},
{
"epoch": 1.895270565608882,
"grad_norm": 1.665847659111023,
"learning_rate": 1.4836857829066839e-05,
"loss": 0.1858,
"step": 10840
},
{
"epoch": 1.897018970189702,
"grad_norm": 2.2026145458221436,
"learning_rate": 1.481342627848398e-05,
"loss": 0.1924,
"step": 10850
},
{
"epoch": 1.898767374770522,
"grad_norm": 2.361902952194214,
"learning_rate": 1.478999472790112e-05,
"loss": 0.2083,
"step": 10860
},
{
"epoch": 1.9005157793513419,
"grad_norm": 2.2269675731658936,
"learning_rate": 1.476656317731826e-05,
"loss": 0.1747,
"step": 10870
},
{
"epoch": 1.902264183932162,
"grad_norm": 2.43367338180542,
"learning_rate": 1.4743131626735401e-05,
"loss": 0.1902,
"step": 10880
},
{
"epoch": 1.904012588512982,
"grad_norm": 1.6025736331939697,
"learning_rate": 1.4719700076152542e-05,
"loss": 0.1636,
"step": 10890
},
{
"epoch": 1.9057609930938018,
"grad_norm": 2.627732038497925,
"learning_rate": 1.469626852556968e-05,
"loss": 0.1993,
"step": 10900
},
{
"epoch": 1.9075093976746218,
"grad_norm": 2.630786180496216,
"learning_rate": 1.4672836974986821e-05,
"loss": 0.1601,
"step": 10910
},
{
"epoch": 1.9092578022554418,
"grad_norm": 2.772336006164551,
"learning_rate": 1.4649405424403962e-05,
"loss": 0.2306,
"step": 10920
},
{
"epoch": 1.9110062068362619,
"grad_norm": 2.6443400382995605,
"learning_rate": 1.4625973873821103e-05,
"loss": 0.1774,
"step": 10930
},
{
"epoch": 1.912754611417082,
"grad_norm": 2.066016435623169,
"learning_rate": 1.4602542323238243e-05,
"loss": 0.1781,
"step": 10940
},
{
"epoch": 1.914503015997902,
"grad_norm": 2.2029716968536377,
"learning_rate": 1.4579110772655382e-05,
"loss": 0.1656,
"step": 10950
},
{
"epoch": 1.916251420578722,
"grad_norm": 2.0414412021636963,
"learning_rate": 1.4555679222072523e-05,
"loss": 0.1927,
"step": 10960
},
{
"epoch": 1.917999825159542,
"grad_norm": 2.9510364532470703,
"learning_rate": 1.4532247671489661e-05,
"loss": 0.1812,
"step": 10970
},
{
"epoch": 1.919748229740362,
"grad_norm": 4.375843524932861,
"learning_rate": 1.4508816120906802e-05,
"loss": 0.2294,
"step": 10980
},
{
"epoch": 1.921496634321182,
"grad_norm": 1.7535252571105957,
"learning_rate": 1.4485384570323941e-05,
"loss": 0.1728,
"step": 10990
},
{
"epoch": 1.923245038902002,
"grad_norm": 2.152392864227295,
"learning_rate": 1.4464296174799367e-05,
"loss": 0.1896,
"step": 11000
},
{
"epoch": 1.924993443482822,
"grad_norm": 1.4847190380096436,
"learning_rate": 1.4440864624216508e-05,
"loss": 0.2039,
"step": 11010
},
{
"epoch": 1.9267418480636418,
"grad_norm": 2.027345895767212,
"learning_rate": 1.4417433073633649e-05,
"loss": 0.2019,
"step": 11020
},
{
"epoch": 1.9284902526444618,
"grad_norm": 2.9172937870025635,
"learning_rate": 1.439400152305079e-05,
"loss": 0.1894,
"step": 11030
},
{
"epoch": 1.9302386572252819,
"grad_norm": 2.5305731296539307,
"learning_rate": 1.437056997246793e-05,
"loss": 0.2269,
"step": 11040
},
{
"epoch": 1.931987061806102,
"grad_norm": 1.8299009799957275,
"learning_rate": 1.4347138421885069e-05,
"loss": 0.1715,
"step": 11050
},
{
"epoch": 1.933735466386922,
"grad_norm": 2.175562620162964,
"learning_rate": 1.432370687130221e-05,
"loss": 0.1625,
"step": 11060
},
{
"epoch": 1.935483870967742,
"grad_norm": 1.5124961137771606,
"learning_rate": 1.430027532071935e-05,
"loss": 0.1808,
"step": 11070
},
{
"epoch": 1.937232275548562,
"grad_norm": 2.2984073162078857,
"learning_rate": 1.427684377013649e-05,
"loss": 0.1946,
"step": 11080
},
{
"epoch": 1.938980680129382,
"grad_norm": 1.8547557592391968,
"learning_rate": 1.4253412219553631e-05,
"loss": 0.1866,
"step": 11090
},
{
"epoch": 1.940729084710202,
"grad_norm": 2.1592087745666504,
"learning_rate": 1.4229980668970772e-05,
"loss": 0.1768,
"step": 11100
},
{
"epoch": 1.942477489291022,
"grad_norm": 2.334019660949707,
"learning_rate": 1.420654911838791e-05,
"loss": 0.1688,
"step": 11110
},
{
"epoch": 1.944225893871842,
"grad_norm": 1.6526786088943481,
"learning_rate": 1.4183117567805051e-05,
"loss": 0.1959,
"step": 11120
},
{
"epoch": 1.9459742984526618,
"grad_norm": 1.7332444190979004,
"learning_rate": 1.4159686017222192e-05,
"loss": 0.1918,
"step": 11130
},
{
"epoch": 1.9477227030334818,
"grad_norm": 1.4149800539016724,
"learning_rate": 1.4136254466639332e-05,
"loss": 0.1459,
"step": 11140
},
{
"epoch": 1.9494711076143019,
"grad_norm": 1.6035906076431274,
"learning_rate": 1.4112822916056473e-05,
"loss": 0.1686,
"step": 11150
},
{
"epoch": 1.951219512195122,
"grad_norm": 2.4894981384277344,
"learning_rate": 1.408939136547361e-05,
"loss": 0.1983,
"step": 11160
},
{
"epoch": 1.952967916775942,
"grad_norm": 1.827898621559143,
"learning_rate": 1.406595981489075e-05,
"loss": 0.1856,
"step": 11170
},
{
"epoch": 1.954716321356762,
"grad_norm": 1.7103666067123413,
"learning_rate": 1.4042528264307891e-05,
"loss": 0.1987,
"step": 11180
},
{
"epoch": 1.956464725937582,
"grad_norm": 1.692668080329895,
"learning_rate": 1.4019096713725032e-05,
"loss": 0.1761,
"step": 11190
},
{
"epoch": 1.958213130518402,
"grad_norm": 2.089463710784912,
"learning_rate": 1.399566516314217e-05,
"loss": 0.1899,
"step": 11200
},
{
"epoch": 1.959961535099222,
"grad_norm": 1.9206026792526245,
"learning_rate": 1.3972233612559311e-05,
"loss": 0.184,
"step": 11210
},
{
"epoch": 1.961709939680042,
"grad_norm": 1.8816583156585693,
"learning_rate": 1.3948802061976452e-05,
"loss": 0.1976,
"step": 11220
},
{
"epoch": 1.963458344260862,
"grad_norm": 4.568369388580322,
"learning_rate": 1.3925370511393592e-05,
"loss": 0.1745,
"step": 11230
},
{
"epoch": 1.965206748841682,
"grad_norm": 1.8514816761016846,
"learning_rate": 1.3901938960810733e-05,
"loss": 0.1673,
"step": 11240
},
{
"epoch": 1.9669551534225018,
"grad_norm": 4.430703163146973,
"learning_rate": 1.3878507410227872e-05,
"loss": 0.1754,
"step": 11250
},
{
"epoch": 1.9687035580033219,
"grad_norm": 1.8397603034973145,
"learning_rate": 1.3855075859645013e-05,
"loss": 0.254,
"step": 11260
},
{
"epoch": 1.970451962584142,
"grad_norm": 1.7431299686431885,
"learning_rate": 1.3831644309062153e-05,
"loss": 0.1611,
"step": 11270
},
{
"epoch": 1.972200367164962,
"grad_norm": 2.219205141067505,
"learning_rate": 1.3808212758479294e-05,
"loss": 0.1885,
"step": 11280
},
{
"epoch": 1.973948771745782,
"grad_norm": 2.130847930908203,
"learning_rate": 1.3784781207896434e-05,
"loss": 0.1949,
"step": 11290
},
{
"epoch": 1.975697176326602,
"grad_norm": 3.2657315731048584,
"learning_rate": 1.3761349657313575e-05,
"loss": 0.1534,
"step": 11300
},
{
"epoch": 1.977445580907422,
"grad_norm": 3.336939811706543,
"learning_rate": 1.3737918106730714e-05,
"loss": 0.1989,
"step": 11310
},
{
"epoch": 1.979193985488242,
"grad_norm": 1.9921077489852905,
"learning_rate": 1.3714486556147854e-05,
"loss": 0.1959,
"step": 11320
},
{
"epoch": 1.9809423900690621,
"grad_norm": 1.6148415803909302,
"learning_rate": 1.3691055005564995e-05,
"loss": 0.1564,
"step": 11330
},
{
"epoch": 1.982690794649882,
"grad_norm": 1.0459388494491577,
"learning_rate": 1.3667623454982136e-05,
"loss": 0.1766,
"step": 11340
},
{
"epoch": 1.984439199230702,
"grad_norm": 3.012587308883667,
"learning_rate": 1.3644191904399276e-05,
"loss": 0.1731,
"step": 11350
},
{
"epoch": 1.986187603811522,
"grad_norm": 1.4650547504425049,
"learning_rate": 1.3620760353816415e-05,
"loss": 0.1637,
"step": 11360
},
{
"epoch": 1.9879360083923419,
"grad_norm": 1.4707012176513672,
"learning_rate": 1.3597328803233556e-05,
"loss": 0.1917,
"step": 11370
},
{
"epoch": 1.989684412973162,
"grad_norm": 1.6757373809814453,
"learning_rate": 1.3573897252650696e-05,
"loss": 0.1974,
"step": 11380
},
{
"epoch": 1.991432817553982,
"grad_norm": 2.444448471069336,
"learning_rate": 1.3550465702067837e-05,
"loss": 0.1702,
"step": 11390
},
{
"epoch": 1.993181222134802,
"grad_norm": 2.135993003845215,
"learning_rate": 1.3527034151484974e-05,
"loss": 0.1989,
"step": 11400
},
{
"epoch": 1.994929626715622,
"grad_norm": 1.804882526397705,
"learning_rate": 1.3503602600902115e-05,
"loss": 0.1994,
"step": 11410
},
{
"epoch": 1.996678031296442,
"grad_norm": 1.5056926012039185,
"learning_rate": 1.3480171050319255e-05,
"loss": 0.1721,
"step": 11420
},
{
"epoch": 1.998426435877262,
"grad_norm": 1.6799705028533936,
"learning_rate": 1.3456739499736396e-05,
"loss": 0.1924,
"step": 11430
},
{
"epoch": 2.000174840458082,
"grad_norm": 1.5842691659927368,
"learning_rate": 1.3433307949153536e-05,
"loss": 0.2088,
"step": 11440
},
{
"epoch": 2.001923245038902,
"grad_norm": 1.5139780044555664,
"learning_rate": 1.3409876398570675e-05,
"loss": 0.1146,
"step": 11450
},
{
"epoch": 2.003671649619722,
"grad_norm": 1.719301462173462,
"learning_rate": 1.3386444847987816e-05,
"loss": 0.1513,
"step": 11460
},
{
"epoch": 2.005420054200542,
"grad_norm": 2.1831586360931396,
"learning_rate": 1.3363013297404956e-05,
"loss": 0.1308,
"step": 11470
},
{
"epoch": 2.007168458781362,
"grad_norm": 2.1777830123901367,
"learning_rate": 1.3339581746822097e-05,
"loss": 0.1188,
"step": 11480
},
{
"epoch": 2.008916863362182,
"grad_norm": 1.8375096321105957,
"learning_rate": 1.3316150196239238e-05,
"loss": 0.1419,
"step": 11490
},
{
"epoch": 2.010665267943002,
"grad_norm": 2.3195183277130127,
"learning_rate": 1.3292718645656378e-05,
"loss": 0.125,
"step": 11500
},
{
"epoch": 2.012413672523822,
"grad_norm": 1.6732579469680786,
"learning_rate": 1.3269287095073517e-05,
"loss": 0.158,
"step": 11510
},
{
"epoch": 2.014162077104642,
"grad_norm": 1.6776469945907593,
"learning_rate": 1.3245855544490658e-05,
"loss": 0.1204,
"step": 11520
},
{
"epoch": 2.015910481685462,
"grad_norm": 1.415498971939087,
"learning_rate": 1.3222423993907798e-05,
"loss": 0.1534,
"step": 11530
},
{
"epoch": 2.017658886266282,
"grad_norm": 1.47972571849823,
"learning_rate": 1.3198992443324939e-05,
"loss": 0.1335,
"step": 11540
},
{
"epoch": 2.019407290847102,
"grad_norm": 3.020838499069214,
"learning_rate": 1.317556089274208e-05,
"loss": 0.1177,
"step": 11550
},
{
"epoch": 2.021155695427922,
"grad_norm": 2.071444034576416,
"learning_rate": 1.3152129342159218e-05,
"loss": 0.1442,
"step": 11560
},
{
"epoch": 2.022904100008742,
"grad_norm": 4.56058931350708,
"learning_rate": 1.3128697791576359e-05,
"loss": 0.1271,
"step": 11570
},
{
"epoch": 2.024652504589562,
"grad_norm": 2.9579317569732666,
"learning_rate": 1.31052662409935e-05,
"loss": 0.1311,
"step": 11580
},
{
"epoch": 2.026400909170382,
"grad_norm": 1.6421356201171875,
"learning_rate": 1.308183469041064e-05,
"loss": 0.1333,
"step": 11590
},
{
"epoch": 2.028149313751202,
"grad_norm": 3.032768726348877,
"learning_rate": 1.305840313982778e-05,
"loss": 0.1532,
"step": 11600
},
{
"epoch": 2.029897718332022,
"grad_norm": 3.688626766204834,
"learning_rate": 1.303497158924492e-05,
"loss": 0.1257,
"step": 11610
},
{
"epoch": 2.031646122912842,
"grad_norm": 1.906921148300171,
"learning_rate": 1.301154003866206e-05,
"loss": 0.1316,
"step": 11620
},
{
"epoch": 2.033394527493662,
"grad_norm": 1.6427615880966187,
"learning_rate": 1.29881084880792e-05,
"loss": 0.1173,
"step": 11630
},
{
"epoch": 2.035142932074482,
"grad_norm": 2.043480157852173,
"learning_rate": 1.296467693749634e-05,
"loss": 0.1348,
"step": 11640
},
{
"epoch": 2.036891336655302,
"grad_norm": 2.482868194580078,
"learning_rate": 1.2941245386913478e-05,
"loss": 0.1229,
"step": 11650
},
{
"epoch": 2.038639741236122,
"grad_norm": 3.356874465942383,
"learning_rate": 1.2917813836330619e-05,
"loss": 0.1163,
"step": 11660
},
{
"epoch": 2.040388145816942,
"grad_norm": 1.3664606809616089,
"learning_rate": 1.289438228574776e-05,
"loss": 0.1256,
"step": 11670
},
{
"epoch": 2.042136550397762,
"grad_norm": 2.038381338119507,
"learning_rate": 1.28709507351649e-05,
"loss": 0.1458,
"step": 11680
},
{
"epoch": 2.0438849549785822,
"grad_norm": 2.1345996856689453,
"learning_rate": 1.284751918458204e-05,
"loss": 0.1234,
"step": 11690
},
{
"epoch": 2.045633359559402,
"grad_norm": 1.422678828239441,
"learning_rate": 1.282408763399918e-05,
"loss": 0.117,
"step": 11700
},
{
"epoch": 2.047381764140222,
"grad_norm": 1.686991810798645,
"learning_rate": 1.280065608341632e-05,
"loss": 0.1389,
"step": 11710
},
{
"epoch": 2.049130168721042,
"grad_norm": 1.8135253190994263,
"learning_rate": 1.277722453283346e-05,
"loss": 0.1121,
"step": 11720
},
{
"epoch": 2.050878573301862,
"grad_norm": 1.7057915925979614,
"learning_rate": 1.2753792982250601e-05,
"loss": 0.1496,
"step": 11730
},
{
"epoch": 2.052626977882682,
"grad_norm": 2.205564022064209,
"learning_rate": 1.2730361431667742e-05,
"loss": 0.1321,
"step": 11740
},
{
"epoch": 2.054375382463502,
"grad_norm": 2.5153441429138184,
"learning_rate": 1.2706929881084883e-05,
"loss": 0.1138,
"step": 11750
},
{
"epoch": 2.056123787044322,
"grad_norm": 1.5551224946975708,
"learning_rate": 1.2683498330502021e-05,
"loss": 0.1317,
"step": 11760
},
{
"epoch": 2.057872191625142,
"grad_norm": 1.755265712738037,
"learning_rate": 1.2660066779919162e-05,
"loss": 0.1465,
"step": 11770
},
{
"epoch": 2.059620596205962,
"grad_norm": 1.6050928831100464,
"learning_rate": 1.2636635229336303e-05,
"loss": 0.1178,
"step": 11780
},
{
"epoch": 2.061369000786782,
"grad_norm": 0.8174687623977661,
"learning_rate": 1.2613203678753443e-05,
"loss": 0.1355,
"step": 11790
},
{
"epoch": 2.0631174053676022,
"grad_norm": 2.0146546363830566,
"learning_rate": 1.2589772128170584e-05,
"loss": 0.1395,
"step": 11800
},
{
"epoch": 2.0648658099484223,
"grad_norm": 3.2247419357299805,
"learning_rate": 1.2566340577587723e-05,
"loss": 0.1496,
"step": 11810
},
{
"epoch": 2.066614214529242,
"grad_norm": 2.9578254222869873,
"learning_rate": 1.2542909027004863e-05,
"loss": 0.1384,
"step": 11820
},
{
"epoch": 2.068362619110062,
"grad_norm": 1.7490684986114502,
"learning_rate": 1.2519477476422004e-05,
"loss": 0.1363,
"step": 11830
},
{
"epoch": 2.070111023690882,
"grad_norm": 1.931545376777649,
"learning_rate": 1.2496045925839144e-05,
"loss": 0.1652,
"step": 11840
},
{
"epoch": 2.071859428271702,
"grad_norm": 1.7428362369537354,
"learning_rate": 1.2472614375256285e-05,
"loss": 0.1309,
"step": 11850
},
{
"epoch": 2.073607832852522,
"grad_norm": 3.5689780712127686,
"learning_rate": 1.2449182824673424e-05,
"loss": 0.121,
"step": 11860
},
{
"epoch": 2.075356237433342,
"grad_norm": 1.8899126052856445,
"learning_rate": 1.2425751274090565e-05,
"loss": 0.1452,
"step": 11870
},
{
"epoch": 2.077104642014162,
"grad_norm": 2.014786720275879,
"learning_rate": 1.2402319723507703e-05,
"loss": 0.1327,
"step": 11880
},
{
"epoch": 2.078853046594982,
"grad_norm": 2.1480844020843506,
"learning_rate": 1.2378888172924844e-05,
"loss": 0.1378,
"step": 11890
},
{
"epoch": 2.080601451175802,
"grad_norm": 2.5206010341644287,
"learning_rate": 1.2355456622341983e-05,
"loss": 0.1684,
"step": 11900
},
{
"epoch": 2.0823498557566222,
"grad_norm": 2.49927020072937,
"learning_rate": 1.2332025071759123e-05,
"loss": 0.1402,
"step": 11910
},
{
"epoch": 2.0840982603374423,
"grad_norm": 1.4586842060089111,
"learning_rate": 1.2308593521176264e-05,
"loss": 0.1285,
"step": 11920
},
{
"epoch": 2.085846664918262,
"grad_norm": 1.743302583694458,
"learning_rate": 1.2285161970593405e-05,
"loss": 0.1117,
"step": 11930
},
{
"epoch": 2.087595069499082,
"grad_norm": 1.992079257965088,
"learning_rate": 1.2261730420010545e-05,
"loss": 0.1409,
"step": 11940
},
{
"epoch": 2.089343474079902,
"grad_norm": 1.6316864490509033,
"learning_rate": 1.2238298869427686e-05,
"loss": 0.1229,
"step": 11950
},
{
"epoch": 2.091091878660722,
"grad_norm": 1.8108314275741577,
"learning_rate": 1.2214867318844825e-05,
"loss": 0.1306,
"step": 11960
},
{
"epoch": 2.092840283241542,
"grad_norm": 1.7207168340682983,
"learning_rate": 1.2191435768261965e-05,
"loss": 0.1316,
"step": 11970
},
{
"epoch": 2.094588687822362,
"grad_norm": 2.498771905899048,
"learning_rate": 1.2168004217679106e-05,
"loss": 0.1368,
"step": 11980
},
{
"epoch": 2.096337092403182,
"grad_norm": 1.751407504081726,
"learning_rate": 1.2144572667096246e-05,
"loss": 0.1416,
"step": 11990
},
{
"epoch": 2.098085496984002,
"grad_norm": 1.6396225690841675,
"learning_rate": 1.2121141116513387e-05,
"loss": 0.1482,
"step": 12000
},
{
"epoch": 2.0987848588163303,
"eval_loss": 0.3506932854652405,
"eval_runtime": 1791.7022,
"eval_samples_per_second": 8.083,
"eval_steps_per_second": 1.011,
"step": 12004
},
{
"epoch": 2.099833901564822,
"grad_norm": 2.5797410011291504,
"learning_rate": 1.2097709565930526e-05,
"loss": 0.1339,
"step": 12010
},
{
"epoch": 2.1015823061456422,
"grad_norm": 3.2276177406311035,
"learning_rate": 1.2074278015347667e-05,
"loss": 0.132,
"step": 12020
},
{
"epoch": 2.1033307107264623,
"grad_norm": 1.973792314529419,
"learning_rate": 1.2050846464764807e-05,
"loss": 0.1232,
"step": 12030
},
{
"epoch": 2.1050791153072823,
"grad_norm": 1.5933607816696167,
"learning_rate": 1.2027414914181948e-05,
"loss": 0.1249,
"step": 12040
},
{
"epoch": 2.106827519888102,
"grad_norm": 2.8495397567749023,
"learning_rate": 1.2003983363599088e-05,
"loss": 0.1822,
"step": 12050
},
{
"epoch": 2.108575924468922,
"grad_norm": 2.044532060623169,
"learning_rate": 1.1980551813016227e-05,
"loss": 0.1298,
"step": 12060
},
{
"epoch": 2.110324329049742,
"grad_norm": 2.275475263595581,
"learning_rate": 1.1957120262433368e-05,
"loss": 0.1242,
"step": 12070
},
{
"epoch": 2.112072733630562,
"grad_norm": 2.919373035430908,
"learning_rate": 1.1933688711850508e-05,
"loss": 0.1314,
"step": 12080
},
{
"epoch": 2.113821138211382,
"grad_norm": 2.3667123317718506,
"learning_rate": 1.1910257161267649e-05,
"loss": 0.1338,
"step": 12090
},
{
"epoch": 2.115569542792202,
"grad_norm": 2.1862826347351074,
"learning_rate": 1.188682561068479e-05,
"loss": 0.1493,
"step": 12100
},
{
"epoch": 2.117317947373022,
"grad_norm": 2.557068347930908,
"learning_rate": 1.186339406010193e-05,
"loss": 0.1367,
"step": 12110
},
{
"epoch": 2.119066351953842,
"grad_norm": 2.359553813934326,
"learning_rate": 1.1839962509519067e-05,
"loss": 0.1549,
"step": 12120
},
{
"epoch": 2.1208147565346622,
"grad_norm": 1.9232834577560425,
"learning_rate": 1.1816530958936208e-05,
"loss": 0.132,
"step": 12130
},
{
"epoch": 2.1225631611154823,
"grad_norm": 1.867737889289856,
"learning_rate": 1.1793099408353348e-05,
"loss": 0.1318,
"step": 12140
},
{
"epoch": 2.1243115656963023,
"grad_norm": 2.331395149230957,
"learning_rate": 1.1769667857770487e-05,
"loss": 0.1428,
"step": 12150
},
{
"epoch": 2.1260599702771223,
"grad_norm": 4.814284324645996,
"learning_rate": 1.1746236307187628e-05,
"loss": 0.1303,
"step": 12160
},
{
"epoch": 2.127808374857942,
"grad_norm": 1.9960688352584839,
"learning_rate": 1.1722804756604769e-05,
"loss": 0.1218,
"step": 12170
},
{
"epoch": 2.129556779438762,
"grad_norm": 1.694406270980835,
"learning_rate": 1.1699373206021909e-05,
"loss": 0.1435,
"step": 12180
},
{
"epoch": 2.131305184019582,
"grad_norm": 2.0222890377044678,
"learning_rate": 1.167594165543905e-05,
"loss": 0.1449,
"step": 12190
},
{
"epoch": 2.133053588600402,
"grad_norm": 1.690718650817871,
"learning_rate": 1.165251010485619e-05,
"loss": 0.1452,
"step": 12200
},
{
"epoch": 2.134801993181222,
"grad_norm": 2.7942721843719482,
"learning_rate": 1.162907855427333e-05,
"loss": 0.1302,
"step": 12210
},
{
"epoch": 2.136550397762042,
"grad_norm": 2.274010181427002,
"learning_rate": 1.1607990158748756e-05,
"loss": 0.1433,
"step": 12220
},
{
"epoch": 2.138298802342862,
"grad_norm": 2.7134246826171875,
"learning_rate": 1.1584558608165896e-05,
"loss": 0.1297,
"step": 12230
},
{
"epoch": 2.140047206923682,
"grad_norm": 2.353001832962036,
"learning_rate": 1.1561127057583037e-05,
"loss": 0.1512,
"step": 12240
},
{
"epoch": 2.1417956115045023,
"grad_norm": 1.4769039154052734,
"learning_rate": 1.1537695507000177e-05,
"loss": 0.134,
"step": 12250
},
{
"epoch": 2.1435440160853223,
"grad_norm": 2.0705323219299316,
"learning_rate": 1.1514263956417318e-05,
"loss": 0.1114,
"step": 12260
},
{
"epoch": 2.1452924206661423,
"grad_norm": 2.1008477210998535,
"learning_rate": 1.1490832405834459e-05,
"loss": 0.1321,
"step": 12270
},
{
"epoch": 2.147040825246962,
"grad_norm": 1.299492597579956,
"learning_rate": 1.1467400855251598e-05,
"loss": 0.1174,
"step": 12280
},
{
"epoch": 2.148789229827782,
"grad_norm": 1.4347295761108398,
"learning_rate": 1.1443969304668738e-05,
"loss": 0.1049,
"step": 12290
},
{
"epoch": 2.150537634408602,
"grad_norm": 1.9035148620605469,
"learning_rate": 1.1420537754085879e-05,
"loss": 0.1322,
"step": 12300
},
{
"epoch": 2.152286038989422,
"grad_norm": 1.6976128816604614,
"learning_rate": 1.1397106203503016e-05,
"loss": 0.1165,
"step": 12310
},
{
"epoch": 2.154034443570242,
"grad_norm": 4.1831955909729,
"learning_rate": 1.1373674652920156e-05,
"loss": 0.1407,
"step": 12320
},
{
"epoch": 2.155782848151062,
"grad_norm": 1.985929012298584,
"learning_rate": 1.1350243102337297e-05,
"loss": 0.1475,
"step": 12330
},
{
"epoch": 2.157531252731882,
"grad_norm": 1.6526029109954834,
"learning_rate": 1.1326811551754438e-05,
"loss": 0.1476,
"step": 12340
},
{
"epoch": 2.159279657312702,
"grad_norm": 2.873518228530884,
"learning_rate": 1.1303380001171578e-05,
"loss": 0.1206,
"step": 12350
},
{
"epoch": 2.1610280618935223,
"grad_norm": 3.4296302795410156,
"learning_rate": 1.1279948450588719e-05,
"loss": 0.1226,
"step": 12360
},
{
"epoch": 2.1627764664743423,
"grad_norm": 4.733137607574463,
"learning_rate": 1.1256516900005858e-05,
"loss": 0.1748,
"step": 12370
},
{
"epoch": 2.1645248710551623,
"grad_norm": 1.851542353630066,
"learning_rate": 1.1233085349422998e-05,
"loss": 0.1483,
"step": 12380
},
{
"epoch": 2.166273275635982,
"grad_norm": 1.5884191989898682,
"learning_rate": 1.1209653798840139e-05,
"loss": 0.1383,
"step": 12390
},
{
"epoch": 2.168021680216802,
"grad_norm": 2.071790933609009,
"learning_rate": 1.118622224825728e-05,
"loss": 0.1534,
"step": 12400
},
{
"epoch": 2.169770084797622,
"grad_norm": 2.419951915740967,
"learning_rate": 1.116279069767442e-05,
"loss": 0.1177,
"step": 12410
},
{
"epoch": 2.171518489378442,
"grad_norm": 1.8410372734069824,
"learning_rate": 1.1139359147091559e-05,
"loss": 0.1211,
"step": 12420
},
{
"epoch": 2.173266893959262,
"grad_norm": 2.593384265899658,
"learning_rate": 1.11159275965087e-05,
"loss": 0.1481,
"step": 12430
},
{
"epoch": 2.175015298540082,
"grad_norm": 1.5354266166687012,
"learning_rate": 1.109249604592584e-05,
"loss": 0.1076,
"step": 12440
},
{
"epoch": 2.176763703120902,
"grad_norm": 2.548050880432129,
"learning_rate": 1.106906449534298e-05,
"loss": 0.1375,
"step": 12450
},
{
"epoch": 2.178512107701722,
"grad_norm": 2.1876955032348633,
"learning_rate": 1.1047976099818407e-05,
"loss": 0.1366,
"step": 12460
},
{
"epoch": 2.1802605122825423,
"grad_norm": 2.163553237915039,
"learning_rate": 1.1024544549235548e-05,
"loss": 0.1371,
"step": 12470
},
{
"epoch": 2.1820089168633623,
"grad_norm": 2.0126430988311768,
"learning_rate": 1.1001112998652688e-05,
"loss": 0.1324,
"step": 12480
},
{
"epoch": 2.1837573214441823,
"grad_norm": 3.8415536880493164,
"learning_rate": 1.0977681448069827e-05,
"loss": 0.1586,
"step": 12490
},
{
"epoch": 2.1855057260250024,
"grad_norm": 1.8899825811386108,
"learning_rate": 1.0954249897486966e-05,
"loss": 0.1255,
"step": 12500
},
{
"epoch": 2.1872541306058224,
"grad_norm": 1.9570482969284058,
"learning_rate": 1.0930818346904107e-05,
"loss": 0.1243,
"step": 12510
},
{
"epoch": 2.189002535186642,
"grad_norm": 0.8766506910324097,
"learning_rate": 1.0907386796321247e-05,
"loss": 0.1176,
"step": 12520
},
{
"epoch": 2.190750939767462,
"grad_norm": 0.7959820628166199,
"learning_rate": 1.0883955245738386e-05,
"loss": 0.1356,
"step": 12530
},
{
"epoch": 2.192499344348282,
"grad_norm": 2.3093817234039307,
"learning_rate": 1.0860523695155527e-05,
"loss": 0.1562,
"step": 12540
},
{
"epoch": 2.194247748929102,
"grad_norm": 2.737586259841919,
"learning_rate": 1.0837092144572667e-05,
"loss": 0.1354,
"step": 12550
},
{
"epoch": 2.195996153509922,
"grad_norm": 1.244848370552063,
"learning_rate": 1.0813660593989808e-05,
"loss": 0.1086,
"step": 12560
},
{
"epoch": 2.197744558090742,
"grad_norm": 1.8399499654769897,
"learning_rate": 1.0790229043406949e-05,
"loss": 0.1102,
"step": 12570
},
{
"epoch": 2.1994929626715622,
"grad_norm": 1.1957368850708008,
"learning_rate": 1.0766797492824087e-05,
"loss": 0.1208,
"step": 12580
},
{
"epoch": 2.2012413672523823,
"grad_norm": 3.7022900581359863,
"learning_rate": 1.0743365942241228e-05,
"loss": 0.141,
"step": 12590
},
{
"epoch": 2.2029897718332023,
"grad_norm": 2.1159870624542236,
"learning_rate": 1.0719934391658369e-05,
"loss": 0.1192,
"step": 12600
},
{
"epoch": 2.2047381764140224,
"grad_norm": 2.370440721511841,
"learning_rate": 1.069650284107551e-05,
"loss": 0.1346,
"step": 12610
},
{
"epoch": 2.2064865809948424,
"grad_norm": 1.9298361539840698,
"learning_rate": 1.067307129049265e-05,
"loss": 0.1471,
"step": 12620
},
{
"epoch": 2.208234985575662,
"grad_norm": 1.9388532638549805,
"learning_rate": 1.0649639739909789e-05,
"loss": 0.1305,
"step": 12630
},
{
"epoch": 2.209983390156482,
"grad_norm": 1.9433221817016602,
"learning_rate": 1.062620818932693e-05,
"loss": 0.126,
"step": 12640
},
{
"epoch": 2.211731794737302,
"grad_norm": 2.007972478866577,
"learning_rate": 1.060277663874407e-05,
"loss": 0.1147,
"step": 12650
},
{
"epoch": 2.213480199318122,
"grad_norm": 1.1948915719985962,
"learning_rate": 1.057934508816121e-05,
"loss": 0.1036,
"step": 12660
},
{
"epoch": 2.215228603898942,
"grad_norm": 1.7573200464248657,
"learning_rate": 1.0555913537578351e-05,
"loss": 0.1361,
"step": 12670
},
{
"epoch": 2.216977008479762,
"grad_norm": 1.6941572427749634,
"learning_rate": 1.0532481986995492e-05,
"loss": 0.1513,
"step": 12680
},
{
"epoch": 2.2187254130605822,
"grad_norm": 2.2214457988739014,
"learning_rate": 1.050905043641263e-05,
"loss": 0.1638,
"step": 12690
},
{
"epoch": 2.2204738176414023,
"grad_norm": 2.1484594345092773,
"learning_rate": 1.0485618885829771e-05,
"loss": 0.1191,
"step": 12700
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.3132258653640747,
"learning_rate": 1.0462187335246912e-05,
"loss": 0.153,
"step": 12710
},
{
"epoch": 2.2239706268030424,
"grad_norm": 1.7616349458694458,
"learning_rate": 1.0438755784664052e-05,
"loss": 0.1358,
"step": 12720
},
{
"epoch": 2.2257190313838624,
"grad_norm": 1.7969423532485962,
"learning_rate": 1.0415324234081193e-05,
"loss": 0.096,
"step": 12730
},
{
"epoch": 2.227467435964682,
"grad_norm": 1.4737247228622437,
"learning_rate": 1.039189268349833e-05,
"loss": 0.1194,
"step": 12740
},
{
"epoch": 2.229215840545502,
"grad_norm": 1.7292555570602417,
"learning_rate": 1.036846113291547e-05,
"loss": 0.1314,
"step": 12750
},
{
"epoch": 2.230964245126322,
"grad_norm": 2.5975306034088135,
"learning_rate": 1.0345029582332611e-05,
"loss": 0.1552,
"step": 12760
},
{
"epoch": 2.232712649707142,
"grad_norm": 1.841124176979065,
"learning_rate": 1.0321598031749752e-05,
"loss": 0.15,
"step": 12770
},
{
"epoch": 2.234461054287962,
"grad_norm": 2.365156888961792,
"learning_rate": 1.029816648116689e-05,
"loss": 0.125,
"step": 12780
},
{
"epoch": 2.236209458868782,
"grad_norm": 2.7648537158966064,
"learning_rate": 1.0274734930584031e-05,
"loss": 0.1316,
"step": 12790
},
{
"epoch": 2.2379578634496022,
"grad_norm": 2.3384718894958496,
"learning_rate": 1.0251303380001172e-05,
"loss": 0.1273,
"step": 12800
},
{
"epoch": 2.2397062680304223,
"grad_norm": 1.648476004600525,
"learning_rate": 1.0227871829418312e-05,
"loss": 0.1088,
"step": 12810
},
{
"epoch": 2.2414546726112423,
"grad_norm": 1.8477935791015625,
"learning_rate": 1.0204440278835453e-05,
"loss": 0.1511,
"step": 12820
},
{
"epoch": 2.2432030771920624,
"grad_norm": 1.7724605798721313,
"learning_rate": 1.0181008728252592e-05,
"loss": 0.1376,
"step": 12830
},
{
"epoch": 2.2449514817728824,
"grad_norm": 3.3595921993255615,
"learning_rate": 1.0157577177669733e-05,
"loss": 0.1095,
"step": 12840
},
{
"epoch": 2.2466998863537024,
"grad_norm": 2.554070472717285,
"learning_rate": 1.0134145627086873e-05,
"loss": 0.1462,
"step": 12850
},
{
"epoch": 2.2484482909345225,
"grad_norm": 2.068704605102539,
"learning_rate": 1.0110714076504014e-05,
"loss": 0.1277,
"step": 12860
},
{
"epoch": 2.250196695515342,
"grad_norm": 1.4557998180389404,
"learning_rate": 1.0087282525921154e-05,
"loss": 0.1457,
"step": 12870
},
{
"epoch": 2.251945100096162,
"grad_norm": 1.990447998046875,
"learning_rate": 1.0063850975338295e-05,
"loss": 0.1192,
"step": 12880
},
{
"epoch": 2.253693504676982,
"grad_norm": 1.5465627908706665,
"learning_rate": 1.0040419424755434e-05,
"loss": 0.1253,
"step": 12890
},
{
"epoch": 2.255441909257802,
"grad_norm": 2.6916444301605225,
"learning_rate": 1.0016987874172574e-05,
"loss": 0.1491,
"step": 12900
},
{
"epoch": 2.2571903138386222,
"grad_norm": 1.3198771476745605,
"learning_rate": 9.993556323589715e-06,
"loss": 0.1174,
"step": 12910
},
{
"epoch": 2.2589387184194423,
"grad_norm": 4.6871256828308105,
"learning_rate": 9.970124773006854e-06,
"loss": 0.1412,
"step": 12920
},
{
"epoch": 2.2606871230002623,
"grad_norm": 1.9112443923950195,
"learning_rate": 9.946693222423994e-06,
"loss": 0.1339,
"step": 12930
},
{
"epoch": 2.2624355275810824,
"grad_norm": 2.208272933959961,
"learning_rate": 9.923261671841135e-06,
"loss": 0.1121,
"step": 12940
},
{
"epoch": 2.2641839321619024,
"grad_norm": 1.507631540298462,
"learning_rate": 9.899830121258276e-06,
"loss": 0.1426,
"step": 12950
},
{
"epoch": 2.2659323367427224,
"grad_norm": 1.6952465772628784,
"learning_rate": 9.876398570675414e-06,
"loss": 0.1428,
"step": 12960
},
{
"epoch": 2.2676807413235425,
"grad_norm": 1.4942928552627563,
"learning_rate": 9.852967020092555e-06,
"loss": 0.1296,
"step": 12970
},
{
"epoch": 2.269429145904362,
"grad_norm": 2.4580042362213135,
"learning_rate": 9.829535469509696e-06,
"loss": 0.1206,
"step": 12980
},
{
"epoch": 2.271177550485182,
"grad_norm": 2.046311855316162,
"learning_rate": 9.806103918926836e-06,
"loss": 0.1252,
"step": 12990
},
{
"epoch": 2.272925955066002,
"grad_norm": 1.9215744733810425,
"learning_rate": 9.782672368343977e-06,
"loss": 0.1239,
"step": 13000
},
{
"epoch": 2.274674359646822,
"grad_norm": 1.7655632495880127,
"learning_rate": 9.759240817761116e-06,
"loss": 0.1512,
"step": 13010
},
{
"epoch": 2.2764227642276422,
"grad_norm": 1.9617197513580322,
"learning_rate": 9.735809267178256e-06,
"loss": 0.0958,
"step": 13020
},
{
"epoch": 2.2781711688084623,
"grad_norm": 2.193418502807617,
"learning_rate": 9.712377716595397e-06,
"loss": 0.1101,
"step": 13030
},
{
"epoch": 2.2799195733892823,
"grad_norm": 1.809605598449707,
"learning_rate": 9.688946166012536e-06,
"loss": 0.1691,
"step": 13040
},
{
"epoch": 2.2816679779701023,
"grad_norm": 1.4769902229309082,
"learning_rate": 9.665514615429676e-06,
"loss": 0.1594,
"step": 13050
},
{
"epoch": 2.2834163825509224,
"grad_norm": 3.269402027130127,
"learning_rate": 9.642083064846817e-06,
"loss": 0.1122,
"step": 13060
},
{
"epoch": 2.2851647871317424,
"grad_norm": 1.951167345046997,
"learning_rate": 9.618651514263957e-06,
"loss": 0.1327,
"step": 13070
},
{
"epoch": 2.2869131917125625,
"grad_norm": 1.3698724508285522,
"learning_rate": 9.595219963681096e-06,
"loss": 0.1447,
"step": 13080
},
{
"epoch": 2.288661596293382,
"grad_norm": 2.255122423171997,
"learning_rate": 9.571788413098237e-06,
"loss": 0.1374,
"step": 13090
},
{
"epoch": 2.290410000874202,
"grad_norm": 2.1527059078216553,
"learning_rate": 9.548356862515378e-06,
"loss": 0.153,
"step": 13100
},
{
"epoch": 2.292158405455022,
"grad_norm": 4.404884338378906,
"learning_rate": 9.524925311932518e-06,
"loss": 0.1195,
"step": 13110
},
{
"epoch": 2.293906810035842,
"grad_norm": 1.2485246658325195,
"learning_rate": 9.501493761349659e-06,
"loss": 0.1142,
"step": 13120
},
{
"epoch": 2.295655214616662,
"grad_norm": 1.492285966873169,
"learning_rate": 9.4780622107668e-06,
"loss": 0.1213,
"step": 13130
},
{
"epoch": 2.2974036191974823,
"grad_norm": 2.0897228717803955,
"learning_rate": 9.454630660183938e-06,
"loss": 0.1439,
"step": 13140
},
{
"epoch": 2.2991520237783023,
"grad_norm": 1.5162708759307861,
"learning_rate": 9.431199109601079e-06,
"loss": 0.1107,
"step": 13150
},
{
"epoch": 2.3009004283591223,
"grad_norm": 2.7167162895202637,
"learning_rate": 9.407767559018218e-06,
"loss": 0.1276,
"step": 13160
},
{
"epoch": 2.3026488329399424,
"grad_norm": 1.5149487257003784,
"learning_rate": 9.384336008435358e-06,
"loss": 0.1313,
"step": 13170
},
{
"epoch": 2.3043972375207624,
"grad_norm": 1.6607292890548706,
"learning_rate": 9.360904457852499e-06,
"loss": 0.1323,
"step": 13180
},
{
"epoch": 2.3061456421015825,
"grad_norm": 2.0336623191833496,
"learning_rate": 9.33747290726964e-06,
"loss": 0.1125,
"step": 13190
},
{
"epoch": 2.3078940466824025,
"grad_norm": 2.8644120693206787,
"learning_rate": 9.31404135668678e-06,
"loss": 0.1332,
"step": 13200
},
{
"epoch": 2.3096424512632225,
"grad_norm": 1.8403265476226807,
"learning_rate": 9.290609806103919e-06,
"loss": 0.1344,
"step": 13210
},
{
"epoch": 2.311390855844042,
"grad_norm": 2.001629114151001,
"learning_rate": 9.26717825552106e-06,
"loss": 0.1537,
"step": 13220
},
{
"epoch": 2.313139260424862,
"grad_norm": 1.4118083715438843,
"learning_rate": 9.2437467049382e-06,
"loss": 0.1025,
"step": 13230
},
{
"epoch": 2.314887665005682,
"grad_norm": 1.8369883298873901,
"learning_rate": 9.22031515435534e-06,
"loss": 0.1232,
"step": 13240
},
{
"epoch": 2.3166360695865023,
"grad_norm": 1.7884759902954102,
"learning_rate": 9.196883603772481e-06,
"loss": 0.1231,
"step": 13250
},
{
"epoch": 2.3183844741673223,
"grad_norm": 2.710341453552246,
"learning_rate": 9.17345205318962e-06,
"loss": 0.1242,
"step": 13260
},
{
"epoch": 2.3201328787481423,
"grad_norm": 2.240281105041504,
"learning_rate": 9.15002050260676e-06,
"loss": 0.1504,
"step": 13270
},
{
"epoch": 2.3218812833289624,
"grad_norm": 1.7557798624038696,
"learning_rate": 9.1265889520239e-06,
"loss": 0.1403,
"step": 13280
},
{
"epoch": 2.3236296879097824,
"grad_norm": 1.6217379570007324,
"learning_rate": 9.10315740144104e-06,
"loss": 0.1245,
"step": 13290
},
{
"epoch": 2.3253780924906025,
"grad_norm": 1.895262360572815,
"learning_rate": 9.07972585085818e-06,
"loss": 0.1342,
"step": 13300
},
{
"epoch": 2.3271264970714225,
"grad_norm": 1.5638673305511475,
"learning_rate": 9.056294300275321e-06,
"loss": 0.1352,
"step": 13310
},
{
"epoch": 2.3288749016522425,
"grad_norm": 2.0554001331329346,
"learning_rate": 9.032862749692462e-06,
"loss": 0.1365,
"step": 13320
},
{
"epoch": 2.330623306233062,
"grad_norm": 6.10771369934082,
"learning_rate": 9.009431199109603e-06,
"loss": 0.1243,
"step": 13330
},
{
"epoch": 2.332371710813882,
"grad_norm": 2.100159168243408,
"learning_rate": 8.985999648526741e-06,
"loss": 0.1182,
"step": 13340
},
{
"epoch": 2.334120115394702,
"grad_norm": 3.7410552501678467,
"learning_rate": 8.962568097943882e-06,
"loss": 0.1278,
"step": 13350
},
{
"epoch": 2.3358685199755223,
"grad_norm": 1.9370155334472656,
"learning_rate": 8.939136547361023e-06,
"loss": 0.1128,
"step": 13360
},
{
"epoch": 2.3376169245563423,
"grad_norm": 1.367945671081543,
"learning_rate": 8.915704996778163e-06,
"loss": 0.1242,
"step": 13370
},
{
"epoch": 2.3393653291371623,
"grad_norm": 2.3016417026519775,
"learning_rate": 8.892273446195304e-06,
"loss": 0.1357,
"step": 13380
},
{
"epoch": 2.3411137337179824,
"grad_norm": 1.5979362726211548,
"learning_rate": 8.868841895612443e-06,
"loss": 0.1251,
"step": 13390
},
{
"epoch": 2.3428621382988024,
"grad_norm": 1.9017846584320068,
"learning_rate": 8.845410345029583e-06,
"loss": 0.1266,
"step": 13400
},
{
"epoch": 2.3446105428796225,
"grad_norm": 2.5822110176086426,
"learning_rate": 8.821978794446722e-06,
"loss": 0.1234,
"step": 13410
},
{
"epoch": 2.3463589474604425,
"grad_norm": 2.033761501312256,
"learning_rate": 8.798547243863863e-06,
"loss": 0.126,
"step": 13420
},
{
"epoch": 2.3481073520412625,
"grad_norm": 1.3017164468765259,
"learning_rate": 8.775115693281003e-06,
"loss": 0.1296,
"step": 13430
},
{
"epoch": 2.349855756622082,
"grad_norm": 4.223972320556641,
"learning_rate": 8.751684142698144e-06,
"loss": 0.1409,
"step": 13440
},
{
"epoch": 2.351604161202902,
"grad_norm": 1.4217913150787354,
"learning_rate": 8.728252592115284e-06,
"loss": 0.1412,
"step": 13450
},
{
"epoch": 2.353352565783722,
"grad_norm": 2.801734685897827,
"learning_rate": 8.704821041532423e-06,
"loss": 0.1182,
"step": 13460
},
{
"epoch": 2.3551009703645422,
"grad_norm": 2.7878241539001465,
"learning_rate": 8.681389490949564e-06,
"loss": 0.1563,
"step": 13470
},
{
"epoch": 2.3568493749453623,
"grad_norm": 1.362971544265747,
"learning_rate": 8.657957940366705e-06,
"loss": 0.1282,
"step": 13480
},
{
"epoch": 2.3585977795261823,
"grad_norm": 1.0237337350845337,
"learning_rate": 8.634526389783845e-06,
"loss": 0.1142,
"step": 13490
},
{
"epoch": 2.3603461841070024,
"grad_norm": 1.7820348739624023,
"learning_rate": 8.611094839200986e-06,
"loss": 0.1226,
"step": 13500
},
{
"epoch": 2.3620945886878224,
"grad_norm": 4.237933158874512,
"learning_rate": 8.587663288618126e-06,
"loss": 0.1334,
"step": 13510
},
{
"epoch": 2.3638429932686424,
"grad_norm": 2.8852193355560303,
"learning_rate": 8.564231738035265e-06,
"loss": 0.1396,
"step": 13520
},
{
"epoch": 2.3655913978494625,
"grad_norm": 1.597548484802246,
"learning_rate": 8.540800187452404e-06,
"loss": 0.1273,
"step": 13530
},
{
"epoch": 2.3673398024302825,
"grad_norm": 2.1600868701934814,
"learning_rate": 8.517368636869545e-06,
"loss": 0.1395,
"step": 13540
},
{
"epoch": 2.369088207011102,
"grad_norm": 2.460857391357422,
"learning_rate": 8.493937086286685e-06,
"loss": 0.1441,
"step": 13550
},
{
"epoch": 2.3708366115919226,
"grad_norm": 1.3445568084716797,
"learning_rate": 8.470505535703826e-06,
"loss": 0.1064,
"step": 13560
},
{
"epoch": 2.372585016172742,
"grad_norm": 2.0153238773345947,
"learning_rate": 8.447073985120966e-06,
"loss": 0.1093,
"step": 13570
},
{
"epoch": 2.3743334207535622,
"grad_norm": 3.338841438293457,
"learning_rate": 8.423642434538107e-06,
"loss": 0.1395,
"step": 13580
},
{
"epoch": 2.3760818253343823,
"grad_norm": 1.538512110710144,
"learning_rate": 8.400210883955246e-06,
"loss": 0.1251,
"step": 13590
},
{
"epoch": 2.3778302299152023,
"grad_norm": 1.4861085414886475,
"learning_rate": 8.376779333372386e-06,
"loss": 0.1409,
"step": 13600
},
{
"epoch": 2.3795786344960224,
"grad_norm": 2.402609348297119,
"learning_rate": 8.353347782789527e-06,
"loss": 0.1561,
"step": 13610
},
{
"epoch": 2.3813270390768424,
"grad_norm": 2.3510336875915527,
"learning_rate": 8.329916232206668e-06,
"loss": 0.1369,
"step": 13620
},
{
"epoch": 2.3830754436576624,
"grad_norm": 2.3919291496276855,
"learning_rate": 8.306484681623808e-06,
"loss": 0.149,
"step": 13630
},
{
"epoch": 2.3848238482384825,
"grad_norm": 2.825187921524048,
"learning_rate": 8.283053131040947e-06,
"loss": 0.1312,
"step": 13640
},
{
"epoch": 2.3865722528193025,
"grad_norm": 1.6704410314559937,
"learning_rate": 8.259621580458088e-06,
"loss": 0.146,
"step": 13650
},
{
"epoch": 2.3883206574001226,
"grad_norm": 1.7998132705688477,
"learning_rate": 8.236190029875227e-06,
"loss": 0.1159,
"step": 13660
},
{
"epoch": 2.3900690619809426,
"grad_norm": 1.5917441844940186,
"learning_rate": 8.212758479292367e-06,
"loss": 0.1266,
"step": 13670
},
{
"epoch": 2.391817466561762,
"grad_norm": 1.5092450380325317,
"learning_rate": 8.189326928709508e-06,
"loss": 0.1542,
"step": 13680
},
{
"epoch": 2.3935658711425822,
"grad_norm": 3.4303741455078125,
"learning_rate": 8.165895378126648e-06,
"loss": 0.1276,
"step": 13690
},
{
"epoch": 2.3953142757234023,
"grad_norm": 1.8061717748641968,
"learning_rate": 8.142463827543789e-06,
"loss": 0.1133,
"step": 13700
},
{
"epoch": 2.3970626803042223,
"grad_norm": 2.527982711791992,
"learning_rate": 8.119032276960928e-06,
"loss": 0.1493,
"step": 13710
},
{
"epoch": 2.3988110848850424,
"grad_norm": 1.5481889247894287,
"learning_rate": 8.095600726378068e-06,
"loss": 0.1252,
"step": 13720
},
{
"epoch": 2.4005594894658624,
"grad_norm": 4.928106784820557,
"learning_rate": 8.072169175795209e-06,
"loss": 0.1466,
"step": 13730
},
{
"epoch": 2.4023078940466824,
"grad_norm": 1.5288153886795044,
"learning_rate": 8.04873762521235e-06,
"loss": 0.1263,
"step": 13740
},
{
"epoch": 2.4040562986275025,
"grad_norm": 1.9449552297592163,
"learning_rate": 8.02530607462949e-06,
"loss": 0.102,
"step": 13750
},
{
"epoch": 2.4058047032083225,
"grad_norm": 2.1797351837158203,
"learning_rate": 8.001874524046629e-06,
"loss": 0.1107,
"step": 13760
},
{
"epoch": 2.4075531077891426,
"grad_norm": 1.4594932794570923,
"learning_rate": 7.97844297346377e-06,
"loss": 0.1392,
"step": 13770
},
{
"epoch": 2.4093015123699626,
"grad_norm": 2.8186988830566406,
"learning_rate": 7.95501142288091e-06,
"loss": 0.1256,
"step": 13780
},
{
"epoch": 2.411049916950782,
"grad_norm": 2.1541221141815186,
"learning_rate": 7.931579872298049e-06,
"loss": 0.1344,
"step": 13790
},
{
"epoch": 2.4127983215316022,
"grad_norm": 1.6721254587173462,
"learning_rate": 7.90814832171519e-06,
"loss": 0.1142,
"step": 13800
},
{
"epoch": 2.4145467261124223,
"grad_norm": 1.4992693662643433,
"learning_rate": 7.88471677113233e-06,
"loss": 0.1053,
"step": 13810
},
{
"epoch": 2.4162951306932423,
"grad_norm": 2.316558361053467,
"learning_rate": 7.861285220549471e-06,
"loss": 0.1284,
"step": 13820
},
{
"epoch": 2.4180435352740624,
"grad_norm": 1.6623950004577637,
"learning_rate": 7.837853669966611e-06,
"loss": 0.1349,
"step": 13830
},
{
"epoch": 2.4197919398548824,
"grad_norm": 1.9431278705596924,
"learning_rate": 7.81442211938375e-06,
"loss": 0.1378,
"step": 13840
},
{
"epoch": 2.4215403444357024,
"grad_norm": 2.124650478363037,
"learning_rate": 7.790990568800891e-06,
"loss": 0.1406,
"step": 13850
},
{
"epoch": 2.4232887490165225,
"grad_norm": 2.7442266941070557,
"learning_rate": 7.767559018218032e-06,
"loss": 0.1208,
"step": 13860
},
{
"epoch": 2.4250371535973425,
"grad_norm": 1.9426761865615845,
"learning_rate": 7.744127467635172e-06,
"loss": 0.1504,
"step": 13870
},
{
"epoch": 2.4267855581781625,
"grad_norm": 1.6392885446548462,
"learning_rate": 7.720695917052311e-06,
"loss": 0.1389,
"step": 13880
},
{
"epoch": 2.4285339627589826,
"grad_norm": 2.3085715770721436,
"learning_rate": 7.697264366469452e-06,
"loss": 0.1455,
"step": 13890
},
{
"epoch": 2.430282367339802,
"grad_norm": 1.8429359197616577,
"learning_rate": 7.673832815886592e-06,
"loss": 0.1173,
"step": 13900
},
{
"epoch": 2.4320307719206227,
"grad_norm": 1.861633062362671,
"learning_rate": 7.650401265303731e-06,
"loss": 0.1499,
"step": 13910
},
{
"epoch": 2.4337791765014423,
"grad_norm": 1.6089733839035034,
"learning_rate": 7.6269697147208725e-06,
"loss": 0.1182,
"step": 13920
},
{
"epoch": 2.4355275810822623,
"grad_norm": 3.4693145751953125,
"learning_rate": 7.603538164138012e-06,
"loss": 0.1621,
"step": 13930
},
{
"epoch": 2.4372759856630823,
"grad_norm": 2.4453048706054688,
"learning_rate": 7.580106613555153e-06,
"loss": 0.1203,
"step": 13940
},
{
"epoch": 2.4390243902439024,
"grad_norm": 2.6296722888946533,
"learning_rate": 7.5566750629722926e-06,
"loss": 0.1384,
"step": 13950
},
{
"epoch": 2.4407727948247224,
"grad_norm": 2.1992902755737305,
"learning_rate": 7.533243512389433e-06,
"loss": 0.1251,
"step": 13960
},
{
"epoch": 2.4425211994055425,
"grad_norm": 2.368910551071167,
"learning_rate": 7.509811961806574e-06,
"loss": 0.1267,
"step": 13970
},
{
"epoch": 2.4442696039863625,
"grad_norm": 2.3806991577148438,
"learning_rate": 7.4863804112237135e-06,
"loss": 0.1169,
"step": 13980
},
{
"epoch": 2.4460180085671825,
"grad_norm": 0.9917481541633606,
"learning_rate": 7.462948860640854e-06,
"loss": 0.1358,
"step": 13990
},
{
"epoch": 2.4477664131480026,
"grad_norm": 1.9190022945404053,
"learning_rate": 7.439517310057993e-06,
"loss": 0.1495,
"step": 14000
},
{
"epoch": 2.4495148177288226,
"grad_norm": 1.8634378910064697,
"learning_rate": 7.4160857594751335e-06,
"loss": 0.1033,
"step": 14010
},
{
"epoch": 2.4512632223096427,
"grad_norm": 2.452369451522827,
"learning_rate": 7.392654208892274e-06,
"loss": 0.1424,
"step": 14020
},
{
"epoch": 2.4530116268904623,
"grad_norm": 1.8152307271957397,
"learning_rate": 7.369222658309414e-06,
"loss": 0.1383,
"step": 14030
},
{
"epoch": 2.4547600314712823,
"grad_norm": 2.709925651550293,
"learning_rate": 7.3457911077265544e-06,
"loss": 0.1052,
"step": 14040
},
{
"epoch": 2.4565084360521023,
"grad_norm": 1.8516377210617065,
"learning_rate": 7.322359557143694e-06,
"loss": 0.0941,
"step": 14050
},
{
"epoch": 2.4582568406329224,
"grad_norm": 1.1404094696044922,
"learning_rate": 7.298928006560835e-06,
"loss": 0.1253,
"step": 14060
},
{
"epoch": 2.4600052452137424,
"grad_norm": 1.4594693183898926,
"learning_rate": 7.275496455977975e-06,
"loss": 0.1384,
"step": 14070
},
{
"epoch": 2.4617536497945625,
"grad_norm": 2.0537307262420654,
"learning_rate": 7.252064905395115e-06,
"loss": 0.12,
"step": 14080
},
{
"epoch": 2.4635020543753825,
"grad_norm": 1.586864709854126,
"learning_rate": 7.228633354812256e-06,
"loss": 0.135,
"step": 14090
},
{
"epoch": 2.4652504589562025,
"grad_norm": 1.7124476432800293,
"learning_rate": 7.205201804229396e-06,
"loss": 0.1189,
"step": 14100
}
],
"logging_steps": 10,
"max_steps": 17157,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.839868867932848e+18,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}